diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,34154 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9747, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006155740227762388, + "grad_norm": 5.71875, + "learning_rate": 1.0238907849829352e-08, + "loss": 1.2493510246276855, + "step": 2 + }, + { + "epoch": 0.0012311480455524776, + "grad_norm": 11.4375, + "learning_rate": 3.071672354948805e-08, + "loss": 1.9457250833511353, + "step": 4 + }, + { + "epoch": 0.0018467220683287165, + "grad_norm": 15.25, + "learning_rate": 5.119453924914676e-08, + "loss": 1.6203389167785645, + "step": 6 + }, + { + "epoch": 0.0024622960911049553, + "grad_norm": 16.625, + "learning_rate": 7.167235494880547e-08, + "loss": 1.9556751251220703, + "step": 8 + }, + { + "epoch": 0.0030778701138811943, + "grad_norm": 16.375, + "learning_rate": 9.215017064846416e-08, + "loss": 2.2844934463500977, + "step": 10 + }, + { + "epoch": 0.003693444136657433, + "grad_norm": 14.5, + "learning_rate": 1.1262798634812287e-07, + "loss": 1.4806036949157715, + "step": 12 + }, + { + "epoch": 0.0043090181594336715, + "grad_norm": 38.75, + "learning_rate": 1.3310580204778158e-07, + "loss": 2.605030059814453, + "step": 14 + }, + { + "epoch": 0.0049245921822099106, + "grad_norm": 24.625, + "learning_rate": 1.5358361774744026e-07, + "loss": 1.8370442390441895, + "step": 16 + }, + { + "epoch": 0.00554016620498615, + "grad_norm": 16.875, + "learning_rate": 1.7406143344709898e-07, + "loss": 1.6506725549697876, + "step": 18 + }, + { + "epoch": 0.006155740227762389, + "grad_norm": 11.8125, + "learning_rate": 1.9453924914675767e-07, + "loss": 1.5109401941299438, + "step": 20 + }, + { + "epoch": 0.006771314250538628, + "grad_norm": 6.34375, + "learning_rate": 2.1501706484641638e-07, + "loss": 1.8124639987945557, + "step": 22 + }, + { + "epoch": 0.007386888273314866, + "grad_norm": 18.375, + "learning_rate": 2.354948805460751e-07, + "loss": 1.8528108596801758, + "step": 24 + }, + { + "epoch": 0.008002462296091105, + "grad_norm": 7.40625, + "learning_rate": 2.559726962457338e-07, + "loss": 1.7236714363098145, + "step": 26 + }, + { + "epoch": 0.008618036318867343, + "grad_norm": 26.625, + "learning_rate": 2.764505119453925e-07, + "loss": 1.6073427200317383, + "step": 28 + }, + { + "epoch": 0.009233610341643583, + "grad_norm": 7.03125, + "learning_rate": 2.969283276450512e-07, + "loss": 1.8255822658538818, + "step": 30 + }, + { + "epoch": 0.009849184364419821, + "grad_norm": 14.6875, + "learning_rate": 3.174061433447099e-07, + "loss": 1.890990138053894, + "step": 32 + }, + { + "epoch": 0.010464758387196061, + "grad_norm": 13.1875, + "learning_rate": 3.378839590443686e-07, + "loss": 2.264435291290283, + "step": 34 + }, + { + "epoch": 0.0110803324099723, + "grad_norm": 6.4375, + "learning_rate": 3.583617747440273e-07, + "loss": 1.5108485221862793, + "step": 36 + }, + { + "epoch": 0.011695906432748537, + "grad_norm": 40.0, + "learning_rate": 3.78839590443686e-07, + "loss": 1.935383677482605, + "step": 38 + }, + { + "epoch": 0.012311480455524777, + "grad_norm": 5.6875, + "learning_rate": 3.9931740614334473e-07, + "loss": 1.2184486389160156, + "step": 40 + }, + { + "epoch": 0.012927054478301015, + "grad_norm": 10.5625, + "learning_rate": 4.197952218430034e-07, + "loss": 1.2659871578216553, + "step": 42 + }, + { + "epoch": 0.013542628501077255, + "grad_norm": 18.625, + "learning_rate": 4.402730375426621e-07, + "loss": 2.0540785789489746, + "step": 44 + }, + { + "epoch": 0.014158202523853494, + "grad_norm": 12.375, + "learning_rate": 4.6075085324232084e-07, + "loss": 1.4828977584838867, + "step": 46 + }, + { + "epoch": 0.014773776546629732, + "grad_norm": 27.625, + "learning_rate": 4.812286689419796e-07, + "loss": 1.7720004320144653, + "step": 48 + }, + { + "epoch": 0.015389350569405972, + "grad_norm": 5.34375, + "learning_rate": 5.017064846416382e-07, + "loss": 1.3994762897491455, + "step": 50 + }, + { + "epoch": 0.01600492459218221, + "grad_norm": 22.625, + "learning_rate": 5.22184300341297e-07, + "loss": 1.7023072242736816, + "step": 52 + }, + { + "epoch": 0.01662049861495845, + "grad_norm": 55.25, + "learning_rate": 5.426621160409556e-07, + "loss": 2.0461301803588867, + "step": 54 + }, + { + "epoch": 0.017236072637734686, + "grad_norm": 22.375, + "learning_rate": 5.631399317406143e-07, + "loss": 1.5622308254241943, + "step": 56 + }, + { + "epoch": 0.017851646660510926, + "grad_norm": 10.8125, + "learning_rate": 5.83617747440273e-07, + "loss": 2.0164151191711426, + "step": 58 + }, + { + "epoch": 0.018467220683287166, + "grad_norm": 6.375, + "learning_rate": 6.040955631399317e-07, + "loss": 1.8669424057006836, + "step": 60 + }, + { + "epoch": 0.019082794706063406, + "grad_norm": 29.125, + "learning_rate": 6.245733788395904e-07, + "loss": 1.5571472644805908, + "step": 62 + }, + { + "epoch": 0.019698368728839642, + "grad_norm": 4.6875, + "learning_rate": 6.450511945392492e-07, + "loss": 1.204810619354248, + "step": 64 + }, + { + "epoch": 0.020313942751615882, + "grad_norm": 28.25, + "learning_rate": 6.655290102389079e-07, + "loss": 1.7420827150344849, + "step": 66 + }, + { + "epoch": 0.020929516774392122, + "grad_norm": 7.1875, + "learning_rate": 6.860068259385666e-07, + "loss": 1.3108768463134766, + "step": 68 + }, + { + "epoch": 0.02154509079716836, + "grad_norm": 24.625, + "learning_rate": 7.064846416382253e-07, + "loss": 2.4043214321136475, + "step": 70 + }, + { + "epoch": 0.0221606648199446, + "grad_norm": 13.4375, + "learning_rate": 7.269624573378839e-07, + "loss": 1.9473189115524292, + "step": 72 + }, + { + "epoch": 0.02277623884272084, + "grad_norm": 12.6875, + "learning_rate": 7.474402730375426e-07, + "loss": 1.7476930618286133, + "step": 74 + }, + { + "epoch": 0.023391812865497075, + "grad_norm": 39.75, + "learning_rate": 7.679180887372013e-07, + "loss": 2.1216416358947754, + "step": 76 + }, + { + "epoch": 0.024007386888273315, + "grad_norm": 41.0, + "learning_rate": 7.883959044368601e-07, + "loss": 2.0714941024780273, + "step": 78 + }, + { + "epoch": 0.024622960911049555, + "grad_norm": 10.8125, + "learning_rate": 8.088737201365188e-07, + "loss": 1.2925407886505127, + "step": 80 + }, + { + "epoch": 0.02523853493382579, + "grad_norm": 11.1875, + "learning_rate": 8.293515358361775e-07, + "loss": 1.6049776077270508, + "step": 82 + }, + { + "epoch": 0.02585410895660203, + "grad_norm": 37.5, + "learning_rate": 8.498293515358362e-07, + "loss": 1.492782473564148, + "step": 84 + }, + { + "epoch": 0.02646968297937827, + "grad_norm": 12.625, + "learning_rate": 8.703071672354949e-07, + "loss": 1.536354899406433, + "step": 86 + }, + { + "epoch": 0.02708525700215451, + "grad_norm": 26.125, + "learning_rate": 8.907849829351535e-07, + "loss": 1.6706409454345703, + "step": 88 + }, + { + "epoch": 0.027700831024930747, + "grad_norm": 23.0, + "learning_rate": 9.112627986348123e-07, + "loss": 2.148848533630371, + "step": 90 + }, + { + "epoch": 0.028316405047706987, + "grad_norm": 20.75, + "learning_rate": 9.31740614334471e-07, + "loss": 1.4585912227630615, + "step": 92 + }, + { + "epoch": 0.028931979070483227, + "grad_norm": 8.9375, + "learning_rate": 9.522184300341297e-07, + "loss": 1.9553436040878296, + "step": 94 + }, + { + "epoch": 0.029547553093259463, + "grad_norm": 8.1875, + "learning_rate": 9.726962457337883e-07, + "loss": 1.6098637580871582, + "step": 96 + }, + { + "epoch": 0.030163127116035703, + "grad_norm": 15.5625, + "learning_rate": 9.93174061433447e-07, + "loss": 1.8004854917526245, + "step": 98 + }, + { + "epoch": 0.030778701138811943, + "grad_norm": 9.25, + "learning_rate": 1.0136518771331057e-06, + "loss": 1.1443376541137695, + "step": 100 + }, + { + "epoch": 0.03139427516158818, + "grad_norm": 9.75, + "learning_rate": 1.0341296928327646e-06, + "loss": 1.6315455436706543, + "step": 102 + }, + { + "epoch": 0.03200984918436442, + "grad_norm": 10.75, + "learning_rate": 1.0546075085324233e-06, + "loss": 1.6523348093032837, + "step": 104 + }, + { + "epoch": 0.03262542320714066, + "grad_norm": 15.75, + "learning_rate": 1.075085324232082e-06, + "loss": 1.861860752105713, + "step": 106 + }, + { + "epoch": 0.0332409972299169, + "grad_norm": 13.6875, + "learning_rate": 1.0955631399317406e-06, + "loss": 1.5432401895523071, + "step": 108 + }, + { + "epoch": 0.03385657125269314, + "grad_norm": 10.4375, + "learning_rate": 1.1160409556313993e-06, + "loss": 1.820489525794983, + "step": 110 + }, + { + "epoch": 0.03447214527546937, + "grad_norm": 34.75, + "learning_rate": 1.136518771331058e-06, + "loss": 2.099825143814087, + "step": 112 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 18.875, + "learning_rate": 1.156996587030717e-06, + "loss": 2.152771472930908, + "step": 114 + }, + { + "epoch": 0.03570329332102185, + "grad_norm": 6.5625, + "learning_rate": 1.1774744027303754e-06, + "loss": 1.5281484127044678, + "step": 116 + }, + { + "epoch": 0.03631886734379809, + "grad_norm": 34.75, + "learning_rate": 1.197952218430034e-06, + "loss": 1.9934353828430176, + "step": 118 + }, + { + "epoch": 0.03693444136657433, + "grad_norm": 10.625, + "learning_rate": 1.2184300341296928e-06, + "loss": 1.680020809173584, + "step": 120 + }, + { + "epoch": 0.03755001538935057, + "grad_norm": 11.0, + "learning_rate": 1.2389078498293515e-06, + "loss": 1.525346279144287, + "step": 122 + }, + { + "epoch": 0.03816558941212681, + "grad_norm": 5.625, + "learning_rate": 1.2593856655290101e-06, + "loss": 1.124660849571228, + "step": 124 + }, + { + "epoch": 0.038781163434903045, + "grad_norm": 10.4375, + "learning_rate": 1.279863481228669e-06, + "loss": 1.4922833442687988, + "step": 126 + }, + { + "epoch": 0.039396737457679284, + "grad_norm": 51.5, + "learning_rate": 1.3003412969283277e-06, + "loss": 1.9254200458526611, + "step": 128 + }, + { + "epoch": 0.040012311480455524, + "grad_norm": 13.125, + "learning_rate": 1.3208191126279864e-06, + "loss": 1.4378312826156616, + "step": 130 + }, + { + "epoch": 0.040627885503231764, + "grad_norm": 13.625, + "learning_rate": 1.341296928327645e-06, + "loss": 1.788328766822815, + "step": 132 + }, + { + "epoch": 0.041243459526008004, + "grad_norm": 14.875, + "learning_rate": 1.3617747440273038e-06, + "loss": 1.807824969291687, + "step": 134 + }, + { + "epoch": 0.041859033548784244, + "grad_norm": 6.8125, + "learning_rate": 1.3822525597269625e-06, + "loss": 0.9935953617095947, + "step": 136 + }, + { + "epoch": 0.04247460757156048, + "grad_norm": 25.125, + "learning_rate": 1.4027303754266212e-06, + "loss": 1.6776387691497803, + "step": 138 + }, + { + "epoch": 0.04309018159433672, + "grad_norm": 26.75, + "learning_rate": 1.4232081911262799e-06, + "loss": 1.9442132711410522, + "step": 140 + }, + { + "epoch": 0.04370575561711296, + "grad_norm": 18.875, + "learning_rate": 1.4436860068259385e-06, + "loss": 1.4468404054641724, + "step": 142 + }, + { + "epoch": 0.0443213296398892, + "grad_norm": 10.0625, + "learning_rate": 1.4641638225255972e-06, + "loss": 1.5079838037490845, + "step": 144 + }, + { + "epoch": 0.04493690366266544, + "grad_norm": 15.375, + "learning_rate": 1.484641638225256e-06, + "loss": 1.6095199584960938, + "step": 146 + }, + { + "epoch": 0.04555247768544168, + "grad_norm": 11.0625, + "learning_rate": 1.5051194539249148e-06, + "loss": 2.0582830905914307, + "step": 148 + }, + { + "epoch": 0.046168051708217916, + "grad_norm": 12.5625, + "learning_rate": 1.5255972696245735e-06, + "loss": 1.5852106809616089, + "step": 150 + }, + { + "epoch": 0.04678362573099415, + "grad_norm": 31.25, + "learning_rate": 1.5460750853242322e-06, + "loss": 1.6878998279571533, + "step": 152 + }, + { + "epoch": 0.04739919975377039, + "grad_norm": 10.375, + "learning_rate": 1.5665529010238909e-06, + "loss": 1.7584779262542725, + "step": 154 + }, + { + "epoch": 0.04801477377654663, + "grad_norm": 15.0625, + "learning_rate": 1.5870307167235496e-06, + "loss": 1.3234977722167969, + "step": 156 + }, + { + "epoch": 0.04863034779932287, + "grad_norm": 5.46875, + "learning_rate": 1.6075085324232083e-06, + "loss": 1.4745519161224365, + "step": 158 + }, + { + "epoch": 0.04924592182209911, + "grad_norm": 24.0, + "learning_rate": 1.627986348122867e-06, + "loss": 1.8369395732879639, + "step": 160 + }, + { + "epoch": 0.04986149584487535, + "grad_norm": 7.75, + "learning_rate": 1.6484641638225254e-06, + "loss": 1.3853015899658203, + "step": 162 + }, + { + "epoch": 0.05047706986765158, + "grad_norm": 14.0625, + "learning_rate": 1.668941979522184e-06, + "loss": 1.332862377166748, + "step": 164 + }, + { + "epoch": 0.05109264389042782, + "grad_norm": 38.75, + "learning_rate": 1.6894197952218432e-06, + "loss": 1.777852177619934, + "step": 166 + }, + { + "epoch": 0.05170821791320406, + "grad_norm": 5.53125, + "learning_rate": 1.709897610921502e-06, + "loss": 1.8864566087722778, + "step": 168 + }, + { + "epoch": 0.0523237919359803, + "grad_norm": 13.6875, + "learning_rate": 1.7303754266211606e-06, + "loss": 1.484859585762024, + "step": 170 + }, + { + "epoch": 0.05293936595875654, + "grad_norm": 55.0, + "learning_rate": 1.7508532423208193e-06, + "loss": 1.9417674541473389, + "step": 172 + }, + { + "epoch": 0.05355493998153278, + "grad_norm": 20.5, + "learning_rate": 1.771331058020478e-06, + "loss": 1.4582123756408691, + "step": 174 + }, + { + "epoch": 0.05417051400430902, + "grad_norm": 14.75, + "learning_rate": 1.7918088737201367e-06, + "loss": 1.568121314048767, + "step": 176 + }, + { + "epoch": 0.054786088027085254, + "grad_norm": 17.25, + "learning_rate": 1.8122866894197953e-06, + "loss": 1.8054301738739014, + "step": 178 + }, + { + "epoch": 0.055401662049861494, + "grad_norm": 17.5, + "learning_rate": 1.832764505119454e-06, + "loss": 1.3522915840148926, + "step": 180 + }, + { + "epoch": 0.056017236072637734, + "grad_norm": 16.625, + "learning_rate": 1.8532423208191125e-06, + "loss": 1.4976370334625244, + "step": 182 + }, + { + "epoch": 0.056632810095413974, + "grad_norm": 20.875, + "learning_rate": 1.8737201365187712e-06, + "loss": 1.0564684867858887, + "step": 184 + }, + { + "epoch": 0.057248384118190214, + "grad_norm": 21.625, + "learning_rate": 1.8941979522184299e-06, + "loss": 1.8197346925735474, + "step": 186 + }, + { + "epoch": 0.057863958140966454, + "grad_norm": 20.75, + "learning_rate": 1.9146757679180888e-06, + "loss": 1.5595773458480835, + "step": 188 + }, + { + "epoch": 0.05847953216374269, + "grad_norm": 17.625, + "learning_rate": 1.9351535836177475e-06, + "loss": 1.7840571403503418, + "step": 190 + }, + { + "epoch": 0.05909510618651893, + "grad_norm": 8.5625, + "learning_rate": 1.955631399317406e-06, + "loss": 1.4016996622085571, + "step": 192 + }, + { + "epoch": 0.05971068020929517, + "grad_norm": 6.53125, + "learning_rate": 1.976109215017065e-06, + "loss": 1.2504699230194092, + "step": 194 + }, + { + "epoch": 0.060326254232071407, + "grad_norm": 8.6875, + "learning_rate": 1.9965870307167235e-06, + "loss": 1.5754780769348145, + "step": 196 + }, + { + "epoch": 0.060941828254847646, + "grad_norm": 13.125, + "learning_rate": 2.0170648464163822e-06, + "loss": 1.3269094228744507, + "step": 198 + }, + { + "epoch": 0.061557402277623886, + "grad_norm": 5.1875, + "learning_rate": 2.037542662116041e-06, + "loss": 1.445792555809021, + "step": 200 + }, + { + "epoch": 0.062172976300400126, + "grad_norm": 26.375, + "learning_rate": 2.0580204778156996e-06, + "loss": 1.6902906894683838, + "step": 202 + }, + { + "epoch": 0.06278855032317636, + "grad_norm": 8.1875, + "learning_rate": 2.0784982935153583e-06, + "loss": 1.0760879516601562, + "step": 204 + }, + { + "epoch": 0.0634041243459526, + "grad_norm": 15.5625, + "learning_rate": 2.098976109215017e-06, + "loss": 1.7913198471069336, + "step": 206 + }, + { + "epoch": 0.06401969836872884, + "grad_norm": 15.9375, + "learning_rate": 2.1194539249146757e-06, + "loss": 1.7480218410491943, + "step": 208 + }, + { + "epoch": 0.06463527239150507, + "grad_norm": 12.6875, + "learning_rate": 2.1399317406143343e-06, + "loss": 1.8400408029556274, + "step": 210 + }, + { + "epoch": 0.06525084641428132, + "grad_norm": 23.75, + "learning_rate": 2.1604095563139935e-06, + "loss": 1.8554953336715698, + "step": 212 + }, + { + "epoch": 0.06586642043705755, + "grad_norm": 102.0, + "learning_rate": 2.180887372013652e-06, + "loss": 1.7532259225845337, + "step": 214 + }, + { + "epoch": 0.0664819944598338, + "grad_norm": 16.875, + "learning_rate": 2.201365187713311e-06, + "loss": 1.4390134811401367, + "step": 216 + }, + { + "epoch": 0.06709756848261003, + "grad_norm": 14.875, + "learning_rate": 2.2218430034129695e-06, + "loss": 1.714995265007019, + "step": 218 + }, + { + "epoch": 0.06771314250538628, + "grad_norm": 19.125, + "learning_rate": 2.242320819112628e-06, + "loss": 1.4827989339828491, + "step": 220 + }, + { + "epoch": 0.06832871652816251, + "grad_norm": 3.609375, + "learning_rate": 2.262798634812287e-06, + "loss": 1.4392755031585693, + "step": 222 + }, + { + "epoch": 0.06894429055093874, + "grad_norm": 8.125, + "learning_rate": 2.2832764505119456e-06, + "loss": 1.1972272396087646, + "step": 224 + }, + { + "epoch": 0.06955986457371499, + "grad_norm": 5.40625, + "learning_rate": 2.3037542662116043e-06, + "loss": 1.4912139177322388, + "step": 226 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 17.0, + "learning_rate": 2.324232081911263e-06, + "loss": 1.4952316284179688, + "step": 228 + }, + { + "epoch": 0.07079101261926747, + "grad_norm": 32.5, + "learning_rate": 2.3447098976109216e-06, + "loss": 1.7515941858291626, + "step": 230 + }, + { + "epoch": 0.0714065866420437, + "grad_norm": 28.625, + "learning_rate": 2.3651877133105803e-06, + "loss": 1.332948088645935, + "step": 232 + }, + { + "epoch": 0.07202216066481995, + "grad_norm": 15.25, + "learning_rate": 2.385665529010239e-06, + "loss": 1.3763008117675781, + "step": 234 + }, + { + "epoch": 0.07263773468759618, + "grad_norm": 6.0625, + "learning_rate": 2.4061433447098977e-06, + "loss": 1.3950610160827637, + "step": 236 + }, + { + "epoch": 0.07325330871037242, + "grad_norm": 14.9375, + "learning_rate": 2.4266211604095564e-06, + "loss": 1.6538128852844238, + "step": 238 + }, + { + "epoch": 0.07386888273314866, + "grad_norm": 10.9375, + "learning_rate": 2.447098976109215e-06, + "loss": 1.3810994625091553, + "step": 240 + }, + { + "epoch": 0.0744844567559249, + "grad_norm": 10.625, + "learning_rate": 2.4675767918088738e-06, + "loss": 1.2243789434432983, + "step": 242 + }, + { + "epoch": 0.07510003077870114, + "grad_norm": 7.125, + "learning_rate": 2.4880546075085325e-06, + "loss": 1.4241597652435303, + "step": 244 + }, + { + "epoch": 0.07571560480147738, + "grad_norm": 20.375, + "learning_rate": 2.508532423208191e-06, + "loss": 1.6715261936187744, + "step": 246 + }, + { + "epoch": 0.07633117882425362, + "grad_norm": 13.375, + "learning_rate": 2.52901023890785e-06, + "loss": 1.5806057453155518, + "step": 248 + }, + { + "epoch": 0.07694675284702986, + "grad_norm": 4.71875, + "learning_rate": 2.5494880546075085e-06, + "loss": 1.0935168266296387, + "step": 250 + }, + { + "epoch": 0.07756232686980609, + "grad_norm": 10.8125, + "learning_rate": 2.569965870307167e-06, + "loss": 1.5497374534606934, + "step": 252 + }, + { + "epoch": 0.07817790089258234, + "grad_norm": 10.0, + "learning_rate": 2.590443686006826e-06, + "loss": 1.7406679391860962, + "step": 254 + }, + { + "epoch": 0.07879347491535857, + "grad_norm": 23.375, + "learning_rate": 2.6109215017064846e-06, + "loss": 1.9079008102416992, + "step": 256 + }, + { + "epoch": 0.07940904893813482, + "grad_norm": 17.125, + "learning_rate": 2.6313993174061433e-06, + "loss": 1.521014928817749, + "step": 258 + }, + { + "epoch": 0.08002462296091105, + "grad_norm": 21.75, + "learning_rate": 2.6518771331058024e-06, + "loss": 1.6001818180084229, + "step": 260 + }, + { + "epoch": 0.08064019698368728, + "grad_norm": 26.125, + "learning_rate": 2.672354948805461e-06, + "loss": 1.6336616277694702, + "step": 262 + }, + { + "epoch": 0.08125577100646353, + "grad_norm": 13.625, + "learning_rate": 2.6928327645051198e-06, + "loss": 1.6789780855178833, + "step": 264 + }, + { + "epoch": 0.08187134502923976, + "grad_norm": 9.625, + "learning_rate": 2.7133105802047784e-06, + "loss": 1.3298799991607666, + "step": 266 + }, + { + "epoch": 0.08248691905201601, + "grad_norm": 10.6875, + "learning_rate": 2.733788395904437e-06, + "loss": 1.4433543682098389, + "step": 268 + }, + { + "epoch": 0.08310249307479224, + "grad_norm": 3.765625, + "learning_rate": 2.754266211604096e-06, + "loss": 1.3586629629135132, + "step": 270 + }, + { + "epoch": 0.08371806709756849, + "grad_norm": 12.5, + "learning_rate": 2.7747440273037545e-06, + "loss": 1.6898560523986816, + "step": 272 + }, + { + "epoch": 0.08433364112034472, + "grad_norm": 9.9375, + "learning_rate": 2.795221843003413e-06, + "loss": 1.2337279319763184, + "step": 274 + }, + { + "epoch": 0.08494921514312095, + "grad_norm": 7.125, + "learning_rate": 2.8156996587030715e-06, + "loss": 1.5503768920898438, + "step": 276 + }, + { + "epoch": 0.0855647891658972, + "grad_norm": 26.875, + "learning_rate": 2.83617747440273e-06, + "loss": 1.4721266031265259, + "step": 278 + }, + { + "epoch": 0.08618036318867343, + "grad_norm": 20.0, + "learning_rate": 2.856655290102389e-06, + "loss": 1.5436632633209229, + "step": 280 + }, + { + "epoch": 0.08679593721144968, + "grad_norm": 10.0, + "learning_rate": 2.8771331058020475e-06, + "loss": 1.6570696830749512, + "step": 282 + }, + { + "epoch": 0.08741151123422591, + "grad_norm": 37.0, + "learning_rate": 2.8976109215017066e-06, + "loss": 1.4481430053710938, + "step": 284 + }, + { + "epoch": 0.08802708525700216, + "grad_norm": 22.75, + "learning_rate": 2.9180887372013653e-06, + "loss": 2.1153812408447266, + "step": 286 + }, + { + "epoch": 0.0886426592797784, + "grad_norm": 16.5, + "learning_rate": 2.938566552901024e-06, + "loss": 1.0929564237594604, + "step": 288 + }, + { + "epoch": 0.08925823330255463, + "grad_norm": 13.25, + "learning_rate": 2.9590443686006827e-06, + "loss": 1.6294186115264893, + "step": 290 + }, + { + "epoch": 0.08987380732533087, + "grad_norm": 14.125, + "learning_rate": 2.9795221843003414e-06, + "loss": 1.1360838413238525, + "step": 292 + }, + { + "epoch": 0.0904893813481071, + "grad_norm": 12.3125, + "learning_rate": 3e-06, + "loss": 1.477271318435669, + "step": 294 + }, + { + "epoch": 0.09110495537088335, + "grad_norm": 8.6875, + "learning_rate": 2.9999997349793134e-06, + "loss": 1.6074985265731812, + "step": 296 + }, + { + "epoch": 0.09172052939365959, + "grad_norm": 18.125, + "learning_rate": 2.99999893991737e-06, + "loss": 1.2198758125305176, + "step": 298 + }, + { + "epoch": 0.09233610341643583, + "grad_norm": 15.375, + "learning_rate": 2.999997614814521e-06, + "loss": 1.467660903930664, + "step": 300 + }, + { + "epoch": 0.09295167743921207, + "grad_norm": 10.8125, + "learning_rate": 2.999995759671352e-06, + "loss": 1.0507367849349976, + "step": 302 + }, + { + "epoch": 0.0935672514619883, + "grad_norm": 27.375, + "learning_rate": 2.9999933744886825e-06, + "loss": 1.9891753196716309, + "step": 304 + }, + { + "epoch": 0.09418282548476455, + "grad_norm": 30.125, + "learning_rate": 2.9999904592675653e-06, + "loss": 1.7953660488128662, + "step": 306 + }, + { + "epoch": 0.09479839950754078, + "grad_norm": 3.65625, + "learning_rate": 2.9999870140092888e-06, + "loss": 0.8793841600418091, + "step": 308 + }, + { + "epoch": 0.09541397353031703, + "grad_norm": 42.75, + "learning_rate": 2.9999830387153745e-06, + "loss": 2.1185731887817383, + "step": 310 + }, + { + "epoch": 0.09602954755309326, + "grad_norm": 15.375, + "learning_rate": 2.9999785333875786e-06, + "loss": 1.90293288230896, + "step": 312 + }, + { + "epoch": 0.09664512157586949, + "grad_norm": 19.75, + "learning_rate": 2.9999734980278905e-06, + "loss": 1.8595056533813477, + "step": 314 + }, + { + "epoch": 0.09726069559864574, + "grad_norm": 6.125, + "learning_rate": 2.9999679326385347e-06, + "loss": 0.754072368144989, + "step": 316 + }, + { + "epoch": 0.09787626962142197, + "grad_norm": 4.65625, + "learning_rate": 2.9999618372219697e-06, + "loss": 1.2867162227630615, + "step": 318 + }, + { + "epoch": 0.09849184364419822, + "grad_norm": 18.125, + "learning_rate": 2.9999552117808872e-06, + "loss": 1.009380578994751, + "step": 320 + }, + { + "epoch": 0.09910741766697445, + "grad_norm": 3.28125, + "learning_rate": 2.9999480563182143e-06, + "loss": 1.360229253768921, + "step": 322 + }, + { + "epoch": 0.0997229916897507, + "grad_norm": 52.25, + "learning_rate": 2.999940370837111e-06, + "loss": 1.5169321298599243, + "step": 324 + }, + { + "epoch": 0.10033856571252693, + "grad_norm": 18.875, + "learning_rate": 2.999932155340973e-06, + "loss": 1.471400499343872, + "step": 326 + }, + { + "epoch": 0.10095413973530316, + "grad_norm": 14.1875, + "learning_rate": 2.999923409833428e-06, + "loss": 1.649963617324829, + "step": 328 + }, + { + "epoch": 0.10156971375807941, + "grad_norm": 16.0, + "learning_rate": 2.9999141343183392e-06, + "loss": 1.264378547668457, + "step": 330 + }, + { + "epoch": 0.10218528778085564, + "grad_norm": 15.1875, + "learning_rate": 2.9999043287998035e-06, + "loss": 1.2814195156097412, + "step": 332 + }, + { + "epoch": 0.10280086180363189, + "grad_norm": 10.875, + "learning_rate": 2.999893993282153e-06, + "loss": 1.59095299243927, + "step": 334 + }, + { + "epoch": 0.10341643582640812, + "grad_norm": 3.3125, + "learning_rate": 2.999883127769951e-06, + "loss": 1.2800519466400146, + "step": 336 + }, + { + "epoch": 0.10403200984918437, + "grad_norm": 12.375, + "learning_rate": 2.999871732267998e-06, + "loss": 1.1677653789520264, + "step": 338 + }, + { + "epoch": 0.1046475838719606, + "grad_norm": 19.0, + "learning_rate": 2.999859806781328e-06, + "loss": 1.6544604301452637, + "step": 340 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 10.875, + "learning_rate": 2.9998473513152085e-06, + "loss": 1.2596807479858398, + "step": 342 + }, + { + "epoch": 0.10587873191751308, + "grad_norm": 37.75, + "learning_rate": 2.999834365875139e-06, + "loss": 1.5575487613677979, + "step": 344 + }, + { + "epoch": 0.10649430594028932, + "grad_norm": 5.71875, + "learning_rate": 2.999820850466857e-06, + "loss": 1.3673763275146484, + "step": 346 + }, + { + "epoch": 0.10710987996306556, + "grad_norm": 8.0625, + "learning_rate": 2.999806805096332e-06, + "loss": 1.4206829071044922, + "step": 348 + }, + { + "epoch": 0.1077254539858418, + "grad_norm": 12.625, + "learning_rate": 2.9997922297697676e-06, + "loss": 1.0419559478759766, + "step": 350 + }, + { + "epoch": 0.10834102800861804, + "grad_norm": 5.0, + "learning_rate": 2.999777124493602e-06, + "loss": 1.1992754936218262, + "step": 352 + }, + { + "epoch": 0.10895660203139428, + "grad_norm": 12.6875, + "learning_rate": 2.999761489274507e-06, + "loss": 1.6422064304351807, + "step": 354 + }, + { + "epoch": 0.10957217605417051, + "grad_norm": 12.6875, + "learning_rate": 2.9997453241193892e-06, + "loss": 1.4691483974456787, + "step": 356 + }, + { + "epoch": 0.11018775007694676, + "grad_norm": 10.6875, + "learning_rate": 2.999728629035388e-06, + "loss": 1.451507806777954, + "step": 358 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 12.9375, + "learning_rate": 2.999711404029878e-06, + "loss": 1.5703644752502441, + "step": 360 + }, + { + "epoch": 0.11141889812249924, + "grad_norm": 17.375, + "learning_rate": 2.999693649110467e-06, + "loss": 1.541607141494751, + "step": 362 + }, + { + "epoch": 0.11203447214527547, + "grad_norm": 42.0, + "learning_rate": 2.999675364284999e-06, + "loss": 1.6094763278961182, + "step": 364 + }, + { + "epoch": 0.11265004616805172, + "grad_norm": 21.0, + "learning_rate": 2.999656549561548e-06, + "loss": 0.6293473243713379, + "step": 366 + }, + { + "epoch": 0.11326562019082795, + "grad_norm": 15.1875, + "learning_rate": 2.999637204948427e-06, + "loss": 1.6930108070373535, + "step": 368 + }, + { + "epoch": 0.11388119421360418, + "grad_norm": 20.75, + "learning_rate": 2.9996173304541787e-06, + "loss": 1.4504094123840332, + "step": 370 + }, + { + "epoch": 0.11449676823638043, + "grad_norm": 8.1875, + "learning_rate": 2.9995969260875816e-06, + "loss": 1.029111385345459, + "step": 372 + }, + { + "epoch": 0.11511234225915666, + "grad_norm": 17.625, + "learning_rate": 2.99957599185765e-06, + "loss": 1.711849331855774, + "step": 374 + }, + { + "epoch": 0.11572791628193291, + "grad_norm": 6.625, + "learning_rate": 2.999554527773629e-06, + "loss": 1.3091596364974976, + "step": 376 + }, + { + "epoch": 0.11634349030470914, + "grad_norm": 14.9375, + "learning_rate": 2.999532533845001e-06, + "loss": 1.4927258491516113, + "step": 378 + }, + { + "epoch": 0.11695906432748537, + "grad_norm": 8.3125, + "learning_rate": 2.9995100100814786e-06, + "loss": 1.2266597747802734, + "step": 380 + }, + { + "epoch": 0.11757463835026162, + "grad_norm": 11.25, + "learning_rate": 2.9994869564930125e-06, + "loss": 1.2330584526062012, + "step": 382 + }, + { + "epoch": 0.11819021237303785, + "grad_norm": 15.3125, + "learning_rate": 2.9994633730897832e-06, + "loss": 1.7261693477630615, + "step": 384 + }, + { + "epoch": 0.1188057863958141, + "grad_norm": 21.5, + "learning_rate": 2.99943925988221e-06, + "loss": 1.339095115661621, + "step": 386 + }, + { + "epoch": 0.11942136041859033, + "grad_norm": 17.25, + "learning_rate": 2.9994146168809423e-06, + "loss": 1.7015557289123535, + "step": 388 + }, + { + "epoch": 0.12003693444136658, + "grad_norm": 18.5, + "learning_rate": 2.9993894440968657e-06, + "loss": 1.0266611576080322, + "step": 390 + }, + { + "epoch": 0.12065250846414281, + "grad_norm": 20.875, + "learning_rate": 2.9993637415410987e-06, + "loss": 1.4492390155792236, + "step": 392 + }, + { + "epoch": 0.12126808248691905, + "grad_norm": 22.375, + "learning_rate": 2.9993375092249934e-06, + "loss": 1.4749621152877808, + "step": 394 + }, + { + "epoch": 0.12188365650969529, + "grad_norm": 6.4375, + "learning_rate": 2.999310747160138e-06, + "loss": 1.355834722518921, + "step": 396 + }, + { + "epoch": 0.12249923053247153, + "grad_norm": 13.5, + "learning_rate": 2.999283455358353e-06, + "loss": 1.307915210723877, + "step": 398 + }, + { + "epoch": 0.12311480455524777, + "grad_norm": 9.0625, + "learning_rate": 2.9992556338316925e-06, + "loss": 1.5071804523468018, + "step": 400 + }, + { + "epoch": 0.123730378578024, + "grad_norm": 13.25, + "learning_rate": 2.9992272825924454e-06, + "loss": 1.6163640022277832, + "step": 402 + }, + { + "epoch": 0.12434595260080025, + "grad_norm": 14.1875, + "learning_rate": 2.9991984016531344e-06, + "loss": 1.0656492710113525, + "step": 404 + }, + { + "epoch": 0.12496152662357649, + "grad_norm": 21.25, + "learning_rate": 2.999168991026518e-06, + "loss": 1.2286940813064575, + "step": 406 + }, + { + "epoch": 0.12557710064635272, + "grad_norm": 19.625, + "learning_rate": 2.9991390507255847e-06, + "loss": 1.4255913496017456, + "step": 408 + }, + { + "epoch": 0.12619267466912895, + "grad_norm": 6.90625, + "learning_rate": 2.99910858076356e-06, + "loss": 1.3530218601226807, + "step": 410 + }, + { + "epoch": 0.1268082486919052, + "grad_norm": 11.5, + "learning_rate": 2.9990775811539026e-06, + "loss": 1.2902710437774658, + "step": 412 + }, + { + "epoch": 0.12742382271468145, + "grad_norm": 11.6875, + "learning_rate": 2.9990460519103045e-06, + "loss": 1.5794827938079834, + "step": 414 + }, + { + "epoch": 0.12803939673745768, + "grad_norm": 25.25, + "learning_rate": 2.9990139930466934e-06, + "loss": 1.5939992666244507, + "step": 416 + }, + { + "epoch": 0.1286549707602339, + "grad_norm": 18.25, + "learning_rate": 2.9989814045772287e-06, + "loss": 0.8886748552322388, + "step": 418 + }, + { + "epoch": 0.12927054478301014, + "grad_norm": 14.4375, + "learning_rate": 2.9989482865163058e-06, + "loss": 1.484604835510254, + "step": 420 + }, + { + "epoch": 0.1298861188057864, + "grad_norm": 25.75, + "learning_rate": 2.9989146388785516e-06, + "loss": 1.5589027404785156, + "step": 422 + }, + { + "epoch": 0.13050169282856264, + "grad_norm": 11.3125, + "learning_rate": 2.9988804616788287e-06, + "loss": 1.100600004196167, + "step": 424 + }, + { + "epoch": 0.13111726685133887, + "grad_norm": 11.625, + "learning_rate": 2.998845754932234e-06, + "loss": 1.454401969909668, + "step": 426 + }, + { + "epoch": 0.1317328408741151, + "grad_norm": 17.875, + "learning_rate": 2.9988105186540964e-06, + "loss": 1.2516822814941406, + "step": 428 + }, + { + "epoch": 0.13234841489689136, + "grad_norm": 10.8125, + "learning_rate": 2.998774752859981e-06, + "loss": 1.4223829507827759, + "step": 430 + }, + { + "epoch": 0.1329639889196676, + "grad_norm": 13.875, + "learning_rate": 2.998738457565685e-06, + "loss": 1.5337071418762207, + "step": 432 + }, + { + "epoch": 0.13357956294244383, + "grad_norm": 17.0, + "learning_rate": 2.99870163278724e-06, + "loss": 1.8929377794265747, + "step": 434 + }, + { + "epoch": 0.13419513696522006, + "grad_norm": 29.375, + "learning_rate": 2.998664278540911e-06, + "loss": 1.9043139219284058, + "step": 436 + }, + { + "epoch": 0.1348107109879963, + "grad_norm": 28.625, + "learning_rate": 2.9986263948431986e-06, + "loss": 1.4659531116485596, + "step": 438 + }, + { + "epoch": 0.13542628501077256, + "grad_norm": 9.0, + "learning_rate": 2.998587981710835e-06, + "loss": 1.0244336128234863, + "step": 440 + }, + { + "epoch": 0.1360418590335488, + "grad_norm": 17.25, + "learning_rate": 2.9985490391607883e-06, + "loss": 1.6359564065933228, + "step": 442 + }, + { + "epoch": 0.13665743305632502, + "grad_norm": 7.875, + "learning_rate": 2.998509567210259e-06, + "loss": 1.3486268520355225, + "step": 444 + }, + { + "epoch": 0.13727300707910126, + "grad_norm": 12.6875, + "learning_rate": 2.9984695658766814e-06, + "loss": 1.4941446781158447, + "step": 446 + }, + { + "epoch": 0.1378885811018775, + "grad_norm": 25.75, + "learning_rate": 2.9984290351777252e-06, + "loss": 2.0870232582092285, + "step": 448 + }, + { + "epoch": 0.13850415512465375, + "grad_norm": 11.5, + "learning_rate": 2.9983879751312923e-06, + "loss": 1.317858099937439, + "step": 450 + }, + { + "epoch": 0.13911972914742998, + "grad_norm": 21.25, + "learning_rate": 2.9983463857555184e-06, + "loss": 1.5850346088409424, + "step": 452 + }, + { + "epoch": 0.13973530317020622, + "grad_norm": 20.875, + "learning_rate": 2.9983042670687745e-06, + "loss": 1.2323607206344604, + "step": 454 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 16.375, + "learning_rate": 2.9982616190896635e-06, + "loss": 1.528209924697876, + "step": 456 + }, + { + "epoch": 0.14096645121575868, + "grad_norm": 70.5, + "learning_rate": 2.9982184418370242e-06, + "loss": 1.527733564376831, + "step": 458 + }, + { + "epoch": 0.14158202523853494, + "grad_norm": 17.125, + "learning_rate": 2.9981747353299277e-06, + "loss": 1.887821912765503, + "step": 460 + }, + { + "epoch": 0.14219759926131118, + "grad_norm": 16.75, + "learning_rate": 2.998130499587679e-06, + "loss": 1.7688817977905273, + "step": 462 + }, + { + "epoch": 0.1428131732840874, + "grad_norm": 11.375, + "learning_rate": 2.9980857346298167e-06, + "loss": 1.4512908458709717, + "step": 464 + }, + { + "epoch": 0.14342874730686364, + "grad_norm": 21.625, + "learning_rate": 2.9980404404761143e-06, + "loss": 1.417380928993225, + "step": 466 + }, + { + "epoch": 0.1440443213296399, + "grad_norm": 15.625, + "learning_rate": 2.997994617146578e-06, + "loss": 1.804483413696289, + "step": 468 + }, + { + "epoch": 0.14465989535241613, + "grad_norm": 94.5, + "learning_rate": 2.997948264661447e-06, + "loss": 1.7305872440338135, + "step": 470 + }, + { + "epoch": 0.14527546937519237, + "grad_norm": 9.4375, + "learning_rate": 2.9979013830411973e-06, + "loss": 1.4537417888641357, + "step": 472 + }, + { + "epoch": 0.1458910433979686, + "grad_norm": 21.75, + "learning_rate": 2.9978539723065345e-06, + "loss": 1.5609411001205444, + "step": 474 + }, + { + "epoch": 0.14650661742074483, + "grad_norm": 12.5625, + "learning_rate": 2.997806032478402e-06, + "loss": 1.7527482509613037, + "step": 476 + }, + { + "epoch": 0.1471221914435211, + "grad_norm": 6.15625, + "learning_rate": 2.9977575635779727e-06, + "loss": 1.0448154211044312, + "step": 478 + }, + { + "epoch": 0.14773776546629733, + "grad_norm": 14.1875, + "learning_rate": 2.997708565626657e-06, + "loss": 1.3397457599639893, + "step": 480 + }, + { + "epoch": 0.14835333948907356, + "grad_norm": 11.5, + "learning_rate": 2.997659038646097e-06, + "loss": 1.3842558860778809, + "step": 482 + }, + { + "epoch": 0.1489689135118498, + "grad_norm": 8.8125, + "learning_rate": 2.9976089826581675e-06, + "loss": 1.3095613718032837, + "step": 484 + }, + { + "epoch": 0.14958448753462603, + "grad_norm": 13.9375, + "learning_rate": 2.997558397684981e-06, + "loss": 1.6145265102386475, + "step": 486 + }, + { + "epoch": 0.1502000615574023, + "grad_norm": 13.4375, + "learning_rate": 2.9975072837488783e-06, + "loss": 1.6280035972595215, + "step": 488 + }, + { + "epoch": 0.15081563558017852, + "grad_norm": 27.875, + "learning_rate": 2.9974556408724377e-06, + "loss": 1.5152499675750732, + "step": 490 + }, + { + "epoch": 0.15143120960295475, + "grad_norm": 24.125, + "learning_rate": 2.99740346907847e-06, + "loss": 1.25870680809021, + "step": 492 + }, + { + "epoch": 0.15204678362573099, + "grad_norm": 16.375, + "learning_rate": 2.997350768390019e-06, + "loss": 0.947930634021759, + "step": 494 + }, + { + "epoch": 0.15266235764850725, + "grad_norm": 36.0, + "learning_rate": 2.9972975388303636e-06, + "loss": 1.5841643810272217, + "step": 496 + }, + { + "epoch": 0.15327793167128348, + "grad_norm": 15.5625, + "learning_rate": 2.997243780423014e-06, + "loss": 1.7457714080810547, + "step": 498 + }, + { + "epoch": 0.1538935056940597, + "grad_norm": 11.5, + "learning_rate": 2.9971894931917164e-06, + "loss": 1.27567720413208, + "step": 500 + }, + { + "epoch": 0.15450907971683595, + "grad_norm": 9.8125, + "learning_rate": 2.9971346771604488e-06, + "loss": 1.5354039669036865, + "step": 502 + }, + { + "epoch": 0.15512465373961218, + "grad_norm": 13.3125, + "learning_rate": 2.997079332353425e-06, + "loss": 1.7307982444763184, + "step": 504 + }, + { + "epoch": 0.15574022776238844, + "grad_norm": 10.125, + "learning_rate": 2.9970234587950887e-06, + "loss": 1.5125446319580078, + "step": 506 + }, + { + "epoch": 0.15635580178516467, + "grad_norm": 10.3125, + "learning_rate": 2.996967056510121e-06, + "loss": 1.3819962739944458, + "step": 508 + }, + { + "epoch": 0.1569713758079409, + "grad_norm": 12.1875, + "learning_rate": 2.9969101255234336e-06, + "loss": 1.5607001781463623, + "step": 510 + }, + { + "epoch": 0.15758694983071714, + "grad_norm": 14.0625, + "learning_rate": 2.996852665860174e-06, + "loss": 1.8390923738479614, + "step": 512 + }, + { + "epoch": 0.15820252385349337, + "grad_norm": 9.9375, + "learning_rate": 2.9967946775457216e-06, + "loss": 1.5656869411468506, + "step": 514 + }, + { + "epoch": 0.15881809787626963, + "grad_norm": 10.3125, + "learning_rate": 2.9967361606056903e-06, + "loss": 1.39482581615448, + "step": 516 + }, + { + "epoch": 0.15943367189904586, + "grad_norm": 12.75, + "learning_rate": 2.996677115065927e-06, + "loss": 1.5271615982055664, + "step": 518 + }, + { + "epoch": 0.1600492459218221, + "grad_norm": 15.4375, + "learning_rate": 2.9966175409525118e-06, + "loss": 1.4671976566314697, + "step": 520 + }, + { + "epoch": 0.16066481994459833, + "grad_norm": 16.5, + "learning_rate": 2.9965574382917595e-06, + "loss": 1.8529220819473267, + "step": 522 + }, + { + "epoch": 0.16128039396737456, + "grad_norm": 13.125, + "learning_rate": 2.996496807110216e-06, + "loss": 1.4948711395263672, + "step": 524 + }, + { + "epoch": 0.16189596799015082, + "grad_norm": 12.0625, + "learning_rate": 2.996435647434664e-06, + "loss": 1.8076704740524292, + "step": 526 + }, + { + "epoch": 0.16251154201292706, + "grad_norm": 14.625, + "learning_rate": 2.9963739592921166e-06, + "loss": 1.5516061782836914, + "step": 528 + }, + { + "epoch": 0.1631271160357033, + "grad_norm": 17.125, + "learning_rate": 2.9963117427098225e-06, + "loss": 1.004725694656372, + "step": 530 + }, + { + "epoch": 0.16374269005847952, + "grad_norm": 6.8125, + "learning_rate": 2.996248997715261e-06, + "loss": 1.3982107639312744, + "step": 532 + }, + { + "epoch": 0.16435826408125578, + "grad_norm": 10.1875, + "learning_rate": 2.996185724336149e-06, + "loss": 1.3941655158996582, + "step": 534 + }, + { + "epoch": 0.16497383810403202, + "grad_norm": 18.25, + "learning_rate": 2.996121922600432e-06, + "loss": 1.4721497297286987, + "step": 536 + }, + { + "epoch": 0.16558941212680825, + "grad_norm": 22.75, + "learning_rate": 2.9960575925362933e-06, + "loss": 1.7670438289642334, + "step": 538 + }, + { + "epoch": 0.16620498614958448, + "grad_norm": 8.8125, + "learning_rate": 2.995992734172147e-06, + "loss": 1.3671045303344727, + "step": 540 + }, + { + "epoch": 0.16682056017236072, + "grad_norm": 10.8125, + "learning_rate": 2.9959273475366404e-06, + "loss": 1.5833959579467773, + "step": 542 + }, + { + "epoch": 0.16743613419513698, + "grad_norm": 24.5, + "learning_rate": 2.995861432658656e-06, + "loss": 1.4065296649932861, + "step": 544 + }, + { + "epoch": 0.1680517082179132, + "grad_norm": 9.6875, + "learning_rate": 2.995794989567307e-06, + "loss": 0.9803017973899841, + "step": 546 + }, + { + "epoch": 0.16866728224068944, + "grad_norm": 21.875, + "learning_rate": 2.995728018291943e-06, + "loss": 1.305236577987671, + "step": 548 + }, + { + "epoch": 0.16928285626346568, + "grad_norm": 10.3125, + "learning_rate": 2.995660518862144e-06, + "loss": 1.4379463195800781, + "step": 550 + }, + { + "epoch": 0.1698984302862419, + "grad_norm": 29.0, + "learning_rate": 2.995592491307726e-06, + "loss": 1.6318714618682861, + "step": 552 + }, + { + "epoch": 0.17051400430901817, + "grad_norm": 25.0, + "learning_rate": 2.995523935658735e-06, + "loss": 1.435588002204895, + "step": 554 + }, + { + "epoch": 0.1711295783317944, + "grad_norm": 38.0, + "learning_rate": 2.9954548519454537e-06, + "loss": 1.6767455339431763, + "step": 556 + }, + { + "epoch": 0.17174515235457063, + "grad_norm": 11.3125, + "learning_rate": 2.995385240198395e-06, + "loss": 1.7289783954620361, + "step": 558 + }, + { + "epoch": 0.17236072637734687, + "grad_norm": 21.375, + "learning_rate": 2.9953151004483084e-06, + "loss": 1.3982090950012207, + "step": 560 + }, + { + "epoch": 0.17297630040012313, + "grad_norm": 9.1875, + "learning_rate": 2.995244432726173e-06, + "loss": 1.022979974746704, + "step": 562 + }, + { + "epoch": 0.17359187442289936, + "grad_norm": 15.3125, + "learning_rate": 2.9951732370632034e-06, + "loss": 1.4568507671356201, + "step": 564 + }, + { + "epoch": 0.1742074484456756, + "grad_norm": 27.125, + "learning_rate": 2.9951015134908465e-06, + "loss": 0.9370282888412476, + "step": 566 + }, + { + "epoch": 0.17482302246845183, + "grad_norm": 15.0625, + "learning_rate": 2.9950292620407835e-06, + "loss": 1.2536559104919434, + "step": 568 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 12.4375, + "learning_rate": 2.994956482744927e-06, + "loss": 1.5881582498550415, + "step": 570 + }, + { + "epoch": 0.17605417051400432, + "grad_norm": 14.1875, + "learning_rate": 2.994883175635425e-06, + "loss": 1.379671573638916, + "step": 572 + }, + { + "epoch": 0.17666974453678055, + "grad_norm": 15.0, + "learning_rate": 2.9948093407446564e-06, + "loss": 1.261031985282898, + "step": 574 + }, + { + "epoch": 0.1772853185595568, + "grad_norm": 23.0, + "learning_rate": 2.9947349781052336e-06, + "loss": 1.6905066967010498, + "step": 576 + }, + { + "epoch": 0.17790089258233302, + "grad_norm": 5.5625, + "learning_rate": 2.9946600877500037e-06, + "loss": 0.9953762292861938, + "step": 578 + }, + { + "epoch": 0.17851646660510925, + "grad_norm": 14.3125, + "learning_rate": 2.9945846697120454e-06, + "loss": 1.0212024450302124, + "step": 580 + }, + { + "epoch": 0.1791320406278855, + "grad_norm": 17.0, + "learning_rate": 2.9945087240246713e-06, + "loss": 1.3336846828460693, + "step": 582 + }, + { + "epoch": 0.17974761465066175, + "grad_norm": 21.5, + "learning_rate": 2.994432250721426e-06, + "loss": 1.8026593923568726, + "step": 584 + }, + { + "epoch": 0.18036318867343798, + "grad_norm": 26.125, + "learning_rate": 2.9943552498360883e-06, + "loss": 1.6114405393600464, + "step": 586 + }, + { + "epoch": 0.1809787626962142, + "grad_norm": 10.125, + "learning_rate": 2.9942777214026696e-06, + "loss": 1.6910456418991089, + "step": 588 + }, + { + "epoch": 0.18159433671899045, + "grad_norm": 12.75, + "learning_rate": 2.994199665455414e-06, + "loss": 1.414069652557373, + "step": 590 + }, + { + "epoch": 0.1822099107417667, + "grad_norm": 6.78125, + "learning_rate": 2.9941210820287994e-06, + "loss": 1.504883050918579, + "step": 592 + }, + { + "epoch": 0.18282548476454294, + "grad_norm": 15.9375, + "learning_rate": 2.994041971157536e-06, + "loss": 1.501386046409607, + "step": 594 + }, + { + "epoch": 0.18344105878731917, + "grad_norm": 9.75, + "learning_rate": 2.993962332876567e-06, + "loss": 1.4829013347625732, + "step": 596 + }, + { + "epoch": 0.1840566328100954, + "grad_norm": 4.1875, + "learning_rate": 2.9938821672210684e-06, + "loss": 0.966797411441803, + "step": 598 + }, + { + "epoch": 0.18467220683287167, + "grad_norm": 24.25, + "learning_rate": 2.9938014742264505e-06, + "loss": 0.9075067043304443, + "step": 600 + }, + { + "epoch": 0.1852877808556479, + "grad_norm": 17.0, + "learning_rate": 2.9937202539283544e-06, + "loss": 1.418922781944275, + "step": 602 + }, + { + "epoch": 0.18590335487842413, + "grad_norm": 35.75, + "learning_rate": 2.993638506362656e-06, + "loss": 1.4964491128921509, + "step": 604 + }, + { + "epoch": 0.18651892890120036, + "grad_norm": 20.875, + "learning_rate": 2.993556231565463e-06, + "loss": 1.4599058628082275, + "step": 606 + }, + { + "epoch": 0.1871345029239766, + "grad_norm": 7.875, + "learning_rate": 2.993473429573116e-06, + "loss": 1.621177315711975, + "step": 608 + }, + { + "epoch": 0.18775007694675286, + "grad_norm": 9.5625, + "learning_rate": 2.993390100422189e-06, + "loss": 1.665502905845642, + "step": 610 + }, + { + "epoch": 0.1883656509695291, + "grad_norm": 14.75, + "learning_rate": 2.993306244149488e-06, + "loss": 0.6662383675575256, + "step": 612 + }, + { + "epoch": 0.18898122499230532, + "grad_norm": 10.5, + "learning_rate": 2.9932218607920542e-06, + "loss": 1.319474220275879, + "step": 614 + }, + { + "epoch": 0.18959679901508156, + "grad_norm": 17.25, + "learning_rate": 2.9931369503871573e-06, + "loss": 1.4323532581329346, + "step": 616 + }, + { + "epoch": 0.1902123730378578, + "grad_norm": 20.5, + "learning_rate": 2.993051512972304e-06, + "loss": 1.3529409170150757, + "step": 618 + }, + { + "epoch": 0.19082794706063405, + "grad_norm": 16.875, + "learning_rate": 2.992965548585232e-06, + "loss": 1.02125883102417, + "step": 620 + }, + { + "epoch": 0.19144352108341028, + "grad_norm": 13.6875, + "learning_rate": 2.9928790572639117e-06, + "loss": 1.5696746110916138, + "step": 622 + }, + { + "epoch": 0.19205909510618652, + "grad_norm": 27.875, + "learning_rate": 2.9927920390465453e-06, + "loss": 1.7022054195404053, + "step": 624 + }, + { + "epoch": 0.19267466912896275, + "grad_norm": 18.125, + "learning_rate": 2.9927044939715703e-06, + "loss": 1.096092700958252, + "step": 626 + }, + { + "epoch": 0.19329024315173898, + "grad_norm": 27.5, + "learning_rate": 2.992616422077655e-06, + "loss": 1.7465717792510986, + "step": 628 + }, + { + "epoch": 0.19390581717451524, + "grad_norm": 41.75, + "learning_rate": 2.9925278234037014e-06, + "loss": 1.9313154220581055, + "step": 630 + }, + { + "epoch": 0.19452139119729148, + "grad_norm": 12.875, + "learning_rate": 2.9924386979888424e-06, + "loss": 1.3957619667053223, + "step": 632 + }, + { + "epoch": 0.1951369652200677, + "grad_norm": 7.25, + "learning_rate": 2.9923490458724457e-06, + "loss": 1.2672849893569946, + "step": 634 + }, + { + "epoch": 0.19575253924284394, + "grad_norm": 8.8125, + "learning_rate": 2.99225886709411e-06, + "loss": 1.5344815254211426, + "step": 636 + }, + { + "epoch": 0.1963681132656202, + "grad_norm": 13.4375, + "learning_rate": 2.992168161693669e-06, + "loss": 1.2782959938049316, + "step": 638 + }, + { + "epoch": 0.19698368728839644, + "grad_norm": 12.375, + "learning_rate": 2.9920769297111856e-06, + "loss": 1.523129940032959, + "step": 640 + }, + { + "epoch": 0.19759926131117267, + "grad_norm": 25.625, + "learning_rate": 2.991985171186958e-06, + "loss": 1.7012336254119873, + "step": 642 + }, + { + "epoch": 0.1982148353339489, + "grad_norm": 11.3125, + "learning_rate": 2.9918928861615156e-06, + "loss": 1.3830182552337646, + "step": 644 + }, + { + "epoch": 0.19883040935672514, + "grad_norm": 11.625, + "learning_rate": 2.9918000746756205e-06, + "loss": 1.4474706649780273, + "step": 646 + }, + { + "epoch": 0.1994459833795014, + "grad_norm": 13.875, + "learning_rate": 2.9917067367702693e-06, + "loss": 1.2021737098693848, + "step": 648 + }, + { + "epoch": 0.20006155740227763, + "grad_norm": 16.0, + "learning_rate": 2.9916128724866877e-06, + "loss": 1.569669246673584, + "step": 650 + }, + { + "epoch": 0.20067713142505386, + "grad_norm": 18.125, + "learning_rate": 2.9915184818663356e-06, + "loss": 1.5191471576690674, + "step": 652 + }, + { + "epoch": 0.2012927054478301, + "grad_norm": 15.9375, + "learning_rate": 2.991423564950907e-06, + "loss": 1.0736037492752075, + "step": 654 + }, + { + "epoch": 0.20190827947060633, + "grad_norm": 22.0, + "learning_rate": 2.991328121782325e-06, + "loss": 1.7012343406677246, + "step": 656 + }, + { + "epoch": 0.2025238534933826, + "grad_norm": 18.125, + "learning_rate": 2.9912321524027485e-06, + "loss": 1.4998762607574463, + "step": 658 + }, + { + "epoch": 0.20313942751615882, + "grad_norm": 14.125, + "learning_rate": 2.9911356568545667e-06, + "loss": 1.5033804178237915, + "step": 660 + }, + { + "epoch": 0.20375500153893505, + "grad_norm": 7.1875, + "learning_rate": 2.9910386351804014e-06, + "loss": 1.372567892074585, + "step": 662 + }, + { + "epoch": 0.2043705755617113, + "grad_norm": 14.3125, + "learning_rate": 2.9909410874231075e-06, + "loss": 1.382359504699707, + "step": 664 + }, + { + "epoch": 0.20498614958448755, + "grad_norm": 7.125, + "learning_rate": 2.990843013625772e-06, + "loss": 0.7901002764701843, + "step": 666 + }, + { + "epoch": 0.20560172360726378, + "grad_norm": 14.875, + "learning_rate": 2.990744413831715e-06, + "loss": 1.2381068468093872, + "step": 668 + }, + { + "epoch": 0.20621729763004001, + "grad_norm": 11.625, + "learning_rate": 2.990645288084487e-06, + "loss": 1.49295973777771, + "step": 670 + }, + { + "epoch": 0.20683287165281625, + "grad_norm": 9.1875, + "learning_rate": 2.990545636427872e-06, + "loss": 1.5857125520706177, + "step": 672 + }, + { + "epoch": 0.20744844567559248, + "grad_norm": 15.0, + "learning_rate": 2.990445458905886e-06, + "loss": 1.3795162439346313, + "step": 674 + }, + { + "epoch": 0.20806401969836874, + "grad_norm": 16.125, + "learning_rate": 2.9903447555627782e-06, + "loss": 1.2645190954208374, + "step": 676 + }, + { + "epoch": 0.20867959372114497, + "grad_norm": 10.6875, + "learning_rate": 2.9902435264430303e-06, + "loss": 1.7142274379730225, + "step": 678 + }, + { + "epoch": 0.2092951677439212, + "grad_norm": 14.625, + "learning_rate": 2.990141771591353e-06, + "loss": 1.548229455947876, + "step": 680 + }, + { + "epoch": 0.20991074176669744, + "grad_norm": 19.75, + "learning_rate": 2.990039491052694e-06, + "loss": 1.3462605476379395, + "step": 682 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 13.375, + "learning_rate": 2.9899366848722284e-06, + "loss": 1.3686890602111816, + "step": 684 + }, + { + "epoch": 0.21114188981224993, + "grad_norm": 13.5625, + "learning_rate": 2.9898333530953674e-06, + "loss": 1.6143863201141357, + "step": 686 + }, + { + "epoch": 0.21175746383502617, + "grad_norm": 17.75, + "learning_rate": 2.9897294957677522e-06, + "loss": 1.8706777095794678, + "step": 688 + }, + { + "epoch": 0.2123730378578024, + "grad_norm": 7.9375, + "learning_rate": 2.989625112935257e-06, + "loss": 1.5297441482543945, + "step": 690 + }, + { + "epoch": 0.21298861188057863, + "grad_norm": 112.0, + "learning_rate": 2.9895202046439872e-06, + "loss": 1.1892492771148682, + "step": 692 + }, + { + "epoch": 0.21360418590335487, + "grad_norm": 17.5, + "learning_rate": 2.989414770940282e-06, + "loss": 1.5064308643341064, + "step": 694 + }, + { + "epoch": 0.21421975992613113, + "grad_norm": 10.9375, + "learning_rate": 2.9893088118707103e-06, + "loss": 1.5546306371688843, + "step": 696 + }, + { + "epoch": 0.21483533394890736, + "grad_norm": 6.6875, + "learning_rate": 2.989202327482076e-06, + "loss": 1.3973575830459595, + "step": 698 + }, + { + "epoch": 0.2154509079716836, + "grad_norm": 8.3125, + "learning_rate": 2.989095317821411e-06, + "loss": 1.2875769138336182, + "step": 700 + }, + { + "epoch": 0.21606648199445982, + "grad_norm": 11.5, + "learning_rate": 2.9889877829359837e-06, + "loss": 1.3528231382369995, + "step": 702 + }, + { + "epoch": 0.21668205601723609, + "grad_norm": 9.75, + "learning_rate": 2.9888797228732908e-06, + "loss": 1.139479637145996, + "step": 704 + }, + { + "epoch": 0.21729763004001232, + "grad_norm": 18.25, + "learning_rate": 2.9887711376810643e-06, + "loss": 1.7238795757293701, + "step": 706 + }, + { + "epoch": 0.21791320406278855, + "grad_norm": 6.84375, + "learning_rate": 2.988662027407265e-06, + "loss": 1.401573657989502, + "step": 708 + }, + { + "epoch": 0.21852877808556478, + "grad_norm": 17.125, + "learning_rate": 2.9885523921000877e-06, + "loss": 1.4582817554473877, + "step": 710 + }, + { + "epoch": 0.21914435210834102, + "grad_norm": 11.625, + "learning_rate": 2.988442231807958e-06, + "loss": 1.4865729808807373, + "step": 712 + }, + { + "epoch": 0.21975992613111728, + "grad_norm": 11.875, + "learning_rate": 2.988331546579534e-06, + "loss": 1.4546085596084595, + "step": 714 + }, + { + "epoch": 0.2203755001538935, + "grad_norm": 15.9375, + "learning_rate": 2.9882203364637058e-06, + "loss": 1.2999931573867798, + "step": 716 + }, + { + "epoch": 0.22099107417666974, + "grad_norm": 17.75, + "learning_rate": 2.9881086015095945e-06, + "loss": 1.215187907218933, + "step": 718 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 7.4375, + "learning_rate": 2.9879963417665544e-06, + "loss": 1.5152490139007568, + "step": 720 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 23.625, + "learning_rate": 2.98788355728417e-06, + "loss": 1.7452555894851685, + "step": 722 + }, + { + "epoch": 0.22283779624499847, + "grad_norm": 11.75, + "learning_rate": 2.9877702481122586e-06, + "loss": 0.7922830581665039, + "step": 724 + }, + { + "epoch": 0.2234533702677747, + "grad_norm": 14.875, + "learning_rate": 2.987656414300869e-06, + "loss": 1.6105217933654785, + "step": 726 + }, + { + "epoch": 0.22406894429055094, + "grad_norm": 14.9375, + "learning_rate": 2.9875420559002812e-06, + "loss": 1.8192713260650635, + "step": 728 + }, + { + "epoch": 0.22468451831332717, + "grad_norm": 23.125, + "learning_rate": 2.9874271729610083e-06, + "loss": 1.4074468612670898, + "step": 730 + }, + { + "epoch": 0.22530009233610343, + "grad_norm": 10.8125, + "learning_rate": 2.9873117655337934e-06, + "loss": 1.2618192434310913, + "step": 732 + }, + { + "epoch": 0.22591566635887966, + "grad_norm": 19.75, + "learning_rate": 2.987195833669613e-06, + "loss": 1.167978286743164, + "step": 734 + }, + { + "epoch": 0.2265312403816559, + "grad_norm": 17.125, + "learning_rate": 2.987079377419673e-06, + "loss": 1.499091386795044, + "step": 736 + }, + { + "epoch": 0.22714681440443213, + "grad_norm": 14.6875, + "learning_rate": 2.9869623968354133e-06, + "loss": 1.4751125574111938, + "step": 738 + }, + { + "epoch": 0.22776238842720836, + "grad_norm": 18.25, + "learning_rate": 2.986844891968505e-06, + "loss": 1.824317455291748, + "step": 740 + }, + { + "epoch": 0.22837796244998462, + "grad_norm": 21.625, + "learning_rate": 2.9867268628708485e-06, + "loss": 1.7516217231750488, + "step": 742 + }, + { + "epoch": 0.22899353647276086, + "grad_norm": 15.5625, + "learning_rate": 2.9866083095945785e-06, + "loss": 1.0053646564483643, + "step": 744 + }, + { + "epoch": 0.2296091104955371, + "grad_norm": 11.5, + "learning_rate": 2.98648923219206e-06, + "loss": 1.4207651615142822, + "step": 746 + }, + { + "epoch": 0.23022468451831332, + "grad_norm": 24.875, + "learning_rate": 2.9863696307158894e-06, + "loss": 1.2401245832443237, + "step": 748 + }, + { + "epoch": 0.23084025854108955, + "grad_norm": 6.40625, + "learning_rate": 2.9862495052188947e-06, + "loss": 1.2797411680221558, + "step": 750 + }, + { + "epoch": 0.23145583256386582, + "grad_norm": 17.25, + "learning_rate": 2.9861288557541357e-06, + "loss": 1.4601941108703613, + "step": 752 + }, + { + "epoch": 0.23207140658664205, + "grad_norm": 12.75, + "learning_rate": 2.986007682374903e-06, + "loss": 1.5369291305541992, + "step": 754 + }, + { + "epoch": 0.23268698060941828, + "grad_norm": 8.6875, + "learning_rate": 2.9858859851347193e-06, + "loss": 1.4711337089538574, + "step": 756 + }, + { + "epoch": 0.23330255463219451, + "grad_norm": 12.4375, + "learning_rate": 2.9857637640873394e-06, + "loss": 1.3970842361450195, + "step": 758 + }, + { + "epoch": 0.23391812865497075, + "grad_norm": 8.9375, + "learning_rate": 2.985641019286746e-06, + "loss": 1.535696029663086, + "step": 760 + }, + { + "epoch": 0.234533702677747, + "grad_norm": 21.5, + "learning_rate": 2.9855177507871586e-06, + "loss": 1.2274210453033447, + "step": 762 + }, + { + "epoch": 0.23514927670052324, + "grad_norm": 17.25, + "learning_rate": 2.9853939586430222e-06, + "loss": 1.777534008026123, + "step": 764 + }, + { + "epoch": 0.23576485072329947, + "grad_norm": 13.75, + "learning_rate": 2.9852696429090186e-06, + "loss": 1.7154924869537354, + "step": 766 + }, + { + "epoch": 0.2363804247460757, + "grad_norm": 11.75, + "learning_rate": 2.9851448036400562e-06, + "loss": 1.6001613140106201, + "step": 768 + }, + { + "epoch": 0.23699599876885197, + "grad_norm": 12.375, + "learning_rate": 2.9850194408912777e-06, + "loss": 1.2444454431533813, + "step": 770 + }, + { + "epoch": 0.2376115727916282, + "grad_norm": 16.0, + "learning_rate": 2.984893554718055e-06, + "loss": 1.2592495679855347, + "step": 772 + }, + { + "epoch": 0.23822714681440443, + "grad_norm": 15.8125, + "learning_rate": 2.984767145175993e-06, + "loss": 1.7219655513763428, + "step": 774 + }, + { + "epoch": 0.23884272083718067, + "grad_norm": 30.5, + "learning_rate": 2.9846402123209276e-06, + "loss": 1.068901777267456, + "step": 776 + }, + { + "epoch": 0.2394582948599569, + "grad_norm": 17.75, + "learning_rate": 2.9845127562089237e-06, + "loss": 1.4635910987854004, + "step": 778 + }, + { + "epoch": 0.24007386888273316, + "grad_norm": 17.875, + "learning_rate": 2.9843847768962794e-06, + "loss": 1.433922529220581, + "step": 780 + }, + { + "epoch": 0.2406894429055094, + "grad_norm": 19.75, + "learning_rate": 2.9842562744395232e-06, + "loss": 1.1140307188034058, + "step": 782 + }, + { + "epoch": 0.24130501692828563, + "grad_norm": 5.03125, + "learning_rate": 2.984127248895415e-06, + "loss": 1.5116472244262695, + "step": 784 + }, + { + "epoch": 0.24192059095106186, + "grad_norm": 13.875, + "learning_rate": 2.983997700320946e-06, + "loss": 1.7026944160461426, + "step": 786 + }, + { + "epoch": 0.2425361649738381, + "grad_norm": 41.5, + "learning_rate": 2.9838676287733367e-06, + "loss": 1.4336001873016357, + "step": 788 + }, + { + "epoch": 0.24315173899661435, + "grad_norm": 39.25, + "learning_rate": 2.9837370343100405e-06, + "loss": 1.4539496898651123, + "step": 790 + }, + { + "epoch": 0.24376731301939059, + "grad_norm": 14.4375, + "learning_rate": 2.9836059169887415e-06, + "loss": 1.3547931909561157, + "step": 792 + }, + { + "epoch": 0.24438288704216682, + "grad_norm": 13.9375, + "learning_rate": 2.983474276867354e-06, + "loss": 1.308903455734253, + "step": 794 + }, + { + "epoch": 0.24499846106494305, + "grad_norm": 18.375, + "learning_rate": 2.9833421140040242e-06, + "loss": 1.670867681503296, + "step": 796 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 10.5625, + "learning_rate": 2.983209428457127e-06, + "loss": 1.5410202741622925, + "step": 798 + }, + { + "epoch": 0.24622960911049555, + "grad_norm": 10.75, + "learning_rate": 2.9830762202852714e-06, + "loss": 1.6194417476654053, + "step": 800 + }, + { + "epoch": 0.24684518313327178, + "grad_norm": 11.125, + "learning_rate": 2.9829424895472952e-06, + "loss": 1.698732614517212, + "step": 802 + }, + { + "epoch": 0.247460757156048, + "grad_norm": 19.5, + "learning_rate": 2.982808236302267e-06, + "loss": 1.6608161926269531, + "step": 804 + }, + { + "epoch": 0.24807633117882424, + "grad_norm": 31.875, + "learning_rate": 2.982673460609486e-06, + "loss": 1.8473336696624756, + "step": 806 + }, + { + "epoch": 0.2486919052016005, + "grad_norm": 10.375, + "learning_rate": 2.9825381625284846e-06, + "loss": 1.4293184280395508, + "step": 808 + }, + { + "epoch": 0.24930747922437674, + "grad_norm": 42.5, + "learning_rate": 2.982402342119023e-06, + "loss": 1.045412302017212, + "step": 810 + }, + { + "epoch": 0.24992305324715297, + "grad_norm": 22.5, + "learning_rate": 2.982265999441093e-06, + "loss": 1.688213586807251, + "step": 812 + }, + { + "epoch": 0.2505386272699292, + "grad_norm": 6.84375, + "learning_rate": 2.9821291345549178e-06, + "loss": 1.1688685417175293, + "step": 814 + }, + { + "epoch": 0.25115420129270544, + "grad_norm": 6.4375, + "learning_rate": 2.9819917475209513e-06, + "loss": 1.2294797897338867, + "step": 816 + }, + { + "epoch": 0.25176977531548167, + "grad_norm": 21.5, + "learning_rate": 2.9818538383998756e-06, + "loss": 1.0619512796401978, + "step": 818 + }, + { + "epoch": 0.2523853493382579, + "grad_norm": 6.53125, + "learning_rate": 2.981715407252608e-06, + "loss": 1.170961618423462, + "step": 820 + }, + { + "epoch": 0.2530009233610342, + "grad_norm": 79.0, + "learning_rate": 2.9815764541402914e-06, + "loss": 1.622196912765503, + "step": 822 + }, + { + "epoch": 0.2536164973838104, + "grad_norm": 5.71875, + "learning_rate": 2.981436979124302e-06, + "loss": 1.3246055841445923, + "step": 824 + }, + { + "epoch": 0.25423207140658666, + "grad_norm": 10.375, + "learning_rate": 2.9812969822662474e-06, + "loss": 1.241051197052002, + "step": 826 + }, + { + "epoch": 0.2548476454293629, + "grad_norm": 12.0625, + "learning_rate": 2.981156463627963e-06, + "loss": 0.7300369739532471, + "step": 828 + }, + { + "epoch": 0.2554632194521391, + "grad_norm": 16.125, + "learning_rate": 2.981015423271517e-06, + "loss": 1.586928129196167, + "step": 830 + }, + { + "epoch": 0.25607879347491536, + "grad_norm": 5.875, + "learning_rate": 2.9808738612592065e-06, + "loss": 1.3401447534561157, + "step": 832 + }, + { + "epoch": 0.2566943674976916, + "grad_norm": 13.5625, + "learning_rate": 2.980731777653559e-06, + "loss": 1.4773958921432495, + "step": 834 + }, + { + "epoch": 0.2573099415204678, + "grad_norm": 14.875, + "learning_rate": 2.980589172517334e-06, + "loss": 1.0504045486450195, + "step": 836 + }, + { + "epoch": 0.25792551554324405, + "grad_norm": 24.375, + "learning_rate": 2.9804460459135203e-06, + "loss": 0.729271650314331, + "step": 838 + }, + { + "epoch": 0.2585410895660203, + "grad_norm": 20.75, + "learning_rate": 2.9803023979053365e-06, + "loss": 1.6884655952453613, + "step": 840 + }, + { + "epoch": 0.2591566635887966, + "grad_norm": 10.125, + "learning_rate": 2.9801582285562325e-06, + "loss": 1.403031349182129, + "step": 842 + }, + { + "epoch": 0.2597722376115728, + "grad_norm": 13.0625, + "learning_rate": 2.980013537929888e-06, + "loss": 1.461706280708313, + "step": 844 + }, + { + "epoch": 0.26038781163434904, + "grad_norm": 21.25, + "learning_rate": 2.9798683260902125e-06, + "loss": 1.5261836051940918, + "step": 846 + }, + { + "epoch": 0.2610033856571253, + "grad_norm": 12.3125, + "learning_rate": 2.979722593101348e-06, + "loss": 1.8656940460205078, + "step": 848 + }, + { + "epoch": 0.2616189596799015, + "grad_norm": 5.625, + "learning_rate": 2.979576339027663e-06, + "loss": 1.134974718093872, + "step": 850 + }, + { + "epoch": 0.26223453370267774, + "grad_norm": 11.5, + "learning_rate": 2.979429563933759e-06, + "loss": 1.4083569049835205, + "step": 852 + }, + { + "epoch": 0.262850107725454, + "grad_norm": 11.1875, + "learning_rate": 2.9792822678844656e-06, + "loss": 1.208848237991333, + "step": 854 + }, + { + "epoch": 0.2634656817482302, + "grad_norm": 34.5, + "learning_rate": 2.979134450944845e-06, + "loss": 1.7384560108184814, + "step": 856 + }, + { + "epoch": 0.26408125577100644, + "grad_norm": 14.3125, + "learning_rate": 2.9789861131801877e-06, + "loss": 1.7467668056488037, + "step": 858 + }, + { + "epoch": 0.26469682979378273, + "grad_norm": 2.90625, + "learning_rate": 2.978837254656015e-06, + "loss": 1.5746519565582275, + "step": 860 + }, + { + "epoch": 0.26531240381655896, + "grad_norm": 12.75, + "learning_rate": 2.9786878754380767e-06, + "loss": 1.5870490074157715, + "step": 862 + }, + { + "epoch": 0.2659279778393352, + "grad_norm": 20.125, + "learning_rate": 2.9785379755923553e-06, + "loss": 1.548311710357666, + "step": 864 + }, + { + "epoch": 0.2665435518621114, + "grad_norm": 17.5, + "learning_rate": 2.9783875551850606e-06, + "loss": 0.8423838019371033, + "step": 866 + }, + { + "epoch": 0.26715912588488766, + "grad_norm": 7.8125, + "learning_rate": 2.9782366142826335e-06, + "loss": 1.2350690364837646, + "step": 868 + }, + { + "epoch": 0.2677746999076639, + "grad_norm": 9.1875, + "learning_rate": 2.978085152951745e-06, + "loss": 1.2413235902786255, + "step": 870 + }, + { + "epoch": 0.2683902739304401, + "grad_norm": 13.75, + "learning_rate": 2.9779331712592967e-06, + "loss": 1.14556086063385, + "step": 872 + }, + { + "epoch": 0.26900584795321636, + "grad_norm": 10.0625, + "learning_rate": 2.977780669272418e-06, + "loss": 1.0823867321014404, + "step": 874 + }, + { + "epoch": 0.2696214219759926, + "grad_norm": 14.9375, + "learning_rate": 2.977627647058469e-06, + "loss": 1.029285192489624, + "step": 876 + }, + { + "epoch": 0.2702369959987688, + "grad_norm": 13.5, + "learning_rate": 2.9774741046850404e-06, + "loss": 1.5127992630004883, + "step": 878 + }, + { + "epoch": 0.2708525700215451, + "grad_norm": 7.09375, + "learning_rate": 2.9773200422199524e-06, + "loss": 1.503528118133545, + "step": 880 + }, + { + "epoch": 0.27146814404432135, + "grad_norm": 13.0625, + "learning_rate": 2.9771654597312527e-06, + "loss": 0.9524327516555786, + "step": 882 + }, + { + "epoch": 0.2720837180670976, + "grad_norm": 17.625, + "learning_rate": 2.977010357287223e-06, + "loss": 1.3801374435424805, + "step": 884 + }, + { + "epoch": 0.2726992920898738, + "grad_norm": 13.25, + "learning_rate": 2.976854734956371e-06, + "loss": 1.24113130569458, + "step": 886 + }, + { + "epoch": 0.27331486611265005, + "grad_norm": 9.875, + "learning_rate": 2.9766985928074356e-06, + "loss": 1.4660515785217285, + "step": 888 + }, + { + "epoch": 0.2739304401354263, + "grad_norm": 12.0, + "learning_rate": 2.976541930909385e-06, + "loss": 1.6487599611282349, + "step": 890 + }, + { + "epoch": 0.2745460141582025, + "grad_norm": 13.375, + "learning_rate": 2.9763847493314152e-06, + "loss": 1.4463802576065063, + "step": 892 + }, + { + "epoch": 0.27516158818097874, + "grad_norm": 21.625, + "learning_rate": 2.976227048142956e-06, + "loss": 1.746794581413269, + "step": 894 + }, + { + "epoch": 0.275777162203755, + "grad_norm": 14.75, + "learning_rate": 2.9760688274136632e-06, + "loss": 1.4405450820922852, + "step": 896 + }, + { + "epoch": 0.27639273622653127, + "grad_norm": 12.4375, + "learning_rate": 2.975910087213423e-06, + "loss": 1.3549253940582275, + "step": 898 + }, + { + "epoch": 0.2770083102493075, + "grad_norm": 55.75, + "learning_rate": 2.975750827612351e-06, + "loss": 1.2329659461975098, + "step": 900 + }, + { + "epoch": 0.27762388427208373, + "grad_norm": 8.9375, + "learning_rate": 2.9755910486807922e-06, + "loss": 1.2978835105895996, + "step": 902 + }, + { + "epoch": 0.27823945829485996, + "grad_norm": 8.9375, + "learning_rate": 2.9754307504893223e-06, + "loss": 1.1330984830856323, + "step": 904 + }, + { + "epoch": 0.2788550323176362, + "grad_norm": 16.25, + "learning_rate": 2.9752699331087436e-06, + "loss": 1.4983842372894287, + "step": 906 + }, + { + "epoch": 0.27947060634041243, + "grad_norm": 6.65625, + "learning_rate": 2.9751085966100907e-06, + "loss": 1.216892123222351, + "step": 908 + }, + { + "epoch": 0.28008618036318866, + "grad_norm": 14.625, + "learning_rate": 2.974946741064625e-06, + "loss": 1.4386199712753296, + "step": 910 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 17.75, + "learning_rate": 2.9747843665438393e-06, + "loss": 0.9468835592269897, + "step": 912 + }, + { + "epoch": 0.28131732840874113, + "grad_norm": 9.125, + "learning_rate": 2.9746214731194534e-06, + "loss": 0.9744749665260315, + "step": 914 + }, + { + "epoch": 0.28193290243151736, + "grad_norm": 8.625, + "learning_rate": 2.9744580608634188e-06, + "loss": 0.960205078125, + "step": 916 + }, + { + "epoch": 0.28254847645429365, + "grad_norm": 19.375, + "learning_rate": 2.9742941298479137e-06, + "loss": 1.7566368579864502, + "step": 918 + }, + { + "epoch": 0.2831640504770699, + "grad_norm": 21.25, + "learning_rate": 2.9741296801453476e-06, + "loss": 1.284712314605713, + "step": 920 + }, + { + "epoch": 0.2837796244998461, + "grad_norm": 21.125, + "learning_rate": 2.9739647118283574e-06, + "loss": 1.3857460021972656, + "step": 922 + }, + { + "epoch": 0.28439519852262235, + "grad_norm": 16.125, + "learning_rate": 2.9737992249698107e-06, + "loss": 0.9767564535140991, + "step": 924 + }, + { + "epoch": 0.2850107725453986, + "grad_norm": 24.625, + "learning_rate": 2.9736332196428024e-06, + "loss": 1.5778262615203857, + "step": 926 + }, + { + "epoch": 0.2856263465681748, + "grad_norm": 29.375, + "learning_rate": 2.9734666959206575e-06, + "loss": 1.5621557235717773, + "step": 928 + }, + { + "epoch": 0.28624192059095105, + "grad_norm": 20.75, + "learning_rate": 2.9732996538769293e-06, + "loss": 1.3215174674987793, + "step": 930 + }, + { + "epoch": 0.2868574946137273, + "grad_norm": 143.0, + "learning_rate": 2.9731320935854016e-06, + "loss": 1.1781163215637207, + "step": 932 + }, + { + "epoch": 0.2874730686365035, + "grad_norm": 13.875, + "learning_rate": 2.9729640151200845e-06, + "loss": 1.2226213216781616, + "step": 934 + }, + { + "epoch": 0.2880886426592798, + "grad_norm": 13.0, + "learning_rate": 2.9727954185552193e-06, + "loss": 1.3734395503997803, + "step": 936 + }, + { + "epoch": 0.28870421668205604, + "grad_norm": 5.375, + "learning_rate": 2.9726263039652757e-06, + "loss": 1.252492070198059, + "step": 938 + }, + { + "epoch": 0.28931979070483227, + "grad_norm": 11.125, + "learning_rate": 2.9724566714249505e-06, + "loss": 1.4262735843658447, + "step": 940 + }, + { + "epoch": 0.2899353647276085, + "grad_norm": 5.9375, + "learning_rate": 2.9722865210091717e-06, + "loss": 1.052842140197754, + "step": 942 + }, + { + "epoch": 0.29055093875038474, + "grad_norm": 10.9375, + "learning_rate": 2.9721158527930945e-06, + "loss": 0.9967107176780701, + "step": 944 + }, + { + "epoch": 0.29116651277316097, + "grad_norm": 24.375, + "learning_rate": 2.971944666852104e-06, + "loss": 1.2896000146865845, + "step": 946 + }, + { + "epoch": 0.2917820867959372, + "grad_norm": 40.25, + "learning_rate": 2.9717729632618123e-06, + "loss": 1.8066881895065308, + "step": 948 + }, + { + "epoch": 0.29239766081871343, + "grad_norm": 15.5625, + "learning_rate": 2.9716007420980614e-06, + "loss": 1.358643651008606, + "step": 950 + }, + { + "epoch": 0.29301323484148967, + "grad_norm": 17.875, + "learning_rate": 2.9714280034369213e-06, + "loss": 1.4859265089035034, + "step": 952 + }, + { + "epoch": 0.29362880886426596, + "grad_norm": 13.5625, + "learning_rate": 2.9712547473546918e-06, + "loss": 1.2124885320663452, + "step": 954 + }, + { + "epoch": 0.2942443828870422, + "grad_norm": 19.0, + "learning_rate": 2.971080973927899e-06, + "loss": 1.194804072380066, + "step": 956 + }, + { + "epoch": 0.2948599569098184, + "grad_norm": 10.4375, + "learning_rate": 2.9709066832332996e-06, + "loss": 1.4251993894577026, + "step": 958 + }, + { + "epoch": 0.29547553093259465, + "grad_norm": 14.6875, + "learning_rate": 2.970731875347877e-06, + "loss": 1.6262626647949219, + "step": 960 + }, + { + "epoch": 0.2960911049553709, + "grad_norm": 19.875, + "learning_rate": 2.9705565503488456e-06, + "loss": 1.4257208108901978, + "step": 962 + }, + { + "epoch": 0.2967066789781471, + "grad_norm": 103.0, + "learning_rate": 2.9703807083136464e-06, + "loss": 1.3943548202514648, + "step": 964 + }, + { + "epoch": 0.29732225300092335, + "grad_norm": 11.5, + "learning_rate": 2.970204349319948e-06, + "loss": 1.5504299402236938, + "step": 966 + }, + { + "epoch": 0.2979378270236996, + "grad_norm": 11.875, + "learning_rate": 2.970027473445649e-06, + "loss": 1.0902049541473389, + "step": 968 + }, + { + "epoch": 0.2985534010464758, + "grad_norm": 15.125, + "learning_rate": 2.969850080768876e-06, + "loss": 1.1666667461395264, + "step": 970 + }, + { + "epoch": 0.29916897506925205, + "grad_norm": 15.25, + "learning_rate": 2.9696721713679825e-06, + "loss": 0.994618833065033, + "step": 972 + }, + { + "epoch": 0.29978454909202834, + "grad_norm": 18.25, + "learning_rate": 2.969493745321552e-06, + "loss": 1.4349355697631836, + "step": 974 + }, + { + "epoch": 0.3004001231148046, + "grad_norm": 17.5, + "learning_rate": 2.969314802708396e-06, + "loss": 1.5411276817321777, + "step": 976 + }, + { + "epoch": 0.3010156971375808, + "grad_norm": 17.5, + "learning_rate": 2.9691353436075527e-06, + "loss": 1.2102165222167969, + "step": 978 + }, + { + "epoch": 0.30163127116035704, + "grad_norm": 11.0625, + "learning_rate": 2.96895536809829e-06, + "loss": 1.524396300315857, + "step": 980 + }, + { + "epoch": 0.3022468451831333, + "grad_norm": 21.25, + "learning_rate": 2.968774876260103e-06, + "loss": 1.6361780166625977, + "step": 982 + }, + { + "epoch": 0.3028624192059095, + "grad_norm": 8.9375, + "learning_rate": 2.968593868172715e-06, + "loss": 1.2558728456497192, + "step": 984 + }, + { + "epoch": 0.30347799322868574, + "grad_norm": 23.375, + "learning_rate": 2.9684123439160782e-06, + "loss": 1.4768342971801758, + "step": 986 + }, + { + "epoch": 0.30409356725146197, + "grad_norm": 26.875, + "learning_rate": 2.9682303035703714e-06, + "loss": 1.4299650192260742, + "step": 988 + }, + { + "epoch": 0.3047091412742382, + "grad_norm": 5.40625, + "learning_rate": 2.9680477472160018e-06, + "loss": 1.1562550067901611, + "step": 990 + }, + { + "epoch": 0.3053247152970145, + "grad_norm": 12.1875, + "learning_rate": 2.9678646749336048e-06, + "loss": 1.4728153944015503, + "step": 992 + }, + { + "epoch": 0.3059402893197907, + "grad_norm": 27.625, + "learning_rate": 2.967681086804045e-06, + "loss": 1.624634027481079, + "step": 994 + }, + { + "epoch": 0.30655586334256696, + "grad_norm": 17.875, + "learning_rate": 2.9674969829084126e-06, + "loss": 1.3707025051116943, + "step": 996 + }, + { + "epoch": 0.3071714373653432, + "grad_norm": 12.3125, + "learning_rate": 2.967312363328025e-06, + "loss": 1.451162338256836, + "step": 998 + }, + { + "epoch": 0.3077870113881194, + "grad_norm": 49.5, + "learning_rate": 2.9671272281444314e-06, + "loss": 1.2668009996414185, + "step": 1000 + }, + { + "epoch": 0.30840258541089566, + "grad_norm": 29.375, + "learning_rate": 2.9669415774394046e-06, + "loss": 1.7370235919952393, + "step": 1002 + }, + { + "epoch": 0.3090181594336719, + "grad_norm": 8.75, + "learning_rate": 2.9667554112949477e-06, + "loss": 1.146094799041748, + "step": 1004 + }, + { + "epoch": 0.3096337334564481, + "grad_norm": 17.375, + "learning_rate": 2.9665687297932896e-06, + "loss": 1.7455390691757202, + "step": 1006 + }, + { + "epoch": 0.31024930747922436, + "grad_norm": 3.953125, + "learning_rate": 2.9663815330168885e-06, + "loss": 1.3706302642822266, + "step": 1008 + }, + { + "epoch": 0.3108648815020006, + "grad_norm": 12.3125, + "learning_rate": 2.9661938210484287e-06, + "loss": 1.325300693511963, + "step": 1010 + }, + { + "epoch": 0.3114804555247769, + "grad_norm": 12.125, + "learning_rate": 2.966005593970823e-06, + "loss": 1.3024173974990845, + "step": 1012 + }, + { + "epoch": 0.3120960295475531, + "grad_norm": 8.0625, + "learning_rate": 2.965816851867212e-06, + "loss": 1.2130205631256104, + "step": 1014 + }, + { + "epoch": 0.31271160357032934, + "grad_norm": 15.875, + "learning_rate": 2.965627594820963e-06, + "loss": 1.447901725769043, + "step": 1016 + }, + { + "epoch": 0.3133271775931056, + "grad_norm": 11.875, + "learning_rate": 2.9654378229156708e-06, + "loss": 1.5102931261062622, + "step": 1018 + }, + { + "epoch": 0.3139427516158818, + "grad_norm": 9.0, + "learning_rate": 2.965247536235159e-06, + "loss": 1.594402551651001, + "step": 1020 + }, + { + "epoch": 0.31455832563865804, + "grad_norm": 11.5, + "learning_rate": 2.9650567348634753e-06, + "loss": 1.2735745906829834, + "step": 1022 + }, + { + "epoch": 0.3151738996614343, + "grad_norm": 10.5, + "learning_rate": 2.9648654188848986e-06, + "loss": 1.1038970947265625, + "step": 1024 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 15.5, + "learning_rate": 2.964673588383933e-06, + "loss": 0.7514253854751587, + "step": 1026 + }, + { + "epoch": 0.31640504770698674, + "grad_norm": 10.375, + "learning_rate": 2.9644812434453106e-06, + "loss": 1.4254611730575562, + "step": 1028 + }, + { + "epoch": 0.31702062172976303, + "grad_norm": 12.25, + "learning_rate": 2.964288384153989e-06, + "loss": 1.0978376865386963, + "step": 1030 + }, + { + "epoch": 0.31763619575253926, + "grad_norm": 9.0625, + "learning_rate": 2.9640950105951563e-06, + "loss": 1.5165598392486572, + "step": 1032 + }, + { + "epoch": 0.3182517697753155, + "grad_norm": 14.125, + "learning_rate": 2.9639011228542236e-06, + "loss": 1.4810335636138916, + "step": 1034 + }, + { + "epoch": 0.31886734379809173, + "grad_norm": 19.125, + "learning_rate": 2.9637067210168337e-06, + "loss": 1.6129978895187378, + "step": 1036 + }, + { + "epoch": 0.31948291782086796, + "grad_norm": 7.75, + "learning_rate": 2.9635118051688527e-06, + "loss": 1.1115729808807373, + "step": 1038 + }, + { + "epoch": 0.3200984918436442, + "grad_norm": 24.5, + "learning_rate": 2.963316375396375e-06, + "loss": 1.5309553146362305, + "step": 1040 + }, + { + "epoch": 0.32071406586642043, + "grad_norm": 17.375, + "learning_rate": 2.9631204317857236e-06, + "loss": 1.5364904403686523, + "step": 1042 + }, + { + "epoch": 0.32132963988919666, + "grad_norm": 18.75, + "learning_rate": 2.9629239744234453e-06, + "loss": 1.5094444751739502, + "step": 1044 + }, + { + "epoch": 0.3219452139119729, + "grad_norm": 9.1875, + "learning_rate": 2.9627270033963164e-06, + "loss": 1.279676914215088, + "step": 1046 + }, + { + "epoch": 0.3225607879347491, + "grad_norm": 12.5, + "learning_rate": 2.962529518791339e-06, + "loss": 1.359884262084961, + "step": 1048 + }, + { + "epoch": 0.3231763619575254, + "grad_norm": 22.75, + "learning_rate": 2.962331520695742e-06, + "loss": 1.2789549827575684, + "step": 1050 + }, + { + "epoch": 0.32379193598030165, + "grad_norm": 15.5625, + "learning_rate": 2.9621330091969828e-06, + "loss": 1.1411724090576172, + "step": 1052 + }, + { + "epoch": 0.3244075100030779, + "grad_norm": 23.75, + "learning_rate": 2.961933984382742e-06, + "loss": 0.9410820007324219, + "step": 1054 + }, + { + "epoch": 0.3250230840258541, + "grad_norm": 11.9375, + "learning_rate": 2.9617344463409305e-06, + "loss": 1.3496623039245605, + "step": 1056 + }, + { + "epoch": 0.32563865804863035, + "grad_norm": 19.0, + "learning_rate": 2.9615343951596846e-06, + "loss": 1.4122042655944824, + "step": 1058 + }, + { + "epoch": 0.3262542320714066, + "grad_norm": 15.1875, + "learning_rate": 2.9613338309273664e-06, + "loss": 1.2870073318481445, + "step": 1060 + }, + { + "epoch": 0.3268698060941828, + "grad_norm": 13.9375, + "learning_rate": 2.961132753732566e-06, + "loss": 1.7000384330749512, + "step": 1062 + }, + { + "epoch": 0.32748538011695905, + "grad_norm": 95.0, + "learning_rate": 2.960931163664099e-06, + "loss": 1.5783706903457642, + "step": 1064 + }, + { + "epoch": 0.3281009541397353, + "grad_norm": 48.75, + "learning_rate": 2.960729060811007e-06, + "loss": 1.178473949432373, + "step": 1066 + }, + { + "epoch": 0.32871652816251157, + "grad_norm": 33.0, + "learning_rate": 2.9605264452625615e-06, + "loss": 0.9566202163696289, + "step": 1068 + }, + { + "epoch": 0.3293321021852878, + "grad_norm": 22.375, + "learning_rate": 2.960323317108256e-06, + "loss": 1.2555310726165771, + "step": 1070 + }, + { + "epoch": 0.32994767620806403, + "grad_norm": 44.0, + "learning_rate": 2.9601196764378128e-06, + "loss": 1.7536683082580566, + "step": 1072 + }, + { + "epoch": 0.33056325023084027, + "grad_norm": 15.75, + "learning_rate": 2.959915523341181e-06, + "loss": 1.7330282926559448, + "step": 1074 + }, + { + "epoch": 0.3311788242536165, + "grad_norm": 23.375, + "learning_rate": 2.959710857908535e-06, + "loss": 1.600478172302246, + "step": 1076 + }, + { + "epoch": 0.33179439827639273, + "grad_norm": 10.0625, + "learning_rate": 2.959505680230275e-06, + "loss": 1.4186145067214966, + "step": 1078 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 3.375, + "learning_rate": 2.959299990397029e-06, + "loss": 1.4805734157562256, + "step": 1080 + }, + { + "epoch": 0.3330255463219452, + "grad_norm": 18.25, + "learning_rate": 2.95909378849965e-06, + "loss": 1.3895663022994995, + "step": 1082 + }, + { + "epoch": 0.33364112034472143, + "grad_norm": 19.75, + "learning_rate": 2.9588870746292177e-06, + "loss": 1.277693271636963, + "step": 1084 + }, + { + "epoch": 0.33425669436749766, + "grad_norm": 8.5, + "learning_rate": 2.9586798488770386e-06, + "loss": 1.3742859363555908, + "step": 1086 + }, + { + "epoch": 0.33487226839027395, + "grad_norm": 19.375, + "learning_rate": 2.958472111334643e-06, + "loss": 1.028365135192871, + "step": 1088 + }, + { + "epoch": 0.3354878424130502, + "grad_norm": 4.0625, + "learning_rate": 2.958263862093789e-06, + "loss": 1.2564489841461182, + "step": 1090 + }, + { + "epoch": 0.3361034164358264, + "grad_norm": 20.875, + "learning_rate": 2.958055101246463e-06, + "loss": 1.198014259338379, + "step": 1092 + }, + { + "epoch": 0.33671899045860265, + "grad_norm": 11.125, + "learning_rate": 2.9578458288848717e-06, + "loss": 0.5163677930831909, + "step": 1094 + }, + { + "epoch": 0.3373345644813789, + "grad_norm": 18.625, + "learning_rate": 2.957636045101453e-06, + "loss": 1.3656609058380127, + "step": 1096 + }, + { + "epoch": 0.3379501385041551, + "grad_norm": 5.71875, + "learning_rate": 2.957425749988868e-06, + "loss": 1.1800310611724854, + "step": 1098 + }, + { + "epoch": 0.33856571252693135, + "grad_norm": 14.5, + "learning_rate": 2.957214943640004e-06, + "loss": 1.435111165046692, + "step": 1100 + }, + { + "epoch": 0.3391812865497076, + "grad_norm": 26.625, + "learning_rate": 2.9570036261479754e-06, + "loss": 1.6497091054916382, + "step": 1102 + }, + { + "epoch": 0.3397968605724838, + "grad_norm": 32.5, + "learning_rate": 2.9567917976061203e-06, + "loss": 1.0989763736724854, + "step": 1104 + }, + { + "epoch": 0.3404124345952601, + "grad_norm": 13.25, + "learning_rate": 2.9565794581080035e-06, + "loss": 1.1690188646316528, + "step": 1106 + }, + { + "epoch": 0.34102800861803634, + "grad_norm": 15.75, + "learning_rate": 2.9563666077474167e-06, + "loss": 1.2261661291122437, + "step": 1108 + }, + { + "epoch": 0.34164358264081257, + "grad_norm": 12.375, + "learning_rate": 2.9561532466183753e-06, + "loss": 1.4731149673461914, + "step": 1110 + }, + { + "epoch": 0.3422591566635888, + "grad_norm": 11.4375, + "learning_rate": 2.9559393748151212e-06, + "loss": 1.3430174589157104, + "step": 1112 + }, + { + "epoch": 0.34287473068636504, + "grad_norm": 30.25, + "learning_rate": 2.9557249924321223e-06, + "loss": 1.5253651142120361, + "step": 1114 + }, + { + "epoch": 0.34349030470914127, + "grad_norm": 16.75, + "learning_rate": 2.955510099564071e-06, + "loss": 1.8036619424819946, + "step": 1116 + }, + { + "epoch": 0.3441058787319175, + "grad_norm": 12.625, + "learning_rate": 2.9552946963058858e-06, + "loss": 1.3675833940505981, + "step": 1118 + }, + { + "epoch": 0.34472145275469374, + "grad_norm": 12.8125, + "learning_rate": 2.9550787827527114e-06, + "loss": 1.5371782779693604, + "step": 1120 + }, + { + "epoch": 0.34533702677746997, + "grad_norm": 6.28125, + "learning_rate": 2.9548623589999155e-06, + "loss": 1.0250526666641235, + "step": 1122 + }, + { + "epoch": 0.34595260080024626, + "grad_norm": 17.75, + "learning_rate": 2.954645425143094e-06, + "loss": 1.5854618549346924, + "step": 1124 + }, + { + "epoch": 0.3465681748230225, + "grad_norm": 13.9375, + "learning_rate": 2.954427981278067e-06, + "loss": 1.5529431104660034, + "step": 1126 + }, + { + "epoch": 0.3471837488457987, + "grad_norm": 10.25, + "learning_rate": 2.9542100275008786e-06, + "loss": 1.3933606147766113, + "step": 1128 + }, + { + "epoch": 0.34779932286857496, + "grad_norm": 38.75, + "learning_rate": 2.9539915639078004e-06, + "loss": 1.7668472528457642, + "step": 1130 + }, + { + "epoch": 0.3484148968913512, + "grad_norm": 14.375, + "learning_rate": 2.9537725905953264e-06, + "loss": 1.350250244140625, + "step": 1132 + }, + { + "epoch": 0.3490304709141274, + "grad_norm": 26.625, + "learning_rate": 2.9535531076601794e-06, + "loss": 1.6775567531585693, + "step": 1134 + }, + { + "epoch": 0.34964604493690365, + "grad_norm": 16.875, + "learning_rate": 2.953333115199303e-06, + "loss": 1.5840508937835693, + "step": 1136 + }, + { + "epoch": 0.3502616189596799, + "grad_norm": 10.625, + "learning_rate": 2.9531126133098705e-06, + "loss": 1.3124401569366455, + "step": 1138 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 23.75, + "learning_rate": 2.9528916020892764e-06, + "loss": 1.412221908569336, + "step": 1140 + }, + { + "epoch": 0.35149276700523235, + "grad_norm": 4.71875, + "learning_rate": 2.9526700816351416e-06, + "loss": 1.1349772214889526, + "step": 1142 + }, + { + "epoch": 0.35210834102800864, + "grad_norm": 14.0, + "learning_rate": 2.9524480520453126e-06, + "loss": 1.2706981897354126, + "step": 1144 + }, + { + "epoch": 0.3527239150507849, + "grad_norm": 11.625, + "learning_rate": 2.9522255134178596e-06, + "loss": 1.5382128953933716, + "step": 1146 + }, + { + "epoch": 0.3533394890735611, + "grad_norm": 21.5, + "learning_rate": 2.9520024658510786e-06, + "loss": 1.4682729244232178, + "step": 1148 + }, + { + "epoch": 0.35395506309633734, + "grad_norm": 12.625, + "learning_rate": 2.9517789094434894e-06, + "loss": 1.5460891723632812, + "step": 1150 + }, + { + "epoch": 0.3545706371191136, + "grad_norm": 32.25, + "learning_rate": 2.9515548442938373e-06, + "loss": 1.6807613372802734, + "step": 1152 + }, + { + "epoch": 0.3551862111418898, + "grad_norm": 38.25, + "learning_rate": 2.9513302705010923e-06, + "loss": 1.1830241680145264, + "step": 1154 + }, + { + "epoch": 0.35580178516466604, + "grad_norm": 20.625, + "learning_rate": 2.9511051881644487e-06, + "loss": 1.5960803031921387, + "step": 1156 + }, + { + "epoch": 0.3564173591874423, + "grad_norm": 5.9375, + "learning_rate": 2.950879597383327e-06, + "loss": 1.2252815961837769, + "step": 1158 + }, + { + "epoch": 0.3570329332102185, + "grad_norm": 30.875, + "learning_rate": 2.9506534982573685e-06, + "loss": 1.4243431091308594, + "step": 1160 + }, + { + "epoch": 0.3576485072329948, + "grad_norm": 17.5, + "learning_rate": 2.9504268908864425e-06, + "loss": 1.4747540950775146, + "step": 1162 + }, + { + "epoch": 0.358264081255771, + "grad_norm": 15.5, + "learning_rate": 2.9501997753706424e-06, + "loss": 1.5682637691497803, + "step": 1164 + }, + { + "epoch": 0.35887965527854726, + "grad_norm": 20.375, + "learning_rate": 2.949972151810285e-06, + "loss": 1.194216251373291, + "step": 1166 + }, + { + "epoch": 0.3594952293013235, + "grad_norm": 9.4375, + "learning_rate": 2.9497440203059114e-06, + "loss": 1.379780888557434, + "step": 1168 + }, + { + "epoch": 0.3601108033240997, + "grad_norm": 14.875, + "learning_rate": 2.9495153809582875e-06, + "loss": 1.4812633991241455, + "step": 1170 + }, + { + "epoch": 0.36072637734687596, + "grad_norm": 26.75, + "learning_rate": 2.9492862338684042e-06, + "loss": 1.6413441896438599, + "step": 1172 + }, + { + "epoch": 0.3613419513696522, + "grad_norm": 34.25, + "learning_rate": 2.949056579137476e-06, + "loss": 1.731945276260376, + "step": 1174 + }, + { + "epoch": 0.3619575253924284, + "grad_norm": 12.9375, + "learning_rate": 2.9488264168669418e-06, + "loss": 1.2650394439697266, + "step": 1176 + }, + { + "epoch": 0.36257309941520466, + "grad_norm": 13.5, + "learning_rate": 2.9485957471584633e-06, + "loss": 1.5294060707092285, + "step": 1178 + }, + { + "epoch": 0.3631886734379809, + "grad_norm": 13.9375, + "learning_rate": 2.9483645701139293e-06, + "loss": 1.2630178928375244, + "step": 1180 + }, + { + "epoch": 0.3638042474607572, + "grad_norm": 40.75, + "learning_rate": 2.9481328858354497e-06, + "loss": 1.5602120161056519, + "step": 1182 + }, + { + "epoch": 0.3644198214835334, + "grad_norm": 13.0, + "learning_rate": 2.9479006944253604e-06, + "loss": 1.4180080890655518, + "step": 1184 + }, + { + "epoch": 0.36503539550630965, + "grad_norm": 51.0, + "learning_rate": 2.94766799598622e-06, + "loss": 1.4425501823425293, + "step": 1186 + }, + { + "epoch": 0.3656509695290859, + "grad_norm": 14.625, + "learning_rate": 2.947434790620812e-06, + "loss": 1.3580070734024048, + "step": 1188 + }, + { + "epoch": 0.3662665435518621, + "grad_norm": 14.6875, + "learning_rate": 2.9472010784321433e-06, + "loss": 1.5068962574005127, + "step": 1190 + }, + { + "epoch": 0.36688211757463834, + "grad_norm": 11.375, + "learning_rate": 2.946966859523445e-06, + "loss": 1.1820440292358398, + "step": 1192 + }, + { + "epoch": 0.3674976915974146, + "grad_norm": 8.5625, + "learning_rate": 2.9467321339981725e-06, + "loss": 1.4404610395431519, + "step": 1194 + }, + { + "epoch": 0.3681132656201908, + "grad_norm": 20.375, + "learning_rate": 2.9464969019600027e-06, + "loss": 1.4200704097747803, + "step": 1196 + }, + { + "epoch": 0.36872883964296704, + "grad_norm": 19.375, + "learning_rate": 2.94626116351284e-06, + "loss": 1.927262544631958, + "step": 1198 + }, + { + "epoch": 0.36934441366574333, + "grad_norm": 24.625, + "learning_rate": 2.9460249187608086e-06, + "loss": 1.6129209995269775, + "step": 1200 + }, + { + "epoch": 0.36995998768851956, + "grad_norm": 3.90625, + "learning_rate": 2.945788167808259e-06, + "loss": 1.3959664106369019, + "step": 1202 + }, + { + "epoch": 0.3705755617112958, + "grad_norm": 11.8125, + "learning_rate": 2.945550910759764e-06, + "loss": 1.3280491828918457, + "step": 1204 + }, + { + "epoch": 0.37119113573407203, + "grad_norm": 14.8125, + "learning_rate": 2.9453131477201202e-06, + "loss": 1.5886045694351196, + "step": 1206 + }, + { + "epoch": 0.37180670975684826, + "grad_norm": 12.125, + "learning_rate": 2.9450748787943476e-06, + "loss": 1.714353084564209, + "step": 1208 + }, + { + "epoch": 0.3724222837796245, + "grad_norm": 10.1875, + "learning_rate": 2.944836104087691e-06, + "loss": 1.3882367610931396, + "step": 1210 + }, + { + "epoch": 0.37303785780240073, + "grad_norm": 13.4375, + "learning_rate": 2.9445968237056167e-06, + "loss": 1.5091825723648071, + "step": 1212 + }, + { + "epoch": 0.37365343182517696, + "grad_norm": 23.25, + "learning_rate": 2.9443570377538145e-06, + "loss": 1.0328447818756104, + "step": 1214 + }, + { + "epoch": 0.3742690058479532, + "grad_norm": 15.5625, + "learning_rate": 2.944116746338199e-06, + "loss": 1.6457622051239014, + "step": 1216 + }, + { + "epoch": 0.37488457987072943, + "grad_norm": 11.9375, + "learning_rate": 2.943875949564907e-06, + "loss": 1.0500051975250244, + "step": 1218 + }, + { + "epoch": 0.3755001538935057, + "grad_norm": 4.875, + "learning_rate": 2.9436346475402983e-06, + "loss": 1.0904285907745361, + "step": 1220 + }, + { + "epoch": 0.37611572791628195, + "grad_norm": 50.5, + "learning_rate": 2.9433928403709567e-06, + "loss": 1.5347020626068115, + "step": 1222 + }, + { + "epoch": 0.3767313019390582, + "grad_norm": 47.25, + "learning_rate": 2.943150528163689e-06, + "loss": 1.447998046875, + "step": 1224 + }, + { + "epoch": 0.3773468759618344, + "grad_norm": 34.5, + "learning_rate": 2.9429077110255244e-06, + "loss": 1.3163659572601318, + "step": 1226 + }, + { + "epoch": 0.37796244998461065, + "grad_norm": 14.625, + "learning_rate": 2.942664389063715e-06, + "loss": 1.315096378326416, + "step": 1228 + }, + { + "epoch": 0.3785780240073869, + "grad_norm": 11.3125, + "learning_rate": 2.9424205623857374e-06, + "loss": 1.392237663269043, + "step": 1230 + }, + { + "epoch": 0.3791935980301631, + "grad_norm": 16.375, + "learning_rate": 2.9421762310992895e-06, + "loss": 1.4208223819732666, + "step": 1232 + }, + { + "epoch": 0.37980917205293935, + "grad_norm": 17.375, + "learning_rate": 2.9419313953122932e-06, + "loss": 1.4550400972366333, + "step": 1234 + }, + { + "epoch": 0.3804247460757156, + "grad_norm": 19.125, + "learning_rate": 2.9416860551328915e-06, + "loss": 1.4298524856567383, + "step": 1236 + }, + { + "epoch": 0.38104032009849187, + "grad_norm": 16.625, + "learning_rate": 2.9414402106694528e-06, + "loss": 1.93170166015625, + "step": 1238 + }, + { + "epoch": 0.3816558941212681, + "grad_norm": 14.0625, + "learning_rate": 2.9411938620305663e-06, + "loss": 1.3365933895111084, + "step": 1240 + }, + { + "epoch": 0.38227146814404434, + "grad_norm": 13.9375, + "learning_rate": 2.9409470093250453e-06, + "loss": 1.9269330501556396, + "step": 1242 + }, + { + "epoch": 0.38288704216682057, + "grad_norm": 20.375, + "learning_rate": 2.9406996526619237e-06, + "loss": 1.3893201351165771, + "step": 1244 + }, + { + "epoch": 0.3835026161895968, + "grad_norm": 14.3125, + "learning_rate": 2.94045179215046e-06, + "loss": 1.711153507232666, + "step": 1246 + }, + { + "epoch": 0.38411819021237303, + "grad_norm": 10.375, + "learning_rate": 2.940203427900133e-06, + "loss": 1.2741600275039673, + "step": 1248 + }, + { + "epoch": 0.38473376423514927, + "grad_norm": 22.375, + "learning_rate": 2.9399545600206474e-06, + "loss": 1.5442752838134766, + "step": 1250 + }, + { + "epoch": 0.3853493382579255, + "grad_norm": 19.375, + "learning_rate": 2.939705188621928e-06, + "loss": 1.328803300857544, + "step": 1252 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 8.25, + "learning_rate": 2.939455313814122e-06, + "loss": 1.207406997680664, + "step": 1254 + }, + { + "epoch": 0.38658048630347797, + "grad_norm": 15.5625, + "learning_rate": 2.9392049357075994e-06, + "loss": 1.4009120464324951, + "step": 1256 + }, + { + "epoch": 0.38719606032625425, + "grad_norm": 8.25, + "learning_rate": 2.9389540544129524e-06, + "loss": 1.562011480331421, + "step": 1258 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 15.875, + "learning_rate": 2.9387026700409965e-06, + "loss": 1.8288936614990234, + "step": 1260 + }, + { + "epoch": 0.3884272083718067, + "grad_norm": 10.9375, + "learning_rate": 2.938450782702767e-06, + "loss": 1.4957469701766968, + "step": 1262 + }, + { + "epoch": 0.38904278239458295, + "grad_norm": 14.1875, + "learning_rate": 2.938198392509524e-06, + "loss": 1.7026026248931885, + "step": 1264 + }, + { + "epoch": 0.3896583564173592, + "grad_norm": 14.25, + "learning_rate": 2.937945499572748e-06, + "loss": 1.2688217163085938, + "step": 1266 + }, + { + "epoch": 0.3902739304401354, + "grad_norm": 7.46875, + "learning_rate": 2.937692104004142e-06, + "loss": 1.181006908416748, + "step": 1268 + }, + { + "epoch": 0.39088950446291165, + "grad_norm": 19.125, + "learning_rate": 2.9374382059156316e-06, + "loss": 1.2825920581817627, + "step": 1270 + }, + { + "epoch": 0.3915050784856879, + "grad_norm": 23.75, + "learning_rate": 2.937183805419363e-06, + "loss": 1.861384630203247, + "step": 1272 + }, + { + "epoch": 0.3921206525084641, + "grad_norm": 19.375, + "learning_rate": 2.9369289026277063e-06, + "loss": 1.5804314613342285, + "step": 1274 + }, + { + "epoch": 0.3927362265312404, + "grad_norm": 8.4375, + "learning_rate": 2.936673497653252e-06, + "loss": 1.4048612117767334, + "step": 1276 + }, + { + "epoch": 0.39335180055401664, + "grad_norm": 32.75, + "learning_rate": 2.936417590608812e-06, + "loss": 1.1659860610961914, + "step": 1278 + }, + { + "epoch": 0.3939673745767929, + "grad_norm": 12.25, + "learning_rate": 2.936161181607422e-06, + "loss": 1.2615526914596558, + "step": 1280 + }, + { + "epoch": 0.3945829485995691, + "grad_norm": 51.75, + "learning_rate": 2.935904270762337e-06, + "loss": 0.9141525626182556, + "step": 1282 + }, + { + "epoch": 0.39519852262234534, + "grad_norm": 7.6875, + "learning_rate": 2.935646858187035e-06, + "loss": 1.1676512956619263, + "step": 1284 + }, + { + "epoch": 0.39581409664512157, + "grad_norm": 23.0, + "learning_rate": 2.935388943995216e-06, + "loss": 1.3372483253479004, + "step": 1286 + }, + { + "epoch": 0.3964296706678978, + "grad_norm": 15.3125, + "learning_rate": 2.935130528300801e-06, + "loss": 1.3295966386795044, + "step": 1288 + }, + { + "epoch": 0.39704524469067404, + "grad_norm": 12.0, + "learning_rate": 2.9348716112179328e-06, + "loss": 1.472252607345581, + "step": 1290 + }, + { + "epoch": 0.39766081871345027, + "grad_norm": 11.875, + "learning_rate": 2.9346121928609734e-06, + "loss": 1.84720778465271, + "step": 1292 + }, + { + "epoch": 0.39827639273622656, + "grad_norm": 16.25, + "learning_rate": 2.93435227334451e-06, + "loss": 1.7415895462036133, + "step": 1294 + }, + { + "epoch": 0.3988919667590028, + "grad_norm": 11.0, + "learning_rate": 2.93409185278335e-06, + "loss": 1.4139404296875, + "step": 1296 + }, + { + "epoch": 0.399507540781779, + "grad_norm": 13.0625, + "learning_rate": 2.9338309312925193e-06, + "loss": 1.4103145599365234, + "step": 1298 + }, + { + "epoch": 0.40012311480455526, + "grad_norm": 13.375, + "learning_rate": 2.9335695089872687e-06, + "loss": 1.3001097440719604, + "step": 1300 + }, + { + "epoch": 0.4007386888273315, + "grad_norm": 7.3125, + "learning_rate": 2.9333075859830684e-06, + "loss": 1.4066133499145508, + "step": 1302 + }, + { + "epoch": 0.4013542628501077, + "grad_norm": 21.375, + "learning_rate": 2.93304516239561e-06, + "loss": 1.3882697820663452, + "step": 1304 + }, + { + "epoch": 0.40196983687288396, + "grad_norm": 17.875, + "learning_rate": 2.932782238340806e-06, + "loss": 1.5596365928649902, + "step": 1306 + }, + { + "epoch": 0.4025854108956602, + "grad_norm": 12.9375, + "learning_rate": 2.932518813934791e-06, + "loss": 1.5969336032867432, + "step": 1308 + }, + { + "epoch": 0.4032009849184364, + "grad_norm": 8.875, + "learning_rate": 2.9322548892939188e-06, + "loss": 1.2696571350097656, + "step": 1310 + }, + { + "epoch": 0.40381655894121266, + "grad_norm": 13.0, + "learning_rate": 2.931990464534767e-06, + "loss": 1.4799418449401855, + "step": 1312 + }, + { + "epoch": 0.40443213296398894, + "grad_norm": 12.5, + "learning_rate": 2.9317255397741303e-06, + "loss": 1.061401605606079, + "step": 1314 + }, + { + "epoch": 0.4050477069867652, + "grad_norm": 4.5, + "learning_rate": 2.9314601151290277e-06, + "loss": 1.3438705205917358, + "step": 1316 + }, + { + "epoch": 0.4056632810095414, + "grad_norm": 59.0, + "learning_rate": 2.9311941907166965e-06, + "loss": 1.691558837890625, + "step": 1318 + }, + { + "epoch": 0.40627885503231764, + "grad_norm": 10.75, + "learning_rate": 2.9309277666545967e-06, + "loss": 1.52768874168396, + "step": 1320 + }, + { + "epoch": 0.4068944290550939, + "grad_norm": 83.0, + "learning_rate": 2.9306608430604075e-06, + "loss": 1.3572287559509277, + "step": 1322 + }, + { + "epoch": 0.4075100030778701, + "grad_norm": 4.8125, + "learning_rate": 2.93039342005203e-06, + "loss": 1.1406067609786987, + "step": 1324 + }, + { + "epoch": 0.40812557710064634, + "grad_norm": 12.125, + "learning_rate": 2.9301254977475843e-06, + "loss": 1.5260097980499268, + "step": 1326 + }, + { + "epoch": 0.4087411511234226, + "grad_norm": 15.25, + "learning_rate": 2.929857076265413e-06, + "loss": 1.1194944381713867, + "step": 1328 + }, + { + "epoch": 0.4093567251461988, + "grad_norm": 8.4375, + "learning_rate": 2.929588155724078e-06, + "loss": 1.281527042388916, + "step": 1330 + }, + { + "epoch": 0.4099722991689751, + "grad_norm": 14.8125, + "learning_rate": 2.929318736242361e-06, + "loss": 1.3978681564331055, + "step": 1332 + }, + { + "epoch": 0.41058787319175133, + "grad_norm": 56.25, + "learning_rate": 2.9290488179392657e-06, + "loss": 1.7001137733459473, + "step": 1334 + }, + { + "epoch": 0.41120344721452756, + "grad_norm": 30.375, + "learning_rate": 2.928778400934015e-06, + "loss": 1.5537958145141602, + "step": 1336 + }, + { + "epoch": 0.4118190212373038, + "grad_norm": 11.3125, + "learning_rate": 2.9285074853460523e-06, + "loss": 1.308958888053894, + "step": 1338 + }, + { + "epoch": 0.41243459526008003, + "grad_norm": 10.5, + "learning_rate": 2.9282360712950418e-06, + "loss": 1.2196264266967773, + "step": 1340 + }, + { + "epoch": 0.41305016928285626, + "grad_norm": 9.9375, + "learning_rate": 2.927964158900867e-06, + "loss": 1.2395238876342773, + "step": 1342 + }, + { + "epoch": 0.4136657433056325, + "grad_norm": 18.375, + "learning_rate": 2.927691748283631e-06, + "loss": 1.388479471206665, + "step": 1344 + }, + { + "epoch": 0.4142813173284087, + "grad_norm": 18.875, + "learning_rate": 2.9274188395636597e-06, + "loss": 1.6272698640823364, + "step": 1346 + }, + { + "epoch": 0.41489689135118496, + "grad_norm": 14.0, + "learning_rate": 2.9271454328614973e-06, + "loss": 1.5498168468475342, + "step": 1348 + }, + { + "epoch": 0.4155124653739612, + "grad_norm": 10.0625, + "learning_rate": 2.9268715282979057e-06, + "loss": 1.6276700496673584, + "step": 1350 + }, + { + "epoch": 0.4161280393967375, + "grad_norm": 13.8125, + "learning_rate": 2.9265971259938705e-06, + "loss": 1.3738837242126465, + "step": 1352 + }, + { + "epoch": 0.4167436134195137, + "grad_norm": 39.5, + "learning_rate": 2.926322226070595e-06, + "loss": 1.1020140647888184, + "step": 1354 + }, + { + "epoch": 0.41735918744228995, + "grad_norm": 11.9375, + "learning_rate": 2.926046828649503e-06, + "loss": 1.3103208541870117, + "step": 1356 + }, + { + "epoch": 0.4179747614650662, + "grad_norm": 22.875, + "learning_rate": 2.9257709338522375e-06, + "loss": 1.6737949848175049, + "step": 1358 + }, + { + "epoch": 0.4185903354878424, + "grad_norm": 24.75, + "learning_rate": 2.925494541800662e-06, + "loss": 1.2139620780944824, + "step": 1360 + }, + { + "epoch": 0.41920590951061865, + "grad_norm": 14.25, + "learning_rate": 2.9252176526168586e-06, + "loss": 1.862136960029602, + "step": 1362 + }, + { + "epoch": 0.4198214835333949, + "grad_norm": 9.9375, + "learning_rate": 2.924940266423131e-06, + "loss": 1.1231043338775635, + "step": 1364 + }, + { + "epoch": 0.4204370575561711, + "grad_norm": 11.25, + "learning_rate": 2.924662383341999e-06, + "loss": 1.3487285375595093, + "step": 1366 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 26.875, + "learning_rate": 2.9243840034962055e-06, + "loss": 1.7209620475769043, + "step": 1368 + }, + { + "epoch": 0.42166820560172363, + "grad_norm": 34.75, + "learning_rate": 2.92410512700871e-06, + "loss": 1.3228155374526978, + "step": 1370 + }, + { + "epoch": 0.42228377962449987, + "grad_norm": 11.0625, + "learning_rate": 2.923825754002693e-06, + "loss": 1.228677749633789, + "step": 1372 + }, + { + "epoch": 0.4228993536472761, + "grad_norm": 19.625, + "learning_rate": 2.923545884601555e-06, + "loss": 1.5254507064819336, + "step": 1374 + }, + { + "epoch": 0.42351492767005233, + "grad_norm": 12.1875, + "learning_rate": 2.9232655189289123e-06, + "loss": 1.4658113718032837, + "step": 1376 + }, + { + "epoch": 0.42413050169282857, + "grad_norm": 13.1875, + "learning_rate": 2.9229846571086044e-06, + "loss": 1.4198139905929565, + "step": 1378 + }, + { + "epoch": 0.4247460757156048, + "grad_norm": 7.84375, + "learning_rate": 2.9227032992646887e-06, + "loss": 1.259503960609436, + "step": 1380 + }, + { + "epoch": 0.42536164973838103, + "grad_norm": 12.1875, + "learning_rate": 2.9224214455214398e-06, + "loss": 1.034414291381836, + "step": 1382 + }, + { + "epoch": 0.42597722376115726, + "grad_norm": 18.75, + "learning_rate": 2.922139096003354e-06, + "loss": 1.646314263343811, + "step": 1384 + }, + { + "epoch": 0.4265927977839335, + "grad_norm": 8.125, + "learning_rate": 2.9218562508351447e-06, + "loss": 1.3071296215057373, + "step": 1386 + }, + { + "epoch": 0.42720837180670973, + "grad_norm": 13.125, + "learning_rate": 2.921572910141745e-06, + "loss": 1.4267911911010742, + "step": 1388 + }, + { + "epoch": 0.427823945829486, + "grad_norm": 6.28125, + "learning_rate": 2.9212890740483074e-06, + "loss": 1.0795435905456543, + "step": 1390 + }, + { + "epoch": 0.42843951985226225, + "grad_norm": 10.25, + "learning_rate": 2.921004742680202e-06, + "loss": 1.2007197141647339, + "step": 1392 + }, + { + "epoch": 0.4290550938750385, + "grad_norm": 15.6875, + "learning_rate": 2.9207199161630184e-06, + "loss": 1.3095588684082031, + "step": 1394 + }, + { + "epoch": 0.4296706678978147, + "grad_norm": 77.0, + "learning_rate": 2.920434594622565e-06, + "loss": 1.3891422748565674, + "step": 1396 + }, + { + "epoch": 0.43028624192059095, + "grad_norm": 10.5, + "learning_rate": 2.9201487781848682e-06, + "loss": 1.5573385953903198, + "step": 1398 + }, + { + "epoch": 0.4309018159433672, + "grad_norm": 15.3125, + "learning_rate": 2.9198624669761748e-06, + "loss": 1.2296578884124756, + "step": 1400 + }, + { + "epoch": 0.4315173899661434, + "grad_norm": 21.0, + "learning_rate": 2.9195756611229465e-06, + "loss": 1.5690207481384277, + "step": 1402 + }, + { + "epoch": 0.43213296398891965, + "grad_norm": 12.0, + "learning_rate": 2.919288360751868e-06, + "loss": 1.4062907695770264, + "step": 1404 + }, + { + "epoch": 0.4327485380116959, + "grad_norm": 3.140625, + "learning_rate": 2.9190005659898386e-06, + "loss": 1.1973246335983276, + "step": 1406 + }, + { + "epoch": 0.43336411203447217, + "grad_norm": 11.8125, + "learning_rate": 2.918712276963979e-06, + "loss": 1.28580904006958, + "step": 1408 + }, + { + "epoch": 0.4339796860572484, + "grad_norm": 11.5625, + "learning_rate": 2.918423493801626e-06, + "loss": 1.411057710647583, + "step": 1410 + }, + { + "epoch": 0.43459526008002464, + "grad_norm": 18.625, + "learning_rate": 2.918134216630335e-06, + "loss": 1.4428560733795166, + "step": 1412 + }, + { + "epoch": 0.43521083410280087, + "grad_norm": 5.6875, + "learning_rate": 2.9178444455778806e-06, + "loss": 1.1413902044296265, + "step": 1414 + }, + { + "epoch": 0.4358264081255771, + "grad_norm": 23.875, + "learning_rate": 2.9175541807722552e-06, + "loss": 1.7097017765045166, + "step": 1416 + }, + { + "epoch": 0.43644198214835334, + "grad_norm": 17.625, + "learning_rate": 2.917263422341668e-06, + "loss": 1.5849788188934326, + "step": 1418 + }, + { + "epoch": 0.43705755617112957, + "grad_norm": 11.375, + "learning_rate": 2.9169721704145496e-06, + "loss": 1.6044025421142578, + "step": 1420 + }, + { + "epoch": 0.4376731301939058, + "grad_norm": 26.0, + "learning_rate": 2.916680425119544e-06, + "loss": 1.6424756050109863, + "step": 1422 + }, + { + "epoch": 0.43828870421668203, + "grad_norm": 13.0625, + "learning_rate": 2.9163881865855165e-06, + "loss": 1.1550233364105225, + "step": 1424 + }, + { + "epoch": 0.43890427823945827, + "grad_norm": 19.875, + "learning_rate": 2.916095454941549e-06, + "loss": 0.9205485582351685, + "step": 1426 + }, + { + "epoch": 0.43951985226223456, + "grad_norm": 30.5, + "learning_rate": 2.915802230316941e-06, + "loss": 1.3150010108947754, + "step": 1428 + }, + { + "epoch": 0.4401354262850108, + "grad_norm": 9.9375, + "learning_rate": 2.9155085128412115e-06, + "loss": 1.126185655593872, + "step": 1430 + }, + { + "epoch": 0.440751000307787, + "grad_norm": 23.5, + "learning_rate": 2.9152143026440945e-06, + "loss": 1.3212380409240723, + "step": 1432 + }, + { + "epoch": 0.44136657433056325, + "grad_norm": 19.25, + "learning_rate": 2.9149195998555434e-06, + "loss": 1.1305532455444336, + "step": 1434 + }, + { + "epoch": 0.4419821483533395, + "grad_norm": 172.0, + "learning_rate": 2.914624404605729e-06, + "loss": 1.5058670043945312, + "step": 1436 + }, + { + "epoch": 0.4425977223761157, + "grad_norm": 13.5, + "learning_rate": 2.914328717025039e-06, + "loss": 1.3187425136566162, + "step": 1438 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 17.25, + "learning_rate": 2.9140325372440786e-06, + "loss": 1.4263267517089844, + "step": 1440 + }, + { + "epoch": 0.4438288704216682, + "grad_norm": 9.125, + "learning_rate": 2.913735865393672e-06, + "loss": 1.4333791732788086, + "step": 1442 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 7.25, + "learning_rate": 2.9134387016048578e-06, + "loss": 1.5287044048309326, + "step": 1444 + }, + { + "epoch": 0.4450600184672207, + "grad_norm": 8.3125, + "learning_rate": 2.9131410460088953e-06, + "loss": 1.2323602437973022, + "step": 1446 + }, + { + "epoch": 0.44567559248999694, + "grad_norm": 7.96875, + "learning_rate": 2.912842898737258e-06, + "loss": 1.1562038660049438, + "step": 1448 + }, + { + "epoch": 0.4462911665127732, + "grad_norm": 24.25, + "learning_rate": 2.9125442599216385e-06, + "loss": 1.52219820022583, + "step": 1450 + }, + { + "epoch": 0.4469067405355494, + "grad_norm": 15.0, + "learning_rate": 2.912245129693946e-06, + "loss": 1.4202659130096436, + "step": 1452 + }, + { + "epoch": 0.44752231455832564, + "grad_norm": 6.53125, + "learning_rate": 2.9119455081863065e-06, + "loss": 1.0215532779693604, + "step": 1454 + }, + { + "epoch": 0.4481378885811019, + "grad_norm": 15.0625, + "learning_rate": 2.9116453955310632e-06, + "loss": 1.053874135017395, + "step": 1456 + }, + { + "epoch": 0.4487534626038781, + "grad_norm": 16.625, + "learning_rate": 2.9113447918607764e-06, + "loss": 1.2909146547317505, + "step": 1458 + }, + { + "epoch": 0.44936903662665434, + "grad_norm": 35.0, + "learning_rate": 2.9110436973082223e-06, + "loss": 1.6827542781829834, + "step": 1460 + }, + { + "epoch": 0.44998461064943057, + "grad_norm": 38.25, + "learning_rate": 2.9107421120063953e-06, + "loss": 1.359308123588562, + "step": 1462 + }, + { + "epoch": 0.45060018467220686, + "grad_norm": 13.9375, + "learning_rate": 2.9104400360885066e-06, + "loss": 1.2988219261169434, + "step": 1464 + }, + { + "epoch": 0.4512157586949831, + "grad_norm": 9.0625, + "learning_rate": 2.9101374696879824e-06, + "loss": 1.3027936220169067, + "step": 1466 + }, + { + "epoch": 0.4518313327177593, + "grad_norm": 6.40625, + "learning_rate": 2.9098344129384667e-06, + "loss": 1.2598680257797241, + "step": 1468 + }, + { + "epoch": 0.45244690674053556, + "grad_norm": 13.375, + "learning_rate": 2.90953086597382e-06, + "loss": 1.4774788618087769, + "step": 1470 + }, + { + "epoch": 0.4530624807633118, + "grad_norm": 12.75, + "learning_rate": 2.9092268289281206e-06, + "loss": 1.5974787473678589, + "step": 1472 + }, + { + "epoch": 0.453678054786088, + "grad_norm": 32.75, + "learning_rate": 2.908922301935661e-06, + "loss": 1.4435534477233887, + "step": 1474 + }, + { + "epoch": 0.45429362880886426, + "grad_norm": 45.5, + "learning_rate": 2.9086172851309508e-06, + "loss": 1.6630170345306396, + "step": 1476 + }, + { + "epoch": 0.4549092028316405, + "grad_norm": 16.625, + "learning_rate": 2.908311778648717e-06, + "loss": 1.562852382659912, + "step": 1478 + }, + { + "epoch": 0.4555247768544167, + "grad_norm": 36.0, + "learning_rate": 2.908005782623902e-06, + "loss": 1.4895464181900024, + "step": 1480 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 12.375, + "learning_rate": 2.907699297191664e-06, + "loss": 1.5021754503250122, + "step": 1482 + }, + { + "epoch": 0.45675592489996925, + "grad_norm": 16.125, + "learning_rate": 2.9073923224873787e-06, + "loss": 1.359191656112671, + "step": 1484 + }, + { + "epoch": 0.4573714989227455, + "grad_norm": 17.125, + "learning_rate": 2.9070848586466364e-06, + "loss": 1.1666719913482666, + "step": 1486 + }, + { + "epoch": 0.4579870729455217, + "grad_norm": 43.25, + "learning_rate": 2.9067769058052452e-06, + "loss": 0.9205150604248047, + "step": 1488 + }, + { + "epoch": 0.45860264696829794, + "grad_norm": 5.40625, + "learning_rate": 2.9064684640992278e-06, + "loss": 0.7707520127296448, + "step": 1490 + }, + { + "epoch": 0.4592182209910742, + "grad_norm": 31.0, + "learning_rate": 2.906159533664823e-06, + "loss": 1.3506240844726562, + "step": 1492 + }, + { + "epoch": 0.4598337950138504, + "grad_norm": 14.3125, + "learning_rate": 2.9058501146384863e-06, + "loss": 1.8203885555267334, + "step": 1494 + }, + { + "epoch": 0.46044936903662664, + "grad_norm": 19.75, + "learning_rate": 2.9055402071568873e-06, + "loss": 1.492109775543213, + "step": 1496 + }, + { + "epoch": 0.4610649430594029, + "grad_norm": 17.375, + "learning_rate": 2.9052298113569134e-06, + "loss": 1.2597657442092896, + "step": 1498 + }, + { + "epoch": 0.4616805170821791, + "grad_norm": 10.25, + "learning_rate": 2.904918927375667e-06, + "loss": 0.9372320175170898, + "step": 1500 + }, + { + "epoch": 0.4622960911049554, + "grad_norm": 19.25, + "learning_rate": 2.904607555350466e-06, + "loss": 1.623645544052124, + "step": 1502 + }, + { + "epoch": 0.46291166512773163, + "grad_norm": 11.5625, + "learning_rate": 2.9042956954188426e-06, + "loss": 1.3719435930252075, + "step": 1504 + }, + { + "epoch": 0.46352723915050786, + "grad_norm": 17.375, + "learning_rate": 2.9039833477185464e-06, + "loss": 2.0057315826416016, + "step": 1506 + }, + { + "epoch": 0.4641428131732841, + "grad_norm": 25.625, + "learning_rate": 2.9036705123875417e-06, + "loss": 1.4058794975280762, + "step": 1508 + }, + { + "epoch": 0.46475838719606033, + "grad_norm": 16.75, + "learning_rate": 2.9033571895640084e-06, + "loss": 1.2668242454528809, + "step": 1510 + }, + { + "epoch": 0.46537396121883656, + "grad_norm": 16.75, + "learning_rate": 2.903043379386342e-06, + "loss": 1.5426902770996094, + "step": 1512 + }, + { + "epoch": 0.4659895352416128, + "grad_norm": 8.3125, + "learning_rate": 2.9027290819931513e-06, + "loss": 0.8808223605155945, + "step": 1514 + }, + { + "epoch": 0.46660510926438903, + "grad_norm": 22.125, + "learning_rate": 2.9024142975232635e-06, + "loss": 0.8790304064750671, + "step": 1516 + }, + { + "epoch": 0.46722068328716526, + "grad_norm": 20.25, + "learning_rate": 2.9020990261157176e-06, + "loss": 1.3890122175216675, + "step": 1518 + }, + { + "epoch": 0.4678362573099415, + "grad_norm": 43.0, + "learning_rate": 2.9017832679097717e-06, + "loss": 1.4353948831558228, + "step": 1520 + }, + { + "epoch": 0.4684518313327178, + "grad_norm": 10.375, + "learning_rate": 2.9014670230448936e-06, + "loss": 1.5785167217254639, + "step": 1522 + }, + { + "epoch": 0.469067405355494, + "grad_norm": 25.5, + "learning_rate": 2.9011502916607713e-06, + "loss": 1.5384750366210938, + "step": 1524 + }, + { + "epoch": 0.46968297937827025, + "grad_norm": 15.1875, + "learning_rate": 2.9008330738973046e-06, + "loss": 1.5118379592895508, + "step": 1526 + }, + { + "epoch": 0.4702985534010465, + "grad_norm": 6.84375, + "learning_rate": 2.9005153698946093e-06, + "loss": 1.3365740776062012, + "step": 1528 + }, + { + "epoch": 0.4709141274238227, + "grad_norm": 12.5625, + "learning_rate": 2.900197179793015e-06, + "loss": 0.9900118112564087, + "step": 1530 + }, + { + "epoch": 0.47152970144659895, + "grad_norm": 11.5, + "learning_rate": 2.899878503733067e-06, + "loss": 1.3724645376205444, + "step": 1532 + }, + { + "epoch": 0.4721452754693752, + "grad_norm": 10.9375, + "learning_rate": 2.899559341855525e-06, + "loss": 1.733636498451233, + "step": 1534 + }, + { + "epoch": 0.4727608494921514, + "grad_norm": 17.0, + "learning_rate": 2.8992396943013624e-06, + "loss": 1.7250124216079712, + "step": 1536 + }, + { + "epoch": 0.47337642351492765, + "grad_norm": 14.375, + "learning_rate": 2.898919561211769e-06, + "loss": 1.8821617364883423, + "step": 1538 + }, + { + "epoch": 0.47399199753770394, + "grad_norm": 13.6875, + "learning_rate": 2.898598942728147e-06, + "loss": 1.5046849250793457, + "step": 1540 + }, + { + "epoch": 0.47460757156048017, + "grad_norm": 18.5, + "learning_rate": 2.8982778389921146e-06, + "loss": 1.4122357368469238, + "step": 1542 + }, + { + "epoch": 0.4752231455832564, + "grad_norm": 9.375, + "learning_rate": 2.8979562501455037e-06, + "loss": 1.287928819656372, + "step": 1544 + }, + { + "epoch": 0.47583871960603263, + "grad_norm": 20.25, + "learning_rate": 2.8976341763303605e-06, + "loss": 1.093557357788086, + "step": 1546 + }, + { + "epoch": 0.47645429362880887, + "grad_norm": 15.5, + "learning_rate": 2.8973116176889447e-06, + "loss": 1.4339439868927002, + "step": 1548 + }, + { + "epoch": 0.4770698676515851, + "grad_norm": 13.4375, + "learning_rate": 2.896988574363731e-06, + "loss": 1.27329683303833, + "step": 1550 + }, + { + "epoch": 0.47768544167436133, + "grad_norm": 7.84375, + "learning_rate": 2.8966650464974084e-06, + "loss": 1.039716124534607, + "step": 1552 + }, + { + "epoch": 0.47830101569713757, + "grad_norm": 13.9375, + "learning_rate": 2.89634103423288e-06, + "loss": 1.5718767642974854, + "step": 1554 + }, + { + "epoch": 0.4789165897199138, + "grad_norm": 17.75, + "learning_rate": 2.896016537713261e-06, + "loss": 1.633246660232544, + "step": 1556 + }, + { + "epoch": 0.47953216374269003, + "grad_norm": 13.1875, + "learning_rate": 2.895691557081883e-06, + "loss": 1.159442663192749, + "step": 1558 + }, + { + "epoch": 0.4801477377654663, + "grad_norm": 13.8125, + "learning_rate": 2.89536609248229e-06, + "loss": 1.199776530265808, + "step": 1560 + }, + { + "epoch": 0.48076331178824255, + "grad_norm": 11.5625, + "learning_rate": 2.89504014405824e-06, + "loss": 1.600058674812317, + "step": 1562 + }, + { + "epoch": 0.4813788858110188, + "grad_norm": 16.0, + "learning_rate": 2.8947137119537048e-06, + "loss": 1.3177971839904785, + "step": 1564 + }, + { + "epoch": 0.481994459833795, + "grad_norm": 4.40625, + "learning_rate": 2.89438679631287e-06, + "loss": 1.278930902481079, + "step": 1566 + }, + { + "epoch": 0.48261003385657125, + "grad_norm": 16.125, + "learning_rate": 2.894059397280134e-06, + "loss": 1.3205333948135376, + "step": 1568 + }, + { + "epoch": 0.4832256078793475, + "grad_norm": 12.5, + "learning_rate": 2.89373151500011e-06, + "loss": 1.354341983795166, + "step": 1570 + }, + { + "epoch": 0.4838411819021237, + "grad_norm": 27.0, + "learning_rate": 2.8934031496176247e-06, + "loss": 1.4841325283050537, + "step": 1572 + }, + { + "epoch": 0.48445675592489995, + "grad_norm": 14.1875, + "learning_rate": 2.893074301277715e-06, + "loss": 1.2359914779663086, + "step": 1574 + }, + { + "epoch": 0.4850723299476762, + "grad_norm": 5.90625, + "learning_rate": 2.8927449701256367e-06, + "loss": 1.1367638111114502, + "step": 1576 + }, + { + "epoch": 0.4856879039704525, + "grad_norm": 14.375, + "learning_rate": 2.892415156306853e-06, + "loss": 1.4632461071014404, + "step": 1578 + }, + { + "epoch": 0.4863034779932287, + "grad_norm": 3.46875, + "learning_rate": 2.8920848599670444e-06, + "loss": 1.1607825756072998, + "step": 1580 + }, + { + "epoch": 0.48691905201600494, + "grad_norm": 13.0, + "learning_rate": 2.8917540812521034e-06, + "loss": 1.3275295495986938, + "step": 1582 + }, + { + "epoch": 0.48753462603878117, + "grad_norm": 13.125, + "learning_rate": 2.891422820308135e-06, + "loss": 1.3891751766204834, + "step": 1584 + }, + { + "epoch": 0.4881502000615574, + "grad_norm": 9.1875, + "learning_rate": 2.8910910772814575e-06, + "loss": 1.415259838104248, + "step": 1586 + }, + { + "epoch": 0.48876577408433364, + "grad_norm": 15.9375, + "learning_rate": 2.890758852318602e-06, + "loss": 1.2171683311462402, + "step": 1588 + }, + { + "epoch": 0.48938134810710987, + "grad_norm": 12.4375, + "learning_rate": 2.890426145566313e-06, + "loss": 1.3115642070770264, + "step": 1590 + }, + { + "epoch": 0.4899969221298861, + "grad_norm": 16.25, + "learning_rate": 2.8900929571715465e-06, + "loss": 1.3981764316558838, + "step": 1592 + }, + { + "epoch": 0.49061249615266234, + "grad_norm": 11.6875, + "learning_rate": 2.8897592872814738e-06, + "loss": 1.0410218238830566, + "step": 1594 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 46.0, + "learning_rate": 2.8894251360434756e-06, + "loss": 0.5647349953651428, + "step": 1596 + }, + { + "epoch": 0.49184364419821486, + "grad_norm": 30.0, + "learning_rate": 2.8890905036051487e-06, + "loss": 1.3878058195114136, + "step": 1598 + }, + { + "epoch": 0.4924592182209911, + "grad_norm": 44.75, + "learning_rate": 2.888755390114299e-06, + "loss": 1.670772910118103, + "step": 1600 + }, + { + "epoch": 0.4930747922437673, + "grad_norm": 27.5, + "learning_rate": 2.8884197957189477e-06, + "loss": 1.9882588386535645, + "step": 1602 + }, + { + "epoch": 0.49369036626654356, + "grad_norm": 15.1875, + "learning_rate": 2.888083720567326e-06, + "loss": 1.4575190544128418, + "step": 1604 + }, + { + "epoch": 0.4943059402893198, + "grad_norm": 23.125, + "learning_rate": 2.8877471648078796e-06, + "loss": 1.4484859704971313, + "step": 1606 + }, + { + "epoch": 0.494921514312096, + "grad_norm": 9.3125, + "learning_rate": 2.887410128589266e-06, + "loss": 1.4162225723266602, + "step": 1608 + }, + { + "epoch": 0.49553708833487226, + "grad_norm": 23.75, + "learning_rate": 2.887072612060353e-06, + "loss": 1.5183629989624023, + "step": 1610 + }, + { + "epoch": 0.4961526623576485, + "grad_norm": 13.6875, + "learning_rate": 2.8867346153702226e-06, + "loss": 1.486363172531128, + "step": 1612 + }, + { + "epoch": 0.4967682363804247, + "grad_norm": 6.65625, + "learning_rate": 2.886396138668169e-06, + "loss": 1.0761302709579468, + "step": 1614 + }, + { + "epoch": 0.497383810403201, + "grad_norm": 9.125, + "learning_rate": 2.8860571821036973e-06, + "loss": 1.2252916097640991, + "step": 1616 + }, + { + "epoch": 0.49799938442597724, + "grad_norm": 16.375, + "learning_rate": 2.885717745826525e-06, + "loss": 1.1279650926589966, + "step": 1618 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 19.625, + "learning_rate": 2.8853778299865823e-06, + "loss": 1.4863353967666626, + "step": 1620 + }, + { + "epoch": 0.4992305324715297, + "grad_norm": 8.375, + "learning_rate": 2.8850374347340086e-06, + "loss": 1.3660411834716797, + "step": 1622 + }, + { + "epoch": 0.49984610649430594, + "grad_norm": 7.78125, + "learning_rate": 2.884696560219158e-06, + "loss": 1.0915755033493042, + "step": 1624 + }, + { + "epoch": 0.5004616805170822, + "grad_norm": 10.6875, + "learning_rate": 2.8843552065925955e-06, + "loss": 1.3456721305847168, + "step": 1626 + }, + { + "epoch": 0.5010772545398584, + "grad_norm": 18.125, + "learning_rate": 2.884013374005097e-06, + "loss": 1.553337574005127, + "step": 1628 + }, + { + "epoch": 0.5016928285626346, + "grad_norm": 127.5, + "learning_rate": 2.88367106260765e-06, + "loss": 1.7117575407028198, + "step": 1630 + }, + { + "epoch": 0.5023084025854109, + "grad_norm": 8.125, + "learning_rate": 2.8833282725514537e-06, + "loss": 0.8663567304611206, + "step": 1632 + }, + { + "epoch": 0.5029239766081871, + "grad_norm": 19.25, + "learning_rate": 2.88298500398792e-06, + "loss": 1.617782473564148, + "step": 1634 + }, + { + "epoch": 0.5035395506309633, + "grad_norm": 26.875, + "learning_rate": 2.8826412570686696e-06, + "loss": 1.2509739398956299, + "step": 1636 + }, + { + "epoch": 0.5041551246537396, + "grad_norm": 22.625, + "learning_rate": 2.8822970319455376e-06, + "loss": 1.4614055156707764, + "step": 1638 + }, + { + "epoch": 0.5047706986765158, + "grad_norm": 20.375, + "learning_rate": 2.881952328770567e-06, + "loss": 1.1225662231445312, + "step": 1640 + }, + { + "epoch": 0.505386272699292, + "grad_norm": 13.25, + "learning_rate": 2.881607147696014e-06, + "loss": 1.4158776998519897, + "step": 1642 + }, + { + "epoch": 0.5060018467220684, + "grad_norm": 28.875, + "learning_rate": 2.881261488874346e-06, + "loss": 1.7590508460998535, + "step": 1644 + }, + { + "epoch": 0.5066174207448446, + "grad_norm": 10.3125, + "learning_rate": 2.8809153524582406e-06, + "loss": 0.5588256120681763, + "step": 1646 + }, + { + "epoch": 0.5072329947676208, + "grad_norm": 11.6875, + "learning_rate": 2.8805687386005873e-06, + "loss": 1.0516297817230225, + "step": 1648 + }, + { + "epoch": 0.5078485687903971, + "grad_norm": 9.5625, + "learning_rate": 2.8802216474544842e-06, + "loss": 1.1188182830810547, + "step": 1650 + }, + { + "epoch": 0.5084641428131733, + "grad_norm": 9.4375, + "learning_rate": 2.8798740791732435e-06, + "loss": 1.3351898193359375, + "step": 1652 + }, + { + "epoch": 0.5090797168359495, + "grad_norm": 14.9375, + "learning_rate": 2.8795260339103864e-06, + "loss": 1.552315592765808, + "step": 1654 + }, + { + "epoch": 0.5096952908587258, + "grad_norm": 8.625, + "learning_rate": 2.879177511819643e-06, + "loss": 1.0963990688323975, + "step": 1656 + }, + { + "epoch": 0.510310864881502, + "grad_norm": 15.1875, + "learning_rate": 2.878828513054958e-06, + "loss": 1.716416358947754, + "step": 1658 + }, + { + "epoch": 0.5109264389042782, + "grad_norm": 8.1875, + "learning_rate": 2.8784790377704833e-06, + "loss": 1.1489286422729492, + "step": 1660 + }, + { + "epoch": 0.5115420129270545, + "grad_norm": 74.5, + "learning_rate": 2.8781290861205835e-06, + "loss": 0.5450893640518188, + "step": 1662 + }, + { + "epoch": 0.5121575869498307, + "grad_norm": 13.5625, + "learning_rate": 2.8777786582598325e-06, + "loss": 1.4440956115722656, + "step": 1664 + }, + { + "epoch": 0.512773160972607, + "grad_norm": 24.75, + "learning_rate": 2.877427754343014e-06, + "loss": 1.051446795463562, + "step": 1666 + }, + { + "epoch": 0.5133887349953832, + "grad_norm": 24.875, + "learning_rate": 2.8770763745251223e-06, + "loss": 1.3044072389602661, + "step": 1668 + }, + { + "epoch": 0.5140043090181594, + "grad_norm": 15.0625, + "learning_rate": 2.8767245189613643e-06, + "loss": 1.0021661520004272, + "step": 1670 + }, + { + "epoch": 0.5146198830409356, + "grad_norm": 18.375, + "learning_rate": 2.876372187807153e-06, + "loss": 1.3740590810775757, + "step": 1672 + }, + { + "epoch": 0.5152354570637119, + "grad_norm": 20.25, + "learning_rate": 2.8760193812181143e-06, + "loss": 1.4852182865142822, + "step": 1674 + }, + { + "epoch": 0.5158510310864881, + "grad_norm": 28.875, + "learning_rate": 2.875666099350083e-06, + "loss": 1.6808966398239136, + "step": 1676 + }, + { + "epoch": 0.5164666051092643, + "grad_norm": 20.5, + "learning_rate": 2.8753123423591046e-06, + "loss": 1.5063765048980713, + "step": 1678 + }, + { + "epoch": 0.5170821791320406, + "grad_norm": 15.75, + "learning_rate": 2.8749581104014334e-06, + "loss": 1.543807029724121, + "step": 1680 + }, + { + "epoch": 0.5176977531548169, + "grad_norm": 15.5, + "learning_rate": 2.874603403633535e-06, + "loss": 1.5597282648086548, + "step": 1682 + }, + { + "epoch": 0.5183133271775932, + "grad_norm": 14.3125, + "learning_rate": 2.874248222212082e-06, + "loss": 1.5003395080566406, + "step": 1684 + }, + { + "epoch": 0.5189289012003694, + "grad_norm": 13.8125, + "learning_rate": 2.87389256629396e-06, + "loss": 1.4587888717651367, + "step": 1686 + }, + { + "epoch": 0.5195444752231456, + "grad_norm": 32.5, + "learning_rate": 2.873536436036262e-06, + "loss": 1.307799220085144, + "step": 1688 + }, + { + "epoch": 0.5201600492459219, + "grad_norm": 11.25, + "learning_rate": 2.873179831596292e-06, + "loss": 1.3242509365081787, + "step": 1690 + }, + { + "epoch": 0.5207756232686981, + "grad_norm": 7.5625, + "learning_rate": 2.872822753131561e-06, + "loss": 1.322249412536621, + "step": 1692 + }, + { + "epoch": 0.5213911972914743, + "grad_norm": 10.25, + "learning_rate": 2.8724652007997922e-06, + "loss": 1.420886754989624, + "step": 1694 + }, + { + "epoch": 0.5220067713142506, + "grad_norm": 21.625, + "learning_rate": 2.8721071747589165e-06, + "loss": 1.4671920537948608, + "step": 1696 + }, + { + "epoch": 0.5226223453370268, + "grad_norm": 15.1875, + "learning_rate": 2.8717486751670743e-06, + "loss": 1.6293821334838867, + "step": 1698 + }, + { + "epoch": 0.523237919359803, + "grad_norm": 19.25, + "learning_rate": 2.871389702182616e-06, + "loss": 1.1151347160339355, + "step": 1700 + }, + { + "epoch": 0.5238534933825792, + "grad_norm": 11.9375, + "learning_rate": 2.871030255964099e-06, + "loss": 1.3921657800674438, + "step": 1702 + }, + { + "epoch": 0.5244690674053555, + "grad_norm": 10.0, + "learning_rate": 2.8706703366702926e-06, + "loss": 1.331470251083374, + "step": 1704 + }, + { + "epoch": 0.5250846414281317, + "grad_norm": 7.65625, + "learning_rate": 2.870309944460172e-06, + "loss": 1.0913875102996826, + "step": 1706 + }, + { + "epoch": 0.525700215450908, + "grad_norm": 4.71875, + "learning_rate": 2.869949079492924e-06, + "loss": 1.2790942192077637, + "step": 1708 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 30.0, + "learning_rate": 2.8695877419279436e-06, + "loss": 1.7259317636489868, + "step": 1710 + }, + { + "epoch": 0.5269313634964604, + "grad_norm": 10.625, + "learning_rate": 2.869225931924832e-06, + "loss": 1.1891872882843018, + "step": 1712 + }, + { + "epoch": 0.5275469375192366, + "grad_norm": 9.75, + "learning_rate": 2.868863649643403e-06, + "loss": 1.1908620595932007, + "step": 1714 + }, + { + "epoch": 0.5281625115420129, + "grad_norm": 29.0, + "learning_rate": 2.8685008952436762e-06, + "loss": 1.4761006832122803, + "step": 1716 + }, + { + "epoch": 0.5287780855647891, + "grad_norm": 6.40625, + "learning_rate": 2.8681376688858812e-06, + "loss": 1.137245774269104, + "step": 1718 + }, + { + "epoch": 0.5293936595875655, + "grad_norm": 10.8125, + "learning_rate": 2.867773970730455e-06, + "loss": 1.0170716047286987, + "step": 1720 + }, + { + "epoch": 0.5300092336103417, + "grad_norm": 10.1875, + "learning_rate": 2.867409800938043e-06, + "loss": 1.353341817855835, + "step": 1722 + }, + { + "epoch": 0.5306248076331179, + "grad_norm": 16.625, + "learning_rate": 2.8670451596695006e-06, + "loss": 1.406203031539917, + "step": 1724 + }, + { + "epoch": 0.5312403816558942, + "grad_norm": 8.625, + "learning_rate": 2.8666800470858897e-06, + "loss": 1.1297376155853271, + "step": 1726 + }, + { + "epoch": 0.5318559556786704, + "grad_norm": 11.125, + "learning_rate": 2.866314463348481e-06, + "loss": 1.3886916637420654, + "step": 1728 + }, + { + "epoch": 0.5324715297014466, + "grad_norm": 13.6875, + "learning_rate": 2.865948408618753e-06, + "loss": 1.3519071340560913, + "step": 1730 + }, + { + "epoch": 0.5330871037242229, + "grad_norm": 18.25, + "learning_rate": 2.8655818830583925e-06, + "loss": 1.651322603225708, + "step": 1732 + }, + { + "epoch": 0.5337026777469991, + "grad_norm": 16.75, + "learning_rate": 2.865214886829295e-06, + "loss": 1.5197107791900635, + "step": 1734 + }, + { + "epoch": 0.5343182517697753, + "grad_norm": 29.875, + "learning_rate": 2.864847420093562e-06, + "loss": 1.8645293712615967, + "step": 1736 + }, + { + "epoch": 0.5349338257925516, + "grad_norm": 19.625, + "learning_rate": 2.8644794830135047e-06, + "loss": 1.5438506603240967, + "step": 1738 + }, + { + "epoch": 0.5355493998153278, + "grad_norm": 6.875, + "learning_rate": 2.864111075751641e-06, + "loss": 1.1475858688354492, + "step": 1740 + }, + { + "epoch": 0.536164973838104, + "grad_norm": 21.0, + "learning_rate": 2.8637421984706977e-06, + "loss": 1.403162956237793, + "step": 1742 + }, + { + "epoch": 0.5367805478608803, + "grad_norm": 7.09375, + "learning_rate": 2.8633728513336076e-06, + "loss": 1.1231191158294678, + "step": 1744 + }, + { + "epoch": 0.5373961218836565, + "grad_norm": 11.625, + "learning_rate": 2.863003034503511e-06, + "loss": 1.4450881481170654, + "step": 1746 + }, + { + "epoch": 0.5380116959064327, + "grad_norm": 10.0, + "learning_rate": 2.8626327481437573e-06, + "loss": 1.3785327672958374, + "step": 1748 + }, + { + "epoch": 0.538627269929209, + "grad_norm": 13.375, + "learning_rate": 2.862261992417903e-06, + "loss": 1.3406766653060913, + "step": 1750 + }, + { + "epoch": 0.5392428439519852, + "grad_norm": 9.5625, + "learning_rate": 2.8618907674897106e-06, + "loss": 1.3030524253845215, + "step": 1752 + }, + { + "epoch": 0.5398584179747614, + "grad_norm": 7.84375, + "learning_rate": 2.86151907352315e-06, + "loss": 1.221339225769043, + "step": 1754 + }, + { + "epoch": 0.5404739919975377, + "grad_norm": 26.0, + "learning_rate": 2.8611469106824e-06, + "loss": 1.056274175643921, + "step": 1756 + }, + { + "epoch": 0.541089566020314, + "grad_norm": 19.75, + "learning_rate": 2.8607742791318442e-06, + "loss": 1.3449528217315674, + "step": 1758 + }, + { + "epoch": 0.5417051400430902, + "grad_norm": 6.34375, + "learning_rate": 2.8604011790360755e-06, + "loss": 1.1858000755310059, + "step": 1760 + }, + { + "epoch": 0.5423207140658665, + "grad_norm": 86.0, + "learning_rate": 2.860027610559892e-06, + "loss": 1.5040273666381836, + "step": 1762 + }, + { + "epoch": 0.5429362880886427, + "grad_norm": 31.5, + "learning_rate": 2.8596535738682998e-06, + "loss": 1.4095792770385742, + "step": 1764 + }, + { + "epoch": 0.5435518621114189, + "grad_norm": 55.75, + "learning_rate": 2.8592790691265104e-06, + "loss": 1.7419880628585815, + "step": 1766 + }, + { + "epoch": 0.5441674361341952, + "grad_norm": 15.3125, + "learning_rate": 2.858904096499944e-06, + "loss": 1.3096810579299927, + "step": 1768 + }, + { + "epoch": 0.5447830101569714, + "grad_norm": 11.8125, + "learning_rate": 2.858528656154226e-06, + "loss": 1.6850093603134155, + "step": 1770 + }, + { + "epoch": 0.5453985841797476, + "grad_norm": 10.375, + "learning_rate": 2.8581527482551887e-06, + "loss": 1.2738856077194214, + "step": 1772 + }, + { + "epoch": 0.5460141582025239, + "grad_norm": 30.5, + "learning_rate": 2.8577763729688717e-06, + "loss": 1.3164045810699463, + "step": 1774 + }, + { + "epoch": 0.5466297322253001, + "grad_norm": 19.375, + "learning_rate": 2.857399530461519e-06, + "loss": 1.4970653057098389, + "step": 1776 + }, + { + "epoch": 0.5472453062480763, + "grad_norm": 13.5, + "learning_rate": 2.857022220899584e-06, + "loss": 1.4046273231506348, + "step": 1778 + }, + { + "epoch": 0.5478608802708526, + "grad_norm": 8.5, + "learning_rate": 2.856644444449724e-06, + "loss": 1.539980411529541, + "step": 1780 + }, + { + "epoch": 0.5484764542936288, + "grad_norm": 23.25, + "learning_rate": 2.8562662012788028e-06, + "loss": 1.7872920036315918, + "step": 1782 + }, + { + "epoch": 0.549092028316405, + "grad_norm": 49.5, + "learning_rate": 2.855887491553892e-06, + "loss": 1.6001696586608887, + "step": 1784 + }, + { + "epoch": 0.5497076023391813, + "grad_norm": 13.1875, + "learning_rate": 2.8555083154422666e-06, + "loss": 1.1596770286560059, + "step": 1786 + }, + { + "epoch": 0.5503231763619575, + "grad_norm": 36.75, + "learning_rate": 2.8551286731114104e-06, + "loss": 1.4008393287658691, + "step": 1788 + }, + { + "epoch": 0.5509387503847337, + "grad_norm": 25.25, + "learning_rate": 2.8547485647290116e-06, + "loss": 1.474470853805542, + "step": 1790 + }, + { + "epoch": 0.55155432440751, + "grad_norm": 8.625, + "learning_rate": 2.8543679904629644e-06, + "loss": 1.4998549222946167, + "step": 1792 + }, + { + "epoch": 0.5521698984302862, + "grad_norm": 11.4375, + "learning_rate": 2.8539869504813684e-06, + "loss": 1.033094048500061, + "step": 1794 + }, + { + "epoch": 0.5527854724530625, + "grad_norm": 8.6875, + "learning_rate": 2.8536054449525304e-06, + "loss": 1.3654334545135498, + "step": 1796 + }, + { + "epoch": 0.5534010464758388, + "grad_norm": 19.125, + "learning_rate": 2.853223474044961e-06, + "loss": 1.45066499710083, + "step": 1798 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 12.1875, + "learning_rate": 2.852841037927377e-06, + "loss": 1.2047758102416992, + "step": 1800 + }, + { + "epoch": 0.5546321945213912, + "grad_norm": 14.625, + "learning_rate": 2.8524581367687023e-06, + "loss": 1.1410934925079346, + "step": 1802 + }, + { + "epoch": 0.5552477685441675, + "grad_norm": 24.375, + "learning_rate": 2.8520747707380634e-06, + "loss": 1.5290203094482422, + "step": 1804 + }, + { + "epoch": 0.5558633425669437, + "grad_norm": 20.5, + "learning_rate": 2.8516909400047937e-06, + "loss": 1.005765438079834, + "step": 1806 + }, + { + "epoch": 0.5564789165897199, + "grad_norm": 22.25, + "learning_rate": 2.851306644738432e-06, + "loss": 1.7779279947280884, + "step": 1808 + }, + { + "epoch": 0.5570944906124962, + "grad_norm": 8.0, + "learning_rate": 2.850921885108722e-06, + "loss": 1.135983943939209, + "step": 1810 + }, + { + "epoch": 0.5577100646352724, + "grad_norm": 20.375, + "learning_rate": 2.850536661285612e-06, + "loss": 1.7957868576049805, + "step": 1812 + }, + { + "epoch": 0.5583256386580486, + "grad_norm": 27.875, + "learning_rate": 2.8501509734392566e-06, + "loss": 1.8340908288955688, + "step": 1814 + }, + { + "epoch": 0.5589412126808249, + "grad_norm": 15.1875, + "learning_rate": 2.8497648217400137e-06, + "loss": 1.5594115257263184, + "step": 1816 + }, + { + "epoch": 0.5595567867036011, + "grad_norm": 15.4375, + "learning_rate": 2.849378206358447e-06, + "loss": 1.1051418781280518, + "step": 1818 + }, + { + "epoch": 0.5601723607263773, + "grad_norm": 12.5, + "learning_rate": 2.8489911274653263e-06, + "loss": 1.3022823333740234, + "step": 1820 + }, + { + "epoch": 0.5607879347491536, + "grad_norm": 33.0, + "learning_rate": 2.848603585231623e-06, + "loss": 1.4590941667556763, + "step": 1822 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 6.75, + "learning_rate": 2.848215579828516e-06, + "loss": 1.4862349033355713, + "step": 1824 + }, + { + "epoch": 0.562019082794706, + "grad_norm": 15.3125, + "learning_rate": 2.8478271114273873e-06, + "loss": 1.1151635646820068, + "step": 1826 + }, + { + "epoch": 0.5626346568174823, + "grad_norm": 24.375, + "learning_rate": 2.8474381801998244e-06, + "loss": 1.2278783321380615, + "step": 1828 + }, + { + "epoch": 0.5632502308402585, + "grad_norm": 23.375, + "learning_rate": 2.847048786317618e-06, + "loss": 1.7876145839691162, + "step": 1830 + }, + { + "epoch": 0.5638658048630347, + "grad_norm": 17.0, + "learning_rate": 2.846658929952764e-06, + "loss": 1.65370512008667, + "step": 1832 + }, + { + "epoch": 0.5644813788858111, + "grad_norm": 12.6875, + "learning_rate": 2.8462686112774625e-06, + "loss": 1.4286746978759766, + "step": 1834 + }, + { + "epoch": 0.5650969529085873, + "grad_norm": 9.5, + "learning_rate": 2.845877830464118e-06, + "loss": 1.5145810842514038, + "step": 1836 + }, + { + "epoch": 0.5657125269313635, + "grad_norm": 20.875, + "learning_rate": 2.845486587685338e-06, + "loss": 1.6489734649658203, + "step": 1838 + }, + { + "epoch": 0.5663281009541398, + "grad_norm": 11.9375, + "learning_rate": 2.8450948831139355e-06, + "loss": 1.3738142251968384, + "step": 1840 + }, + { + "epoch": 0.566943674976916, + "grad_norm": 14.5625, + "learning_rate": 2.8447027169229277e-06, + "loss": 1.5936473608016968, + "step": 1842 + }, + { + "epoch": 0.5675592489996922, + "grad_norm": 74.0, + "learning_rate": 2.8443100892855328e-06, + "loss": 1.0497071743011475, + "step": 1844 + }, + { + "epoch": 0.5681748230224685, + "grad_norm": 7.65625, + "learning_rate": 2.843917000375177e-06, + "loss": 1.1069320440292358, + "step": 1846 + }, + { + "epoch": 0.5687903970452447, + "grad_norm": 24.625, + "learning_rate": 2.843523450365486e-06, + "loss": 1.5907843112945557, + "step": 1848 + }, + { + "epoch": 0.5694059710680209, + "grad_norm": 9.0, + "learning_rate": 2.8431294394302937e-06, + "loss": 1.447683334350586, + "step": 1850 + }, + { + "epoch": 0.5700215450907972, + "grad_norm": 19.375, + "learning_rate": 2.842734967743633e-06, + "loss": 1.3191251754760742, + "step": 1852 + }, + { + "epoch": 0.5706371191135734, + "grad_norm": 13.9375, + "learning_rate": 2.8423400354797437e-06, + "loss": 1.2646541595458984, + "step": 1854 + }, + { + "epoch": 0.5712526931363496, + "grad_norm": 25.0, + "learning_rate": 2.841944642813067e-06, + "loss": 0.9847139120101929, + "step": 1856 + }, + { + "epoch": 0.5718682671591259, + "grad_norm": 24.625, + "learning_rate": 2.84154878991825e-06, + "loss": 1.350081443786621, + "step": 1858 + }, + { + "epoch": 0.5724838411819021, + "grad_norm": 79.5, + "learning_rate": 2.841152476970139e-06, + "loss": 1.20438551902771, + "step": 1860 + }, + { + "epoch": 0.5730994152046783, + "grad_norm": 10.4375, + "learning_rate": 2.8407557041437875e-06, + "loss": 1.325690507888794, + "step": 1862 + }, + { + "epoch": 0.5737149892274546, + "grad_norm": 19.875, + "learning_rate": 2.84035847161445e-06, + "loss": 1.2209234237670898, + "step": 1864 + }, + { + "epoch": 0.5743305632502308, + "grad_norm": 7.03125, + "learning_rate": 2.8399607795575845e-06, + "loss": 1.2860186100006104, + "step": 1866 + }, + { + "epoch": 0.574946137273007, + "grad_norm": 15.0625, + "learning_rate": 2.8395626281488528e-06, + "loss": 1.4151506423950195, + "step": 1868 + }, + { + "epoch": 0.5755617112957833, + "grad_norm": 8.1875, + "learning_rate": 2.8391640175641177e-06, + "loss": 1.2648940086364746, + "step": 1870 + }, + { + "epoch": 0.5761772853185596, + "grad_norm": 26.75, + "learning_rate": 2.838764947979447e-06, + "loss": 0.8969897627830505, + "step": 1872 + }, + { + "epoch": 0.5767928593413358, + "grad_norm": 17.375, + "learning_rate": 2.838365419571109e-06, + "loss": 1.4495514631271362, + "step": 1874 + }, + { + "epoch": 0.5774084333641121, + "grad_norm": 4.0625, + "learning_rate": 2.8379654325155772e-06, + "loss": 1.1686997413635254, + "step": 1876 + }, + { + "epoch": 0.5780240073868883, + "grad_norm": 16.625, + "learning_rate": 2.837564986989525e-06, + "loss": 1.185441255569458, + "step": 1878 + }, + { + "epoch": 0.5786395814096645, + "grad_norm": 10.0, + "learning_rate": 2.8371640831698305e-06, + "loss": 1.539408564567566, + "step": 1880 + }, + { + "epoch": 0.5792551554324408, + "grad_norm": 16.375, + "learning_rate": 2.8367627212335734e-06, + "loss": 1.452717661857605, + "step": 1882 + }, + { + "epoch": 0.579870729455217, + "grad_norm": 20.0, + "learning_rate": 2.8363609013580353e-06, + "loss": 1.2913964986801147, + "step": 1884 + }, + { + "epoch": 0.5804863034779932, + "grad_norm": 7.125, + "learning_rate": 2.8359586237207013e-06, + "loss": 1.2374382019042969, + "step": 1886 + }, + { + "epoch": 0.5811018775007695, + "grad_norm": 26.625, + "learning_rate": 2.8355558884992565e-06, + "loss": 1.564466953277588, + "step": 1888 + }, + { + "epoch": 0.5817174515235457, + "grad_norm": 18.25, + "learning_rate": 2.8351526958715914e-06, + "loss": 1.572069525718689, + "step": 1890 + }, + { + "epoch": 0.5823330255463219, + "grad_norm": 41.5, + "learning_rate": 2.834749046015794e-06, + "loss": 1.0248215198516846, + "step": 1892 + }, + { + "epoch": 0.5829485995690982, + "grad_norm": 6.03125, + "learning_rate": 2.8343449391101594e-06, + "loss": 1.573186993598938, + "step": 1894 + }, + { + "epoch": 0.5835641735918744, + "grad_norm": 26.125, + "learning_rate": 2.8339403753331814e-06, + "loss": 1.384692907333374, + "step": 1896 + }, + { + "epoch": 0.5841797476146506, + "grad_norm": 15.6875, + "learning_rate": 2.833535354863556e-06, + "loss": 1.307197093963623, + "step": 1898 + }, + { + "epoch": 0.5847953216374269, + "grad_norm": 5.46875, + "learning_rate": 2.8331298778801806e-06, + "loss": 1.4879049062728882, + "step": 1900 + }, + { + "epoch": 0.5854108956602031, + "grad_norm": 20.5, + "learning_rate": 2.8327239445621555e-06, + "loss": 1.6862337589263916, + "step": 1902 + }, + { + "epoch": 0.5860264696829793, + "grad_norm": 24.125, + "learning_rate": 2.8323175550887824e-06, + "loss": 1.6703479290008545, + "step": 1904 + }, + { + "epoch": 0.5866420437057556, + "grad_norm": 15.1875, + "learning_rate": 2.831910709639563e-06, + "loss": 1.492964744567871, + "step": 1906 + }, + { + "epoch": 0.5872576177285319, + "grad_norm": 13.1875, + "learning_rate": 2.8315034083942028e-06, + "loss": 1.3600826263427734, + "step": 1908 + }, + { + "epoch": 0.5878731917513081, + "grad_norm": 8.0625, + "learning_rate": 2.8310956515326053e-06, + "loss": 1.158477544784546, + "step": 1910 + }, + { + "epoch": 0.5884887657740844, + "grad_norm": 14.6875, + "learning_rate": 2.8306874392348786e-06, + "loss": 1.2859324216842651, + "step": 1912 + }, + { + "epoch": 0.5891043397968606, + "grad_norm": 9.5, + "learning_rate": 2.8302787716813304e-06, + "loss": 1.4909124374389648, + "step": 1914 + }, + { + "epoch": 0.5897199138196368, + "grad_norm": 16.125, + "learning_rate": 2.8298696490524687e-06, + "loss": 1.2859081029891968, + "step": 1916 + }, + { + "epoch": 0.5903354878424131, + "grad_norm": 10.25, + "learning_rate": 2.8294600715290046e-06, + "loss": 1.3993711471557617, + "step": 1918 + }, + { + "epoch": 0.5909510618651893, + "grad_norm": 20.375, + "learning_rate": 2.8290500392918485e-06, + "loss": 0.9705616235733032, + "step": 1920 + }, + { + "epoch": 0.5915666358879655, + "grad_norm": 12.1875, + "learning_rate": 2.8286395525221118e-06, + "loss": 1.364678144454956, + "step": 1922 + }, + { + "epoch": 0.5921822099107418, + "grad_norm": 40.75, + "learning_rate": 2.8282286114011074e-06, + "loss": 1.29304838180542, + "step": 1924 + }, + { + "epoch": 0.592797783933518, + "grad_norm": 7.375, + "learning_rate": 2.8278172161103485e-06, + "loss": 1.2006807327270508, + "step": 1926 + }, + { + "epoch": 0.5934133579562942, + "grad_norm": 9.75, + "learning_rate": 2.8274053668315483e-06, + "loss": 1.387573480606079, + "step": 1928 + }, + { + "epoch": 0.5940289319790705, + "grad_norm": 7.15625, + "learning_rate": 2.8269930637466216e-06, + "loss": 1.365392804145813, + "step": 1930 + }, + { + "epoch": 0.5946445060018467, + "grad_norm": 11.875, + "learning_rate": 2.8265803070376824e-06, + "loss": 1.5575824975967407, + "step": 1932 + }, + { + "epoch": 0.5952600800246229, + "grad_norm": 20.5, + "learning_rate": 2.826167096887047e-06, + "loss": 1.1728448867797852, + "step": 1934 + }, + { + "epoch": 0.5958756540473992, + "grad_norm": 11.8125, + "learning_rate": 2.8257534334772303e-06, + "loss": 1.4417364597320557, + "step": 1936 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 25.125, + "learning_rate": 2.8253393169909474e-06, + "loss": 1.6214367151260376, + "step": 1938 + }, + { + "epoch": 0.5971068020929516, + "grad_norm": 38.5, + "learning_rate": 2.824924747611115e-06, + "loss": 1.4998779296875, + "step": 1940 + }, + { + "epoch": 0.5977223761157279, + "grad_norm": 7.90625, + "learning_rate": 2.824509725520848e-06, + "loss": 1.2933369874954224, + "step": 1942 + }, + { + "epoch": 0.5983379501385041, + "grad_norm": 11.1875, + "learning_rate": 2.8240942509034626e-06, + "loss": 1.4675788879394531, + "step": 1944 + }, + { + "epoch": 0.5989535241612804, + "grad_norm": 9.9375, + "learning_rate": 2.823678323942474e-06, + "loss": 1.1986143589019775, + "step": 1946 + }, + { + "epoch": 0.5995690981840567, + "grad_norm": 8.4375, + "learning_rate": 2.8232619448215984e-06, + "loss": 1.014643669128418, + "step": 1948 + }, + { + "epoch": 0.6001846722068329, + "grad_norm": 14.375, + "learning_rate": 2.82284511372475e-06, + "loss": 1.5362460613250732, + "step": 1950 + }, + { + "epoch": 0.6008002462296091, + "grad_norm": 10.0625, + "learning_rate": 2.822427830836044e-06, + "loss": 1.085378646850586, + "step": 1952 + }, + { + "epoch": 0.6014158202523854, + "grad_norm": 70.0, + "learning_rate": 2.8220100963397945e-06, + "loss": 1.327484369277954, + "step": 1954 + }, + { + "epoch": 0.6020313942751616, + "grad_norm": 14.625, + "learning_rate": 2.821591910420516e-06, + "loss": 1.308484435081482, + "step": 1956 + }, + { + "epoch": 0.6026469682979378, + "grad_norm": 7.6875, + "learning_rate": 2.821173273262921e-06, + "loss": 1.0947763919830322, + "step": 1958 + }, + { + "epoch": 0.6032625423207141, + "grad_norm": 23.625, + "learning_rate": 2.8207541850519226e-06, + "loss": 1.7331056594848633, + "step": 1960 + }, + { + "epoch": 0.6038781163434903, + "grad_norm": 11.6875, + "learning_rate": 2.8203346459726315e-06, + "loss": 1.389496088027954, + "step": 1962 + }, + { + "epoch": 0.6044936903662665, + "grad_norm": 22.625, + "learning_rate": 2.81991465621036e-06, + "loss": 1.372626543045044, + "step": 1964 + }, + { + "epoch": 0.6051092643890428, + "grad_norm": 18.25, + "learning_rate": 2.8194942159506163e-06, + "loss": 1.157492995262146, + "step": 1966 + }, + { + "epoch": 0.605724838411819, + "grad_norm": 22.0, + "learning_rate": 2.819073325379111e-06, + "loss": 0.7561789155006409, + "step": 1968 + }, + { + "epoch": 0.6063404124345952, + "grad_norm": 25.25, + "learning_rate": 2.8186519846817515e-06, + "loss": 1.4574605226516724, + "step": 1970 + }, + { + "epoch": 0.6069559864573715, + "grad_norm": 10.3125, + "learning_rate": 2.818230194044644e-06, + "loss": 1.4265029430389404, + "step": 1972 + }, + { + "epoch": 0.6075715604801477, + "grad_norm": 6.96875, + "learning_rate": 2.817807953654094e-06, + "loss": 0.9847813844680786, + "step": 1974 + }, + { + "epoch": 0.6081871345029239, + "grad_norm": 8.8125, + "learning_rate": 2.817385263696606e-06, + "loss": 1.3704838752746582, + "step": 1976 + }, + { + "epoch": 0.6088027085257002, + "grad_norm": 29.125, + "learning_rate": 2.816962124358883e-06, + "loss": 0.8931136727333069, + "step": 1978 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 33.25, + "learning_rate": 2.8165385358278245e-06, + "loss": 1.4192783832550049, + "step": 1980 + }, + { + "epoch": 0.6100338565712526, + "grad_norm": 16.25, + "learning_rate": 2.8161144982905313e-06, + "loss": 1.2977051734924316, + "step": 1982 + }, + { + "epoch": 0.610649430594029, + "grad_norm": 18.125, + "learning_rate": 2.8156900119343013e-06, + "loss": 1.8155004978179932, + "step": 1984 + }, + { + "epoch": 0.6112650046168052, + "grad_norm": 21.125, + "learning_rate": 2.81526507694663e-06, + "loss": 1.7881275415420532, + "step": 1986 + }, + { + "epoch": 0.6118805786395815, + "grad_norm": 14.0, + "learning_rate": 2.8148396935152125e-06, + "loss": 1.238133192062378, + "step": 1988 + }, + { + "epoch": 0.6124961526623577, + "grad_norm": 33.75, + "learning_rate": 2.81441386182794e-06, + "loss": 1.4487230777740479, + "step": 1990 + }, + { + "epoch": 0.6131117266851339, + "grad_norm": 7.09375, + "learning_rate": 2.813987582072904e-06, + "loss": 1.2874596118927002, + "step": 1992 + }, + { + "epoch": 0.6137273007079102, + "grad_norm": 11.4375, + "learning_rate": 2.813560854438392e-06, + "loss": 1.3983826637268066, + "step": 1994 + }, + { + "epoch": 0.6143428747306864, + "grad_norm": 22.75, + "learning_rate": 2.8131336791128914e-06, + "loss": 1.0365115404129028, + "step": 1996 + }, + { + "epoch": 0.6149584487534626, + "grad_norm": 38.0, + "learning_rate": 2.8127060562850835e-06, + "loss": 1.5444605350494385, + "step": 1998 + }, + { + "epoch": 0.6155740227762388, + "grad_norm": 15.8125, + "learning_rate": 2.812277986143852e-06, + "loss": 1.8046175241470337, + "step": 2000 + }, + { + "epoch": 0.6161895967990151, + "grad_norm": 25.125, + "learning_rate": 2.8118494688782747e-06, + "loss": 1.093684196472168, + "step": 2002 + }, + { + "epoch": 0.6168051708217913, + "grad_norm": 15.25, + "learning_rate": 2.8114205046776295e-06, + "loss": 1.3421696424484253, + "step": 2004 + }, + { + "epoch": 0.6174207448445675, + "grad_norm": 13.3125, + "learning_rate": 2.810991093731389e-06, + "loss": 1.4425666332244873, + "step": 2006 + }, + { + "epoch": 0.6180363188673438, + "grad_norm": 14.0625, + "learning_rate": 2.810561236229225e-06, + "loss": 1.2806977033615112, + "step": 2008 + }, + { + "epoch": 0.61865189289012, + "grad_norm": 6.9375, + "learning_rate": 2.8101309323610063e-06, + "loss": 1.0246402025222778, + "step": 2010 + }, + { + "epoch": 0.6192674669128962, + "grad_norm": 11.1875, + "learning_rate": 2.8097001823167988e-06, + "loss": 1.3080570697784424, + "step": 2012 + }, + { + "epoch": 0.6198830409356725, + "grad_norm": 18.375, + "learning_rate": 2.809268986286864e-06, + "loss": 1.5274906158447266, + "step": 2014 + }, + { + "epoch": 0.6204986149584487, + "grad_norm": 50.25, + "learning_rate": 2.8088373444616635e-06, + "loss": 1.5145537853240967, + "step": 2016 + }, + { + "epoch": 0.621114188981225, + "grad_norm": 11.5625, + "learning_rate": 2.808405257031853e-06, + "loss": 1.186488389968872, + "step": 2018 + }, + { + "epoch": 0.6217297630040012, + "grad_norm": 11.6875, + "learning_rate": 2.807972724188286e-06, + "loss": 1.3994492292404175, + "step": 2020 + }, + { + "epoch": 0.6223453370267775, + "grad_norm": 27.875, + "learning_rate": 2.8075397461220128e-06, + "loss": 1.7313969135284424, + "step": 2022 + }, + { + "epoch": 0.6229609110495538, + "grad_norm": 17.125, + "learning_rate": 2.80710632302428e-06, + "loss": 0.9865247011184692, + "step": 2024 + }, + { + "epoch": 0.62357648507233, + "grad_norm": 5.9375, + "learning_rate": 2.806672455086532e-06, + "loss": 1.3501904010772705, + "step": 2026 + }, + { + "epoch": 0.6241920590951062, + "grad_norm": 12.125, + "learning_rate": 2.8062381425004084e-06, + "loss": 1.665205717086792, + "step": 2028 + }, + { + "epoch": 0.6248076331178825, + "grad_norm": 24.625, + "learning_rate": 2.805803385457745e-06, + "loss": 1.8587546348571777, + "step": 2030 + }, + { + "epoch": 0.6254232071406587, + "grad_norm": 28.625, + "learning_rate": 2.8053681841505746e-06, + "loss": 1.045028567314148, + "step": 2032 + }, + { + "epoch": 0.6260387811634349, + "grad_norm": 39.75, + "learning_rate": 2.804932538771127e-06, + "loss": 1.2187459468841553, + "step": 2034 + }, + { + "epoch": 0.6266543551862112, + "grad_norm": 18.75, + "learning_rate": 2.804496449511826e-06, + "loss": 1.2937474250793457, + "step": 2036 + }, + { + "epoch": 0.6272699292089874, + "grad_norm": 44.25, + "learning_rate": 2.8040599165652944e-06, + "loss": 1.2768559455871582, + "step": 2038 + }, + { + "epoch": 0.6278855032317636, + "grad_norm": 29.25, + "learning_rate": 2.8036229401243473e-06, + "loss": 1.0333044528961182, + "step": 2040 + }, + { + "epoch": 0.6285010772545399, + "grad_norm": 9.4375, + "learning_rate": 2.8031855203819993e-06, + "loss": 0.9537710547447205, + "step": 2042 + }, + { + "epoch": 0.6291166512773161, + "grad_norm": 8.375, + "learning_rate": 2.8027476575314575e-06, + "loss": 1.1429424285888672, + "step": 2044 + }, + { + "epoch": 0.6297322253000923, + "grad_norm": 38.0, + "learning_rate": 2.8023093517661286e-06, + "loss": 1.6575042009353638, + "step": 2046 + }, + { + "epoch": 0.6303477993228686, + "grad_norm": 8.1875, + "learning_rate": 2.8018706032796115e-06, + "loss": 1.3889267444610596, + "step": 2048 + }, + { + "epoch": 0.6309633733456448, + "grad_norm": 17.5, + "learning_rate": 2.801431412265702e-06, + "loss": 1.4309298992156982, + "step": 2050 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 11.75, + "learning_rate": 2.8009917789183904e-06, + "loss": 1.4273048639297485, + "step": 2052 + }, + { + "epoch": 0.6321945213911973, + "grad_norm": 12.625, + "learning_rate": 2.8005517034318654e-06, + "loss": 1.0454750061035156, + "step": 2054 + }, + { + "epoch": 0.6328100954139735, + "grad_norm": 14.375, + "learning_rate": 2.8001111860005067e-06, + "loss": 1.2234904766082764, + "step": 2056 + }, + { + "epoch": 0.6334256694367497, + "grad_norm": 9.75, + "learning_rate": 2.799670226818893e-06, + "loss": 1.093738079071045, + "step": 2058 + }, + { + "epoch": 0.6340412434595261, + "grad_norm": 22.0, + "learning_rate": 2.799228826081796e-06, + "loss": 1.7311053276062012, + "step": 2060 + }, + { + "epoch": 0.6346568174823023, + "grad_norm": 21.625, + "learning_rate": 2.7987869839841817e-06, + "loss": 1.5796515941619873, + "step": 2062 + }, + { + "epoch": 0.6352723915050785, + "grad_norm": 9.0, + "learning_rate": 2.7983447007212133e-06, + "loss": 1.5824609994888306, + "step": 2064 + }, + { + "epoch": 0.6358879655278548, + "grad_norm": 5.71875, + "learning_rate": 2.7979019764882487e-06, + "loss": 1.1825772523880005, + "step": 2066 + }, + { + "epoch": 0.636503539550631, + "grad_norm": 27.0, + "learning_rate": 2.7974588114808382e-06, + "loss": 1.2671689987182617, + "step": 2068 + }, + { + "epoch": 0.6371191135734072, + "grad_norm": 15.125, + "learning_rate": 2.797015205894729e-06, + "loss": 1.1446564197540283, + "step": 2070 + }, + { + "epoch": 0.6377346875961835, + "grad_norm": 11.8125, + "learning_rate": 2.7965711599258618e-06, + "loss": 1.421542763710022, + "step": 2072 + }, + { + "epoch": 0.6383502616189597, + "grad_norm": 17.625, + "learning_rate": 2.7961266737703725e-06, + "loss": 1.7223405838012695, + "step": 2074 + }, + { + "epoch": 0.6389658356417359, + "grad_norm": 11.125, + "learning_rate": 2.795681747624591e-06, + "loss": 1.7312476634979248, + "step": 2076 + }, + { + "epoch": 0.6395814096645122, + "grad_norm": 432.0, + "learning_rate": 2.795236381685042e-06, + "loss": 1.6300084590911865, + "step": 2078 + }, + { + "epoch": 0.6401969836872884, + "grad_norm": 13.1875, + "learning_rate": 2.7947905761484434e-06, + "loss": 1.3182811737060547, + "step": 2080 + }, + { + "epoch": 0.6408125577100646, + "grad_norm": 32.5, + "learning_rate": 2.7943443312117096e-06, + "loss": 1.3044389486312866, + "step": 2082 + }, + { + "epoch": 0.6414281317328409, + "grad_norm": 16.75, + "learning_rate": 2.793897647071946e-06, + "loss": 1.5463333129882812, + "step": 2084 + }, + { + "epoch": 0.6420437057556171, + "grad_norm": 29.5, + "learning_rate": 2.7934505239264535e-06, + "loss": 1.4204747676849365, + "step": 2086 + }, + { + "epoch": 0.6426592797783933, + "grad_norm": 14.75, + "learning_rate": 2.793002961972728e-06, + "loss": 1.5265976190567017, + "step": 2088 + }, + { + "epoch": 0.6432748538011696, + "grad_norm": 7.84375, + "learning_rate": 2.792554961408457e-06, + "loss": 1.242282509803772, + "step": 2090 + }, + { + "epoch": 0.6438904278239458, + "grad_norm": 15.875, + "learning_rate": 2.792106522431523e-06, + "loss": 1.3729312419891357, + "step": 2092 + }, + { + "epoch": 0.644506001846722, + "grad_norm": 14.5, + "learning_rate": 2.7916576452400033e-06, + "loss": 1.8193776607513428, + "step": 2094 + }, + { + "epoch": 0.6451215758694983, + "grad_norm": 20.75, + "learning_rate": 2.7912083300321656e-06, + "loss": 1.2827033996582031, + "step": 2096 + }, + { + "epoch": 0.6457371498922746, + "grad_norm": 14.0625, + "learning_rate": 2.7907585770064747e-06, + "loss": 1.1103391647338867, + "step": 2098 + }, + { + "epoch": 0.6463527239150508, + "grad_norm": 9.5625, + "learning_rate": 2.7903083863615856e-06, + "loss": 1.3382441997528076, + "step": 2100 + }, + { + "epoch": 0.6469682979378271, + "grad_norm": 20.625, + "learning_rate": 2.789857758296349e-06, + "loss": 1.7949204444885254, + "step": 2102 + }, + { + "epoch": 0.6475838719606033, + "grad_norm": 16.875, + "learning_rate": 2.789406693009807e-06, + "loss": 1.326945424079895, + "step": 2104 + }, + { + "epoch": 0.6481994459833795, + "grad_norm": 11.9375, + "learning_rate": 2.7889551907011965e-06, + "loss": 1.5753204822540283, + "step": 2106 + }, + { + "epoch": 0.6488150200061558, + "grad_norm": 20.5, + "learning_rate": 2.788503251569946e-06, + "loss": 1.9633748531341553, + "step": 2108 + }, + { + "epoch": 0.649430594028932, + "grad_norm": 25.25, + "learning_rate": 2.7880508758156777e-06, + "loss": 1.1748063564300537, + "step": 2110 + }, + { + "epoch": 0.6500461680517082, + "grad_norm": 110.0, + "learning_rate": 2.7875980636382068e-06, + "loss": 1.2353019714355469, + "step": 2112 + }, + { + "epoch": 0.6506617420744845, + "grad_norm": 40.0, + "learning_rate": 2.78714481523754e-06, + "loss": 1.6957015991210938, + "step": 2114 + }, + { + "epoch": 0.6512773160972607, + "grad_norm": 20.5, + "learning_rate": 2.7866911308138785e-06, + "loss": 1.7818708419799805, + "step": 2116 + }, + { + "epoch": 0.6518928901200369, + "grad_norm": 10.0, + "learning_rate": 2.786237010567615e-06, + "loss": 1.3317992687225342, + "step": 2118 + }, + { + "epoch": 0.6525084641428132, + "grad_norm": 27.375, + "learning_rate": 2.7857824546993356e-06, + "loss": 1.1547877788543701, + "step": 2120 + }, + { + "epoch": 0.6531240381655894, + "grad_norm": 8.375, + "learning_rate": 2.7853274634098166e-06, + "loss": 1.246095895767212, + "step": 2122 + }, + { + "epoch": 0.6537396121883656, + "grad_norm": 15.75, + "learning_rate": 2.7848720369000297e-06, + "loss": 1.4210995435714722, + "step": 2124 + }, + { + "epoch": 0.6543551862111419, + "grad_norm": 26.5, + "learning_rate": 2.7844161753711363e-06, + "loss": 1.2631009817123413, + "step": 2126 + }, + { + "epoch": 0.6549707602339181, + "grad_norm": 12.8125, + "learning_rate": 2.7839598790244913e-06, + "loss": 1.5142743587493896, + "step": 2128 + }, + { + "epoch": 0.6555863342566943, + "grad_norm": 42.0, + "learning_rate": 2.783503148061642e-06, + "loss": 1.3284528255462646, + "step": 2130 + }, + { + "epoch": 0.6562019082794706, + "grad_norm": 22.625, + "learning_rate": 2.7830459826843256e-06, + "loss": 1.7355668544769287, + "step": 2132 + }, + { + "epoch": 0.6568174823022468, + "grad_norm": 8.3125, + "learning_rate": 2.782588383094474e-06, + "loss": 1.3677023649215698, + "step": 2134 + }, + { + "epoch": 0.6574330563250231, + "grad_norm": 8.0, + "learning_rate": 2.7821303494942085e-06, + "loss": 1.4090447425842285, + "step": 2136 + }, + { + "epoch": 0.6580486303477994, + "grad_norm": 17.25, + "learning_rate": 2.7816718820858432e-06, + "loss": 1.2605023384094238, + "step": 2138 + }, + { + "epoch": 0.6586642043705756, + "grad_norm": 63.75, + "learning_rate": 2.7812129810718836e-06, + "loss": 1.1370813846588135, + "step": 2140 + }, + { + "epoch": 0.6592797783933518, + "grad_norm": 10.5, + "learning_rate": 2.780753646655028e-06, + "loss": 1.4284716844558716, + "step": 2142 + }, + { + "epoch": 0.6598953524161281, + "grad_norm": 16.75, + "learning_rate": 2.780293879038163e-06, + "loss": 1.4087693691253662, + "step": 2144 + }, + { + "epoch": 0.6605109264389043, + "grad_norm": 72.0, + "learning_rate": 2.7798336784243695e-06, + "loss": 1.7108557224273682, + "step": 2146 + }, + { + "epoch": 0.6611265004616805, + "grad_norm": 87.0, + "learning_rate": 2.7793730450169186e-06, + "loss": 1.2445833683013916, + "step": 2148 + }, + { + "epoch": 0.6617420744844568, + "grad_norm": 34.75, + "learning_rate": 2.778911979019273e-06, + "loss": 1.3835158348083496, + "step": 2150 + }, + { + "epoch": 0.662357648507233, + "grad_norm": 20.25, + "learning_rate": 2.778450480635086e-06, + "loss": 1.3453741073608398, + "step": 2152 + }, + { + "epoch": 0.6629732225300092, + "grad_norm": 19.375, + "learning_rate": 2.777988550068201e-06, + "loss": 1.3057246208190918, + "step": 2154 + }, + { + "epoch": 0.6635887965527855, + "grad_norm": 20.125, + "learning_rate": 2.7775261875226544e-06, + "loss": 1.776896357536316, + "step": 2156 + }, + { + "epoch": 0.6642043705755617, + "grad_norm": 10.6875, + "learning_rate": 2.7770633932026714e-06, + "loss": 1.359185814857483, + "step": 2158 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 24.125, + "learning_rate": 2.776600167312669e-06, + "loss": 1.4824891090393066, + "step": 2160 + }, + { + "epoch": 0.6654355186211142, + "grad_norm": 11.375, + "learning_rate": 2.776136510057255e-06, + "loss": 1.4170936346054077, + "step": 2162 + }, + { + "epoch": 0.6660510926438904, + "grad_norm": 19.25, + "learning_rate": 2.7756724216412274e-06, + "loss": 1.7150704860687256, + "step": 2164 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 26.0, + "learning_rate": 2.7752079022695735e-06, + "loss": 1.2592456340789795, + "step": 2166 + }, + { + "epoch": 0.6672822406894429, + "grad_norm": 10.1875, + "learning_rate": 2.7747429521474738e-06, + "loss": 1.4872198104858398, + "step": 2168 + }, + { + "epoch": 0.6678978147122191, + "grad_norm": 21.625, + "learning_rate": 2.7742775714802955e-06, + "loss": 1.5821186304092407, + "step": 2170 + }, + { + "epoch": 0.6685133887349953, + "grad_norm": 15.6875, + "learning_rate": 2.7738117604735985e-06, + "loss": 1.4031760692596436, + "step": 2172 + }, + { + "epoch": 0.6691289627577717, + "grad_norm": 41.0, + "learning_rate": 2.7733455193331332e-06, + "loss": 1.5786303281784058, + "step": 2174 + }, + { + "epoch": 0.6697445367805479, + "grad_norm": 24.125, + "learning_rate": 2.7728788482648364e-06, + "loss": 1.2885444164276123, + "step": 2176 + }, + { + "epoch": 0.6703601108033241, + "grad_norm": 12.8125, + "learning_rate": 2.7724117474748393e-06, + "loss": 1.0977085828781128, + "step": 2178 + }, + { + "epoch": 0.6709756848261004, + "grad_norm": 26.875, + "learning_rate": 2.7719442171694602e-06, + "loss": 1.7864042520523071, + "step": 2180 + }, + { + "epoch": 0.6715912588488766, + "grad_norm": 16.5, + "learning_rate": 2.7714762575552083e-06, + "loss": 1.787111759185791, + "step": 2182 + }, + { + "epoch": 0.6722068328716528, + "grad_norm": 33.0, + "learning_rate": 2.7710078688387807e-06, + "loss": 1.441255807876587, + "step": 2184 + }, + { + "epoch": 0.6728224068944291, + "grad_norm": 13.5, + "learning_rate": 2.770539051227066e-06, + "loss": 1.3002700805664062, + "step": 2186 + }, + { + "epoch": 0.6734379809172053, + "grad_norm": 25.25, + "learning_rate": 2.770069804927141e-06, + "loss": 1.2706061601638794, + "step": 2188 + }, + { + "epoch": 0.6740535549399815, + "grad_norm": 13.9375, + "learning_rate": 2.7696001301462732e-06, + "loss": 1.1968492269515991, + "step": 2190 + }, + { + "epoch": 0.6746691289627578, + "grad_norm": 19.0, + "learning_rate": 2.769130027091918e-06, + "loss": 1.6952558755874634, + "step": 2192 + }, + { + "epoch": 0.675284702985534, + "grad_norm": 8.625, + "learning_rate": 2.76865949597172e-06, + "loss": 1.7061266899108887, + "step": 2194 + }, + { + "epoch": 0.6759002770083102, + "grad_norm": 17.75, + "learning_rate": 2.768188536993514e-06, + "loss": 0.9394193887710571, + "step": 2196 + }, + { + "epoch": 0.6765158510310865, + "grad_norm": 8.9375, + "learning_rate": 2.7677171503653236e-06, + "loss": 0.793316125869751, + "step": 2198 + }, + { + "epoch": 0.6771314250538627, + "grad_norm": 13.4375, + "learning_rate": 2.7672453362953588e-06, + "loss": 1.0395151376724243, + "step": 2200 + }, + { + "epoch": 0.6777469990766389, + "grad_norm": 28.625, + "learning_rate": 2.766773094992023e-06, + "loss": 1.7974693775177002, + "step": 2202 + }, + { + "epoch": 0.6783625730994152, + "grad_norm": 11.5625, + "learning_rate": 2.766300426663904e-06, + "loss": 0.9771690368652344, + "step": 2204 + }, + { + "epoch": 0.6789781471221914, + "grad_norm": 22.25, + "learning_rate": 2.76582733151978e-06, + "loss": 0.8614020347595215, + "step": 2206 + }, + { + "epoch": 0.6795937211449676, + "grad_norm": 1184.0, + "learning_rate": 2.7653538097686183e-06, + "loss": 1.854684829711914, + "step": 2208 + }, + { + "epoch": 0.6802092951677439, + "grad_norm": 13.25, + "learning_rate": 2.7648798616195734e-06, + "loss": 1.3579025268554688, + "step": 2210 + }, + { + "epoch": 0.6808248691905202, + "grad_norm": 9.9375, + "learning_rate": 2.7644054872819902e-06, + "loss": 0.8971392512321472, + "step": 2212 + }, + { + "epoch": 0.6814404432132964, + "grad_norm": 6.71875, + "learning_rate": 2.7639306869653982e-06, + "loss": 1.313225269317627, + "step": 2214 + }, + { + "epoch": 0.6820560172360727, + "grad_norm": 19.125, + "learning_rate": 2.7634554608795185e-06, + "loss": 1.3202639818191528, + "step": 2216 + }, + { + "epoch": 0.6826715912588489, + "grad_norm": 14.25, + "learning_rate": 2.762979809234259e-06, + "loss": 1.181537389755249, + "step": 2218 + }, + { + "epoch": 0.6832871652816251, + "grad_norm": 4.65625, + "learning_rate": 2.7625037322397156e-06, + "loss": 1.23629629611969, + "step": 2220 + }, + { + "epoch": 0.6839027393044014, + "grad_norm": 41.75, + "learning_rate": 2.7620272301061716e-06, + "loss": 1.233704686164856, + "step": 2222 + }, + { + "epoch": 0.6845183133271776, + "grad_norm": 11.9375, + "learning_rate": 2.7615503030440984e-06, + "loss": 1.2978665828704834, + "step": 2224 + }, + { + "epoch": 0.6851338873499538, + "grad_norm": 13.75, + "learning_rate": 2.761072951264156e-06, + "loss": 1.2603421211242676, + "step": 2226 + }, + { + "epoch": 0.6857494613727301, + "grad_norm": 28.0, + "learning_rate": 2.7605951749771914e-06, + "loss": 1.3707977533340454, + "step": 2228 + }, + { + "epoch": 0.6863650353955063, + "grad_norm": 15.9375, + "learning_rate": 2.7601169743942375e-06, + "loss": 1.8459405899047852, + "step": 2230 + }, + { + "epoch": 0.6869806094182825, + "grad_norm": 19.125, + "learning_rate": 2.7596383497265174e-06, + "loss": 1.42882239818573, + "step": 2232 + }, + { + "epoch": 0.6875961834410588, + "grad_norm": 28.375, + "learning_rate": 2.7591593011854395e-06, + "loss": 1.8060115575790405, + "step": 2234 + }, + { + "epoch": 0.688211757463835, + "grad_norm": 32.75, + "learning_rate": 2.758679828982601e-06, + "loss": 1.7955055236816406, + "step": 2236 + }, + { + "epoch": 0.6888273314866112, + "grad_norm": 15.3125, + "learning_rate": 2.758199933329784e-06, + "loss": 1.348275899887085, + "step": 2238 + }, + { + "epoch": 0.6894429055093875, + "grad_norm": 13.25, + "learning_rate": 2.7577196144389592e-06, + "loss": 1.5242661237716675, + "step": 2240 + }, + { + "epoch": 0.6900584795321637, + "grad_norm": 12.875, + "learning_rate": 2.7572388725222848e-06, + "loss": 1.3932478427886963, + "step": 2242 + }, + { + "epoch": 0.6906740535549399, + "grad_norm": 14.5625, + "learning_rate": 2.7567577077921046e-06, + "loss": 1.2572059631347656, + "step": 2244 + }, + { + "epoch": 0.6912896275777162, + "grad_norm": 18.0, + "learning_rate": 2.7562761204609494e-06, + "loss": 1.7579888105392456, + "step": 2246 + }, + { + "epoch": 0.6919052016004925, + "grad_norm": 22.25, + "learning_rate": 2.7557941107415375e-06, + "loss": 1.524322271347046, + "step": 2248 + }, + { + "epoch": 0.6925207756232687, + "grad_norm": 9.9375, + "learning_rate": 2.755311678846773e-06, + "loss": 1.2153962850570679, + "step": 2250 + }, + { + "epoch": 0.693136349646045, + "grad_norm": 11.1875, + "learning_rate": 2.7548288249897455e-06, + "loss": 1.6535332202911377, + "step": 2252 + }, + { + "epoch": 0.6937519236688212, + "grad_norm": 13.4375, + "learning_rate": 2.7543455493837334e-06, + "loss": 0.6955275535583496, + "step": 2254 + }, + { + "epoch": 0.6943674976915974, + "grad_norm": 5.6875, + "learning_rate": 2.7538618522422e-06, + "loss": 1.3415193557739258, + "step": 2256 + }, + { + "epoch": 0.6949830717143737, + "grad_norm": 10.3125, + "learning_rate": 2.7533777337787945e-06, + "loss": 1.3508849143981934, + "step": 2258 + }, + { + "epoch": 0.6955986457371499, + "grad_norm": 13.5, + "learning_rate": 2.752893194207352e-06, + "loss": 1.2617613077163696, + "step": 2260 + }, + { + "epoch": 0.6962142197599261, + "grad_norm": 17.75, + "learning_rate": 2.7524082337418948e-06, + "loss": 1.5909432172775269, + "step": 2262 + }, + { + "epoch": 0.6968297937827024, + "grad_norm": 16.0, + "learning_rate": 2.751922852596631e-06, + "loss": 1.5903215408325195, + "step": 2264 + }, + { + "epoch": 0.6974453678054786, + "grad_norm": 6.5, + "learning_rate": 2.751437050985954e-06, + "loss": 1.2537171840667725, + "step": 2266 + }, + { + "epoch": 0.6980609418282548, + "grad_norm": 23.375, + "learning_rate": 2.7509508291244417e-06, + "loss": 1.4879339933395386, + "step": 2268 + }, + { + "epoch": 0.6986765158510311, + "grad_norm": 14.375, + "learning_rate": 2.75046418722686e-06, + "loss": 1.5560357570648193, + "step": 2270 + }, + { + "epoch": 0.6992920898738073, + "grad_norm": 12.75, + "learning_rate": 2.749977125508158e-06, + "loss": 1.4228450059890747, + "step": 2272 + }, + { + "epoch": 0.6999076638965835, + "grad_norm": 143.0, + "learning_rate": 2.7494896441834726e-06, + "loss": 1.4056365489959717, + "step": 2274 + }, + { + "epoch": 0.7005232379193598, + "grad_norm": 3.40625, + "learning_rate": 2.749001743468125e-06, + "loss": 1.3732974529266357, + "step": 2276 + }, + { + "epoch": 0.701138811942136, + "grad_norm": 16.875, + "learning_rate": 2.7485134235776207e-06, + "loss": 1.351379156112671, + "step": 2278 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.46875, + "learning_rate": 2.7480246847276512e-06, + "loss": 1.1557962894439697, + "step": 2280 + }, + { + "epoch": 0.7023699599876885, + "grad_norm": 36.25, + "learning_rate": 2.747535527134093e-06, + "loss": 1.2525272369384766, + "step": 2282 + }, + { + "epoch": 0.7029855340104647, + "grad_norm": 16.25, + "learning_rate": 2.747045951013008e-06, + "loss": 0.9319900274276733, + "step": 2284 + }, + { + "epoch": 0.703601108033241, + "grad_norm": 12.3125, + "learning_rate": 2.7465559565806423e-06, + "loss": 1.4514952898025513, + "step": 2286 + }, + { + "epoch": 0.7042166820560173, + "grad_norm": 9.0, + "learning_rate": 2.7460655440534277e-06, + "loss": 1.169830322265625, + "step": 2288 + }, + { + "epoch": 0.7048322560787935, + "grad_norm": 6.125, + "learning_rate": 2.7455747136479778e-06, + "loss": 1.2618813514709473, + "step": 2290 + }, + { + "epoch": 0.7054478301015698, + "grad_norm": 2.96875, + "learning_rate": 2.745083465581096e-06, + "loss": 1.0544602870941162, + "step": 2292 + }, + { + "epoch": 0.706063404124346, + "grad_norm": 8.8125, + "learning_rate": 2.744591800069765e-06, + "loss": 1.2584667205810547, + "step": 2294 + }, + { + "epoch": 0.7066789781471222, + "grad_norm": 12.625, + "learning_rate": 2.7440997173311546e-06, + "loss": 1.459695816040039, + "step": 2296 + }, + { + "epoch": 0.7072945521698984, + "grad_norm": 8.0, + "learning_rate": 2.7436072175826177e-06, + "loss": 1.2529194355010986, + "step": 2298 + }, + { + "epoch": 0.7079101261926747, + "grad_norm": 10.75, + "learning_rate": 2.7431143010416932e-06, + "loss": 1.2173200845718384, + "step": 2300 + }, + { + "epoch": 0.7085257002154509, + "grad_norm": 24.0, + "learning_rate": 2.7426209679261024e-06, + "loss": 1.2202057838439941, + "step": 2302 + }, + { + "epoch": 0.7091412742382271, + "grad_norm": 15.0, + "learning_rate": 2.7421272184537516e-06, + "loss": 1.6685733795166016, + "step": 2304 + }, + { + "epoch": 0.7097568482610034, + "grad_norm": 38.0, + "learning_rate": 2.7416330528427285e-06, + "loss": 1.4149043560028076, + "step": 2306 + }, + { + "epoch": 0.7103724222837796, + "grad_norm": 12.4375, + "learning_rate": 2.7411384713113094e-06, + "loss": 1.2093353271484375, + "step": 2308 + }, + { + "epoch": 0.7109879963065558, + "grad_norm": 10.4375, + "learning_rate": 2.740643474077949e-06, + "loss": 1.1467716693878174, + "step": 2310 + }, + { + "epoch": 0.7116035703293321, + "grad_norm": 25.625, + "learning_rate": 2.74014806136129e-06, + "loss": 1.4263098239898682, + "step": 2312 + }, + { + "epoch": 0.7122191443521083, + "grad_norm": 12.6875, + "learning_rate": 2.739652233380156e-06, + "loss": 1.4533731937408447, + "step": 2314 + }, + { + "epoch": 0.7128347183748845, + "grad_norm": 7.375, + "learning_rate": 2.7391559903535543e-06, + "loss": 1.3134911060333252, + "step": 2316 + }, + { + "epoch": 0.7134502923976608, + "grad_norm": 11.9375, + "learning_rate": 2.7386593325006774e-06, + "loss": 1.3923784494400024, + "step": 2318 + }, + { + "epoch": 0.714065866420437, + "grad_norm": 18.875, + "learning_rate": 2.7381622600408983e-06, + "loss": 1.280435562133789, + "step": 2320 + }, + { + "epoch": 0.7146814404432132, + "grad_norm": 15.75, + "learning_rate": 2.737664773193776e-06, + "loss": 0.7519451379776001, + "step": 2322 + }, + { + "epoch": 0.7152970144659896, + "grad_norm": 15.625, + "learning_rate": 2.7371668721790487e-06, + "loss": 1.4741475582122803, + "step": 2324 + }, + { + "epoch": 0.7159125884887658, + "grad_norm": 13.5625, + "learning_rate": 2.7366685572166416e-06, + "loss": 1.3647998571395874, + "step": 2326 + }, + { + "epoch": 0.716528162511542, + "grad_norm": 19.375, + "learning_rate": 2.736169828526661e-06, + "loss": 1.6258673667907715, + "step": 2328 + }, + { + "epoch": 0.7171437365343183, + "grad_norm": 16.25, + "learning_rate": 2.7356706863293943e-06, + "loss": 1.5511900186538696, + "step": 2330 + }, + { + "epoch": 0.7177593105570945, + "grad_norm": 10.3125, + "learning_rate": 2.7351711308453158e-06, + "loss": 1.2366533279418945, + "step": 2332 + }, + { + "epoch": 0.7183748845798708, + "grad_norm": 7.84375, + "learning_rate": 2.734671162295077e-06, + "loss": 1.3121178150177002, + "step": 2334 + }, + { + "epoch": 0.718990458602647, + "grad_norm": 12.5625, + "learning_rate": 2.7341707808995167e-06, + "loss": 1.1641346216201782, + "step": 2336 + }, + { + "epoch": 0.7196060326254232, + "grad_norm": 12.5625, + "learning_rate": 2.733669986879653e-06, + "loss": 1.4536982774734497, + "step": 2338 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 22.5, + "learning_rate": 2.733168780456687e-06, + "loss": 1.1714286804199219, + "step": 2340 + }, + { + "epoch": 0.7208371806709757, + "grad_norm": 33.25, + "learning_rate": 2.732667161852003e-06, + "loss": 1.4636425971984863, + "step": 2342 + }, + { + "epoch": 0.7214527546937519, + "grad_norm": 12.5625, + "learning_rate": 2.732165131287165e-06, + "loss": 1.734552025794983, + "step": 2344 + }, + { + "epoch": 0.7220683287165282, + "grad_norm": 16.5, + "learning_rate": 2.731662688983922e-06, + "loss": 1.1435258388519287, + "step": 2346 + }, + { + "epoch": 0.7226839027393044, + "grad_norm": 11.5, + "learning_rate": 2.731159835164203e-06, + "loss": 1.2161524295806885, + "step": 2348 + }, + { + "epoch": 0.7232994767620806, + "grad_norm": 14.25, + "learning_rate": 2.7306565700501187e-06, + "loss": 1.2991399765014648, + "step": 2350 + }, + { + "epoch": 0.7239150507848569, + "grad_norm": 18.125, + "learning_rate": 2.730152893863962e-06, + "loss": 1.5045785903930664, + "step": 2352 + }, + { + "epoch": 0.7245306248076331, + "grad_norm": 19.75, + "learning_rate": 2.7296488068282075e-06, + "loss": 1.297149658203125, + "step": 2354 + }, + { + "epoch": 0.7251461988304093, + "grad_norm": 33.25, + "learning_rate": 2.7291443091655106e-06, + "loss": 1.441976547241211, + "step": 2356 + }, + { + "epoch": 0.7257617728531855, + "grad_norm": 22.625, + "learning_rate": 2.728639401098709e-06, + "loss": 1.2886719703674316, + "step": 2358 + }, + { + "epoch": 0.7263773468759618, + "grad_norm": 30.25, + "learning_rate": 2.7281340828508204e-06, + "loss": 1.936105489730835, + "step": 2360 + }, + { + "epoch": 0.7269929208987381, + "grad_norm": 12.125, + "learning_rate": 2.7276283546450453e-06, + "loss": 1.4851388931274414, + "step": 2362 + }, + { + "epoch": 0.7276084949215144, + "grad_norm": 15.875, + "learning_rate": 2.727122216704764e-06, + "loss": 1.3581610918045044, + "step": 2364 + }, + { + "epoch": 0.7282240689442906, + "grad_norm": 14.6875, + "learning_rate": 2.7266156692535384e-06, + "loss": 1.2150791883468628, + "step": 2366 + }, + { + "epoch": 0.7288396429670668, + "grad_norm": 13.4375, + "learning_rate": 2.7261087125151103e-06, + "loss": 1.5836424827575684, + "step": 2368 + }, + { + "epoch": 0.7294552169898431, + "grad_norm": 9.6875, + "learning_rate": 2.7256013467134044e-06, + "loss": 1.5009336471557617, + "step": 2370 + }, + { + "epoch": 0.7300707910126193, + "grad_norm": 43.25, + "learning_rate": 2.725093572072524e-06, + "loss": 1.7373287677764893, + "step": 2372 + }, + { + "epoch": 0.7306863650353955, + "grad_norm": 12.4375, + "learning_rate": 2.7245853888167537e-06, + "loss": 1.2735278606414795, + "step": 2374 + }, + { + "epoch": 0.7313019390581718, + "grad_norm": 32.75, + "learning_rate": 2.724076797170559e-06, + "loss": 1.5648930072784424, + "step": 2376 + }, + { + "epoch": 0.731917513080948, + "grad_norm": 18.0, + "learning_rate": 2.723567797358585e-06, + "loss": 1.8062489032745361, + "step": 2378 + }, + { + "epoch": 0.7325330871037242, + "grad_norm": 15.0, + "learning_rate": 2.7230583896056573e-06, + "loss": 1.4886645078659058, + "step": 2380 + }, + { + "epoch": 0.7331486611265005, + "grad_norm": 11.625, + "learning_rate": 2.7225485741367827e-06, + "loss": 1.6303420066833496, + "step": 2382 + }, + { + "epoch": 0.7337642351492767, + "grad_norm": 15.1875, + "learning_rate": 2.7220383511771466e-06, + "loss": 1.212205410003662, + "step": 2384 + }, + { + "epoch": 0.7343798091720529, + "grad_norm": 14.125, + "learning_rate": 2.7215277209521153e-06, + "loss": 1.199798345565796, + "step": 2386 + }, + { + "epoch": 0.7349953831948292, + "grad_norm": 14.0625, + "learning_rate": 2.721016683687235e-06, + "loss": 1.6477723121643066, + "step": 2388 + }, + { + "epoch": 0.7356109572176054, + "grad_norm": 36.25, + "learning_rate": 2.7205052396082316e-06, + "loss": 1.1711366176605225, + "step": 2390 + }, + { + "epoch": 0.7362265312403816, + "grad_norm": 9.1875, + "learning_rate": 2.7199933889410095e-06, + "loss": 1.4491968154907227, + "step": 2392 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 8.4375, + "learning_rate": 2.7194811319116537e-06, + "loss": 1.3231985569000244, + "step": 2394 + }, + { + "epoch": 0.7374576792859341, + "grad_norm": 17.375, + "learning_rate": 2.718968468746431e-06, + "loss": 1.2740378379821777, + "step": 2396 + }, + { + "epoch": 0.7380732533087103, + "grad_norm": 15.25, + "learning_rate": 2.7184553996717827e-06, + "loss": 1.203467607498169, + "step": 2398 + }, + { + "epoch": 0.7386888273314867, + "grad_norm": 12.4375, + "learning_rate": 2.7179419249143326e-06, + "loss": 1.423843264579773, + "step": 2400 + }, + { + "epoch": 0.7393044013542629, + "grad_norm": 13.125, + "learning_rate": 2.7174280447008843e-06, + "loss": 1.42924165725708, + "step": 2402 + }, + { + "epoch": 0.7399199753770391, + "grad_norm": 12.875, + "learning_rate": 2.7169137592584177e-06, + "loss": 1.3557947874069214, + "step": 2404 + }, + { + "epoch": 0.7405355493998154, + "grad_norm": 23.875, + "learning_rate": 2.7163990688140948e-06, + "loss": 1.376276969909668, + "step": 2406 + }, + { + "epoch": 0.7411511234225916, + "grad_norm": 7.90625, + "learning_rate": 2.7158839735952536e-06, + "loss": 1.2827298641204834, + "step": 2408 + }, + { + "epoch": 0.7417666974453678, + "grad_norm": 14.875, + "learning_rate": 2.715368473829413e-06, + "loss": 1.4431157112121582, + "step": 2410 + }, + { + "epoch": 0.7423822714681441, + "grad_norm": 8.5625, + "learning_rate": 2.71485256974427e-06, + "loss": 1.05255126953125, + "step": 2412 + }, + { + "epoch": 0.7429978454909203, + "grad_norm": 12.625, + "learning_rate": 2.7143362615676994e-06, + "loss": 1.1840324401855469, + "step": 2414 + }, + { + "epoch": 0.7436134195136965, + "grad_norm": 16.75, + "learning_rate": 2.7138195495277556e-06, + "loss": 1.7737140655517578, + "step": 2416 + }, + { + "epoch": 0.7442289935364728, + "grad_norm": 5.8125, + "learning_rate": 2.7133024338526705e-06, + "loss": 1.3097442388534546, + "step": 2418 + }, + { + "epoch": 0.744844567559249, + "grad_norm": 15.75, + "learning_rate": 2.7127849147708544e-06, + "loss": 1.3597081899642944, + "step": 2420 + }, + { + "epoch": 0.7454601415820252, + "grad_norm": 7.875, + "learning_rate": 2.712266992510897e-06, + "loss": 1.2132012844085693, + "step": 2422 + }, + { + "epoch": 0.7460757156048015, + "grad_norm": 9.5, + "learning_rate": 2.7117486673015647e-06, + "loss": 1.2576617002487183, + "step": 2424 + }, + { + "epoch": 0.7466912896275777, + "grad_norm": 17.25, + "learning_rate": 2.7112299393718024e-06, + "loss": 0.8877116441726685, + "step": 2426 + }, + { + "epoch": 0.7473068636503539, + "grad_norm": 12.875, + "learning_rate": 2.710710808950733e-06, + "loss": 1.40200936794281, + "step": 2428 + }, + { + "epoch": 0.7479224376731302, + "grad_norm": 23.25, + "learning_rate": 2.710191276267656e-06, + "loss": 0.8947640061378479, + "step": 2430 + }, + { + "epoch": 0.7485380116959064, + "grad_norm": 12.6875, + "learning_rate": 2.7096713415520514e-06, + "loss": 1.2542033195495605, + "step": 2432 + }, + { + "epoch": 0.7491535857186826, + "grad_norm": 13.3125, + "learning_rate": 2.709151005033573e-06, + "loss": 1.3930459022521973, + "step": 2434 + }, + { + "epoch": 0.7497691597414589, + "grad_norm": 15.5625, + "learning_rate": 2.7086302669420553e-06, + "loss": 1.506596326828003, + "step": 2436 + }, + { + "epoch": 0.7503847337642352, + "grad_norm": 12.0, + "learning_rate": 2.708109127507509e-06, + "loss": 1.3037588596343994, + "step": 2438 + }, + { + "epoch": 0.7510003077870114, + "grad_norm": 10.3125, + "learning_rate": 2.707587586960121e-06, + "loss": 1.2608420848846436, + "step": 2440 + }, + { + "epoch": 0.7516158818097877, + "grad_norm": 34.0, + "learning_rate": 2.7070656455302567e-06, + "loss": 1.4738867282867432, + "step": 2442 + }, + { + "epoch": 0.7522314558325639, + "grad_norm": 21.875, + "learning_rate": 2.706543303448459e-06, + "loss": 1.5918917655944824, + "step": 2444 + }, + { + "epoch": 0.7528470298553401, + "grad_norm": 8.9375, + "learning_rate": 2.706020560945446e-06, + "loss": 1.3004255294799805, + "step": 2446 + }, + { + "epoch": 0.7534626038781164, + "grad_norm": 12.0, + "learning_rate": 2.705497418252114e-06, + "loss": 1.5478678941726685, + "step": 2448 + }, + { + "epoch": 0.7540781779008926, + "grad_norm": 3.25, + "learning_rate": 2.7049738755995356e-06, + "loss": 1.1431105136871338, + "step": 2450 + }, + { + "epoch": 0.7546937519236688, + "grad_norm": 11.375, + "learning_rate": 2.704449933218961e-06, + "loss": 1.0184783935546875, + "step": 2452 + }, + { + "epoch": 0.7553093259464451, + "grad_norm": 33.75, + "learning_rate": 2.7039255913418157e-06, + "loss": 1.7602791786193848, + "step": 2454 + }, + { + "epoch": 0.7559248999692213, + "grad_norm": 5.96875, + "learning_rate": 2.7034008501997013e-06, + "loss": 1.3575026988983154, + "step": 2456 + }, + { + "epoch": 0.7565404739919975, + "grad_norm": 10.0, + "learning_rate": 2.7028757100243973e-06, + "loss": 1.474034309387207, + "step": 2458 + }, + { + "epoch": 0.7571560480147738, + "grad_norm": 55.0, + "learning_rate": 2.702350171047859e-06, + "loss": 1.3285274505615234, + "step": 2460 + }, + { + "epoch": 0.75777162203755, + "grad_norm": 19.875, + "learning_rate": 2.701824233502217e-06, + "loss": 1.4691517353057861, + "step": 2462 + }, + { + "epoch": 0.7583871960603262, + "grad_norm": 36.5, + "learning_rate": 2.7012978976197793e-06, + "loss": 1.464407205581665, + "step": 2464 + }, + { + "epoch": 0.7590027700831025, + "grad_norm": 15.0, + "learning_rate": 2.7007711636330273e-06, + "loss": 1.5255820751190186, + "step": 2466 + }, + { + "epoch": 0.7596183441058787, + "grad_norm": 9.375, + "learning_rate": 2.7002440317746224e-06, + "loss": 1.279646396636963, + "step": 2468 + }, + { + "epoch": 0.7602339181286549, + "grad_norm": 6.21875, + "learning_rate": 2.699716502277397e-06, + "loss": 1.1852023601531982, + "step": 2470 + }, + { + "epoch": 0.7608494921514312, + "grad_norm": 23.5, + "learning_rate": 2.6991885753743632e-06, + "loss": 1.4758989810943604, + "step": 2472 + }, + { + "epoch": 0.7614650661742074, + "grad_norm": 20.875, + "learning_rate": 2.698660251298706e-06, + "loss": 1.7962994575500488, + "step": 2474 + }, + { + "epoch": 0.7620806401969837, + "grad_norm": 12.125, + "learning_rate": 2.698131530283788e-06, + "loss": 1.674941062927246, + "step": 2476 + }, + { + "epoch": 0.76269621421976, + "grad_norm": 20.625, + "learning_rate": 2.697602412563144e-06, + "loss": 1.3710949420928955, + "step": 2478 + }, + { + "epoch": 0.7633117882425362, + "grad_norm": 13.0, + "learning_rate": 2.697072898370487e-06, + "loss": 1.4410480260849, + "step": 2480 + }, + { + "epoch": 0.7639273622653124, + "grad_norm": 25.875, + "learning_rate": 2.696542987939704e-06, + "loss": 1.4642608165740967, + "step": 2482 + }, + { + "epoch": 0.7645429362880887, + "grad_norm": 81.5, + "learning_rate": 2.6960126815048573e-06, + "loss": 1.811263084411621, + "step": 2484 + }, + { + "epoch": 0.7651585103108649, + "grad_norm": 13.25, + "learning_rate": 2.6954819793001828e-06, + "loss": 1.1713266372680664, + "step": 2486 + }, + { + "epoch": 0.7657740843336411, + "grad_norm": 16.375, + "learning_rate": 2.694950881560094e-06, + "loss": 1.1104718446731567, + "step": 2488 + }, + { + "epoch": 0.7663896583564174, + "grad_norm": 28.375, + "learning_rate": 2.6944193885191753e-06, + "loss": 1.6377125978469849, + "step": 2490 + }, + { + "epoch": 0.7670052323791936, + "grad_norm": 12.125, + "learning_rate": 2.693887500412189e-06, + "loss": 1.28836190700531, + "step": 2492 + }, + { + "epoch": 0.7676208064019698, + "grad_norm": 9.875, + "learning_rate": 2.6933552174740704e-06, + "loss": 0.921204686164856, + "step": 2494 + }, + { + "epoch": 0.7682363804247461, + "grad_norm": 15.8125, + "learning_rate": 2.6928225399399296e-06, + "loss": 1.48775315284729, + "step": 2496 + }, + { + "epoch": 0.7688519544475223, + "grad_norm": 16.5, + "learning_rate": 2.692289468045051e-06, + "loss": 1.3238601684570312, + "step": 2498 + }, + { + "epoch": 0.7694675284702985, + "grad_norm": 4.71875, + "learning_rate": 2.6917560020248935e-06, + "loss": 1.343148946762085, + "step": 2500 + }, + { + "epoch": 0.7700831024930748, + "grad_norm": 40.0, + "learning_rate": 2.6912221421150883e-06, + "loss": 1.518383264541626, + "step": 2502 + }, + { + "epoch": 0.770698676515851, + "grad_norm": 11.0625, + "learning_rate": 2.6906878885514435e-06, + "loss": 1.2683169841766357, + "step": 2504 + }, + { + "epoch": 0.7713142505386272, + "grad_norm": 6.09375, + "learning_rate": 2.6901532415699378e-06, + "loss": 1.1903431415557861, + "step": 2506 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 89.0, + "learning_rate": 2.6896182014067273e-06, + "loss": 1.6317634582519531, + "step": 2508 + }, + { + "epoch": 0.7725453985841797, + "grad_norm": 10.4375, + "learning_rate": 2.689082768298138e-06, + "loss": 1.2749760150909424, + "step": 2510 + }, + { + "epoch": 0.7731609726069559, + "grad_norm": 36.75, + "learning_rate": 2.688546942480673e-06, + "loss": 1.9662134647369385, + "step": 2512 + }, + { + "epoch": 0.7737765466297323, + "grad_norm": 11.5, + "learning_rate": 2.688010724191006e-06, + "loss": 1.454052209854126, + "step": 2514 + }, + { + "epoch": 0.7743921206525085, + "grad_norm": 15.875, + "learning_rate": 2.687474113665985e-06, + "loss": 1.078743577003479, + "step": 2516 + }, + { + "epoch": 0.7750076946752847, + "grad_norm": 50.25, + "learning_rate": 2.686937111142633e-06, + "loss": 1.247643232345581, + "step": 2518 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 10.0625, + "learning_rate": 2.6863997168581427e-06, + "loss": 1.3732048273086548, + "step": 2520 + }, + { + "epoch": 0.7762388427208372, + "grad_norm": 11.625, + "learning_rate": 2.685861931049884e-06, + "loss": 1.6267473697662354, + "step": 2522 + }, + { + "epoch": 0.7768544167436134, + "grad_norm": 12.5, + "learning_rate": 2.6853237539553947e-06, + "loss": 1.3458107709884644, + "step": 2524 + }, + { + "epoch": 0.7774699907663897, + "grad_norm": 10.0, + "learning_rate": 2.684785185812391e-06, + "loss": 1.3616960048675537, + "step": 2526 + }, + { + "epoch": 0.7780855647891659, + "grad_norm": 6.125, + "learning_rate": 2.684246226858758e-06, + "loss": 1.3410570621490479, + "step": 2528 + }, + { + "epoch": 0.7787011388119421, + "grad_norm": 28.875, + "learning_rate": 2.6837068773325537e-06, + "loss": 1.5064880847930908, + "step": 2530 + }, + { + "epoch": 0.7793167128347184, + "grad_norm": 10.0625, + "learning_rate": 2.68316713747201e-06, + "loss": 1.109811782836914, + "step": 2532 + }, + { + "epoch": 0.7799322868574946, + "grad_norm": 19.0, + "learning_rate": 2.6826270075155315e-06, + "loss": 1.1550883054733276, + "step": 2534 + }, + { + "epoch": 0.7805478608802708, + "grad_norm": 4.28125, + "learning_rate": 2.682086487701693e-06, + "loss": 1.033223032951355, + "step": 2536 + }, + { + "epoch": 0.7811634349030471, + "grad_norm": 12.5, + "learning_rate": 2.6815455782692434e-06, + "loss": 1.2696874141693115, + "step": 2538 + }, + { + "epoch": 0.7817790089258233, + "grad_norm": 19.0, + "learning_rate": 2.681004279457102e-06, + "loss": 0.9384381771087646, + "step": 2540 + }, + { + "epoch": 0.7823945829485995, + "grad_norm": 6.53125, + "learning_rate": 2.6804625915043623e-06, + "loss": 1.2728493213653564, + "step": 2542 + }, + { + "epoch": 0.7830101569713758, + "grad_norm": 15.4375, + "learning_rate": 2.679920514650288e-06, + "loss": 0.9392402172088623, + "step": 2544 + }, + { + "epoch": 0.783625730994152, + "grad_norm": 10.75, + "learning_rate": 2.679378049134315e-06, + "loss": 1.66042160987854, + "step": 2546 + }, + { + "epoch": 0.7842413050169282, + "grad_norm": 9.8125, + "learning_rate": 2.678835195196051e-06, + "loss": 1.1751291751861572, + "step": 2548 + }, + { + "epoch": 0.7848568790397045, + "grad_norm": 13.9375, + "learning_rate": 2.6782919530752756e-06, + "loss": 1.453952431678772, + "step": 2550 + }, + { + "epoch": 0.7854724530624808, + "grad_norm": 12.1875, + "learning_rate": 2.6777483230119388e-06, + "loss": 1.568293809890747, + "step": 2552 + }, + { + "epoch": 0.786088027085257, + "grad_norm": 17.75, + "learning_rate": 2.6772043052461632e-06, + "loss": 1.2449113130569458, + "step": 2554 + }, + { + "epoch": 0.7867036011080333, + "grad_norm": 71.5, + "learning_rate": 2.676659900018242e-06, + "loss": 1.2687745094299316, + "step": 2556 + }, + { + "epoch": 0.7873191751308095, + "grad_norm": 106.5, + "learning_rate": 2.6761151075686386e-06, + "loss": 1.4777586460113525, + "step": 2558 + }, + { + "epoch": 0.7879347491535857, + "grad_norm": 8.5625, + "learning_rate": 2.6755699281379897e-06, + "loss": 1.2233953475952148, + "step": 2560 + }, + { + "epoch": 0.788550323176362, + "grad_norm": 12.6875, + "learning_rate": 2.6750243619671015e-06, + "loss": 1.5392688512802124, + "step": 2562 + }, + { + "epoch": 0.7891658971991382, + "grad_norm": 26.25, + "learning_rate": 2.6744784092969506e-06, + "loss": 1.1792702674865723, + "step": 2564 + }, + { + "epoch": 0.7897814712219144, + "grad_norm": 13.3125, + "learning_rate": 2.673932070368686e-06, + "loss": 1.4379987716674805, + "step": 2566 + }, + { + "epoch": 0.7903970452446907, + "grad_norm": 5.0625, + "learning_rate": 2.6733853454236242e-06, + "loss": 1.1217145919799805, + "step": 2568 + }, + { + "epoch": 0.7910126192674669, + "grad_norm": 12.0, + "learning_rate": 2.6728382347032564e-06, + "loss": 1.3270764350891113, + "step": 2570 + }, + { + "epoch": 0.7916281932902431, + "grad_norm": 16.625, + "learning_rate": 2.672290738449241e-06, + "loss": 1.4277535676956177, + "step": 2572 + }, + { + "epoch": 0.7922437673130194, + "grad_norm": 96.0, + "learning_rate": 2.6717428569034083e-06, + "loss": 1.6692981719970703, + "step": 2574 + }, + { + "epoch": 0.7928593413357956, + "grad_norm": 16.625, + "learning_rate": 2.6711945903077576e-06, + "loss": 1.504457712173462, + "step": 2576 + }, + { + "epoch": 0.7934749153585718, + "grad_norm": 17.0, + "learning_rate": 2.6706459389044587e-06, + "loss": 1.1550219058990479, + "step": 2578 + }, + { + "epoch": 0.7940904893813481, + "grad_norm": 12.625, + "learning_rate": 2.670096902935852e-06, + "loss": 0.9650784134864807, + "step": 2580 + }, + { + "epoch": 0.7947060634041243, + "grad_norm": 9.375, + "learning_rate": 2.6695474826444473e-06, + "loss": 1.3110347986221313, + "step": 2582 + }, + { + "epoch": 0.7953216374269005, + "grad_norm": 8.4375, + "learning_rate": 2.6689976782729238e-06, + "loss": 1.3138242959976196, + "step": 2584 + }, + { + "epoch": 0.7959372114496768, + "grad_norm": 19.75, + "learning_rate": 2.6684474900641317e-06, + "loss": 1.6382602453231812, + "step": 2586 + }, + { + "epoch": 0.7965527854724531, + "grad_norm": 12.9375, + "learning_rate": 2.6678969182610885e-06, + "loss": 1.4878780841827393, + "step": 2588 + }, + { + "epoch": 0.7971683594952294, + "grad_norm": 21.625, + "learning_rate": 2.667345963106984e-06, + "loss": 1.1575593948364258, + "step": 2590 + }, + { + "epoch": 0.7977839335180056, + "grad_norm": 19.0, + "learning_rate": 2.6667946248451737e-06, + "loss": 1.3297533988952637, + "step": 2592 + }, + { + "epoch": 0.7983995075407818, + "grad_norm": 44.25, + "learning_rate": 2.6662429037191855e-06, + "loss": 1.3626294136047363, + "step": 2594 + }, + { + "epoch": 0.799015081563558, + "grad_norm": 18.375, + "learning_rate": 2.6656907999727156e-06, + "loss": 1.394080638885498, + "step": 2596 + }, + { + "epoch": 0.7996306555863343, + "grad_norm": 58.75, + "learning_rate": 2.665138313849628e-06, + "loss": 1.375070571899414, + "step": 2598 + }, + { + "epoch": 0.8002462296091105, + "grad_norm": 15.0, + "learning_rate": 2.664585445593957e-06, + "loss": 1.5231802463531494, + "step": 2600 + }, + { + "epoch": 0.8008618036318867, + "grad_norm": 7.53125, + "learning_rate": 2.664032195449905e-06, + "loss": 1.1144105195999146, + "step": 2602 + }, + { + "epoch": 0.801477377654663, + "grad_norm": 5.40625, + "learning_rate": 2.6634785636618434e-06, + "loss": 1.070705771446228, + "step": 2604 + }, + { + "epoch": 0.8020929516774392, + "grad_norm": 16.875, + "learning_rate": 2.6629245504743108e-06, + "loss": 1.2115540504455566, + "step": 2606 + }, + { + "epoch": 0.8027085257002154, + "grad_norm": 8.25, + "learning_rate": 2.662370156132017e-06, + "loss": 1.3473308086395264, + "step": 2608 + }, + { + "epoch": 0.8033240997229917, + "grad_norm": 16.625, + "learning_rate": 2.6618153808798385e-06, + "loss": 1.4530270099639893, + "step": 2610 + }, + { + "epoch": 0.8039396737457679, + "grad_norm": 22.75, + "learning_rate": 2.661260224962819e-06, + "loss": 0.9463323354721069, + "step": 2612 + }, + { + "epoch": 0.8045552477685441, + "grad_norm": 4.625, + "learning_rate": 2.6607046886261728e-06, + "loss": 1.0715365409851074, + "step": 2614 + }, + { + "epoch": 0.8051708217913204, + "grad_norm": 11.3125, + "learning_rate": 2.66014877211528e-06, + "loss": 1.1417533159255981, + "step": 2616 + }, + { + "epoch": 0.8057863958140966, + "grad_norm": 12.625, + "learning_rate": 2.65959247567569e-06, + "loss": 1.3943442106246948, + "step": 2618 + }, + { + "epoch": 0.8064019698368728, + "grad_norm": 11.5625, + "learning_rate": 2.6590357995531195e-06, + "loss": 1.2495189905166626, + "step": 2620 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 11.6875, + "learning_rate": 2.658478743993453e-06, + "loss": 1.047597885131836, + "step": 2622 + }, + { + "epoch": 0.8076331178824253, + "grad_norm": 7.9375, + "learning_rate": 2.657921309242743e-06, + "loss": 1.1524721384048462, + "step": 2624 + }, + { + "epoch": 0.8082486919052017, + "grad_norm": 9.875, + "learning_rate": 2.6573634955472074e-06, + "loss": 1.3182554244995117, + "step": 2626 + }, + { + "epoch": 0.8088642659279779, + "grad_norm": 13.1875, + "learning_rate": 2.656805303153235e-06, + "loss": 1.3932991027832031, + "step": 2628 + }, + { + "epoch": 0.8094798399507541, + "grad_norm": 10.0625, + "learning_rate": 2.65624673230738e-06, + "loss": 0.956896185874939, + "step": 2630 + }, + { + "epoch": 0.8100954139735304, + "grad_norm": 9.8125, + "learning_rate": 2.6556877832563627e-06, + "loss": 1.201633334159851, + "step": 2632 + }, + { + "epoch": 0.8107109879963066, + "grad_norm": 17.25, + "learning_rate": 2.6551284562470716e-06, + "loss": 1.1008045673370361, + "step": 2634 + }, + { + "epoch": 0.8113265620190828, + "grad_norm": 11.0625, + "learning_rate": 2.6545687515265633e-06, + "loss": 1.2820231914520264, + "step": 2636 + }, + { + "epoch": 0.811942136041859, + "grad_norm": 20.875, + "learning_rate": 2.6540086693420585e-06, + "loss": 1.4971567392349243, + "step": 2638 + }, + { + "epoch": 0.8125577100646353, + "grad_norm": 19.875, + "learning_rate": 2.653448209940947e-06, + "loss": 1.4473721981048584, + "step": 2640 + }, + { + "epoch": 0.8131732840874115, + "grad_norm": 10.3125, + "learning_rate": 2.652887373570784e-06, + "loss": 1.492368221282959, + "step": 2642 + }, + { + "epoch": 0.8137888581101878, + "grad_norm": 12.1875, + "learning_rate": 2.6523261604792924e-06, + "loss": 1.4002925157546997, + "step": 2644 + }, + { + "epoch": 0.814404432132964, + "grad_norm": 7.96875, + "learning_rate": 2.65176457091436e-06, + "loss": 1.4026880264282227, + "step": 2646 + }, + { + "epoch": 0.8150200061557402, + "grad_norm": 9.5625, + "learning_rate": 2.651202605124041e-06, + "loss": 1.2172515392303467, + "step": 2648 + }, + { + "epoch": 0.8156355801785165, + "grad_norm": 22.625, + "learning_rate": 2.6506402633565574e-06, + "loss": 1.5221962928771973, + "step": 2650 + }, + { + "epoch": 0.8162511542012927, + "grad_norm": 9.5625, + "learning_rate": 2.650077545860295e-06, + "loss": 1.2790491580963135, + "step": 2652 + }, + { + "epoch": 0.8168667282240689, + "grad_norm": 15.0625, + "learning_rate": 2.6495144528838083e-06, + "loss": 1.5245963335037231, + "step": 2654 + }, + { + "epoch": 0.8174823022468451, + "grad_norm": 15.5, + "learning_rate": 2.6489509846758146e-06, + "loss": 1.5136122703552246, + "step": 2656 + }, + { + "epoch": 0.8180978762696214, + "grad_norm": 18.75, + "learning_rate": 2.6483871414851997e-06, + "loss": 1.0351097583770752, + "step": 2658 + }, + { + "epoch": 0.8187134502923976, + "grad_norm": 11.3125, + "learning_rate": 2.647822923561013e-06, + "loss": 1.3081021308898926, + "step": 2660 + }, + { + "epoch": 0.8193290243151738, + "grad_norm": 15.25, + "learning_rate": 2.6472583311524704e-06, + "loss": 1.4934989213943481, + "step": 2662 + }, + { + "epoch": 0.8199445983379502, + "grad_norm": 28.25, + "learning_rate": 2.646693364508953e-06, + "loss": 1.2610775232315063, + "step": 2664 + }, + { + "epoch": 0.8205601723607264, + "grad_norm": 18.875, + "learning_rate": 2.6461280238800076e-06, + "loss": 1.1182706356048584, + "step": 2666 + }, + { + "epoch": 0.8211757463835027, + "grad_norm": 10.0, + "learning_rate": 2.645562309515345e-06, + "loss": 1.1223533153533936, + "step": 2668 + }, + { + "epoch": 0.8217913204062789, + "grad_norm": 18.0, + "learning_rate": 2.644996221664843e-06, + "loss": 1.2998898029327393, + "step": 2670 + }, + { + "epoch": 0.8224068944290551, + "grad_norm": 11.4375, + "learning_rate": 2.644429760578542e-06, + "loss": 1.4623315334320068, + "step": 2672 + }, + { + "epoch": 0.8230224684518314, + "grad_norm": 14.1875, + "learning_rate": 2.6438629265066496e-06, + "loss": 1.1372411251068115, + "step": 2674 + }, + { + "epoch": 0.8236380424746076, + "grad_norm": 13.0625, + "learning_rate": 2.643295719699536e-06, + "loss": 1.5036474466323853, + "step": 2676 + }, + { + "epoch": 0.8242536164973838, + "grad_norm": 7.8125, + "learning_rate": 2.642728140407738e-06, + "loss": 1.2718346118927002, + "step": 2678 + }, + { + "epoch": 0.8248691905201601, + "grad_norm": 8.8125, + "learning_rate": 2.642160188881955e-06, + "loss": 1.0505506992340088, + "step": 2680 + }, + { + "epoch": 0.8254847645429363, + "grad_norm": 69.0, + "learning_rate": 2.6415918653730535e-06, + "loss": 1.3011550903320312, + "step": 2682 + }, + { + "epoch": 0.8261003385657125, + "grad_norm": 16.0, + "learning_rate": 2.641023170132062e-06, + "loss": 1.2865910530090332, + "step": 2684 + }, + { + "epoch": 0.8267159125884888, + "grad_norm": 15.3125, + "learning_rate": 2.6404541034101723e-06, + "loss": 1.4186267852783203, + "step": 2686 + }, + { + "epoch": 0.827331486611265, + "grad_norm": 4.9375, + "learning_rate": 2.639884665458744e-06, + "loss": 1.345642328262329, + "step": 2688 + }, + { + "epoch": 0.8279470606340412, + "grad_norm": 17.5, + "learning_rate": 2.6393148565292973e-06, + "loss": 1.4285130500793457, + "step": 2690 + }, + { + "epoch": 0.8285626346568175, + "grad_norm": 12.0, + "learning_rate": 2.638744676873517e-06, + "loss": 1.0859196186065674, + "step": 2692 + }, + { + "epoch": 0.8291782086795937, + "grad_norm": 22.125, + "learning_rate": 2.6381741267432527e-06, + "loss": 1.5866267681121826, + "step": 2694 + }, + { + "epoch": 0.8297937827023699, + "grad_norm": 20.125, + "learning_rate": 2.6376032063905177e-06, + "loss": 1.4759366512298584, + "step": 2696 + }, + { + "epoch": 0.8304093567251462, + "grad_norm": 13.1875, + "learning_rate": 2.6370319160674883e-06, + "loss": 1.1637474298477173, + "step": 2698 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 10.5, + "learning_rate": 2.6364602560265027e-06, + "loss": 1.2510435581207275, + "step": 2700 + }, + { + "epoch": 0.8316405047706987, + "grad_norm": 10.4375, + "learning_rate": 2.6358882265200637e-06, + "loss": 1.4966708421707153, + "step": 2702 + }, + { + "epoch": 0.832256078793475, + "grad_norm": 3.359375, + "learning_rate": 2.6353158278008395e-06, + "loss": 1.2599449157714844, + "step": 2704 + }, + { + "epoch": 0.8328716528162512, + "grad_norm": 15.75, + "learning_rate": 2.6347430601216575e-06, + "loss": 1.3064113855361938, + "step": 2706 + }, + { + "epoch": 0.8334872268390274, + "grad_norm": 33.25, + "learning_rate": 2.63416992373551e-06, + "loss": 1.6911900043487549, + "step": 2708 + }, + { + "epoch": 0.8341028008618037, + "grad_norm": 21.0, + "learning_rate": 2.6335964188955523e-06, + "loss": 1.5358657836914062, + "step": 2710 + }, + { + "epoch": 0.8347183748845799, + "grad_norm": 22.625, + "learning_rate": 2.6330225458551026e-06, + "loss": 1.5055537223815918, + "step": 2712 + }, + { + "epoch": 0.8353339489073561, + "grad_norm": 4.28125, + "learning_rate": 2.6324483048676403e-06, + "loss": 1.00602126121521, + "step": 2714 + }, + { + "epoch": 0.8359495229301324, + "grad_norm": 9.3125, + "learning_rate": 2.631873696186809e-06, + "loss": 1.2267096042633057, + "step": 2716 + }, + { + "epoch": 0.8365650969529086, + "grad_norm": 22.25, + "learning_rate": 2.6312987200664134e-06, + "loss": 1.7715106010437012, + "step": 2718 + }, + { + "epoch": 0.8371806709756848, + "grad_norm": 14.0, + "learning_rate": 2.6307233767604223e-06, + "loss": 1.4620671272277832, + "step": 2720 + }, + { + "epoch": 0.8377962449984611, + "grad_norm": 4.90625, + "learning_rate": 2.6301476665229644e-06, + "loss": 1.3678604364395142, + "step": 2722 + }, + { + "epoch": 0.8384118190212373, + "grad_norm": 15.9375, + "learning_rate": 2.629571589608332e-06, + "loss": 1.2953975200653076, + "step": 2724 + }, + { + "epoch": 0.8390273930440135, + "grad_norm": 6.3125, + "learning_rate": 2.628995146270979e-06, + "loss": 1.2497246265411377, + "step": 2726 + }, + { + "epoch": 0.8396429670667898, + "grad_norm": 13.0, + "learning_rate": 2.6284183367655206e-06, + "loss": 1.2648379802703857, + "step": 2728 + }, + { + "epoch": 0.840258541089566, + "grad_norm": 9.3125, + "learning_rate": 2.627841161346734e-06, + "loss": 1.284532070159912, + "step": 2730 + }, + { + "epoch": 0.8408741151123422, + "grad_norm": 10.125, + "learning_rate": 2.6272636202695597e-06, + "loss": 1.401597261428833, + "step": 2732 + }, + { + "epoch": 0.8414896891351185, + "grad_norm": 28.75, + "learning_rate": 2.626685713789097e-06, + "loss": 1.6952519416809082, + "step": 2734 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 18.875, + "learning_rate": 2.626107442160608e-06, + "loss": 1.7030467987060547, + "step": 2736 + }, + { + "epoch": 0.8427208371806709, + "grad_norm": 13.1875, + "learning_rate": 2.6255288056395165e-06, + "loss": 1.5869030952453613, + "step": 2738 + }, + { + "epoch": 0.8433364112034473, + "grad_norm": 5.09375, + "learning_rate": 2.6249498044814064e-06, + "loss": 1.4417071342468262, + "step": 2740 + }, + { + "epoch": 0.8439519852262235, + "grad_norm": 12.875, + "learning_rate": 2.6243704389420225e-06, + "loss": 1.567571759223938, + "step": 2742 + }, + { + "epoch": 0.8445675592489997, + "grad_norm": 34.0, + "learning_rate": 2.623790709277273e-06, + "loss": 1.6155643463134766, + "step": 2744 + }, + { + "epoch": 0.845183133271776, + "grad_norm": 38.5, + "learning_rate": 2.623210615743224e-06, + "loss": 1.5020222663879395, + "step": 2746 + }, + { + "epoch": 0.8457987072945522, + "grad_norm": 21.625, + "learning_rate": 2.6226301585961033e-06, + "loss": 1.3307198286056519, + "step": 2748 + }, + { + "epoch": 0.8464142813173284, + "grad_norm": 24.0, + "learning_rate": 2.622049338092301e-06, + "loss": 1.7647689580917358, + "step": 2750 + }, + { + "epoch": 0.8470298553401047, + "grad_norm": 42.5, + "learning_rate": 2.621468154488364e-06, + "loss": 1.4326261281967163, + "step": 2752 + }, + { + "epoch": 0.8476454293628809, + "grad_norm": 7.46875, + "learning_rate": 2.6208866080410026e-06, + "loss": 1.283789873123169, + "step": 2754 + }, + { + "epoch": 0.8482610033856571, + "grad_norm": 11.5, + "learning_rate": 2.6203046990070875e-06, + "loss": 0.9671059250831604, + "step": 2756 + }, + { + "epoch": 0.8488765774084334, + "grad_norm": 18.75, + "learning_rate": 2.6197224276436474e-06, + "loss": 1.3419411182403564, + "step": 2758 + }, + { + "epoch": 0.8494921514312096, + "grad_norm": 10.6875, + "learning_rate": 2.619139794207873e-06, + "loss": 1.4022338390350342, + "step": 2760 + }, + { + "epoch": 0.8501077254539858, + "grad_norm": 19.25, + "learning_rate": 2.618556798957113e-06, + "loss": 1.330336570739746, + "step": 2762 + }, + { + "epoch": 0.8507232994767621, + "grad_norm": 11.1875, + "learning_rate": 2.6179734421488785e-06, + "loss": 1.348820686340332, + "step": 2764 + }, + { + "epoch": 0.8513388734995383, + "grad_norm": 6.34375, + "learning_rate": 2.6173897240408385e-06, + "loss": 1.0717982053756714, + "step": 2766 + }, + { + "epoch": 0.8519544475223145, + "grad_norm": 21.0, + "learning_rate": 2.616805644890821e-06, + "loss": 1.4334378242492676, + "step": 2768 + }, + { + "epoch": 0.8525700215450908, + "grad_norm": 12.125, + "learning_rate": 2.6162212049568155e-06, + "loss": 1.236671805381775, + "step": 2770 + }, + { + "epoch": 0.853185595567867, + "grad_norm": 22.375, + "learning_rate": 2.6156364044969694e-06, + "loss": 1.4810919761657715, + "step": 2772 + }, + { + "epoch": 0.8538011695906432, + "grad_norm": 13.5, + "learning_rate": 2.61505124376959e-06, + "loss": 1.5031414031982422, + "step": 2774 + }, + { + "epoch": 0.8544167436134195, + "grad_norm": 13.25, + "learning_rate": 2.614465723033143e-06, + "loss": 1.3797107934951782, + "step": 2776 + }, + { + "epoch": 0.8550323176361958, + "grad_norm": 7.59375, + "learning_rate": 2.6138798425462537e-06, + "loss": 1.4212639331817627, + "step": 2778 + }, + { + "epoch": 0.855647891658972, + "grad_norm": 38.0, + "learning_rate": 2.613293602567706e-06, + "loss": 1.4438972473144531, + "step": 2780 + }, + { + "epoch": 0.8562634656817483, + "grad_norm": 20.625, + "learning_rate": 2.6127070033564434e-06, + "loss": 1.578815221786499, + "step": 2782 + }, + { + "epoch": 0.8568790397045245, + "grad_norm": 4.3125, + "learning_rate": 2.6121200451715675e-06, + "loss": 1.0285112857818604, + "step": 2784 + }, + { + "epoch": 0.8574946137273007, + "grad_norm": 13.4375, + "learning_rate": 2.6115327282723372e-06, + "loss": 1.0916186571121216, + "step": 2786 + }, + { + "epoch": 0.858110187750077, + "grad_norm": 9.375, + "learning_rate": 2.6109450529181725e-06, + "loss": 1.2124738693237305, + "step": 2788 + }, + { + "epoch": 0.8587257617728532, + "grad_norm": 15.4375, + "learning_rate": 2.610357019368649e-06, + "loss": 1.2717947959899902, + "step": 2790 + }, + { + "epoch": 0.8593413357956294, + "grad_norm": 14.125, + "learning_rate": 2.6097686278835026e-06, + "loss": 0.9262675642967224, + "step": 2792 + }, + { + "epoch": 0.8599569098184057, + "grad_norm": 67.5, + "learning_rate": 2.6091798787226263e-06, + "loss": 1.0638432502746582, + "step": 2794 + }, + { + "epoch": 0.8605724838411819, + "grad_norm": 16.5, + "learning_rate": 2.6085907721460716e-06, + "loss": 1.2468934059143066, + "step": 2796 + }, + { + "epoch": 0.8611880578639581, + "grad_norm": 13.125, + "learning_rate": 2.6080013084140466e-06, + "loss": 1.290550708770752, + "step": 2798 + }, + { + "epoch": 0.8618036318867344, + "grad_norm": 16.25, + "learning_rate": 2.60741148778692e-06, + "loss": 1.3275871276855469, + "step": 2800 + }, + { + "epoch": 0.8624192059095106, + "grad_norm": 13.5, + "learning_rate": 2.606821310525213e-06, + "loss": 0.9118420481681824, + "step": 2802 + }, + { + "epoch": 0.8630347799322868, + "grad_norm": 12.4375, + "learning_rate": 2.606230776889611e-06, + "loss": 1.3373104333877563, + "step": 2804 + }, + { + "epoch": 0.8636503539550631, + "grad_norm": 18.75, + "learning_rate": 2.605639887140952e-06, + "loss": 1.4756443500518799, + "step": 2806 + }, + { + "epoch": 0.8642659279778393, + "grad_norm": 19.25, + "learning_rate": 2.605048641540232e-06, + "loss": 1.3395130634307861, + "step": 2808 + }, + { + "epoch": 0.8648815020006155, + "grad_norm": 6.09375, + "learning_rate": 2.6044570403486055e-06, + "loss": 1.3719476461410522, + "step": 2810 + }, + { + "epoch": 0.8654970760233918, + "grad_norm": 7.84375, + "learning_rate": 2.6038650838273833e-06, + "loss": 1.368028998374939, + "step": 2812 + }, + { + "epoch": 0.866112650046168, + "grad_norm": 20.5, + "learning_rate": 2.6032727722380332e-06, + "loss": 1.309047818183899, + "step": 2814 + }, + { + "epoch": 0.8667282240689443, + "grad_norm": 12.0, + "learning_rate": 2.602680105842181e-06, + "loss": 1.3907105922698975, + "step": 2816 + }, + { + "epoch": 0.8673437980917206, + "grad_norm": 15.1875, + "learning_rate": 2.6020870849016066e-06, + "loss": 1.6243841648101807, + "step": 2818 + }, + { + "epoch": 0.8679593721144968, + "grad_norm": 37.25, + "learning_rate": 2.6014937096782484e-06, + "loss": 1.253342628479004, + "step": 2820 + }, + { + "epoch": 0.868574946137273, + "grad_norm": 17.25, + "learning_rate": 2.6008999804342017e-06, + "loss": 1.4394148588180542, + "step": 2822 + }, + { + "epoch": 0.8691905201600493, + "grad_norm": 12.5625, + "learning_rate": 2.6003058974317166e-06, + "loss": 1.3138532638549805, + "step": 2824 + }, + { + "epoch": 0.8698060941828255, + "grad_norm": 5.25, + "learning_rate": 2.5997114609332e-06, + "loss": 1.1129381656646729, + "step": 2826 + }, + { + "epoch": 0.8704216682056017, + "grad_norm": 10.0, + "learning_rate": 2.5991166712012163e-06, + "loss": 1.2607460021972656, + "step": 2828 + }, + { + "epoch": 0.871037242228378, + "grad_norm": 14.375, + "learning_rate": 2.5985215284984843e-06, + "loss": 1.262549638748169, + "step": 2830 + }, + { + "epoch": 0.8716528162511542, + "grad_norm": 15.5, + "learning_rate": 2.5979260330878776e-06, + "loss": 1.0864108800888062, + "step": 2832 + }, + { + "epoch": 0.8722683902739304, + "grad_norm": 5.28125, + "learning_rate": 2.59733018523243e-06, + "loss": 1.2294626235961914, + "step": 2834 + }, + { + "epoch": 0.8728839642967067, + "grad_norm": 12.5625, + "learning_rate": 2.596733985195327e-06, + "loss": 1.5669360160827637, + "step": 2836 + }, + { + "epoch": 0.8734995383194829, + "grad_norm": 54.5, + "learning_rate": 2.5961374332399104e-06, + "loss": 1.681774616241455, + "step": 2838 + }, + { + "epoch": 0.8741151123422591, + "grad_norm": 13.5625, + "learning_rate": 2.595540529629678e-06, + "loss": 1.4134490489959717, + "step": 2840 + }, + { + "epoch": 0.8747306863650354, + "grad_norm": 22.0, + "learning_rate": 2.594943274628283e-06, + "loss": 1.5794594287872314, + "step": 2842 + }, + { + "epoch": 0.8753462603878116, + "grad_norm": 16.125, + "learning_rate": 2.5943456684995334e-06, + "loss": 1.5964782238006592, + "step": 2844 + }, + { + "epoch": 0.8759618344105878, + "grad_norm": 20.75, + "learning_rate": 2.5937477115073933e-06, + "loss": 1.819379210472107, + "step": 2846 + }, + { + "epoch": 0.8765774084333641, + "grad_norm": 14.9375, + "learning_rate": 2.5931494039159797e-06, + "loss": 1.4165709018707275, + "step": 2848 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 15.875, + "learning_rate": 2.5925507459895673e-06, + "loss": 1.6482594013214111, + "step": 2850 + }, + { + "epoch": 0.8778085564789165, + "grad_norm": 40.0, + "learning_rate": 2.5919517379925825e-06, + "loss": 1.3646832704544067, + "step": 2852 + }, + { + "epoch": 0.8784241305016929, + "grad_norm": 15.3125, + "learning_rate": 2.5913523801896083e-06, + "loss": 1.118886947631836, + "step": 2854 + }, + { + "epoch": 0.8790397045244691, + "grad_norm": 15.3125, + "learning_rate": 2.5907526728453826e-06, + "loss": 1.0934598445892334, + "step": 2856 + }, + { + "epoch": 0.8796552785472453, + "grad_norm": 7.0, + "learning_rate": 2.5901526162247956e-06, + "loss": 1.2138285636901855, + "step": 2858 + }, + { + "epoch": 0.8802708525700216, + "grad_norm": 19.0, + "learning_rate": 2.5895522105928932e-06, + "loss": 1.4065654277801514, + "step": 2860 + }, + { + "epoch": 0.8808864265927978, + "grad_norm": 84.0, + "learning_rate": 2.5889514562148764e-06, + "loss": 1.5109992027282715, + "step": 2862 + }, + { + "epoch": 0.881502000615574, + "grad_norm": 12.375, + "learning_rate": 2.5883503533560976e-06, + "loss": 1.4605238437652588, + "step": 2864 + }, + { + "epoch": 0.8821175746383503, + "grad_norm": 49.5, + "learning_rate": 2.5877489022820653e-06, + "loss": 0.7409439086914062, + "step": 2866 + }, + { + "epoch": 0.8827331486611265, + "grad_norm": 14.0, + "learning_rate": 2.5871471032584412e-06, + "loss": 1.4429962635040283, + "step": 2868 + }, + { + "epoch": 0.8833487226839027, + "grad_norm": 23.25, + "learning_rate": 2.586544956551041e-06, + "loss": 1.6522072553634644, + "step": 2870 + }, + { + "epoch": 0.883964296706679, + "grad_norm": 7.25, + "learning_rate": 2.5859424624258324e-06, + "loss": 1.1676018238067627, + "step": 2872 + }, + { + "epoch": 0.8845798707294552, + "grad_norm": 7.71875, + "learning_rate": 2.585339621148939e-06, + "loss": 1.1003012657165527, + "step": 2874 + }, + { + "epoch": 0.8851954447522314, + "grad_norm": 9.875, + "learning_rate": 2.5847364329866354e-06, + "loss": 1.3532074689865112, + "step": 2876 + }, + { + "epoch": 0.8858110187750077, + "grad_norm": 13.0, + "learning_rate": 2.5841328982053518e-06, + "loss": 1.2843401432037354, + "step": 2878 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 10.1875, + "learning_rate": 2.5835290170716688e-06, + "loss": 1.2896305322647095, + "step": 2880 + }, + { + "epoch": 0.8870421668205601, + "grad_norm": 17.75, + "learning_rate": 2.5829247898523217e-06, + "loss": 1.3048279285430908, + "step": 2882 + }, + { + "epoch": 0.8876577408433364, + "grad_norm": 13.0, + "learning_rate": 2.5823202168141993e-06, + "loss": 1.1085158586502075, + "step": 2884 + }, + { + "epoch": 0.8882733148661126, + "grad_norm": 9.0625, + "learning_rate": 2.5817152982243413e-06, + "loss": 1.4354221820831299, + "step": 2886 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 15.75, + "learning_rate": 2.581110034349942e-06, + "loss": 1.0920836925506592, + "step": 2888 + }, + { + "epoch": 0.8895044629116651, + "grad_norm": 29.625, + "learning_rate": 2.5805044254583456e-06, + "loss": 0.6879758834838867, + "step": 2890 + }, + { + "epoch": 0.8901200369344414, + "grad_norm": 11.125, + "learning_rate": 2.5798984718170507e-06, + "loss": 1.3846540451049805, + "step": 2892 + }, + { + "epoch": 0.8907356109572176, + "grad_norm": 33.5, + "learning_rate": 2.579292173693709e-06, + "loss": 1.2994391918182373, + "step": 2894 + }, + { + "epoch": 0.8913511849799939, + "grad_norm": 13.3125, + "learning_rate": 2.5786855313561216e-06, + "loss": 1.164937973022461, + "step": 2896 + }, + { + "epoch": 0.8919667590027701, + "grad_norm": 23.0, + "learning_rate": 2.5780785450722434e-06, + "loss": 1.4573490619659424, + "step": 2898 + }, + { + "epoch": 0.8925823330255463, + "grad_norm": 22.875, + "learning_rate": 2.5774712151101814e-06, + "loss": 1.6126925945281982, + "step": 2900 + }, + { + "epoch": 0.8931979070483226, + "grad_norm": 21.625, + "learning_rate": 2.576863541738193e-06, + "loss": 1.1551085710525513, + "step": 2902 + }, + { + "epoch": 0.8938134810710988, + "grad_norm": 20.0, + "learning_rate": 2.5762555252246896e-06, + "loss": 1.3950368165969849, + "step": 2904 + }, + { + "epoch": 0.894429055093875, + "grad_norm": 13.4375, + "learning_rate": 2.5756471658382325e-06, + "loss": 1.4055218696594238, + "step": 2906 + }, + { + "epoch": 0.8950446291166513, + "grad_norm": 11.0, + "learning_rate": 2.5750384638475337e-06, + "loss": 1.2864396572113037, + "step": 2908 + }, + { + "epoch": 0.8956602031394275, + "grad_norm": 20.25, + "learning_rate": 2.5744294195214584e-06, + "loss": 1.6634044647216797, + "step": 2910 + }, + { + "epoch": 0.8962757771622037, + "grad_norm": 12.6875, + "learning_rate": 2.573820033129022e-06, + "loss": 1.3990838527679443, + "step": 2912 + }, + { + "epoch": 0.89689135118498, + "grad_norm": 15.75, + "learning_rate": 2.5732103049393908e-06, + "loss": 1.301020860671997, + "step": 2914 + }, + { + "epoch": 0.8975069252077562, + "grad_norm": 41.5, + "learning_rate": 2.572600235221883e-06, + "loss": 1.5635402202606201, + "step": 2916 + }, + { + "epoch": 0.8981224992305324, + "grad_norm": 8.625, + "learning_rate": 2.571989824245967e-06, + "loss": 1.3051490783691406, + "step": 2918 + }, + { + "epoch": 0.8987380732533087, + "grad_norm": 12.6875, + "learning_rate": 2.571379072281262e-06, + "loss": 1.5666720867156982, + "step": 2920 + }, + { + "epoch": 0.8993536472760849, + "grad_norm": 15.6875, + "learning_rate": 2.5707679795975377e-06, + "loss": 1.0976910591125488, + "step": 2922 + }, + { + "epoch": 0.8999692212988611, + "grad_norm": 16.875, + "learning_rate": 2.5701565464647146e-06, + "loss": 1.0979533195495605, + "step": 2924 + }, + { + "epoch": 0.9005847953216374, + "grad_norm": 12.75, + "learning_rate": 2.5695447731528634e-06, + "loss": 1.7441458702087402, + "step": 2926 + }, + { + "epoch": 0.9012003693444137, + "grad_norm": 7.71875, + "learning_rate": 2.5689326599322043e-06, + "loss": 1.4589111804962158, + "step": 2928 + }, + { + "epoch": 0.90181594336719, + "grad_norm": 10.0625, + "learning_rate": 2.5683202070731097e-06, + "loss": 1.434112310409546, + "step": 2930 + }, + { + "epoch": 0.9024315173899662, + "grad_norm": 29.0, + "learning_rate": 2.5677074148460995e-06, + "loss": 1.6956961154937744, + "step": 2932 + }, + { + "epoch": 0.9030470914127424, + "grad_norm": 27.125, + "learning_rate": 2.5670942835218465e-06, + "loss": 1.1124253273010254, + "step": 2934 + }, + { + "epoch": 0.9036626654355187, + "grad_norm": 7.21875, + "learning_rate": 2.5664808133711695e-06, + "loss": 1.3332182168960571, + "step": 2936 + }, + { + "epoch": 0.9042782394582949, + "grad_norm": 12.75, + "learning_rate": 2.5658670046650395e-06, + "loss": 1.623244285583496, + "step": 2938 + }, + { + "epoch": 0.9048938134810711, + "grad_norm": 8.4375, + "learning_rate": 2.565252857674578e-06, + "loss": 1.2698122262954712, + "step": 2940 + }, + { + "epoch": 0.9055093875038474, + "grad_norm": 12.3125, + "learning_rate": 2.564638372671052e-06, + "loss": 1.6054935455322266, + "step": 2942 + }, + { + "epoch": 0.9061249615266236, + "grad_norm": 31.25, + "learning_rate": 2.564023549925882e-06, + "loss": 1.413541555404663, + "step": 2944 + }, + { + "epoch": 0.9067405355493998, + "grad_norm": 20.875, + "learning_rate": 2.563408389710636e-06, + "loss": 1.2901825904846191, + "step": 2946 + }, + { + "epoch": 0.907356109572176, + "grad_norm": 17.25, + "learning_rate": 2.5627928922970294e-06, + "loss": 1.7496740818023682, + "step": 2948 + }, + { + "epoch": 0.9079716835949523, + "grad_norm": 14.1875, + "learning_rate": 2.562177057956929e-06, + "loss": 1.3308651447296143, + "step": 2950 + }, + { + "epoch": 0.9085872576177285, + "grad_norm": 12.6875, + "learning_rate": 2.5615608869623505e-06, + "loss": 0.8857111930847168, + "step": 2952 + }, + { + "epoch": 0.9092028316405047, + "grad_norm": 11.5, + "learning_rate": 2.5609443795854557e-06, + "loss": 0.7823306322097778, + "step": 2954 + }, + { + "epoch": 0.909818405663281, + "grad_norm": 37.5, + "learning_rate": 2.560327536098558e-06, + "loss": 1.6251142024993896, + "step": 2956 + }, + { + "epoch": 0.9104339796860572, + "grad_norm": 8.0, + "learning_rate": 2.5597103567741162e-06, + "loss": 1.3983535766601562, + "step": 2958 + }, + { + "epoch": 0.9110495537088334, + "grad_norm": 17.5, + "learning_rate": 2.5590928418847415e-06, + "loss": 1.1519997119903564, + "step": 2960 + }, + { + "epoch": 0.9116651277316097, + "grad_norm": 21.625, + "learning_rate": 2.5584749917031887e-06, + "loss": 1.6490130424499512, + "step": 2962 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 14.3125, + "learning_rate": 2.557856806502364e-06, + "loss": 1.4404358863830566, + "step": 2964 + }, + { + "epoch": 0.9128962757771623, + "grad_norm": 19.75, + "learning_rate": 2.5572382865553203e-06, + "loss": 1.3353450298309326, + "step": 2966 + }, + { + "epoch": 0.9135118497999385, + "grad_norm": 15.75, + "learning_rate": 2.5566194321352584e-06, + "loss": 1.310957670211792, + "step": 2968 + }, + { + "epoch": 0.9141274238227147, + "grad_norm": 12.375, + "learning_rate": 2.5560002435155283e-06, + "loss": 1.3766510486602783, + "step": 2970 + }, + { + "epoch": 0.914742997845491, + "grad_norm": 31.125, + "learning_rate": 2.5553807209696237e-06, + "loss": 1.389007329940796, + "step": 2972 + }, + { + "epoch": 0.9153585718682672, + "grad_norm": 38.5, + "learning_rate": 2.554760864771191e-06, + "loss": 1.6427724361419678, + "step": 2974 + }, + { + "epoch": 0.9159741458910434, + "grad_norm": 5.125, + "learning_rate": 2.5541406751940193e-06, + "loss": 1.3378264904022217, + "step": 2976 + }, + { + "epoch": 0.9165897199138197, + "grad_norm": 32.5, + "learning_rate": 2.5535201525120492e-06, + "loss": 1.6025424003601074, + "step": 2978 + }, + { + "epoch": 0.9172052939365959, + "grad_norm": 7.28125, + "learning_rate": 2.5528992969993648e-06, + "loss": 1.0566959381103516, + "step": 2980 + }, + { + "epoch": 0.9178208679593721, + "grad_norm": 8.9375, + "learning_rate": 2.5522781089301983e-06, + "loss": 1.0162148475646973, + "step": 2982 + }, + { + "epoch": 0.9184364419821484, + "grad_norm": 6.1875, + "learning_rate": 2.551656588578931e-06, + "loss": 1.179545283317566, + "step": 2984 + }, + { + "epoch": 0.9190520160049246, + "grad_norm": 5.375, + "learning_rate": 2.551034736220087e-06, + "loss": 1.2015800476074219, + "step": 2986 + }, + { + "epoch": 0.9196675900277008, + "grad_norm": 14.75, + "learning_rate": 2.5504125521283416e-06, + "loss": 1.3361399173736572, + "step": 2988 + }, + { + "epoch": 0.920283164050477, + "grad_norm": 11.5625, + "learning_rate": 2.5497900365785116e-06, + "loss": 1.6196482181549072, + "step": 2990 + }, + { + "epoch": 0.9208987380732533, + "grad_norm": 12.875, + "learning_rate": 2.549167189845565e-06, + "loss": 1.4826675653457642, + "step": 2992 + }, + { + "epoch": 0.9215143120960295, + "grad_norm": 10.875, + "learning_rate": 2.5485440122046133e-06, + "loss": 0.8327569365501404, + "step": 2994 + }, + { + "epoch": 0.9221298861188058, + "grad_norm": 18.75, + "learning_rate": 2.5479205039309135e-06, + "loss": 1.507612943649292, + "step": 2996 + }, + { + "epoch": 0.922745460141582, + "grad_norm": 52.75, + "learning_rate": 2.547296665299872e-06, + "loss": 1.764232873916626, + "step": 2998 + }, + { + "epoch": 0.9233610341643582, + "grad_norm": 21.25, + "learning_rate": 2.546672496587037e-06, + "loss": 1.5592687129974365, + "step": 3000 + }, + { + "epoch": 0.9239766081871345, + "grad_norm": 6.21875, + "learning_rate": 2.5460479980681062e-06, + "loss": 1.2681667804718018, + "step": 3002 + }, + { + "epoch": 0.9245921822099108, + "grad_norm": 12.25, + "learning_rate": 2.5454231700189204e-06, + "loss": 1.4879635572433472, + "step": 3004 + }, + { + "epoch": 0.925207756232687, + "grad_norm": 5.84375, + "learning_rate": 2.5447980127154673e-06, + "loss": 1.446776032447815, + "step": 3006 + }, + { + "epoch": 0.9258233302554633, + "grad_norm": 25.625, + "learning_rate": 2.544172526433879e-06, + "loss": 1.4826745986938477, + "step": 3008 + }, + { + "epoch": 0.9264389042782395, + "grad_norm": 11.25, + "learning_rate": 2.543546711450434e-06, + "loss": 1.2050938606262207, + "step": 3010 + }, + { + "epoch": 0.9270544783010157, + "grad_norm": 4.40625, + "learning_rate": 2.542920568041555e-06, + "loss": 1.1081424951553345, + "step": 3012 + }, + { + "epoch": 0.927670052323792, + "grad_norm": 19.625, + "learning_rate": 2.542294096483811e-06, + "loss": 1.2566344738006592, + "step": 3014 + }, + { + "epoch": 0.9282856263465682, + "grad_norm": 7.1875, + "learning_rate": 2.5416672970539154e-06, + "loss": 1.2727718353271484, + "step": 3016 + }, + { + "epoch": 0.9289012003693444, + "grad_norm": 16.75, + "learning_rate": 2.541040170028725e-06, + "loss": 1.3702088594436646, + "step": 3018 + }, + { + "epoch": 0.9295167743921207, + "grad_norm": 12.25, + "learning_rate": 2.5404127156852436e-06, + "loss": 1.037917137145996, + "step": 3020 + }, + { + "epoch": 0.9301323484148969, + "grad_norm": 7.03125, + "learning_rate": 2.539784934300618e-06, + "loss": 1.2113661766052246, + "step": 3022 + }, + { + "epoch": 0.9307479224376731, + "grad_norm": 8.6875, + "learning_rate": 2.53915682615214e-06, + "loss": 1.376421332359314, + "step": 3024 + }, + { + "epoch": 0.9313634964604494, + "grad_norm": 34.75, + "learning_rate": 2.5385283915172454e-06, + "loss": 1.5175917148590088, + "step": 3026 + }, + { + "epoch": 0.9319790704832256, + "grad_norm": 161.0, + "learning_rate": 2.5378996306735157e-06, + "loss": 1.3562586307525635, + "step": 3028 + }, + { + "epoch": 0.9325946445060018, + "grad_norm": 11.6875, + "learning_rate": 2.5372705438986742e-06, + "loss": 1.3766993284225464, + "step": 3030 + }, + { + "epoch": 0.9332102185287781, + "grad_norm": 13.3125, + "learning_rate": 2.5366411314705884e-06, + "loss": 1.4237804412841797, + "step": 3032 + }, + { + "epoch": 0.9338257925515543, + "grad_norm": 17.75, + "learning_rate": 2.5360113936672727e-06, + "loss": 1.5633426904678345, + "step": 3034 + }, + { + "epoch": 0.9344413665743305, + "grad_norm": 10.5, + "learning_rate": 2.5353813307668818e-06, + "loss": 1.4739643335342407, + "step": 3036 + }, + { + "epoch": 0.9350569405971068, + "grad_norm": 12.9375, + "learning_rate": 2.534750943047715e-06, + "loss": 1.355829119682312, + "step": 3038 + }, + { + "epoch": 0.935672514619883, + "grad_norm": 28.625, + "learning_rate": 2.534120230788216e-06, + "loss": 1.4471814632415771, + "step": 3040 + }, + { + "epoch": 0.9362880886426593, + "grad_norm": 10.625, + "learning_rate": 2.53348919426697e-06, + "loss": 1.619966983795166, + "step": 3042 + }, + { + "epoch": 0.9369036626654356, + "grad_norm": 8.4375, + "learning_rate": 2.5328578337627076e-06, + "loss": 1.022241234779358, + "step": 3044 + }, + { + "epoch": 0.9375192366882118, + "grad_norm": 10.1875, + "learning_rate": 2.5322261495543017e-06, + "loss": 1.1702439785003662, + "step": 3046 + }, + { + "epoch": 0.938134810710988, + "grad_norm": 10.875, + "learning_rate": 2.531594141920766e-06, + "loss": 1.3634662628173828, + "step": 3048 + }, + { + "epoch": 0.9387503847337643, + "grad_norm": 15.1875, + "learning_rate": 2.530961811141261e-06, + "loss": 1.2916343212127686, + "step": 3050 + }, + { + "epoch": 0.9393659587565405, + "grad_norm": 20.5, + "learning_rate": 2.5303291574950877e-06, + "loss": 1.716345191001892, + "step": 3052 + }, + { + "epoch": 0.9399815327793167, + "grad_norm": 19.875, + "learning_rate": 2.529696181261689e-06, + "loss": 1.5666673183441162, + "step": 3054 + }, + { + "epoch": 0.940597106802093, + "grad_norm": 17.625, + "learning_rate": 2.5290628827206517e-06, + "loss": 1.3357521295547485, + "step": 3056 + }, + { + "epoch": 0.9412126808248692, + "grad_norm": 29.0, + "learning_rate": 2.528429262151705e-06, + "loss": 1.4472358226776123, + "step": 3058 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 13.875, + "learning_rate": 2.5277953198347186e-06, + "loss": 1.5934563875198364, + "step": 3060 + }, + { + "epoch": 0.9424438288704217, + "grad_norm": 18.375, + "learning_rate": 2.5271610560497068e-06, + "loss": 1.1851208209991455, + "step": 3062 + }, + { + "epoch": 0.9430594028931979, + "grad_norm": 9.9375, + "learning_rate": 2.526526471076824e-06, + "loss": 1.8103055953979492, + "step": 3064 + }, + { + "epoch": 0.9436749769159741, + "grad_norm": 21.125, + "learning_rate": 2.5258915651963673e-06, + "loss": 1.5106641054153442, + "step": 3066 + }, + { + "epoch": 0.9442905509387504, + "grad_norm": 11.8125, + "learning_rate": 2.5252563386887754e-06, + "loss": 1.179125189781189, + "step": 3068 + }, + { + "epoch": 0.9449061249615266, + "grad_norm": 10.8125, + "learning_rate": 2.5246207918346286e-06, + "loss": 1.4003870487213135, + "step": 3070 + }, + { + "epoch": 0.9455216989843028, + "grad_norm": 47.75, + "learning_rate": 2.523984924914648e-06, + "loss": 1.1455589532852173, + "step": 3072 + }, + { + "epoch": 0.9461372730070791, + "grad_norm": 10.375, + "learning_rate": 2.523348738209698e-06, + "loss": 1.1926007270812988, + "step": 3074 + }, + { + "epoch": 0.9467528470298553, + "grad_norm": 10.0, + "learning_rate": 2.5227122320007817e-06, + "loss": 1.4721944332122803, + "step": 3076 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 13.25, + "learning_rate": 2.5220754065690455e-06, + "loss": 1.2761927843093872, + "step": 3078 + }, + { + "epoch": 0.9479839950754079, + "grad_norm": 15.125, + "learning_rate": 2.5214382621957754e-06, + "loss": 1.6848111152648926, + "step": 3080 + }, + { + "epoch": 0.9485995690981841, + "grad_norm": 26.5, + "learning_rate": 2.520800799162399e-06, + "loss": 1.3228620290756226, + "step": 3082 + }, + { + "epoch": 0.9492151431209603, + "grad_norm": 16.5, + "learning_rate": 2.5201630177504848e-06, + "loss": 0.9865208864212036, + "step": 3084 + }, + { + "epoch": 0.9498307171437366, + "grad_norm": 5.53125, + "learning_rate": 2.519524918241741e-06, + "loss": 1.5489556789398193, + "step": 3086 + }, + { + "epoch": 0.9504462911665128, + "grad_norm": 20.125, + "learning_rate": 2.5188865009180176e-06, + "loss": 1.191929578781128, + "step": 3088 + }, + { + "epoch": 0.951061865189289, + "grad_norm": 4.8125, + "learning_rate": 2.5182477660613033e-06, + "loss": 1.1938698291778564, + "step": 3090 + }, + { + "epoch": 0.9516774392120653, + "grad_norm": 28.25, + "learning_rate": 2.517608713953729e-06, + "loss": 1.7811293601989746, + "step": 3092 + }, + { + "epoch": 0.9522930132348415, + "grad_norm": 12.125, + "learning_rate": 2.5169693448775642e-06, + "loss": 1.4118529558181763, + "step": 3094 + }, + { + "epoch": 0.9529085872576177, + "grad_norm": 30.75, + "learning_rate": 2.5163296591152186e-06, + "loss": 1.3781766891479492, + "step": 3096 + }, + { + "epoch": 0.953524161280394, + "grad_norm": 15.6875, + "learning_rate": 2.515689656949243e-06, + "loss": 1.4856109619140625, + "step": 3098 + }, + { + "epoch": 0.9541397353031702, + "grad_norm": 24.875, + "learning_rate": 2.5150493386623265e-06, + "loss": 1.1463229656219482, + "step": 3100 + }, + { + "epoch": 0.9547553093259464, + "grad_norm": 10.1875, + "learning_rate": 2.5144087045372987e-06, + "loss": 1.2381125688552856, + "step": 3102 + }, + { + "epoch": 0.9553708833487227, + "grad_norm": 78.0, + "learning_rate": 2.513767754857128e-06, + "loss": 1.2273215055465698, + "step": 3104 + }, + { + "epoch": 0.9559864573714989, + "grad_norm": 7.03125, + "learning_rate": 2.5131264899049225e-06, + "loss": 1.1484160423278809, + "step": 3106 + }, + { + "epoch": 0.9566020313942751, + "grad_norm": 117.5, + "learning_rate": 2.512484909963931e-06, + "loss": 1.3532674312591553, + "step": 3108 + }, + { + "epoch": 0.9572176054170514, + "grad_norm": 17.0, + "learning_rate": 2.5118430153175384e-06, + "loss": 1.0924203395843506, + "step": 3110 + }, + { + "epoch": 0.9578331794398276, + "grad_norm": 36.25, + "learning_rate": 2.511200806249272e-06, + "loss": 1.5392255783081055, + "step": 3112 + }, + { + "epoch": 0.9584487534626038, + "grad_norm": 17.0, + "learning_rate": 2.5105582830427948e-06, + "loss": 1.3981271982192993, + "step": 3114 + }, + { + "epoch": 0.9590643274853801, + "grad_norm": 10.1875, + "learning_rate": 2.50991544598191e-06, + "loss": 1.2499940395355225, + "step": 3116 + }, + { + "epoch": 0.9596799015081564, + "grad_norm": 11.25, + "learning_rate": 2.509272295350561e-06, + "loss": 1.3273930549621582, + "step": 3118 + }, + { + "epoch": 0.9602954755309326, + "grad_norm": 12.6875, + "learning_rate": 2.5086288314328267e-06, + "loss": 1.0946621894836426, + "step": 3120 + }, + { + "epoch": 0.9609110495537089, + "grad_norm": 8.8125, + "learning_rate": 2.5079850545129265e-06, + "loss": 1.2586936950683594, + "step": 3122 + }, + { + "epoch": 0.9615266235764851, + "grad_norm": 12.3125, + "learning_rate": 2.5073409648752176e-06, + "loss": 1.404738426208496, + "step": 3124 + }, + { + "epoch": 0.9621421975992613, + "grad_norm": 32.75, + "learning_rate": 2.5066965628041944e-06, + "loss": 1.5974805355072021, + "step": 3126 + }, + { + "epoch": 0.9627577716220376, + "grad_norm": 16.875, + "learning_rate": 2.5060518485844903e-06, + "loss": 1.1815767288208008, + "step": 3128 + }, + { + "epoch": 0.9633733456448138, + "grad_norm": 26.125, + "learning_rate": 2.505406822500877e-06, + "loss": 0.96288001537323, + "step": 3130 + }, + { + "epoch": 0.96398891966759, + "grad_norm": 5.96875, + "learning_rate": 2.504761484838262e-06, + "loss": 1.0400323867797852, + "step": 3132 + }, + { + "epoch": 0.9646044936903663, + "grad_norm": 21.625, + "learning_rate": 2.5041158358816925e-06, + "loss": 1.462847113609314, + "step": 3134 + }, + { + "epoch": 0.9652200677131425, + "grad_norm": 14.75, + "learning_rate": 2.5034698759163528e-06, + "loss": 1.285290241241455, + "step": 3136 + }, + { + "epoch": 0.9658356417359187, + "grad_norm": 16.75, + "learning_rate": 2.502823605227563e-06, + "loss": 1.515703797340393, + "step": 3138 + }, + { + "epoch": 0.966451215758695, + "grad_norm": 16.25, + "learning_rate": 2.5021770241007826e-06, + "loss": 1.472865343093872, + "step": 3140 + }, + { + "epoch": 0.9670667897814712, + "grad_norm": 13.5625, + "learning_rate": 2.501530132821607e-06, + "loss": 1.6380629539489746, + "step": 3142 + }, + { + "epoch": 0.9676823638042474, + "grad_norm": 11.875, + "learning_rate": 2.5008829316757685e-06, + "loss": 1.2624460458755493, + "step": 3144 + }, + { + "epoch": 0.9682979378270237, + "grad_norm": 9.8125, + "learning_rate": 2.5002354209491364e-06, + "loss": 1.141180157661438, + "step": 3146 + }, + { + "epoch": 0.9689135118497999, + "grad_norm": 15.875, + "learning_rate": 2.4995876009277176e-06, + "loss": 1.371462106704712, + "step": 3148 + }, + { + "epoch": 0.9695290858725761, + "grad_norm": 20.5, + "learning_rate": 2.4989394718976542e-06, + "loss": 1.5800580978393555, + "step": 3150 + }, + { + "epoch": 0.9701446598953524, + "grad_norm": 17.0, + "learning_rate": 2.4982910341452274e-06, + "loss": 1.5933725833892822, + "step": 3152 + }, + { + "epoch": 0.9707602339181286, + "grad_norm": 13.25, + "learning_rate": 2.4976422879568497e-06, + "loss": 1.7934191226959229, + "step": 3154 + }, + { + "epoch": 0.971375807940905, + "grad_norm": 16.125, + "learning_rate": 2.496993233619076e-06, + "loss": 1.5261006355285645, + "step": 3156 + }, + { + "epoch": 0.9719913819636812, + "grad_norm": 7.46875, + "learning_rate": 2.4963438714185924e-06, + "loss": 1.2210710048675537, + "step": 3158 + }, + { + "epoch": 0.9726069559864574, + "grad_norm": 7.84375, + "learning_rate": 2.4956942016422243e-06, + "loss": 1.0445549488067627, + "step": 3160 + }, + { + "epoch": 0.9732225300092336, + "grad_norm": 14.0, + "learning_rate": 2.4950442245769304e-06, + "loss": 1.2202482223510742, + "step": 3162 + }, + { + "epoch": 0.9738381040320099, + "grad_norm": 24.75, + "learning_rate": 2.494393940509807e-06, + "loss": 1.2303857803344727, + "step": 3164 + }, + { + "epoch": 0.9744536780547861, + "grad_norm": 9.8125, + "learning_rate": 2.493743349728085e-06, + "loss": 1.519113540649414, + "step": 3166 + }, + { + "epoch": 0.9750692520775623, + "grad_norm": 14.125, + "learning_rate": 2.4930924525191317e-06, + "loss": 1.6462736129760742, + "step": 3168 + }, + { + "epoch": 0.9756848261003386, + "grad_norm": 21.5, + "learning_rate": 2.4924412491704485e-06, + "loss": 1.3465995788574219, + "step": 3170 + }, + { + "epoch": 0.9763004001231148, + "grad_norm": 16.25, + "learning_rate": 2.491789739969673e-06, + "loss": 1.1416728496551514, + "step": 3172 + }, + { + "epoch": 0.976915974145891, + "grad_norm": 35.0, + "learning_rate": 2.4911379252045775e-06, + "loss": 1.6752818822860718, + "step": 3174 + }, + { + "epoch": 0.9775315481686673, + "grad_norm": 15.0625, + "learning_rate": 2.490485805163069e-06, + "loss": 1.5092238187789917, + "step": 3176 + }, + { + "epoch": 0.9781471221914435, + "grad_norm": 12.3125, + "learning_rate": 2.48983338013319e-06, + "loss": 1.4850773811340332, + "step": 3178 + }, + { + "epoch": 0.9787626962142197, + "grad_norm": 20.0, + "learning_rate": 2.4891806504031175e-06, + "loss": 1.7530406713485718, + "step": 3180 + }, + { + "epoch": 0.979378270236996, + "grad_norm": 16.375, + "learning_rate": 2.488527616261163e-06, + "loss": 2.0001983642578125, + "step": 3182 + }, + { + "epoch": 0.9799938442597722, + "grad_norm": 35.25, + "learning_rate": 2.487874277995771e-06, + "loss": 1.9470124244689941, + "step": 3184 + }, + { + "epoch": 0.9806094182825484, + "grad_norm": 30.875, + "learning_rate": 2.4872206358955244e-06, + "loss": 1.3628416061401367, + "step": 3186 + }, + { + "epoch": 0.9812249923053247, + "grad_norm": 25.75, + "learning_rate": 2.486566690249136e-06, + "loss": 1.5063343048095703, + "step": 3188 + }, + { + "epoch": 0.9818405663281009, + "grad_norm": 4.5, + "learning_rate": 2.485912441345454e-06, + "loss": 1.3452836275100708, + "step": 3190 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 17.125, + "learning_rate": 2.4852578894734615e-06, + "loss": 0.7489191889762878, + "step": 3192 + }, + { + "epoch": 0.9830717143736535, + "grad_norm": 27.5, + "learning_rate": 2.484603034922275e-06, + "loss": 1.5885820388793945, + "step": 3194 + }, + { + "epoch": 0.9836872883964297, + "grad_norm": 4.75, + "learning_rate": 2.4839478779811445e-06, + "loss": 1.1795306205749512, + "step": 3196 + }, + { + "epoch": 0.984302862419206, + "grad_norm": 8.5, + "learning_rate": 2.483292418939454e-06, + "loss": 1.0942715406417847, + "step": 3198 + }, + { + "epoch": 0.9849184364419822, + "grad_norm": 11.5, + "learning_rate": 2.4826366580867188e-06, + "loss": 1.322864294052124, + "step": 3200 + }, + { + "epoch": 0.9855340104647584, + "grad_norm": 14.0, + "learning_rate": 2.4819805957125903e-06, + "loss": 1.2085504531860352, + "step": 3202 + }, + { + "epoch": 0.9861495844875346, + "grad_norm": 44.5, + "learning_rate": 2.4813242321068525e-06, + "loss": 0.9992244839668274, + "step": 3204 + }, + { + "epoch": 0.9867651585103109, + "grad_norm": 19.875, + "learning_rate": 2.480667567559421e-06, + "loss": 1.217922568321228, + "step": 3206 + }, + { + "epoch": 0.9873807325330871, + "grad_norm": 15.25, + "learning_rate": 2.4800106023603457e-06, + "loss": 1.1221704483032227, + "step": 3208 + }, + { + "epoch": 0.9879963065558633, + "grad_norm": 13.25, + "learning_rate": 2.479353336799809e-06, + "loss": 1.4771157503128052, + "step": 3210 + }, + { + "epoch": 0.9886118805786396, + "grad_norm": 15.625, + "learning_rate": 2.478695771168126e-06, + "loss": 1.4712547063827515, + "step": 3212 + }, + { + "epoch": 0.9892274546014158, + "grad_norm": 5.90625, + "learning_rate": 2.478037905755744e-06, + "loss": 1.2736375331878662, + "step": 3214 + }, + { + "epoch": 0.989843028624192, + "grad_norm": 20.125, + "learning_rate": 2.477379740853242e-06, + "loss": 1.5006616115570068, + "step": 3216 + }, + { + "epoch": 0.9904586026469683, + "grad_norm": 20.0, + "learning_rate": 2.4767212767513344e-06, + "loss": 1.2370715141296387, + "step": 3218 + }, + { + "epoch": 0.9910741766697445, + "grad_norm": 10.875, + "learning_rate": 2.4760625137408635e-06, + "loss": 1.1793454885482788, + "step": 3220 + }, + { + "epoch": 0.9916897506925207, + "grad_norm": 26.625, + "learning_rate": 2.475403452112806e-06, + "loss": 1.6577112674713135, + "step": 3222 + }, + { + "epoch": 0.992305324715297, + "grad_norm": 20.625, + "learning_rate": 2.474744092158271e-06, + "loss": 1.3132104873657227, + "step": 3224 + }, + { + "epoch": 0.9929208987380732, + "grad_norm": 15.1875, + "learning_rate": 2.4740844341684984e-06, + "loss": 1.5861358642578125, + "step": 3226 + }, + { + "epoch": 0.9935364727608494, + "grad_norm": 25.125, + "learning_rate": 2.473424478434859e-06, + "loss": 1.5777220726013184, + "step": 3228 + }, + { + "epoch": 0.9941520467836257, + "grad_norm": 21.75, + "learning_rate": 2.4727642252488566e-06, + "loss": 1.5983284711837769, + "step": 3230 + }, + { + "epoch": 0.994767620806402, + "grad_norm": 15.75, + "learning_rate": 2.472103674902126e-06, + "loss": 1.368581771850586, + "step": 3232 + }, + { + "epoch": 0.9953831948291783, + "grad_norm": 7.71875, + "learning_rate": 2.4714428276864327e-06, + "loss": 0.8675891160964966, + "step": 3234 + }, + { + "epoch": 0.9959987688519545, + "grad_norm": 7.21875, + "learning_rate": 2.4707816838936743e-06, + "loss": 0.9874320030212402, + "step": 3236 + }, + { + "epoch": 0.9966143428747307, + "grad_norm": 14.4375, + "learning_rate": 2.470120243815878e-06, + "loss": 0.9434853792190552, + "step": 3238 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 15.125, + "learning_rate": 2.4694585077452024e-06, + "loss": 0.8188871145248413, + "step": 3240 + }, + { + "epoch": 0.9978454909202832, + "grad_norm": 8.125, + "learning_rate": 2.4687964759739384e-06, + "loss": 1.2960641384124756, + "step": 3242 + }, + { + "epoch": 0.9984610649430594, + "grad_norm": 8.4375, + "learning_rate": 2.468134148794504e-06, + "loss": 1.224454641342163, + "step": 3244 + }, + { + "epoch": 0.9990766389658357, + "grad_norm": 26.5, + "learning_rate": 2.467471526499453e-06, + "loss": 1.3921452760696411, + "step": 3246 + }, + { + "epoch": 0.9996922129886119, + "grad_norm": 13.0, + "learning_rate": 2.4668086093814634e-06, + "loss": 1.4095790386199951, + "step": 3248 + }, + { + "epoch": 1.0003077870113881, + "grad_norm": 5.9375, + "learning_rate": 2.4661453977333482e-06, + "loss": 1.392961025238037, + "step": 3250 + }, + { + "epoch": 1.0009233610341643, + "grad_norm": 32.0, + "learning_rate": 2.4654818918480477e-06, + "loss": 1.4555277824401855, + "step": 3252 + }, + { + "epoch": 1.0015389350569406, + "grad_norm": 15.625, + "learning_rate": 2.464818092018635e-06, + "loss": 1.2271881103515625, + "step": 3254 + }, + { + "epoch": 1.0021545090797168, + "grad_norm": 17.0, + "learning_rate": 2.4641539985383088e-06, + "loss": 1.4368352890014648, + "step": 3256 + }, + { + "epoch": 1.002770083102493, + "grad_norm": 23.25, + "learning_rate": 2.463489611700402e-06, + "loss": 1.69225013256073, + "step": 3258 + }, + { + "epoch": 1.0033856571252693, + "grad_norm": 8.25, + "learning_rate": 2.4628249317983737e-06, + "loss": 1.1970336437225342, + "step": 3260 + }, + { + "epoch": 1.0040012311480455, + "grad_norm": 23.625, + "learning_rate": 2.4621599591258145e-06, + "loss": 0.747069239616394, + "step": 3262 + }, + { + "epoch": 1.0046168051708217, + "grad_norm": 18.875, + "learning_rate": 2.461494693976443e-06, + "loss": 1.5445091724395752, + "step": 3264 + }, + { + "epoch": 1.005232379193598, + "grad_norm": 3.515625, + "learning_rate": 2.4608291366441085e-06, + "loss": 1.2502844333648682, + "step": 3266 + }, + { + "epoch": 1.0058479532163742, + "grad_norm": 8.3125, + "learning_rate": 2.4601632874227873e-06, + "loss": 1.341188669204712, + "step": 3268 + }, + { + "epoch": 1.0064635272391504, + "grad_norm": 15.375, + "learning_rate": 2.459497146606587e-06, + "loss": 1.2452143430709839, + "step": 3270 + }, + { + "epoch": 1.0070791012619267, + "grad_norm": 20.0, + "learning_rate": 2.4588307144897412e-06, + "loss": 1.5691194534301758, + "step": 3272 + }, + { + "epoch": 1.007694675284703, + "grad_norm": 15.6875, + "learning_rate": 2.458163991366615e-06, + "loss": 1.076235294342041, + "step": 3274 + }, + { + "epoch": 1.0083102493074791, + "grad_norm": 9.625, + "learning_rate": 2.4574969775317e-06, + "loss": 1.206156849861145, + "step": 3276 + }, + { + "epoch": 1.0089258233302554, + "grad_norm": 20.25, + "learning_rate": 2.456829673279618e-06, + "loss": 1.520322561264038, + "step": 3278 + }, + { + "epoch": 1.0095413973530316, + "grad_norm": 20.375, + "learning_rate": 2.456162078905118e-06, + "loss": 1.2282922267913818, + "step": 3280 + }, + { + "epoch": 1.0101569713758078, + "grad_norm": 13.875, + "learning_rate": 2.4554941947030754e-06, + "loss": 1.8445767164230347, + "step": 3282 + }, + { + "epoch": 1.010772545398584, + "grad_norm": 8.6875, + "learning_rate": 2.454826020968497e-06, + "loss": 1.3538954257965088, + "step": 3284 + }, + { + "epoch": 1.0113881194213603, + "grad_norm": 12.1875, + "learning_rate": 2.4541575579965167e-06, + "loss": 1.2854328155517578, + "step": 3286 + }, + { + "epoch": 1.0120036934441368, + "grad_norm": 4.40625, + "learning_rate": 2.4534888060823927e-06, + "loss": 1.1238691806793213, + "step": 3288 + }, + { + "epoch": 1.012619267466913, + "grad_norm": 18.0, + "learning_rate": 2.4528197655215153e-06, + "loss": 1.1130653619766235, + "step": 3290 + }, + { + "epoch": 1.0132348414896892, + "grad_norm": 11.1875, + "learning_rate": 2.4521504366094e-06, + "loss": 1.1687474250793457, + "step": 3292 + }, + { + "epoch": 1.0138504155124655, + "grad_norm": 15.5625, + "learning_rate": 2.4514808196416907e-06, + "loss": 1.5649261474609375, + "step": 3294 + }, + { + "epoch": 1.0144659895352417, + "grad_norm": 36.25, + "learning_rate": 2.4508109149141577e-06, + "loss": 1.2250068187713623, + "step": 3296 + }, + { + "epoch": 1.015081563558018, + "grad_norm": 13.1875, + "learning_rate": 2.4501407227226984e-06, + "loss": 1.4038679599761963, + "step": 3298 + }, + { + "epoch": 1.0156971375807942, + "grad_norm": 4.5625, + "learning_rate": 2.449470243363338e-06, + "loss": 1.0873544216156006, + "step": 3300 + }, + { + "epoch": 1.0163127116035704, + "grad_norm": 14.0625, + "learning_rate": 2.448799477132227e-06, + "loss": 1.562752366065979, + "step": 3302 + }, + { + "epoch": 1.0169282856263466, + "grad_norm": 5.75, + "learning_rate": 2.448128424325645e-06, + "loss": 1.3743469715118408, + "step": 3304 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 26.25, + "learning_rate": 2.4474570852399953e-06, + "loss": 1.6857503652572632, + "step": 3306 + }, + { + "epoch": 1.018159433671899, + "grad_norm": 24.625, + "learning_rate": 2.4467854601718094e-06, + "loss": 1.5393667221069336, + "step": 3308 + }, + { + "epoch": 1.0187750076946753, + "grad_norm": 8.5625, + "learning_rate": 2.446113549417747e-06, + "loss": 1.216958999633789, + "step": 3310 + }, + { + "epoch": 1.0193905817174516, + "grad_norm": 6.1875, + "learning_rate": 2.4454413532745893e-06, + "loss": 1.2533522844314575, + "step": 3312 + }, + { + "epoch": 1.0200061557402278, + "grad_norm": 11.8125, + "learning_rate": 2.444768872039247e-06, + "loss": 1.1868257522583008, + "step": 3314 + }, + { + "epoch": 1.020621729763004, + "grad_norm": 8.375, + "learning_rate": 2.444096106008756e-06, + "loss": 1.022985816001892, + "step": 3316 + }, + { + "epoch": 1.0212373037857803, + "grad_norm": 30.625, + "learning_rate": 2.443423055480277e-06, + "loss": 1.6255028247833252, + "step": 3318 + }, + { + "epoch": 1.0218528778085565, + "grad_norm": 21.125, + "learning_rate": 2.4427497207510983e-06, + "loss": 1.4674348831176758, + "step": 3320 + }, + { + "epoch": 1.0224684518313327, + "grad_norm": 16.125, + "learning_rate": 2.4420761021186323e-06, + "loss": 1.4604638814926147, + "step": 3322 + }, + { + "epoch": 1.023084025854109, + "grad_norm": 17.375, + "learning_rate": 2.4414021998804167e-06, + "loss": 1.3937731981277466, + "step": 3324 + }, + { + "epoch": 1.0236995998768852, + "grad_norm": 12.625, + "learning_rate": 2.4407280143341155e-06, + "loss": 1.5467102527618408, + "step": 3326 + }, + { + "epoch": 1.0243151738996614, + "grad_norm": 16.875, + "learning_rate": 2.440053545777517e-06, + "loss": 1.1986671686172485, + "step": 3328 + }, + { + "epoch": 1.0249307479224377, + "grad_norm": 8.1875, + "learning_rate": 2.4393787945085343e-06, + "loss": 1.1440664529800415, + "step": 3330 + }, + { + "epoch": 1.025546321945214, + "grad_norm": 15.625, + "learning_rate": 2.4387037608252063e-06, + "loss": 1.2961888313293457, + "step": 3332 + }, + { + "epoch": 1.0261618959679901, + "grad_norm": 14.5, + "learning_rate": 2.4380284450256955e-06, + "loss": 0.9457817673683167, + "step": 3334 + }, + { + "epoch": 1.0267774699907664, + "grad_norm": 23.375, + "learning_rate": 2.437352847408291e-06, + "loss": 1.6424400806427002, + "step": 3336 + }, + { + "epoch": 1.0273930440135426, + "grad_norm": 32.5, + "learning_rate": 2.436676968271404e-06, + "loss": 1.4064046144485474, + "step": 3338 + }, + { + "epoch": 1.0280086180363188, + "grad_norm": 65.5, + "learning_rate": 2.436000807913571e-06, + "loss": 1.326033592224121, + "step": 3340 + }, + { + "epoch": 1.028624192059095, + "grad_norm": 21.625, + "learning_rate": 2.4353243666334535e-06, + "loss": 1.7490839958190918, + "step": 3342 + }, + { + "epoch": 1.0292397660818713, + "grad_norm": 19.75, + "learning_rate": 2.4346476447298353e-06, + "loss": 1.5640172958374023, + "step": 3344 + }, + { + "epoch": 1.0298553401046475, + "grad_norm": 18.125, + "learning_rate": 2.433970642501626e-06, + "loss": 1.1779654026031494, + "step": 3346 + }, + { + "epoch": 1.0304709141274238, + "grad_norm": 4.65625, + "learning_rate": 2.433293360247859e-06, + "loss": 1.087789535522461, + "step": 3348 + }, + { + "epoch": 1.0310864881502, + "grad_norm": 23.25, + "learning_rate": 2.4326157982676892e-06, + "loss": 1.2343831062316895, + "step": 3350 + }, + { + "epoch": 1.0317020621729762, + "grad_norm": 32.25, + "learning_rate": 2.4319379568603974e-06, + "loss": 1.4259557723999023, + "step": 3352 + }, + { + "epoch": 1.0323176361957525, + "grad_norm": 16.0, + "learning_rate": 2.431259836325386e-06, + "loss": 1.3478798866271973, + "step": 3354 + }, + { + "epoch": 1.0329332102185287, + "grad_norm": 10.0, + "learning_rate": 2.4305814369621833e-06, + "loss": 1.4005303382873535, + "step": 3356 + }, + { + "epoch": 1.033548784241305, + "grad_norm": 31.75, + "learning_rate": 2.429902759070437e-06, + "loss": 1.5675277709960938, + "step": 3358 + }, + { + "epoch": 1.0341643582640812, + "grad_norm": 9.75, + "learning_rate": 2.4292238029499213e-06, + "loss": 1.10936439037323, + "step": 3360 + }, + { + "epoch": 1.0347799322868574, + "grad_norm": 29.375, + "learning_rate": 2.428544568900532e-06, + "loss": 1.9242689609527588, + "step": 3362 + }, + { + "epoch": 1.0353955063096338, + "grad_norm": 14.625, + "learning_rate": 2.427865057222287e-06, + "loss": 1.5668294429779053, + "step": 3364 + }, + { + "epoch": 1.03601108033241, + "grad_norm": 15.6875, + "learning_rate": 2.427185268215328e-06, + "loss": 1.1507887840270996, + "step": 3366 + }, + { + "epoch": 1.0366266543551863, + "grad_norm": 11.25, + "learning_rate": 2.4265052021799166e-06, + "loss": 1.597625494003296, + "step": 3368 + }, + { + "epoch": 1.0372422283779625, + "grad_norm": 14.5625, + "learning_rate": 2.4258248594164414e-06, + "loss": 1.2777953147888184, + "step": 3370 + }, + { + "epoch": 1.0378578024007388, + "grad_norm": 5.90625, + "learning_rate": 2.4251442402254084e-06, + "loss": 1.0946611166000366, + "step": 3372 + }, + { + "epoch": 1.038473376423515, + "grad_norm": 4.4375, + "learning_rate": 2.4244633449074495e-06, + "loss": 1.0733752250671387, + "step": 3374 + }, + { + "epoch": 1.0390889504462912, + "grad_norm": 47.0, + "learning_rate": 2.423782173763317e-06, + "loss": 1.3776036500930786, + "step": 3376 + }, + { + "epoch": 1.0397045244690675, + "grad_norm": 11.8125, + "learning_rate": 2.423100727093883e-06, + "loss": 1.0430665016174316, + "step": 3378 + }, + { + "epoch": 1.0403200984918437, + "grad_norm": 7.6875, + "learning_rate": 2.4224190052001455e-06, + "loss": 1.343729019165039, + "step": 3380 + }, + { + "epoch": 1.04093567251462, + "grad_norm": 27.125, + "learning_rate": 2.4217370083832206e-06, + "loss": 1.4689085483551025, + "step": 3382 + }, + { + "epoch": 1.0415512465373962, + "grad_norm": 3.875, + "learning_rate": 2.421054736944347e-06, + "loss": 1.1416873931884766, + "step": 3384 + }, + { + "epoch": 1.0421668205601724, + "grad_norm": 8.4375, + "learning_rate": 2.4203721911848854e-06, + "loss": 0.9953557252883911, + "step": 3386 + }, + { + "epoch": 1.0427823945829486, + "grad_norm": 12.125, + "learning_rate": 2.419689371406317e-06, + "loss": 1.7007834911346436, + "step": 3388 + }, + { + "epoch": 1.0433979686057249, + "grad_norm": 5.0625, + "learning_rate": 2.419006277910243e-06, + "loss": 1.4867901802062988, + "step": 3390 + }, + { + "epoch": 1.044013542628501, + "grad_norm": 10.8125, + "learning_rate": 2.4183229109983885e-06, + "loss": 1.3887057304382324, + "step": 3392 + }, + { + "epoch": 1.0446291166512773, + "grad_norm": 10.4375, + "learning_rate": 2.417639270972596e-06, + "loss": 1.2495818138122559, + "step": 3394 + }, + { + "epoch": 1.0452446906740536, + "grad_norm": 21.125, + "learning_rate": 2.4169553581348307e-06, + "loss": 1.646527647972107, + "step": 3396 + }, + { + "epoch": 1.0458602646968298, + "grad_norm": 13.1875, + "learning_rate": 2.416271172787177e-06, + "loss": 1.0748255252838135, + "step": 3398 + }, + { + "epoch": 1.046475838719606, + "grad_norm": 16.375, + "learning_rate": 2.415586715231842e-06, + "loss": 1.2074308395385742, + "step": 3400 + }, + { + "epoch": 1.0470914127423823, + "grad_norm": 18.5, + "learning_rate": 2.414901985771149e-06, + "loss": 1.6861954927444458, + "step": 3402 + }, + { + "epoch": 1.0477069867651585, + "grad_norm": 6.75, + "learning_rate": 2.414216984707545e-06, + "loss": 1.1067943572998047, + "step": 3404 + }, + { + "epoch": 1.0483225607879347, + "grad_norm": 26.875, + "learning_rate": 2.4135317123435956e-06, + "loss": 1.556877613067627, + "step": 3406 + }, + { + "epoch": 1.048938134810711, + "grad_norm": 44.5, + "learning_rate": 2.412846168981987e-06, + "loss": 1.095926284790039, + "step": 3408 + }, + { + "epoch": 1.0495537088334872, + "grad_norm": 15.6875, + "learning_rate": 2.412160354925525e-06, + "loss": 1.4250001907348633, + "step": 3410 + }, + { + "epoch": 1.0501692828562634, + "grad_norm": 2.875, + "learning_rate": 2.411474270477132e-06, + "loss": 0.9531576633453369, + "step": 3412 + }, + { + "epoch": 1.0507848568790397, + "grad_norm": 53.0, + "learning_rate": 2.410787915939854e-06, + "loss": 0.9053239822387695, + "step": 3414 + }, + { + "epoch": 1.051400430901816, + "grad_norm": 21.625, + "learning_rate": 2.4101012916168544e-06, + "loss": 1.9192160367965698, + "step": 3416 + }, + { + "epoch": 1.0520160049245921, + "grad_norm": 14.875, + "learning_rate": 2.4094143978114163e-06, + "loss": 1.3932238817214966, + "step": 3418 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 3.71875, + "learning_rate": 2.4087272348269403e-06, + "loss": 1.208620309829712, + "step": 3420 + }, + { + "epoch": 1.0532471529701446, + "grad_norm": 18.5, + "learning_rate": 2.408039802966949e-06, + "loss": 1.367321252822876, + "step": 3422 + }, + { + "epoch": 1.0538627269929208, + "grad_norm": 5.34375, + "learning_rate": 2.4073521025350797e-06, + "loss": 1.345115065574646, + "step": 3424 + }, + { + "epoch": 1.054478301015697, + "grad_norm": 17.25, + "learning_rate": 2.406664133835092e-06, + "loss": 1.4480860233306885, + "step": 3426 + }, + { + "epoch": 1.0550938750384733, + "grad_norm": 7.625, + "learning_rate": 2.405975897170862e-06, + "loss": 1.298915147781372, + "step": 3428 + }, + { + "epoch": 1.0557094490612495, + "grad_norm": 22.625, + "learning_rate": 2.4052873928463838e-06, + "loss": 1.5393712520599365, + "step": 3430 + }, + { + "epoch": 1.0563250230840258, + "grad_norm": 29.375, + "learning_rate": 2.4045986211657718e-06, + "loss": 0.9303069114685059, + "step": 3432 + }, + { + "epoch": 1.056940597106802, + "grad_norm": 25.625, + "learning_rate": 2.4039095824332567e-06, + "loss": 1.3752050399780273, + "step": 3434 + }, + { + "epoch": 1.0575561711295782, + "grad_norm": 13.9375, + "learning_rate": 2.4032202769531878e-06, + "loss": 0.9744688272476196, + "step": 3436 + }, + { + "epoch": 1.0581717451523547, + "grad_norm": 9.4375, + "learning_rate": 2.4025307050300317e-06, + "loss": 1.7089290618896484, + "step": 3438 + }, + { + "epoch": 1.058787319175131, + "grad_norm": 16.625, + "learning_rate": 2.401840866968373e-06, + "loss": 1.3734301328659058, + "step": 3440 + }, + { + "epoch": 1.0594028931979071, + "grad_norm": 3.53125, + "learning_rate": 2.4011507630729158e-06, + "loss": 1.1722290515899658, + "step": 3442 + }, + { + "epoch": 1.0600184672206834, + "grad_norm": 12.5625, + "learning_rate": 2.4004603936484778e-06, + "loss": 1.180796504020691, + "step": 3444 + }, + { + "epoch": 1.0606340412434596, + "grad_norm": 21.75, + "learning_rate": 2.399769758999996e-06, + "loss": 1.2893471717834473, + "step": 3446 + }, + { + "epoch": 1.0612496152662358, + "grad_norm": 20.75, + "learning_rate": 2.3990788594325256e-06, + "loss": 1.4683547019958496, + "step": 3448 + }, + { + "epoch": 1.061865189289012, + "grad_norm": 16.375, + "learning_rate": 2.3983876952512377e-06, + "loss": 1.2302544116973877, + "step": 3450 + }, + { + "epoch": 1.0624807633117883, + "grad_norm": 4.09375, + "learning_rate": 2.397696266761419e-06, + "loss": 1.1894465684890747, + "step": 3452 + }, + { + "epoch": 1.0630963373345645, + "grad_norm": 109.0, + "learning_rate": 2.397004574268475e-06, + "loss": 1.4176809787750244, + "step": 3454 + }, + { + "epoch": 1.0637119113573408, + "grad_norm": 24.25, + "learning_rate": 2.396312618077928e-06, + "loss": 1.4848954677581787, + "step": 3456 + }, + { + "epoch": 1.064327485380117, + "grad_norm": 15.5625, + "learning_rate": 2.395620398495414e-06, + "loss": 1.4666824340820312, + "step": 3458 + }, + { + "epoch": 1.0649430594028932, + "grad_norm": 27.25, + "learning_rate": 2.394927915826689e-06, + "loss": 1.942247986793518, + "step": 3460 + }, + { + "epoch": 1.0655586334256695, + "grad_norm": 17.75, + "learning_rate": 2.394235170377622e-06, + "loss": 1.3470737934112549, + "step": 3462 + }, + { + "epoch": 1.0661742074484457, + "grad_norm": 38.25, + "learning_rate": 2.3935421624542e-06, + "loss": 1.3554115295410156, + "step": 3464 + }, + { + "epoch": 1.066789781471222, + "grad_norm": 17.125, + "learning_rate": 2.3928488923625255e-06, + "loss": 1.2180290222167969, + "step": 3466 + }, + { + "epoch": 1.0674053554939982, + "grad_norm": 23.625, + "learning_rate": 2.392155360408817e-06, + "loss": 1.5887868404388428, + "step": 3468 + }, + { + "epoch": 1.0680209295167744, + "grad_norm": 12.9375, + "learning_rate": 2.391461566899407e-06, + "loss": 1.319965124130249, + "step": 3470 + }, + { + "epoch": 1.0686365035395506, + "grad_norm": 15.375, + "learning_rate": 2.3907675121407463e-06, + "loss": 1.2301232814788818, + "step": 3472 + }, + { + "epoch": 1.0692520775623269, + "grad_norm": 13.125, + "learning_rate": 2.3900731964393996e-06, + "loss": 1.1541111469268799, + "step": 3474 + }, + { + "epoch": 1.069867651585103, + "grad_norm": 3.40625, + "learning_rate": 2.3893786201020466e-06, + "loss": 1.2411861419677734, + "step": 3476 + }, + { + "epoch": 1.0704832256078793, + "grad_norm": 12.875, + "learning_rate": 2.3886837834354815e-06, + "loss": 1.3128635883331299, + "step": 3478 + }, + { + "epoch": 1.0710987996306556, + "grad_norm": 8.6875, + "learning_rate": 2.387988686746616e-06, + "loss": 1.594773292541504, + "step": 3480 + }, + { + "epoch": 1.0717143736534318, + "grad_norm": 12.25, + "learning_rate": 2.3872933303424746e-06, + "loss": 1.1944515705108643, + "step": 3482 + }, + { + "epoch": 1.072329947676208, + "grad_norm": 24.625, + "learning_rate": 2.386597714530197e-06, + "loss": 1.2775322198867798, + "step": 3484 + }, + { + "epoch": 1.0729455216989843, + "grad_norm": 12.9375, + "learning_rate": 2.385901839617037e-06, + "loss": 1.2027215957641602, + "step": 3486 + }, + { + "epoch": 1.0735610957217605, + "grad_norm": 7.6875, + "learning_rate": 2.3852057059103642e-06, + "loss": 1.2455811500549316, + "step": 3488 + }, + { + "epoch": 1.0741766697445367, + "grad_norm": 14.375, + "learning_rate": 2.3845093137176605e-06, + "loss": 1.3259642124176025, + "step": 3490 + }, + { + "epoch": 1.074792243767313, + "grad_norm": 12.625, + "learning_rate": 2.383812663346524e-06, + "loss": 1.031188726425171, + "step": 3492 + }, + { + "epoch": 1.0754078177900892, + "grad_norm": 12.8125, + "learning_rate": 2.3831157551046664e-06, + "loss": 1.3350074291229248, + "step": 3494 + }, + { + "epoch": 1.0760233918128654, + "grad_norm": 9.5625, + "learning_rate": 2.3824185892999113e-06, + "loss": 1.3674886226654053, + "step": 3496 + }, + { + "epoch": 1.0766389658356417, + "grad_norm": 7.71875, + "learning_rate": 2.3817211662401994e-06, + "loss": 1.2225689888000488, + "step": 3498 + }, + { + "epoch": 1.077254539858418, + "grad_norm": 9.75, + "learning_rate": 2.381023486233582e-06, + "loss": 1.1854305267333984, + "step": 3500 + }, + { + "epoch": 1.0778701138811941, + "grad_norm": 10.0, + "learning_rate": 2.380325549588226e-06, + "loss": 1.411360502243042, + "step": 3502 + }, + { + "epoch": 1.0784856879039704, + "grad_norm": 8.9375, + "learning_rate": 2.3796273566124105e-06, + "loss": 1.5251002311706543, + "step": 3504 + }, + { + "epoch": 1.0791012619267466, + "grad_norm": 12.9375, + "learning_rate": 2.378928907614528e-06, + "loss": 1.8434513807296753, + "step": 3506 + }, + { + "epoch": 1.0797168359495228, + "grad_norm": 17.25, + "learning_rate": 2.3782302029030837e-06, + "loss": 1.564579963684082, + "step": 3508 + }, + { + "epoch": 1.080332409972299, + "grad_norm": 20.875, + "learning_rate": 2.3775312427866972e-06, + "loss": 1.0563139915466309, + "step": 3510 + }, + { + "epoch": 1.0809479839950753, + "grad_norm": 13.5625, + "learning_rate": 2.3768320275740995e-06, + "loss": 1.7783114910125732, + "step": 3512 + }, + { + "epoch": 1.0815635580178515, + "grad_norm": 51.25, + "learning_rate": 2.3761325575741357e-06, + "loss": 1.170400619506836, + "step": 3514 + }, + { + "epoch": 1.082179132040628, + "grad_norm": 8.0625, + "learning_rate": 2.375432833095761e-06, + "loss": 1.2846317291259766, + "step": 3516 + }, + { + "epoch": 1.0827947060634042, + "grad_norm": 13.8125, + "learning_rate": 2.374732854448045e-06, + "loss": 1.447777271270752, + "step": 3518 + }, + { + "epoch": 1.0834102800861805, + "grad_norm": 13.5, + "learning_rate": 2.3740326219401694e-06, + "loss": 1.1701204776763916, + "step": 3520 + }, + { + "epoch": 1.0840258541089567, + "grad_norm": 11.9375, + "learning_rate": 2.3733321358814276e-06, + "loss": 1.0754103660583496, + "step": 3522 + }, + { + "epoch": 1.084641428131733, + "grad_norm": 13.9375, + "learning_rate": 2.3726313965812255e-06, + "loss": 1.5020666122436523, + "step": 3524 + }, + { + "epoch": 1.0852570021545092, + "grad_norm": 14.3125, + "learning_rate": 2.3719304043490795e-06, + "loss": 1.2469940185546875, + "step": 3526 + }, + { + "epoch": 1.0858725761772854, + "grad_norm": 10.375, + "learning_rate": 2.3712291594946197e-06, + "loss": 1.41176176071167, + "step": 3528 + }, + { + "epoch": 1.0864881502000616, + "grad_norm": 13.125, + "learning_rate": 2.370527662327586e-06, + "loss": 1.514477014541626, + "step": 3530 + }, + { + "epoch": 1.0871037242228379, + "grad_norm": 22.875, + "learning_rate": 2.369825913157831e-06, + "loss": 1.0701708793640137, + "step": 3532 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 35.5, + "learning_rate": 2.3691239122953174e-06, + "loss": 1.7896881103515625, + "step": 3534 + }, + { + "epoch": 1.0883348722683903, + "grad_norm": 16.375, + "learning_rate": 2.3684216600501205e-06, + "loss": 1.608804702758789, + "step": 3536 + }, + { + "epoch": 1.0889504462911666, + "grad_norm": 9.8125, + "learning_rate": 2.3677191567324254e-06, + "loss": 0.8485841155052185, + "step": 3538 + }, + { + "epoch": 1.0895660203139428, + "grad_norm": 17.125, + "learning_rate": 2.3670164026525296e-06, + "loss": 1.7242555618286133, + "step": 3540 + }, + { + "epoch": 1.090181594336719, + "grad_norm": 4.5, + "learning_rate": 2.3663133981208388e-06, + "loss": 0.7557381987571716, + "step": 3542 + }, + { + "epoch": 1.0907971683594953, + "grad_norm": 16.25, + "learning_rate": 2.3656101434478724e-06, + "loss": 1.5691182613372803, + "step": 3544 + }, + { + "epoch": 1.0914127423822715, + "grad_norm": 11.0, + "learning_rate": 2.3649066389442577e-06, + "loss": 1.3667467832565308, + "step": 3546 + }, + { + "epoch": 1.0920283164050477, + "grad_norm": 29.25, + "learning_rate": 2.3642028849207337e-06, + "loss": 0.8939248323440552, + "step": 3548 + }, + { + "epoch": 1.092643890427824, + "grad_norm": 6.46875, + "learning_rate": 2.3634988816881503e-06, + "loss": 1.3205374479293823, + "step": 3550 + }, + { + "epoch": 1.0932594644506002, + "grad_norm": 22.625, + "learning_rate": 2.362794629557465e-06, + "loss": 1.3853535652160645, + "step": 3552 + }, + { + "epoch": 1.0938750384733764, + "grad_norm": 13.8125, + "learning_rate": 2.3620901288397484e-06, + "loss": 1.7651689052581787, + "step": 3554 + }, + { + "epoch": 1.0944906124961526, + "grad_norm": 26.625, + "learning_rate": 2.3613853798461783e-06, + "loss": 1.0660011768341064, + "step": 3556 + }, + { + "epoch": 1.0951061865189289, + "grad_norm": 28.875, + "learning_rate": 2.360680382888043e-06, + "loss": 1.4802651405334473, + "step": 3558 + }, + { + "epoch": 1.0957217605417051, + "grad_norm": 28.125, + "learning_rate": 2.3599751382767406e-06, + "loss": 2.1089253425598145, + "step": 3560 + }, + { + "epoch": 1.0963373345644813, + "grad_norm": 16.75, + "learning_rate": 2.359269646323779e-06, + "loss": 1.5472657680511475, + "step": 3562 + }, + { + "epoch": 1.0969529085872576, + "grad_norm": 9.875, + "learning_rate": 2.3585639073407743e-06, + "loss": 0.8907637000083923, + "step": 3564 + }, + { + "epoch": 1.0975684826100338, + "grad_norm": 22.125, + "learning_rate": 2.3578579216394523e-06, + "loss": 1.1718206405639648, + "step": 3566 + }, + { + "epoch": 1.09818405663281, + "grad_norm": 30.5, + "learning_rate": 2.357151689531647e-06, + "loss": 0.9224869608879089, + "step": 3568 + }, + { + "epoch": 1.0987996306555863, + "grad_norm": 11.9375, + "learning_rate": 2.356445211329304e-06, + "loss": 1.2034261226654053, + "step": 3570 + }, + { + "epoch": 1.0994152046783625, + "grad_norm": 18.625, + "learning_rate": 2.355738487344473e-06, + "loss": 1.4323277473449707, + "step": 3572 + }, + { + "epoch": 1.1000307787011387, + "grad_norm": 8.875, + "learning_rate": 2.355031517889317e-06, + "loss": 0.9144423007965088, + "step": 3574 + }, + { + "epoch": 1.100646352723915, + "grad_norm": 20.625, + "learning_rate": 2.3543243032761033e-06, + "loss": 1.5986289978027344, + "step": 3576 + }, + { + "epoch": 1.1012619267466912, + "grad_norm": 10.0625, + "learning_rate": 2.353616843817211e-06, + "loss": 1.3675158023834229, + "step": 3578 + }, + { + "epoch": 1.1018775007694674, + "grad_norm": 9.4375, + "learning_rate": 2.352909139825124e-06, + "loss": 1.145599126815796, + "step": 3580 + }, + { + "epoch": 1.1024930747922437, + "grad_norm": 19.25, + "learning_rate": 2.352201191612438e-06, + "loss": 1.3675510883331299, + "step": 3582 + }, + { + "epoch": 1.10310864881502, + "grad_norm": 10.125, + "learning_rate": 2.351492999491853e-06, + "loss": 1.4357773065567017, + "step": 3584 + }, + { + "epoch": 1.1037242228377961, + "grad_norm": 13.5625, + "learning_rate": 2.350784563776179e-06, + "loss": 0.8242530822753906, + "step": 3586 + }, + { + "epoch": 1.1043397968605726, + "grad_norm": 8.9375, + "learning_rate": 2.350075884778333e-06, + "loss": 1.350933313369751, + "step": 3588 + }, + { + "epoch": 1.1049553708833488, + "grad_norm": 8.6875, + "learning_rate": 2.349366962811339e-06, + "loss": 1.3204925060272217, + "step": 3590 + }, + { + "epoch": 1.105570944906125, + "grad_norm": 12.625, + "learning_rate": 2.348657798188328e-06, + "loss": 1.407926321029663, + "step": 3592 + }, + { + "epoch": 1.1061865189289013, + "grad_norm": 9.0625, + "learning_rate": 2.3479483912225396e-06, + "loss": 1.3072185516357422, + "step": 3594 + }, + { + "epoch": 1.1068020929516775, + "grad_norm": 103.5, + "learning_rate": 2.3472387422273202e-06, + "loss": 1.3156657218933105, + "step": 3596 + }, + { + "epoch": 1.1074176669744538, + "grad_norm": 4.125, + "learning_rate": 2.346528851516122e-06, + "loss": 1.2110552787780762, + "step": 3598 + }, + { + "epoch": 1.10803324099723, + "grad_norm": 13.6875, + "learning_rate": 2.3458187194025036e-06, + "loss": 0.8570442199707031, + "step": 3600 + }, + { + "epoch": 1.1086488150200062, + "grad_norm": 10.9375, + "learning_rate": 2.3451083462001325e-06, + "loss": 1.2895807027816772, + "step": 3602 + }, + { + "epoch": 1.1092643890427825, + "grad_norm": 11.5, + "learning_rate": 2.3443977322227804e-06, + "loss": 1.4392380714416504, + "step": 3604 + }, + { + "epoch": 1.1098799630655587, + "grad_norm": 16.0, + "learning_rate": 2.3436868777843278e-06, + "loss": 1.277353286743164, + "step": 3606 + }, + { + "epoch": 1.110495537088335, + "grad_norm": 17.0, + "learning_rate": 2.3429757831987573e-06, + "loss": 1.3384151458740234, + "step": 3608 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 6.15625, + "learning_rate": 2.3422644487801633e-06, + "loss": 1.3099740743637085, + "step": 3610 + }, + { + "epoch": 1.1117266851338874, + "grad_norm": 104.5, + "learning_rate": 2.3415528748427407e-06, + "loss": 1.686471700668335, + "step": 3612 + }, + { + "epoch": 1.1123422591566636, + "grad_norm": 35.25, + "learning_rate": 2.3408410617007943e-06, + "loss": 1.1620298624038696, + "step": 3614 + }, + { + "epoch": 1.1129578331794399, + "grad_norm": 9.6875, + "learning_rate": 2.3401290096687307e-06, + "loss": 0.7311451435089111, + "step": 3616 + }, + { + "epoch": 1.113573407202216, + "grad_norm": 19.125, + "learning_rate": 2.3394167190610658e-06, + "loss": 1.417179822921753, + "step": 3618 + }, + { + "epoch": 1.1141889812249923, + "grad_norm": 7.84375, + "learning_rate": 2.338704190192418e-06, + "loss": 1.3925962448120117, + "step": 3620 + }, + { + "epoch": 1.1148045552477686, + "grad_norm": 27.125, + "learning_rate": 2.3379914233775135e-06, + "loss": 1.372132658958435, + "step": 3622 + }, + { + "epoch": 1.1154201292705448, + "grad_norm": 43.25, + "learning_rate": 2.337278418931181e-06, + "loss": 1.3415924310684204, + "step": 3624 + }, + { + "epoch": 1.116035703293321, + "grad_norm": 16.0, + "learning_rate": 2.3365651771683563e-06, + "loss": 1.215116024017334, + "step": 3626 + }, + { + "epoch": 1.1166512773160973, + "grad_norm": 9.4375, + "learning_rate": 2.335851698404078e-06, + "loss": 1.2907482385635376, + "step": 3628 + }, + { + "epoch": 1.1172668513388735, + "grad_norm": 10.5625, + "learning_rate": 2.3351379829534917e-06, + "loss": 1.1975575685501099, + "step": 3630 + }, + { + "epoch": 1.1178824253616497, + "grad_norm": 13.375, + "learning_rate": 2.3344240311318454e-06, + "loss": 1.269575834274292, + "step": 3632 + }, + { + "epoch": 1.118497999384426, + "grad_norm": 54.5, + "learning_rate": 2.333709843254493e-06, + "loss": 1.30656898021698, + "step": 3634 + }, + { + "epoch": 1.1191135734072022, + "grad_norm": 17.125, + "learning_rate": 2.3329954196368916e-06, + "loss": 1.7163310050964355, + "step": 3636 + }, + { + "epoch": 1.1197291474299784, + "grad_norm": 31.0, + "learning_rate": 2.3322807605946033e-06, + "loss": 0.9354555606842041, + "step": 3638 + }, + { + "epoch": 1.1203447214527547, + "grad_norm": 22.75, + "learning_rate": 2.3315658664432933e-06, + "loss": 1.4401531219482422, + "step": 3640 + }, + { + "epoch": 1.1209602954755309, + "grad_norm": 12.0, + "learning_rate": 2.330850737498732e-06, + "loss": 1.307511568069458, + "step": 3642 + }, + { + "epoch": 1.1215758694983071, + "grad_norm": 22.25, + "learning_rate": 2.330135374076792e-06, + "loss": 1.3069522380828857, + "step": 3644 + }, + { + "epoch": 1.1221914435210834, + "grad_norm": 4.5, + "learning_rate": 2.3294197764934507e-06, + "loss": 1.1047172546386719, + "step": 3646 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 9.1875, + "learning_rate": 2.3287039450647888e-06, + "loss": 1.3423113822937012, + "step": 3648 + }, + { + "epoch": 1.1234225915666358, + "grad_norm": 14.4375, + "learning_rate": 2.3279878801069884e-06, + "loss": 1.4939258098602295, + "step": 3650 + }, + { + "epoch": 1.124038165589412, + "grad_norm": 19.125, + "learning_rate": 2.3272715819363368e-06, + "loss": 0.857292652130127, + "step": 3652 + }, + { + "epoch": 1.1246537396121883, + "grad_norm": 5.875, + "learning_rate": 2.326555050869224e-06, + "loss": 1.2719879150390625, + "step": 3654 + }, + { + "epoch": 1.1252693136349645, + "grad_norm": 15.9375, + "learning_rate": 2.3258382872221423e-06, + "loss": 1.321458339691162, + "step": 3656 + }, + { + "epoch": 1.1258848876577408, + "grad_norm": 19.375, + "learning_rate": 2.3251212913116876e-06, + "loss": 1.5195269584655762, + "step": 3658 + }, + { + "epoch": 1.126500461680517, + "grad_norm": 12.0625, + "learning_rate": 2.3244040634545574e-06, + "loss": 1.1064283847808838, + "step": 3660 + }, + { + "epoch": 1.1271160357032932, + "grad_norm": 18.875, + "learning_rate": 2.3236866039675514e-06, + "loss": 1.3086144924163818, + "step": 3662 + }, + { + "epoch": 1.1277316097260695, + "grad_norm": 19.75, + "learning_rate": 2.322968913167574e-06, + "loss": 1.4040865898132324, + "step": 3664 + }, + { + "epoch": 1.1283471837488457, + "grad_norm": 19.25, + "learning_rate": 2.322250991371628e-06, + "loss": 1.2450525760650635, + "step": 3666 + }, + { + "epoch": 1.128962757771622, + "grad_norm": 10.625, + "learning_rate": 2.321532838896822e-06, + "loss": 0.8933705687522888, + "step": 3668 + }, + { + "epoch": 1.1295783317943984, + "grad_norm": 26.125, + "learning_rate": 2.320814456060363e-06, + "loss": 1.368276596069336, + "step": 3670 + }, + { + "epoch": 1.1301939058171746, + "grad_norm": 16.875, + "learning_rate": 2.3200958431795637e-06, + "loss": 1.0339691638946533, + "step": 3672 + }, + { + "epoch": 1.1308094798399508, + "grad_norm": 11.4375, + "learning_rate": 2.319377000571835e-06, + "loss": 1.3817522525787354, + "step": 3674 + }, + { + "epoch": 1.131425053862727, + "grad_norm": 10.125, + "learning_rate": 2.3186579285546903e-06, + "loss": 1.1731239557266235, + "step": 3676 + }, + { + "epoch": 1.1320406278855033, + "grad_norm": 26.875, + "learning_rate": 2.3179386274457446e-06, + "loss": 1.2372663021087646, + "step": 3678 + }, + { + "epoch": 1.1326562019082795, + "grad_norm": 18.25, + "learning_rate": 2.3172190975627146e-06, + "loss": 1.3585962057113647, + "step": 3680 + }, + { + "epoch": 1.1332717759310558, + "grad_norm": 22.375, + "learning_rate": 2.316499339223417e-06, + "loss": 1.6335867643356323, + "step": 3682 + }, + { + "epoch": 1.133887349953832, + "grad_norm": 23.25, + "learning_rate": 2.3157793527457697e-06, + "loss": 1.7519006729125977, + "step": 3684 + }, + { + "epoch": 1.1345029239766082, + "grad_norm": 17.875, + "learning_rate": 2.3150591384477923e-06, + "loss": 1.8296453952789307, + "step": 3686 + }, + { + "epoch": 1.1351184979993845, + "grad_norm": 22.75, + "learning_rate": 2.314338696647603e-06, + "loss": 0.7551788091659546, + "step": 3688 + }, + { + "epoch": 1.1357340720221607, + "grad_norm": 86.0, + "learning_rate": 2.3136180276634238e-06, + "loss": 1.4087929725646973, + "step": 3690 + }, + { + "epoch": 1.136349646044937, + "grad_norm": 8.3125, + "learning_rate": 2.3128971318135732e-06, + "loss": 1.5488674640655518, + "step": 3692 + }, + { + "epoch": 1.1369652200677132, + "grad_norm": 13.25, + "learning_rate": 2.3121760094164727e-06, + "loss": 1.1042814254760742, + "step": 3694 + }, + { + "epoch": 1.1375807940904894, + "grad_norm": 16.375, + "learning_rate": 2.3114546607906422e-06, + "loss": 1.7696267366409302, + "step": 3696 + }, + { + "epoch": 1.1381963681132656, + "grad_norm": 15.5, + "learning_rate": 2.3107330862547025e-06, + "loss": 1.6327096223831177, + "step": 3698 + }, + { + "epoch": 1.1388119421360419, + "grad_norm": 21.375, + "learning_rate": 2.3100112861273747e-06, + "loss": 1.5142444372177124, + "step": 3700 + }, + { + "epoch": 1.139427516158818, + "grad_norm": 14.5, + "learning_rate": 2.3092892607274777e-06, + "loss": 1.1201368570327759, + "step": 3702 + }, + { + "epoch": 1.1400430901815943, + "grad_norm": 18.125, + "learning_rate": 2.3085670103739305e-06, + "loss": 1.3404749631881714, + "step": 3704 + }, + { + "epoch": 1.1406586642043706, + "grad_norm": 7.96875, + "learning_rate": 2.3078445353857537e-06, + "loss": 1.3205509185791016, + "step": 3706 + }, + { + "epoch": 1.1412742382271468, + "grad_norm": 24.0, + "learning_rate": 2.307121836082063e-06, + "loss": 1.7567417621612549, + "step": 3708 + }, + { + "epoch": 1.141889812249923, + "grad_norm": 24.375, + "learning_rate": 2.3063989127820773e-06, + "loss": 1.647988200187683, + "step": 3710 + }, + { + "epoch": 1.1425053862726993, + "grad_norm": 9.125, + "learning_rate": 2.3056757658051123e-06, + "loss": 1.558417558670044, + "step": 3712 + }, + { + "epoch": 1.1431209602954755, + "grad_norm": 10.125, + "learning_rate": 2.304952395470583e-06, + "loss": 1.4553438425064087, + "step": 3714 + }, + { + "epoch": 1.1437365343182517, + "grad_norm": 12.125, + "learning_rate": 2.3042288020980025e-06, + "loss": 1.2713730335235596, + "step": 3716 + }, + { + "epoch": 1.144352108341028, + "grad_norm": 14.5, + "learning_rate": 2.3035049860069827e-06, + "loss": 1.8827029466629028, + "step": 3718 + }, + { + "epoch": 1.1449676823638042, + "grad_norm": 12.625, + "learning_rate": 2.302780947517234e-06, + "loss": 1.3714704513549805, + "step": 3720 + }, + { + "epoch": 1.1455832563865804, + "grad_norm": 8.8125, + "learning_rate": 2.3020566869485657e-06, + "loss": 1.372882604598999, + "step": 3722 + }, + { + "epoch": 1.1461988304093567, + "grad_norm": 48.0, + "learning_rate": 2.301332204620883e-06, + "loss": 1.7148422002792358, + "step": 3724 + }, + { + "epoch": 1.146814404432133, + "grad_norm": 10.9375, + "learning_rate": 2.300607500854193e-06, + "loss": 1.164696455001831, + "step": 3726 + }, + { + "epoch": 1.1474299784549091, + "grad_norm": 9.0625, + "learning_rate": 2.2998825759685964e-06, + "loss": 1.1495816707611084, + "step": 3728 + }, + { + "epoch": 1.1480455524776854, + "grad_norm": 30.25, + "learning_rate": 2.2991574302842934e-06, + "loss": 1.2931642532348633, + "step": 3730 + }, + { + "epoch": 1.1486611265004616, + "grad_norm": 15.375, + "learning_rate": 2.298432064121582e-06, + "loss": 1.1962416172027588, + "step": 3732 + }, + { + "epoch": 1.1492767005232378, + "grad_norm": 18.875, + "learning_rate": 2.297706477800858e-06, + "loss": 1.4143441915512085, + "step": 3734 + }, + { + "epoch": 1.1498922745460143, + "grad_norm": 19.5, + "learning_rate": 2.296980671642612e-06, + "loss": 1.5150713920593262, + "step": 3736 + }, + { + "epoch": 1.1505078485687905, + "grad_norm": 15.5, + "learning_rate": 2.296254645967435e-06, + "loss": 1.386347770690918, + "step": 3738 + }, + { + "epoch": 1.1511234225915667, + "grad_norm": 23.0, + "learning_rate": 2.2955284010960128e-06, + "loss": 1.5230674743652344, + "step": 3740 + }, + { + "epoch": 1.151738996614343, + "grad_norm": 7.625, + "learning_rate": 2.294801937349129e-06, + "loss": 1.0187252759933472, + "step": 3742 + }, + { + "epoch": 1.1523545706371192, + "grad_norm": 16.75, + "learning_rate": 2.294075255047662e-06, + "loss": 0.9895851016044617, + "step": 3744 + }, + { + "epoch": 1.1529701446598954, + "grad_norm": 20.5, + "learning_rate": 2.29334835451259e-06, + "loss": 1.5257803201675415, + "step": 3746 + }, + { + "epoch": 1.1535857186826717, + "grad_norm": 6.46875, + "learning_rate": 2.292621236064985e-06, + "loss": 1.4674441814422607, + "step": 3748 + }, + { + "epoch": 1.154201292705448, + "grad_norm": 39.0, + "learning_rate": 2.291893900026016e-06, + "loss": 1.4346339702606201, + "step": 3750 + }, + { + "epoch": 1.1548168667282241, + "grad_norm": 14.625, + "learning_rate": 2.2911663467169485e-06, + "loss": 1.461643934249878, + "step": 3752 + }, + { + "epoch": 1.1554324407510004, + "grad_norm": 11.8125, + "learning_rate": 2.2904385764591426e-06, + "loss": 1.651922583580017, + "step": 3754 + }, + { + "epoch": 1.1560480147737766, + "grad_norm": 9.4375, + "learning_rate": 2.289710589574057e-06, + "loss": 1.1730600595474243, + "step": 3756 + }, + { + "epoch": 1.1566635887965528, + "grad_norm": 17.25, + "learning_rate": 2.2889823863832433e-06, + "loss": 1.525307536125183, + "step": 3758 + }, + { + "epoch": 1.157279162819329, + "grad_norm": 12.5625, + "learning_rate": 2.2882539672083495e-06, + "loss": 1.4266164302825928, + "step": 3760 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 17.0, + "learning_rate": 2.2875253323711195e-06, + "loss": 1.7639350891113281, + "step": 3762 + }, + { + "epoch": 1.1585103108648815, + "grad_norm": 8.375, + "learning_rate": 2.286796482193392e-06, + "loss": 1.4397724866867065, + "step": 3764 + }, + { + "epoch": 1.1591258848876578, + "grad_norm": 12.0, + "learning_rate": 2.286067416997101e-06, + "loss": 1.2925528287887573, + "step": 3766 + }, + { + "epoch": 1.159741458910434, + "grad_norm": 22.75, + "learning_rate": 2.2853381371042762e-06, + "loss": 1.3446506261825562, + "step": 3768 + }, + { + "epoch": 1.1603570329332102, + "grad_norm": 23.75, + "learning_rate": 2.2846086428370396e-06, + "loss": 1.6084314584732056, + "step": 3770 + }, + { + "epoch": 1.1609726069559865, + "grad_norm": 10.3125, + "learning_rate": 2.283878934517611e-06, + "loss": 1.509368896484375, + "step": 3772 + }, + { + "epoch": 1.1615881809787627, + "grad_norm": 21.75, + "learning_rate": 2.2831490124683035e-06, + "loss": 1.6360807418823242, + "step": 3774 + }, + { + "epoch": 1.162203755001539, + "grad_norm": 13.9375, + "learning_rate": 2.2824188770115244e-06, + "loss": 1.5900465250015259, + "step": 3776 + }, + { + "epoch": 1.1628193290243152, + "grad_norm": 8.0625, + "learning_rate": 2.2816885284697742e-06, + "loss": 1.228595495223999, + "step": 3778 + }, + { + "epoch": 1.1634349030470914, + "grad_norm": 19.375, + "learning_rate": 2.2809579671656504e-06, + "loss": 1.2203516960144043, + "step": 3780 + }, + { + "epoch": 1.1640504770698676, + "grad_norm": 24.5, + "learning_rate": 2.280227193421841e-06, + "loss": 1.2803552150726318, + "step": 3782 + }, + { + "epoch": 1.1646660510926439, + "grad_norm": 38.75, + "learning_rate": 2.2794962075611312e-06, + "loss": 1.5093402862548828, + "step": 3784 + }, + { + "epoch": 1.16528162511542, + "grad_norm": 19.5, + "learning_rate": 2.278765009906398e-06, + "loss": 1.3051506280899048, + "step": 3786 + }, + { + "epoch": 1.1658971991381963, + "grad_norm": 6.0625, + "learning_rate": 2.2780336007806107e-06, + "loss": 1.3691089153289795, + "step": 3788 + }, + { + "epoch": 1.1665127731609726, + "grad_norm": 15.375, + "learning_rate": 2.2773019805068355e-06, + "loss": 1.620005488395691, + "step": 3790 + }, + { + "epoch": 1.1671283471837488, + "grad_norm": 20.75, + "learning_rate": 2.276570149408229e-06, + "loss": 1.3951692581176758, + "step": 3792 + }, + { + "epoch": 1.167743921206525, + "grad_norm": 16.5, + "learning_rate": 2.2758381078080425e-06, + "loss": 0.8126804828643799, + "step": 3794 + }, + { + "epoch": 1.1683594952293013, + "grad_norm": 9.625, + "learning_rate": 2.2751058560296183e-06, + "loss": 1.2357356548309326, + "step": 3796 + }, + { + "epoch": 1.1689750692520775, + "grad_norm": 12.75, + "learning_rate": 2.2743733943963937e-06, + "loss": 1.2538156509399414, + "step": 3798 + }, + { + "epoch": 1.1695906432748537, + "grad_norm": 16.5, + "learning_rate": 2.2736407232318983e-06, + "loss": 1.3101701736450195, + "step": 3800 + }, + { + "epoch": 1.17020621729763, + "grad_norm": 9.9375, + "learning_rate": 2.2729078428597526e-06, + "loss": 1.4193048477172852, + "step": 3802 + }, + { + "epoch": 1.1708217913204062, + "grad_norm": 14.375, + "learning_rate": 2.272174753603672e-06, + "loss": 1.587726354598999, + "step": 3804 + }, + { + "epoch": 1.1714373653431824, + "grad_norm": 19.25, + "learning_rate": 2.271441455787462e-06, + "loss": 1.8314234018325806, + "step": 3806 + }, + { + "epoch": 1.1720529393659587, + "grad_norm": 16.875, + "learning_rate": 2.2707079497350205e-06, + "loss": 1.4434683322906494, + "step": 3808 + }, + { + "epoch": 1.172668513388735, + "grad_norm": 78.0, + "learning_rate": 2.26997423577034e-06, + "loss": 0.9510101079940796, + "step": 3810 + }, + { + "epoch": 1.1732840874115111, + "grad_norm": 16.5, + "learning_rate": 2.269240314217501e-06, + "loss": 1.235957145690918, + "step": 3812 + }, + { + "epoch": 1.1738996614342874, + "grad_norm": 13.4375, + "learning_rate": 2.2685061854006778e-06, + "loss": 1.0225133895874023, + "step": 3814 + }, + { + "epoch": 1.1745152354570636, + "grad_norm": 8.8125, + "learning_rate": 2.2677718496441363e-06, + "loss": 1.081822395324707, + "step": 3816 + }, + { + "epoch": 1.1751308094798398, + "grad_norm": 31.5, + "learning_rate": 2.267037307272234e-06, + "loss": 1.5427581071853638, + "step": 3818 + }, + { + "epoch": 1.175746383502616, + "grad_norm": 16.125, + "learning_rate": 2.2663025586094177e-06, + "loss": 1.3339014053344727, + "step": 3820 + }, + { + "epoch": 1.1763619575253925, + "grad_norm": 11.8125, + "learning_rate": 2.2655676039802275e-06, + "loss": 1.3625433444976807, + "step": 3822 + }, + { + "epoch": 1.1769775315481688, + "grad_norm": 21.5, + "learning_rate": 2.2648324437092943e-06, + "loss": 1.2665989398956299, + "step": 3824 + }, + { + "epoch": 1.177593105570945, + "grad_norm": 12.3125, + "learning_rate": 2.264097078121338e-06, + "loss": 1.31890869140625, + "step": 3826 + }, + { + "epoch": 1.1782086795937212, + "grad_norm": 25.625, + "learning_rate": 2.263361507541171e-06, + "loss": 0.8552164435386658, + "step": 3828 + }, + { + "epoch": 1.1788242536164975, + "grad_norm": 25.125, + "learning_rate": 2.2626257322936967e-06, + "loss": 1.3524929285049438, + "step": 3830 + }, + { + "epoch": 1.1794398276392737, + "grad_norm": 20.5, + "learning_rate": 2.2618897527039055e-06, + "loss": 1.2687150239944458, + "step": 3832 + }, + { + "epoch": 1.18005540166205, + "grad_norm": 29.875, + "learning_rate": 2.2611535690968824e-06, + "loss": 1.767914056777954, + "step": 3834 + }, + { + "epoch": 1.1806709756848262, + "grad_norm": 22.875, + "learning_rate": 2.2604171817978e-06, + "loss": 1.393390417098999, + "step": 3836 + }, + { + "epoch": 1.1812865497076024, + "grad_norm": 6.28125, + "learning_rate": 2.259680591131921e-06, + "loss": 1.4195334911346436, + "step": 3838 + }, + { + "epoch": 1.1819021237303786, + "grad_norm": 15.4375, + "learning_rate": 2.2589437974245994e-06, + "loss": 1.593935251235962, + "step": 3840 + }, + { + "epoch": 1.1825176977531549, + "grad_norm": 12.6875, + "learning_rate": 2.2582068010012767e-06, + "loss": 1.3784018754959106, + "step": 3842 + }, + { + "epoch": 1.183133271775931, + "grad_norm": 10.0625, + "learning_rate": 2.257469602187485e-06, + "loss": 1.3572193384170532, + "step": 3844 + }, + { + "epoch": 1.1837488457987073, + "grad_norm": 4.09375, + "learning_rate": 2.2567322013088466e-06, + "loss": 1.1336941719055176, + "step": 3846 + }, + { + "epoch": 1.1843644198214835, + "grad_norm": 15.75, + "learning_rate": 2.2559945986910728e-06, + "loss": 0.8909260034561157, + "step": 3848 + }, + { + "epoch": 1.1849799938442598, + "grad_norm": 20.375, + "learning_rate": 2.255256794659962e-06, + "loss": 0.9847655296325684, + "step": 3850 + }, + { + "epoch": 1.185595567867036, + "grad_norm": 21.875, + "learning_rate": 2.2545187895414036e-06, + "loss": 1.426526427268982, + "step": 3852 + }, + { + "epoch": 1.1862111418898122, + "grad_norm": 14.25, + "learning_rate": 2.253780583661376e-06, + "loss": 1.4254299402236938, + "step": 3854 + }, + { + "epoch": 1.1868267159125885, + "grad_norm": 21.125, + "learning_rate": 2.2530421773459455e-06, + "loss": 1.7375472784042358, + "step": 3856 + }, + { + "epoch": 1.1874422899353647, + "grad_norm": 16.75, + "learning_rate": 2.2523035709212657e-06, + "loss": 1.397571086883545, + "step": 3858 + }, + { + "epoch": 1.188057863958141, + "grad_norm": 20.625, + "learning_rate": 2.2515647647135815e-06, + "loss": 1.0399236679077148, + "step": 3860 + }, + { + "epoch": 1.1886734379809172, + "grad_norm": 12.3125, + "learning_rate": 2.2508257590492237e-06, + "loss": 0.8612368702888489, + "step": 3862 + }, + { + "epoch": 1.1892890120036934, + "grad_norm": 19.875, + "learning_rate": 2.250086554254612e-06, + "loss": 1.3931348323822021, + "step": 3864 + }, + { + "epoch": 1.1899045860264696, + "grad_norm": 33.5, + "learning_rate": 2.2493471506562544e-06, + "loss": 1.150251865386963, + "step": 3866 + }, + { + "epoch": 1.1905201600492459, + "grad_norm": 21.75, + "learning_rate": 2.248607548580746e-06, + "loss": 0.958466112613678, + "step": 3868 + }, + { + "epoch": 1.1911357340720221, + "grad_norm": 15.25, + "learning_rate": 2.247867748354769e-06, + "loss": 1.409304141998291, + "step": 3870 + }, + { + "epoch": 1.1917513080947983, + "grad_norm": 27.875, + "learning_rate": 2.2471277503050955e-06, + "loss": 1.5489780902862549, + "step": 3872 + }, + { + "epoch": 1.1923668821175746, + "grad_norm": 11.9375, + "learning_rate": 2.246387554758583e-06, + "loss": 1.063664197921753, + "step": 3874 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 12.6875, + "learning_rate": 2.2456471620421762e-06, + "loss": 1.606360673904419, + "step": 3876 + }, + { + "epoch": 1.193598030163127, + "grad_norm": 17.875, + "learning_rate": 2.244906572482908e-06, + "loss": 1.8889983892440796, + "step": 3878 + }, + { + "epoch": 1.1942136041859033, + "grad_norm": 12.3125, + "learning_rate": 2.244165786407898e-06, + "loss": 1.4994919300079346, + "step": 3880 + }, + { + "epoch": 1.1948291782086795, + "grad_norm": 7.21875, + "learning_rate": 2.2434248041443507e-06, + "loss": 1.1762011051177979, + "step": 3882 + }, + { + "epoch": 1.1954447522314557, + "grad_norm": 31.375, + "learning_rate": 2.24268362601956e-06, + "loss": 1.547112226486206, + "step": 3884 + }, + { + "epoch": 1.196060326254232, + "grad_norm": 14.375, + "learning_rate": 2.241942252360905e-06, + "loss": 1.0352790355682373, + "step": 3886 + }, + { + "epoch": 1.1966759002770084, + "grad_norm": 20.5, + "learning_rate": 2.2412006834958517e-06, + "loss": 1.4299671649932861, + "step": 3888 + }, + { + "epoch": 1.1972914742997847, + "grad_norm": 18.75, + "learning_rate": 2.2404589197519507e-06, + "loss": 1.6509742736816406, + "step": 3890 + }, + { + "epoch": 1.197907048322561, + "grad_norm": 15.75, + "learning_rate": 2.239716961456841e-06, + "loss": 1.3373303413391113, + "step": 3892 + }, + { + "epoch": 1.1985226223453371, + "grad_norm": 118.0, + "learning_rate": 2.238974808938246e-06, + "loss": 1.3991224765777588, + "step": 3894 + }, + { + "epoch": 1.1991381963681134, + "grad_norm": 14.3125, + "learning_rate": 2.2382324625239757e-06, + "loss": 1.0333058834075928, + "step": 3896 + }, + { + "epoch": 1.1997537703908896, + "grad_norm": 14.75, + "learning_rate": 2.2374899225419247e-06, + "loss": 1.4212753772735596, + "step": 3898 + }, + { + "epoch": 1.2003693444136658, + "grad_norm": 10.25, + "learning_rate": 2.2367471893200744e-06, + "loss": 1.3892631530761719, + "step": 3900 + }, + { + "epoch": 1.200984918436442, + "grad_norm": 294.0, + "learning_rate": 2.23600426318649e-06, + "loss": 1.336948037147522, + "step": 3902 + }, + { + "epoch": 1.2016004924592183, + "grad_norm": 12.4375, + "learning_rate": 2.235261144469324e-06, + "loss": 1.2685046195983887, + "step": 3904 + }, + { + "epoch": 1.2022160664819945, + "grad_norm": 32.0, + "learning_rate": 2.2345178334968125e-06, + "loss": 1.7446030378341675, + "step": 3906 + }, + { + "epoch": 1.2028316405047708, + "grad_norm": 73.0, + "learning_rate": 2.233774330597276e-06, + "loss": 1.4774670600891113, + "step": 3908 + }, + { + "epoch": 1.203447214527547, + "grad_norm": 7.90625, + "learning_rate": 2.233030636099121e-06, + "loss": 1.3360192775726318, + "step": 3910 + }, + { + "epoch": 1.2040627885503232, + "grad_norm": 8.5, + "learning_rate": 2.232286750330839e-06, + "loss": 1.2566142082214355, + "step": 3912 + }, + { + "epoch": 1.2046783625730995, + "grad_norm": 8.875, + "learning_rate": 2.2315426736210037e-06, + "loss": 0.8111532926559448, + "step": 3914 + }, + { + "epoch": 1.2052939365958757, + "grad_norm": 13.8125, + "learning_rate": 2.230798406298276e-06, + "loss": 1.1176693439483643, + "step": 3916 + }, + { + "epoch": 1.205909510618652, + "grad_norm": 19.625, + "learning_rate": 2.2300539486913985e-06, + "loss": 1.320571780204773, + "step": 3918 + }, + { + "epoch": 1.2065250846414282, + "grad_norm": 17.0, + "learning_rate": 2.2293093011292006e-06, + "loss": 1.564365029335022, + "step": 3920 + }, + { + "epoch": 1.2071406586642044, + "grad_norm": 11.375, + "learning_rate": 2.228564463940592e-06, + "loss": 1.2905172109603882, + "step": 3922 + }, + { + "epoch": 1.2077562326869806, + "grad_norm": 9.8125, + "learning_rate": 2.2278194374545697e-06, + "loss": 1.1235172748565674, + "step": 3924 + }, + { + "epoch": 1.2083718067097569, + "grad_norm": 15.3125, + "learning_rate": 2.227074222000212e-06, + "loss": 1.4910166263580322, + "step": 3926 + }, + { + "epoch": 1.208987380732533, + "grad_norm": 37.0, + "learning_rate": 2.226328817906681e-06, + "loss": 1.5519063472747803, + "step": 3928 + }, + { + "epoch": 1.2096029547553093, + "grad_norm": 13.5, + "learning_rate": 2.225583225503224e-06, + "loss": 1.3468170166015625, + "step": 3930 + }, + { + "epoch": 1.2102185287780856, + "grad_norm": 18.25, + "learning_rate": 2.2248374451191687e-06, + "loss": 1.344160556793213, + "step": 3932 + }, + { + "epoch": 1.2108341028008618, + "grad_norm": 13.375, + "learning_rate": 2.2240914770839273e-06, + "loss": 1.3389885425567627, + "step": 3934 + }, + { + "epoch": 1.211449676823638, + "grad_norm": 15.125, + "learning_rate": 2.223345321726995e-06, + "loss": 1.783838152885437, + "step": 3936 + }, + { + "epoch": 1.2120652508464143, + "grad_norm": 20.25, + "learning_rate": 2.2225989793779502e-06, + "loss": 1.6713354587554932, + "step": 3938 + }, + { + "epoch": 1.2126808248691905, + "grad_norm": 14.0625, + "learning_rate": 2.2218524503664514e-06, + "loss": 1.3737525939941406, + "step": 3940 + }, + { + "epoch": 1.2132963988919667, + "grad_norm": 10.75, + "learning_rate": 2.221105735022243e-06, + "loss": 1.1370043754577637, + "step": 3942 + }, + { + "epoch": 1.213911972914743, + "grad_norm": 13.3125, + "learning_rate": 2.2203588336751496e-06, + "loss": 1.3710174560546875, + "step": 3944 + }, + { + "epoch": 1.2145275469375192, + "grad_norm": 14.125, + "learning_rate": 2.2196117466550774e-06, + "loss": 1.5901612043380737, + "step": 3946 + }, + { + "epoch": 1.2151431209602954, + "grad_norm": 5.375, + "learning_rate": 2.2188644742920173e-06, + "loss": 1.0958540439605713, + "step": 3948 + }, + { + "epoch": 1.2157586949830717, + "grad_norm": 9.0625, + "learning_rate": 2.2181170169160385e-06, + "loss": 1.2019283771514893, + "step": 3950 + }, + { + "epoch": 1.2163742690058479, + "grad_norm": 4.71875, + "learning_rate": 2.217369374857296e-06, + "loss": 1.2523112297058105, + "step": 3952 + }, + { + "epoch": 1.2169898430286241, + "grad_norm": 19.0, + "learning_rate": 2.216621548446021e-06, + "loss": 1.3643252849578857, + "step": 3954 + }, + { + "epoch": 1.2176054170514004, + "grad_norm": 12.1875, + "learning_rate": 2.2158735380125325e-06, + "loss": 1.606905221939087, + "step": 3956 + }, + { + "epoch": 1.2182209910741766, + "grad_norm": 17.75, + "learning_rate": 2.2151253438872263e-06, + "loss": 1.2848656177520752, + "step": 3958 + }, + { + "epoch": 1.2188365650969528, + "grad_norm": 19.25, + "learning_rate": 2.2143769664005797e-06, + "loss": 1.4402281045913696, + "step": 3960 + }, + { + "epoch": 1.219452139119729, + "grad_norm": 19.25, + "learning_rate": 2.2136284058831533e-06, + "loss": 1.4613749980926514, + "step": 3962 + }, + { + "epoch": 1.2200677131425053, + "grad_norm": 8.875, + "learning_rate": 2.2128796626655855e-06, + "loss": 1.257155179977417, + "step": 3964 + }, + { + "epoch": 1.2206832871652815, + "grad_norm": 14.6875, + "learning_rate": 2.212130737078599e-06, + "loss": 1.0831825733184814, + "step": 3966 + }, + { + "epoch": 1.2212988611880577, + "grad_norm": 15.9375, + "learning_rate": 2.211381629452994e-06, + "loss": 1.5734920501708984, + "step": 3968 + }, + { + "epoch": 1.221914435210834, + "grad_norm": 17.25, + "learning_rate": 2.2106323401196528e-06, + "loss": 1.361938238143921, + "step": 3970 + }, + { + "epoch": 1.2225300092336104, + "grad_norm": 15.0625, + "learning_rate": 2.2098828694095356e-06, + "loss": 1.3760902881622314, + "step": 3972 + }, + { + "epoch": 1.2231455832563867, + "grad_norm": 13.4375, + "learning_rate": 2.209133217653687e-06, + "loss": 0.9740235805511475, + "step": 3974 + }, + { + "epoch": 1.223761157279163, + "grad_norm": 22.0, + "learning_rate": 2.2083833851832277e-06, + "loss": 1.5925462245941162, + "step": 3976 + }, + { + "epoch": 1.2243767313019391, + "grad_norm": 12.0625, + "learning_rate": 2.2076333723293588e-06, + "loss": 1.377173900604248, + "step": 3978 + }, + { + "epoch": 1.2249923053247154, + "grad_norm": 8.5, + "learning_rate": 2.206883179423364e-06, + "loss": 1.2487232685089111, + "step": 3980 + }, + { + "epoch": 1.2256078793474916, + "grad_norm": 5.40625, + "learning_rate": 2.2061328067966016e-06, + "loss": 1.1240956783294678, + "step": 3982 + }, + { + "epoch": 1.2262234533702678, + "grad_norm": 13.9375, + "learning_rate": 2.2053822547805145e-06, + "loss": 1.1868245601654053, + "step": 3984 + }, + { + "epoch": 1.226839027393044, + "grad_norm": 18.625, + "learning_rate": 2.2046315237066213e-06, + "loss": 1.6078002452850342, + "step": 3986 + }, + { + "epoch": 1.2274546014158203, + "grad_norm": 11.6875, + "learning_rate": 2.2038806139065206e-06, + "loss": 1.2998437881469727, + "step": 3988 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 41.5, + "learning_rate": 2.2031295257118905e-06, + "loss": 2.041577100753784, + "step": 3990 + }, + { + "epoch": 1.2286857494613728, + "grad_norm": 40.5, + "learning_rate": 2.202378259454488e-06, + "loss": 1.0328714847564697, + "step": 3992 + }, + { + "epoch": 1.229301323484149, + "grad_norm": 7.8125, + "learning_rate": 2.201626815466147e-06, + "loss": 1.3742680549621582, + "step": 3994 + }, + { + "epoch": 1.2299168975069252, + "grad_norm": 10.375, + "learning_rate": 2.2008751940787817e-06, + "loss": 1.1253302097320557, + "step": 3996 + }, + { + "epoch": 1.2305324715297015, + "grad_norm": 15.25, + "learning_rate": 2.2001233956243846e-06, + "loss": 1.2740788459777832, + "step": 3998 + }, + { + "epoch": 1.2311480455524777, + "grad_norm": 18.75, + "learning_rate": 2.1993714204350257e-06, + "loss": 1.2912187576293945, + "step": 4000 + }, + { + "epoch": 1.231763619575254, + "grad_norm": 17.125, + "learning_rate": 2.198619268842853e-06, + "loss": 1.438889980316162, + "step": 4002 + }, + { + "epoch": 1.2323791935980302, + "grad_norm": 13.125, + "learning_rate": 2.1978669411800936e-06, + "loss": 1.3957526683807373, + "step": 4004 + }, + { + "epoch": 1.2329947676208064, + "grad_norm": 16.625, + "learning_rate": 2.1971144377790504e-06, + "loss": 1.3101451396942139, + "step": 4006 + }, + { + "epoch": 1.2336103416435826, + "grad_norm": 26.5, + "learning_rate": 2.196361758972105e-06, + "loss": 1.486053705215454, + "step": 4008 + }, + { + "epoch": 1.2342259156663589, + "grad_norm": 11.4375, + "learning_rate": 2.1956089050917174e-06, + "loss": 1.1454870700836182, + "step": 4010 + }, + { + "epoch": 1.234841489689135, + "grad_norm": 25.75, + "learning_rate": 2.1948558764704234e-06, + "loss": 1.5119868516921997, + "step": 4012 + }, + { + "epoch": 1.2354570637119113, + "grad_norm": 17.75, + "learning_rate": 2.1941026734408368e-06, + "loss": 1.6028649806976318, + "step": 4014 + }, + { + "epoch": 1.2360726377346876, + "grad_norm": 11.5625, + "learning_rate": 2.1933492963356486e-06, + "loss": 1.7800073623657227, + "step": 4016 + }, + { + "epoch": 1.2366882117574638, + "grad_norm": 10.4375, + "learning_rate": 2.192595745487625e-06, + "loss": 1.2908351421356201, + "step": 4018 + }, + { + "epoch": 1.23730378578024, + "grad_norm": 7.96875, + "learning_rate": 2.1918420212296126e-06, + "loss": 1.0685988664627075, + "step": 4020 + }, + { + "epoch": 1.2379193598030163, + "grad_norm": 11.125, + "learning_rate": 2.1910881238945293e-06, + "loss": 1.470991611480713, + "step": 4022 + }, + { + "epoch": 1.2385349338257925, + "grad_norm": 9.1875, + "learning_rate": 2.190334053815375e-06, + "loss": 1.4521594047546387, + "step": 4024 + }, + { + "epoch": 1.2391505078485687, + "grad_norm": 14.4375, + "learning_rate": 2.189579811325222e-06, + "loss": 1.1041626930236816, + "step": 4026 + }, + { + "epoch": 1.239766081871345, + "grad_norm": 16.0, + "learning_rate": 2.18882539675722e-06, + "loss": 1.468430519104004, + "step": 4028 + }, + { + "epoch": 1.2403816558941212, + "grad_norm": 24.25, + "learning_rate": 2.1880708104445954e-06, + "loss": 0.9561784267425537, + "step": 4030 + }, + { + "epoch": 1.2409972299168974, + "grad_norm": 20.75, + "learning_rate": 2.1873160527206505e-06, + "loss": 1.477291464805603, + "step": 4032 + }, + { + "epoch": 1.2416128039396737, + "grad_norm": 36.0, + "learning_rate": 2.18656112391876e-06, + "loss": 1.5140433311462402, + "step": 4034 + }, + { + "epoch": 1.24222837796245, + "grad_norm": 9.9375, + "learning_rate": 2.18580602437238e-06, + "loss": 1.5981786251068115, + "step": 4036 + }, + { + "epoch": 1.2428439519852263, + "grad_norm": 21.375, + "learning_rate": 2.1850507544150368e-06, + "loss": 1.4360090494155884, + "step": 4038 + }, + { + "epoch": 1.2434595260080026, + "grad_norm": 8.25, + "learning_rate": 2.184295314380335e-06, + "loss": 1.2938246726989746, + "step": 4040 + }, + { + "epoch": 1.2440751000307788, + "grad_norm": 11.625, + "learning_rate": 2.1835397046019524e-06, + "loss": 1.2391458749771118, + "step": 4042 + }, + { + "epoch": 1.244690674053555, + "grad_norm": 50.5, + "learning_rate": 2.1827839254136436e-06, + "loss": 1.4162282943725586, + "step": 4044 + }, + { + "epoch": 1.2453062480763313, + "grad_norm": 15.3125, + "learning_rate": 2.1820279771492364e-06, + "loss": 1.5083057880401611, + "step": 4046 + }, + { + "epoch": 1.2459218220991075, + "grad_norm": 238.0, + "learning_rate": 2.1812718601426346e-06, + "loss": 1.6508371829986572, + "step": 4048 + }, + { + "epoch": 1.2465373961218837, + "grad_norm": 13.375, + "learning_rate": 2.1805155747278157e-06, + "loss": 1.3943655490875244, + "step": 4050 + }, + { + "epoch": 1.24715297014466, + "grad_norm": 10.625, + "learning_rate": 2.1797591212388317e-06, + "loss": 1.657834768295288, + "step": 4052 + }, + { + "epoch": 1.2477685441674362, + "grad_norm": 30.125, + "learning_rate": 2.17900250000981e-06, + "loss": 1.6758224964141846, + "step": 4054 + }, + { + "epoch": 1.2483841181902124, + "grad_norm": 15.375, + "learning_rate": 2.1782457113749485e-06, + "loss": 1.800303339958191, + "step": 4056 + }, + { + "epoch": 1.2489996922129887, + "grad_norm": 25.75, + "learning_rate": 2.177488755668525e-06, + "loss": 1.2303414344787598, + "step": 4058 + }, + { + "epoch": 1.249615266235765, + "grad_norm": 22.75, + "learning_rate": 2.176731633224885e-06, + "loss": 1.1429264545440674, + "step": 4060 + }, + { + "epoch": 1.2502308402585411, + "grad_norm": 7.09375, + "learning_rate": 2.1759743443784515e-06, + "loss": 1.3129208087921143, + "step": 4062 + }, + { + "epoch": 1.2508464142813174, + "grad_norm": 14.5, + "learning_rate": 2.1752168894637197e-06, + "loss": 1.12176513671875, + "step": 4064 + }, + { + "epoch": 1.2514619883040936, + "grad_norm": 29.75, + "learning_rate": 2.1744592688152588e-06, + "loss": 1.3381752967834473, + "step": 4066 + }, + { + "epoch": 1.2520775623268698, + "grad_norm": 4.59375, + "learning_rate": 2.1737014827677097e-06, + "loss": 0.7888558506965637, + "step": 4068 + }, + { + "epoch": 1.252693136349646, + "grad_norm": 7.84375, + "learning_rate": 2.1729435316557878e-06, + "loss": 1.3463082313537598, + "step": 4070 + }, + { + "epoch": 1.2533087103724223, + "grad_norm": 11.875, + "learning_rate": 2.1721854158142814e-06, + "loss": 1.5913687944412231, + "step": 4072 + }, + { + "epoch": 1.2539242843951985, + "grad_norm": 15.5, + "learning_rate": 2.17142713557805e-06, + "loss": 1.1627197265625, + "step": 4074 + }, + { + "epoch": 1.2545398584179748, + "grad_norm": 8.0, + "learning_rate": 2.170668691282029e-06, + "loss": 0.9327364563941956, + "step": 4076 + }, + { + "epoch": 1.255155432440751, + "grad_norm": 9.0625, + "learning_rate": 2.1699100832612216e-06, + "loss": 0.8239724636077881, + "step": 4078 + }, + { + "epoch": 1.2557710064635272, + "grad_norm": 57.5, + "learning_rate": 2.1691513118507074e-06, + "loss": 1.562792181968689, + "step": 4080 + }, + { + "epoch": 1.2563865804863035, + "grad_norm": 23.375, + "learning_rate": 2.1683923773856368e-06, + "loss": 1.2528904676437378, + "step": 4082 + }, + { + "epoch": 1.2570021545090797, + "grad_norm": 10.75, + "learning_rate": 2.16763328020123e-06, + "loss": 1.369584560394287, + "step": 4084 + }, + { + "epoch": 1.257617728531856, + "grad_norm": 59.0, + "learning_rate": 2.1668740206327837e-06, + "loss": 0.8014663457870483, + "step": 4086 + }, + { + "epoch": 1.2582333025546322, + "grad_norm": 49.25, + "learning_rate": 2.1661145990156617e-06, + "loss": 0.9348541498184204, + "step": 4088 + }, + { + "epoch": 1.2588488765774084, + "grad_norm": 39.25, + "learning_rate": 2.1653550156853026e-06, + "loss": 1.7951834201812744, + "step": 4090 + }, + { + "epoch": 1.2594644506001846, + "grad_norm": 15.9375, + "learning_rate": 2.1645952709772147e-06, + "loss": 1.2039365768432617, + "step": 4092 + }, + { + "epoch": 1.2600800246229609, + "grad_norm": 27.0, + "learning_rate": 2.1638353652269784e-06, + "loss": 1.275390386581421, + "step": 4094 + }, + { + "epoch": 1.260695598645737, + "grad_norm": 11.375, + "learning_rate": 2.163075298770245e-06, + "loss": 1.6113206148147583, + "step": 4096 + }, + { + "epoch": 1.2613111726685133, + "grad_norm": 21.0, + "learning_rate": 2.1623150719427364e-06, + "loss": 1.623373031616211, + "step": 4098 + }, + { + "epoch": 1.2619267466912896, + "grad_norm": 31.75, + "learning_rate": 2.1615546850802454e-06, + "loss": 1.2032837867736816, + "step": 4100 + }, + { + "epoch": 1.2625423207140658, + "grad_norm": 8.875, + "learning_rate": 2.1607941385186364e-06, + "loss": 1.058774471282959, + "step": 4102 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 39.0, + "learning_rate": 2.160033432593843e-06, + "loss": 1.4389801025390625, + "step": 4104 + }, + { + "epoch": 1.2637734687596183, + "grad_norm": 12.75, + "learning_rate": 2.1592725676418705e-06, + "loss": 1.7803723812103271, + "step": 4106 + }, + { + "epoch": 1.2643890427823945, + "grad_norm": 19.875, + "learning_rate": 2.1585115439987935e-06, + "loss": 1.6537094116210938, + "step": 4108 + }, + { + "epoch": 1.2650046168051707, + "grad_norm": 7.90625, + "learning_rate": 2.1577503620007577e-06, + "loss": 1.5434420108795166, + "step": 4110 + }, + { + "epoch": 1.265620190827947, + "grad_norm": 8.3125, + "learning_rate": 2.1569890219839776e-06, + "loss": 1.4224700927734375, + "step": 4112 + }, + { + "epoch": 1.2662357648507232, + "grad_norm": 13.375, + "learning_rate": 2.156227524284737e-06, + "loss": 1.266633152961731, + "step": 4114 + }, + { + "epoch": 1.2668513388734994, + "grad_norm": 10.4375, + "learning_rate": 2.1554658692393915e-06, + "loss": 0.9019656181335449, + "step": 4116 + }, + { + "epoch": 1.2674669128962757, + "grad_norm": 11.1875, + "learning_rate": 2.1547040571843644e-06, + "loss": 1.0631968975067139, + "step": 4118 + }, + { + "epoch": 1.268082486919052, + "grad_norm": 12.0625, + "learning_rate": 2.1539420884561497e-06, + "loss": 1.1509294509887695, + "step": 4120 + }, + { + "epoch": 1.2686980609418281, + "grad_norm": 16.875, + "learning_rate": 2.153179963391309e-06, + "loss": 1.1578824520111084, + "step": 4122 + }, + { + "epoch": 1.2693136349646044, + "grad_norm": 9.3125, + "learning_rate": 2.152417682326474e-06, + "loss": 0.6979641318321228, + "step": 4124 + }, + { + "epoch": 1.2699292089873806, + "grad_norm": 24.75, + "learning_rate": 2.1516552455983456e-06, + "loss": 1.3547711372375488, + "step": 4126 + }, + { + "epoch": 1.270544783010157, + "grad_norm": 21.25, + "learning_rate": 2.150892653543693e-06, + "loss": 1.553722858428955, + "step": 4128 + }, + { + "epoch": 1.2711603570329333, + "grad_norm": 7.625, + "learning_rate": 2.150129906499353e-06, + "loss": 1.1862623691558838, + "step": 4130 + }, + { + "epoch": 1.2717759310557095, + "grad_norm": 15.5625, + "learning_rate": 2.1493670048022324e-06, + "loss": 0.948223888874054, + "step": 4132 + }, + { + "epoch": 1.2723915050784858, + "grad_norm": 17.875, + "learning_rate": 2.148603948789307e-06, + "loss": 1.3155548572540283, + "step": 4134 + }, + { + "epoch": 1.273007079101262, + "grad_norm": 10.0, + "learning_rate": 2.1478407387976172e-06, + "loss": 1.2378668785095215, + "step": 4136 + }, + { + "epoch": 1.2736226531240382, + "grad_norm": 17.75, + "learning_rate": 2.147077375164275e-06, + "loss": 1.4044148921966553, + "step": 4138 + }, + { + "epoch": 1.2742382271468145, + "grad_norm": 13.375, + "learning_rate": 2.146313858226459e-06, + "loss": 1.5460163354873657, + "step": 4140 + }, + { + "epoch": 1.2748538011695907, + "grad_norm": 14.875, + "learning_rate": 2.1455501883214155e-06, + "loss": 1.4281752109527588, + "step": 4142 + }, + { + "epoch": 1.275469375192367, + "grad_norm": 12.6875, + "learning_rate": 2.144786365786458e-06, + "loss": 1.5017046928405762, + "step": 4144 + }, + { + "epoch": 1.2760849492151431, + "grad_norm": 12.625, + "learning_rate": 2.1440223909589686e-06, + "loss": 1.4624148607254028, + "step": 4146 + }, + { + "epoch": 1.2767005232379194, + "grad_norm": 6.90625, + "learning_rate": 2.1432582641763956e-06, + "loss": 1.1030162572860718, + "step": 4148 + }, + { + "epoch": 1.2773160972606956, + "grad_norm": 18.375, + "learning_rate": 2.1424939857762535e-06, + "loss": 1.322242259979248, + "step": 4150 + }, + { + "epoch": 1.2779316712834718, + "grad_norm": 19.5, + "learning_rate": 2.1417295560961258e-06, + "loss": 1.0702810287475586, + "step": 4152 + }, + { + "epoch": 1.278547245306248, + "grad_norm": 18.875, + "learning_rate": 2.140964975473663e-06, + "loss": 1.2559038400650024, + "step": 4154 + }, + { + "epoch": 1.2791628193290243, + "grad_norm": 23.125, + "learning_rate": 2.1402002442465792e-06, + "loss": 1.339484453201294, + "step": 4156 + }, + { + "epoch": 1.2797783933518005, + "grad_norm": 20.375, + "learning_rate": 2.139435362752658e-06, + "loss": 1.4479165077209473, + "step": 4158 + }, + { + "epoch": 1.2803939673745768, + "grad_norm": 14.0625, + "learning_rate": 2.138670331329749e-06, + "loss": 0.7785294055938721, + "step": 4160 + }, + { + "epoch": 1.281009541397353, + "grad_norm": 5.25, + "learning_rate": 2.137905150315767e-06, + "loss": 1.0688307285308838, + "step": 4162 + }, + { + "epoch": 1.2816251154201292, + "grad_norm": 86.0, + "learning_rate": 2.1371398200486937e-06, + "loss": 0.8996104598045349, + "step": 4164 + }, + { + "epoch": 1.2822406894429055, + "grad_norm": 13.1875, + "learning_rate": 2.1363743408665754e-06, + "loss": 1.5372533798217773, + "step": 4166 + }, + { + "epoch": 1.2828562634656817, + "grad_norm": 9.6875, + "learning_rate": 2.135608713107525e-06, + "loss": 1.2518038749694824, + "step": 4168 + }, + { + "epoch": 1.283471837488458, + "grad_norm": 16.375, + "learning_rate": 2.1348429371097226e-06, + "loss": 1.4110199213027954, + "step": 4170 + }, + { + "epoch": 1.2840874115112342, + "grad_norm": 13.5625, + "learning_rate": 2.134077013211412e-06, + "loss": 0.7723270654678345, + "step": 4172 + }, + { + "epoch": 1.2847029855340104, + "grad_norm": 13.6875, + "learning_rate": 2.1333109417509017e-06, + "loss": 1.4334893226623535, + "step": 4174 + }, + { + "epoch": 1.2853185595567866, + "grad_norm": 30.0, + "learning_rate": 2.132544723066567e-06, + "loss": 1.5801007747650146, + "step": 4176 + }, + { + "epoch": 1.2859341335795629, + "grad_norm": 14.6875, + "learning_rate": 2.131778357496847e-06, + "loss": 1.2111811637878418, + "step": 4178 + }, + { + "epoch": 1.286549707602339, + "grad_norm": 10.5625, + "learning_rate": 2.131011845380247e-06, + "loss": 1.333784818649292, + "step": 4180 + }, + { + "epoch": 1.2871652816251153, + "grad_norm": 13.5625, + "learning_rate": 2.1302451870553363e-06, + "loss": 1.223961591720581, + "step": 4182 + }, + { + "epoch": 1.2877808556478916, + "grad_norm": 11.8125, + "learning_rate": 2.129478382860748e-06, + "loss": 1.148566722869873, + "step": 4184 + }, + { + "epoch": 1.288396429670668, + "grad_norm": 6.03125, + "learning_rate": 2.128711433135181e-06, + "loss": 1.365966558456421, + "step": 4186 + }, + { + "epoch": 1.2890120036934443, + "grad_norm": 14.875, + "learning_rate": 2.127944338217398e-06, + "loss": 1.2503944635391235, + "step": 4188 + }, + { + "epoch": 1.2896275777162205, + "grad_norm": 19.75, + "learning_rate": 2.127177098446225e-06, + "loss": 1.1578048467636108, + "step": 4190 + }, + { + "epoch": 1.2902431517389967, + "grad_norm": 15.5625, + "learning_rate": 2.126409714160553e-06, + "loss": 1.0331218242645264, + "step": 4192 + }, + { + "epoch": 1.290858725761773, + "grad_norm": 6.15625, + "learning_rate": 2.1256421856993367e-06, + "loss": 0.9528582096099854, + "step": 4194 + }, + { + "epoch": 1.2914742997845492, + "grad_norm": 61.0, + "learning_rate": 2.124874513401594e-06, + "loss": 1.3459211587905884, + "step": 4196 + }, + { + "epoch": 1.2920898738073254, + "grad_norm": 14.625, + "learning_rate": 2.1241066976064076e-06, + "loss": 1.7501788139343262, + "step": 4198 + }, + { + "epoch": 1.2927054478301017, + "grad_norm": 9.4375, + "learning_rate": 2.1233387386529216e-06, + "loss": 1.2650032043457031, + "step": 4200 + }, + { + "epoch": 1.293321021852878, + "grad_norm": 7.6875, + "learning_rate": 2.122570636880344e-06, + "loss": 1.304715871810913, + "step": 4202 + }, + { + "epoch": 1.2939365958756541, + "grad_norm": 10.25, + "learning_rate": 2.1218023926279474e-06, + "loss": 1.2947635650634766, + "step": 4204 + }, + { + "epoch": 1.2945521698984304, + "grad_norm": 6.09375, + "learning_rate": 2.1210340062350656e-06, + "loss": 1.1739459037780762, + "step": 4206 + }, + { + "epoch": 1.2951677439212066, + "grad_norm": 7.21875, + "learning_rate": 2.120265478041095e-06, + "loss": 1.3887667655944824, + "step": 4208 + }, + { + "epoch": 1.2957833179439828, + "grad_norm": 17.0, + "learning_rate": 2.119496808385497e-06, + "loss": 1.4132437705993652, + "step": 4210 + }, + { + "epoch": 1.296398891966759, + "grad_norm": 12.4375, + "learning_rate": 2.1187279976077927e-06, + "loss": 1.393709421157837, + "step": 4212 + }, + { + "epoch": 1.2970144659895353, + "grad_norm": 29.625, + "learning_rate": 2.1179590460475666e-06, + "loss": 1.6518043279647827, + "step": 4214 + }, + { + "epoch": 1.2976300400123115, + "grad_norm": 7.6875, + "learning_rate": 2.1171899540444667e-06, + "loss": 1.0748077630996704, + "step": 4216 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 44.25, + "learning_rate": 2.1164207219382007e-06, + "loss": 1.0568442344665527, + "step": 4218 + }, + { + "epoch": 1.298861188057864, + "grad_norm": 23.75, + "learning_rate": 2.1156513500685388e-06, + "loss": 0.8065166473388672, + "step": 4220 + }, + { + "epoch": 1.2994767620806402, + "grad_norm": 16.5, + "learning_rate": 2.114881838775315e-06, + "loss": 1.3090413808822632, + "step": 4222 + }, + { + "epoch": 1.3000923361034165, + "grad_norm": 45.0, + "learning_rate": 2.114112188398423e-06, + "loss": 1.681335687637329, + "step": 4224 + }, + { + "epoch": 1.3007079101261927, + "grad_norm": 85.0, + "learning_rate": 2.113342399277817e-06, + "loss": 0.8728981018066406, + "step": 4226 + }, + { + "epoch": 1.301323484148969, + "grad_norm": 21.75, + "learning_rate": 2.1125724717535147e-06, + "loss": 1.6157889366149902, + "step": 4228 + }, + { + "epoch": 1.3019390581717452, + "grad_norm": 21.625, + "learning_rate": 2.111802406165594e-06, + "loss": 1.3912405967712402, + "step": 4230 + }, + { + "epoch": 1.3025546321945214, + "grad_norm": 12.9375, + "learning_rate": 2.111032202854194e-06, + "loss": 1.4589062929153442, + "step": 4232 + }, + { + "epoch": 1.3031702062172976, + "grad_norm": 19.0, + "learning_rate": 2.110261862159513e-06, + "loss": 1.3329696655273438, + "step": 4234 + }, + { + "epoch": 1.3037857802400739, + "grad_norm": 13.0, + "learning_rate": 2.1094913844218126e-06, + "loss": 1.2835090160369873, + "step": 4236 + }, + { + "epoch": 1.30440135426285, + "grad_norm": 14.375, + "learning_rate": 2.1087207699814135e-06, + "loss": 1.3457741737365723, + "step": 4238 + }, + { + "epoch": 1.3050169282856263, + "grad_norm": 17.875, + "learning_rate": 2.1079500191786973e-06, + "loss": 1.2177116870880127, + "step": 4240 + }, + { + "epoch": 1.3056325023084026, + "grad_norm": 13.25, + "learning_rate": 2.1071791323541047e-06, + "loss": 1.3747305870056152, + "step": 4242 + }, + { + "epoch": 1.3062480763311788, + "grad_norm": 10.4375, + "learning_rate": 2.1064081098481374e-06, + "loss": 1.4916646480560303, + "step": 4244 + }, + { + "epoch": 1.306863650353955, + "grad_norm": 24.375, + "learning_rate": 2.1056369520013582e-06, + "loss": 1.630761742591858, + "step": 4246 + }, + { + "epoch": 1.3074792243767313, + "grad_norm": 52.75, + "learning_rate": 2.104865659154387e-06, + "loss": 0.7215687036514282, + "step": 4248 + }, + { + "epoch": 1.3080947983995075, + "grad_norm": 35.25, + "learning_rate": 2.1040942316479046e-06, + "loss": 1.8062270879745483, + "step": 4250 + }, + { + "epoch": 1.3087103724222837, + "grad_norm": 24.25, + "learning_rate": 2.1033226698226526e-06, + "loss": 1.3249752521514893, + "step": 4252 + }, + { + "epoch": 1.30932594644506, + "grad_norm": 23.625, + "learning_rate": 2.10255097401943e-06, + "loss": 1.3274242877960205, + "step": 4254 + }, + { + "epoch": 1.3099415204678362, + "grad_norm": 10.3125, + "learning_rate": 2.1017791445790953e-06, + "loss": 1.582962989807129, + "step": 4256 + }, + { + "epoch": 1.3105570944906124, + "grad_norm": 17.625, + "learning_rate": 2.101007181842568e-06, + "loss": 1.228154182434082, + "step": 4258 + }, + { + "epoch": 1.3111726685133887, + "grad_norm": 8.625, + "learning_rate": 2.1002350861508234e-06, + "loss": 1.2410879135131836, + "step": 4260 + }, + { + "epoch": 1.3117882425361649, + "grad_norm": 7.34375, + "learning_rate": 2.099462857844897e-06, + "loss": 1.2221719026565552, + "step": 4262 + }, + { + "epoch": 1.3124038165589411, + "grad_norm": 13.0, + "learning_rate": 2.0986904972658837e-06, + "loss": 1.1706748008728027, + "step": 4264 + }, + { + "epoch": 1.3130193905817173, + "grad_norm": 18.0, + "learning_rate": 2.0979180047549363e-06, + "loss": 1.4403222799301147, + "step": 4266 + }, + { + "epoch": 1.3136349646044936, + "grad_norm": 35.25, + "learning_rate": 2.097145380653265e-06, + "loss": 1.600974440574646, + "step": 4268 + }, + { + "epoch": 1.3142505386272698, + "grad_norm": 6.59375, + "learning_rate": 2.0963726253021393e-06, + "loss": 1.3970341682434082, + "step": 4270 + }, + { + "epoch": 1.314866112650046, + "grad_norm": 16.0, + "learning_rate": 2.095599739042885e-06, + "loss": 0.7584080696105957, + "step": 4272 + }, + { + "epoch": 1.3154816866728223, + "grad_norm": 11.3125, + "learning_rate": 2.094826722216888e-06, + "loss": 1.0941411256790161, + "step": 4274 + }, + { + "epoch": 1.3160972606955985, + "grad_norm": 141.0, + "learning_rate": 2.0940535751655897e-06, + "loss": 0.9738726615905762, + "step": 4276 + }, + { + "epoch": 1.3167128347183747, + "grad_norm": 8.125, + "learning_rate": 2.0932802982304915e-06, + "loss": 1.0021553039550781, + "step": 4278 + }, + { + "epoch": 1.3173284087411512, + "grad_norm": 33.25, + "learning_rate": 2.0925068917531495e-06, + "loss": 1.660839319229126, + "step": 4280 + }, + { + "epoch": 1.3179439827639274, + "grad_norm": 90.0, + "learning_rate": 2.091733356075179e-06, + "loss": 1.2870283126831055, + "step": 4282 + }, + { + "epoch": 1.3185595567867037, + "grad_norm": 14.125, + "learning_rate": 2.0909596915382504e-06, + "loss": 1.5686073303222656, + "step": 4284 + }, + { + "epoch": 1.31917513080948, + "grad_norm": 11.625, + "learning_rate": 2.0901858984840935e-06, + "loss": 1.1524231433868408, + "step": 4286 + }, + { + "epoch": 1.3197907048322561, + "grad_norm": 20.75, + "learning_rate": 2.0894119772544927e-06, + "loss": 1.5431840419769287, + "step": 4288 + }, + { + "epoch": 1.3204062788550324, + "grad_norm": 13.3125, + "learning_rate": 2.0886379281912903e-06, + "loss": 1.4432337284088135, + "step": 4290 + }, + { + "epoch": 1.3210218528778086, + "grad_norm": 37.25, + "learning_rate": 2.0878637516363846e-06, + "loss": 1.8126132488250732, + "step": 4292 + }, + { + "epoch": 1.3216374269005848, + "grad_norm": 5.8125, + "learning_rate": 2.0870894479317306e-06, + "loss": 0.8514418601989746, + "step": 4294 + }, + { + "epoch": 1.322253000923361, + "grad_norm": 9.8125, + "learning_rate": 2.086315017419338e-06, + "loss": 1.323155403137207, + "step": 4296 + }, + { + "epoch": 1.3228685749461373, + "grad_norm": 10.75, + "learning_rate": 2.0855404604412754e-06, + "loss": 1.3291047811508179, + "step": 4298 + }, + { + "epoch": 1.3234841489689135, + "grad_norm": 14.75, + "learning_rate": 2.084765777339664e-06, + "loss": 0.9919479489326477, + "step": 4300 + }, + { + "epoch": 1.3240997229916898, + "grad_norm": 13.625, + "learning_rate": 2.083990968456683e-06, + "loss": 1.2738001346588135, + "step": 4302 + }, + { + "epoch": 1.324715297014466, + "grad_norm": 10.125, + "learning_rate": 2.0832160341345657e-06, + "loss": 0.8954488039016724, + "step": 4304 + }, + { + "epoch": 1.3253308710372422, + "grad_norm": 10.625, + "learning_rate": 2.082440974715603e-06, + "loss": 1.3220200538635254, + "step": 4306 + }, + { + "epoch": 1.3259464450600185, + "grad_norm": 5.5, + "learning_rate": 2.0816657905421376e-06, + "loss": 1.226555347442627, + "step": 4308 + }, + { + "epoch": 1.3265620190827947, + "grad_norm": 16.625, + "learning_rate": 2.0808904819565703e-06, + "loss": 1.5791137218475342, + "step": 4310 + }, + { + "epoch": 1.327177593105571, + "grad_norm": 34.0, + "learning_rate": 2.0801150493013557e-06, + "loss": 1.5559651851654053, + "step": 4312 + }, + { + "epoch": 1.3277931671283472, + "grad_norm": 14.5625, + "learning_rate": 2.0793394929190026e-06, + "loss": 1.4778270721435547, + "step": 4314 + }, + { + "epoch": 1.3284087411511234, + "grad_norm": 13.5, + "learning_rate": 2.078563813152076e-06, + "loss": 0.5933064222335815, + "step": 4316 + }, + { + "epoch": 1.3290243151738996, + "grad_norm": 5.125, + "learning_rate": 2.0777880103431946e-06, + "loss": 1.1889657974243164, + "step": 4318 + }, + { + "epoch": 1.3296398891966759, + "grad_norm": 26.625, + "learning_rate": 2.077012084835031e-06, + "loss": 1.584001898765564, + "step": 4320 + }, + { + "epoch": 1.330255463219452, + "grad_norm": 14.9375, + "learning_rate": 2.0762360369703122e-06, + "loss": 1.5891938209533691, + "step": 4322 + }, + { + "epoch": 1.3308710372422283, + "grad_norm": 13.5625, + "learning_rate": 2.0754598670918193e-06, + "loss": 1.6458081007003784, + "step": 4324 + }, + { + "epoch": 1.3314866112650046, + "grad_norm": 26.75, + "learning_rate": 2.0746835755423883e-06, + "loss": 1.5826940536499023, + "step": 4326 + }, + { + "epoch": 1.3321021852877808, + "grad_norm": 12.875, + "learning_rate": 2.0739071626649074e-06, + "loss": 1.4658899307250977, + "step": 4328 + }, + { + "epoch": 1.332717759310557, + "grad_norm": 20.625, + "learning_rate": 2.073130628802319e-06, + "loss": 1.2064714431762695, + "step": 4330 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 4.8125, + "learning_rate": 2.07235397429762e-06, + "loss": 1.3495519161224365, + "step": 4332 + }, + { + "epoch": 1.3339489073561095, + "grad_norm": 9.4375, + "learning_rate": 2.071577199493858e-06, + "loss": 1.3448805809020996, + "step": 4334 + }, + { + "epoch": 1.334564481378886, + "grad_norm": 22.625, + "learning_rate": 2.0708003047341366e-06, + "loss": 1.2070777416229248, + "step": 4336 + }, + { + "epoch": 1.3351800554016622, + "grad_norm": 11.3125, + "learning_rate": 2.07002329036161e-06, + "loss": 1.0715398788452148, + "step": 4338 + }, + { + "epoch": 1.3357956294244384, + "grad_norm": 10.8125, + "learning_rate": 2.069246156719487e-06, + "loss": 1.1646559238433838, + "step": 4340 + }, + { + "epoch": 1.3364112034472146, + "grad_norm": 15.9375, + "learning_rate": 2.068468904151028e-06, + "loss": 0.6743322014808655, + "step": 4342 + }, + { + "epoch": 1.3370267774699909, + "grad_norm": 87.5, + "learning_rate": 2.067691532999548e-06, + "loss": 0.7150102853775024, + "step": 4344 + }, + { + "epoch": 1.3376423514927671, + "grad_norm": 9.1875, + "learning_rate": 2.0669140436084105e-06, + "loss": 1.5677911043167114, + "step": 4346 + }, + { + "epoch": 1.3382579255155433, + "grad_norm": 13.0, + "learning_rate": 2.066136436321035e-06, + "loss": 1.2293686866760254, + "step": 4348 + }, + { + "epoch": 1.3388734995383196, + "grad_norm": 11.6875, + "learning_rate": 2.0653587114808902e-06, + "loss": 1.3133034706115723, + "step": 4350 + }, + { + "epoch": 1.3394890735610958, + "grad_norm": 23.875, + "learning_rate": 2.064580869431499e-06, + "loss": 1.7450048923492432, + "step": 4352 + }, + { + "epoch": 1.340104647583872, + "grad_norm": 9.625, + "learning_rate": 2.063802910516435e-06, + "loss": 0.7626341581344604, + "step": 4354 + }, + { + "epoch": 1.3407202216066483, + "grad_norm": 10.8125, + "learning_rate": 2.0630248350793238e-06, + "loss": 1.1981799602508545, + "step": 4356 + }, + { + "epoch": 1.3413357956294245, + "grad_norm": 15.0625, + "learning_rate": 2.0622466434638414e-06, + "loss": 1.2713027000427246, + "step": 4358 + }, + { + "epoch": 1.3419513696522007, + "grad_norm": 16.125, + "learning_rate": 2.0614683360137164e-06, + "loss": 1.393301010131836, + "step": 4360 + }, + { + "epoch": 1.342566943674977, + "grad_norm": 13.9375, + "learning_rate": 2.060689913072728e-06, + "loss": 1.3362078666687012, + "step": 4362 + }, + { + "epoch": 1.3431825176977532, + "grad_norm": 21.75, + "learning_rate": 2.0599113749847066e-06, + "loss": 1.6755115985870361, + "step": 4364 + }, + { + "epoch": 1.3437980917205294, + "grad_norm": 20.875, + "learning_rate": 2.059132722093533e-06, + "loss": 1.3301327228546143, + "step": 4366 + }, + { + "epoch": 1.3444136657433057, + "grad_norm": 13.875, + "learning_rate": 2.0583539547431407e-06, + "loss": 1.7222154140472412, + "step": 4368 + }, + { + "epoch": 1.345029239766082, + "grad_norm": 8.6875, + "learning_rate": 2.0575750732775097e-06, + "loss": 1.095766544342041, + "step": 4370 + }, + { + "epoch": 1.3456448137888581, + "grad_norm": 13.5625, + "learning_rate": 2.056796078040675e-06, + "loss": 1.230293869972229, + "step": 4372 + }, + { + "epoch": 1.3462603878116344, + "grad_norm": 20.75, + "learning_rate": 2.0560169693767174e-06, + "loss": 1.4030444622039795, + "step": 4374 + }, + { + "epoch": 1.3468759618344106, + "grad_norm": 12.875, + "learning_rate": 2.0552377476297716e-06, + "loss": 1.573530912399292, + "step": 4376 + }, + { + "epoch": 1.3474915358571868, + "grad_norm": 15.25, + "learning_rate": 2.0544584131440212e-06, + "loss": 1.604763150215149, + "step": 4378 + }, + { + "epoch": 1.348107109879963, + "grad_norm": 8.5, + "learning_rate": 2.053678966263698e-06, + "loss": 1.346675992012024, + "step": 4380 + }, + { + "epoch": 1.3487226839027393, + "grad_norm": 14.9375, + "learning_rate": 2.052899407333085e-06, + "loss": 1.5418078899383545, + "step": 4382 + }, + { + "epoch": 1.3493382579255155, + "grad_norm": 97.0, + "learning_rate": 2.052119736696514e-06, + "loss": 1.6574347019195557, + "step": 4384 + }, + { + "epoch": 1.3499538319482918, + "grad_norm": 17.625, + "learning_rate": 2.0513399546983677e-06, + "loss": 1.5158668756484985, + "step": 4386 + }, + { + "epoch": 1.350569405971068, + "grad_norm": 12.5, + "learning_rate": 2.050560061683075e-06, + "loss": 1.090888261795044, + "step": 4388 + }, + { + "epoch": 1.3511849799938442, + "grad_norm": 114.5, + "learning_rate": 2.049780057995116e-06, + "loss": 1.3082555532455444, + "step": 4390 + }, + { + "epoch": 1.3518005540166205, + "grad_norm": 6.6875, + "learning_rate": 2.04899994397902e-06, + "loss": 1.070220947265625, + "step": 4392 + }, + { + "epoch": 1.3524161280393967, + "grad_norm": 11.5, + "learning_rate": 2.048219719979363e-06, + "loss": 1.3319404125213623, + "step": 4394 + }, + { + "epoch": 1.353031702062173, + "grad_norm": 15.6875, + "learning_rate": 2.0474393863407724e-06, + "loss": 1.5495846271514893, + "step": 4396 + }, + { + "epoch": 1.3536472760849492, + "grad_norm": 17.625, + "learning_rate": 2.046658943407921e-06, + "loss": 1.338994026184082, + "step": 4398 + }, + { + "epoch": 1.3542628501077254, + "grad_norm": 18.5, + "learning_rate": 2.045878391525532e-06, + "loss": 1.5177886486053467, + "step": 4400 + }, + { + "epoch": 1.3548784241305016, + "grad_norm": 3.25, + "learning_rate": 2.045097731038376e-06, + "loss": 1.3279091119766235, + "step": 4402 + }, + { + "epoch": 1.3554939981532779, + "grad_norm": 29.625, + "learning_rate": 2.0443169622912717e-06, + "loss": 1.4942060708999634, + "step": 4404 + }, + { + "epoch": 1.356109572176054, + "grad_norm": 12.625, + "learning_rate": 2.043536085629086e-06, + "loss": 1.3356786966323853, + "step": 4406 + }, + { + "epoch": 1.3567251461988303, + "grad_norm": 9.0, + "learning_rate": 2.0427551013967314e-06, + "loss": 1.1315312385559082, + "step": 4408 + }, + { + "epoch": 1.3573407202216066, + "grad_norm": 14.1875, + "learning_rate": 2.0419740099391717e-06, + "loss": 1.504289150238037, + "step": 4410 + }, + { + "epoch": 1.3579562942443828, + "grad_norm": 17.625, + "learning_rate": 2.041192811601414e-06, + "loss": 1.5331953763961792, + "step": 4412 + }, + { + "epoch": 1.358571868267159, + "grad_norm": 19.625, + "learning_rate": 2.0404115067285157e-06, + "loss": 1.4719030857086182, + "step": 4414 + }, + { + "epoch": 1.3591874422899353, + "grad_norm": 11.0625, + "learning_rate": 2.0396300956655794e-06, + "loss": 1.0315923690795898, + "step": 4416 + }, + { + "epoch": 1.3598030163127115, + "grad_norm": 16.125, + "learning_rate": 2.038848578757756e-06, + "loss": 1.3465533256530762, + "step": 4418 + }, + { + "epoch": 1.3604185903354877, + "grad_norm": 21.125, + "learning_rate": 2.0380669563502418e-06, + "loss": 1.7399990558624268, + "step": 4420 + }, + { + "epoch": 1.361034164358264, + "grad_norm": 17.25, + "learning_rate": 2.03728522878828e-06, + "loss": 1.406071424484253, + "step": 4422 + }, + { + "epoch": 1.3616497383810402, + "grad_norm": 10.5, + "learning_rate": 2.036503396417162e-06, + "loss": 1.495185375213623, + "step": 4424 + }, + { + "epoch": 1.3622653124038164, + "grad_norm": 10.75, + "learning_rate": 2.0357214595822224e-06, + "loss": 1.3955354690551758, + "step": 4426 + }, + { + "epoch": 1.3628808864265927, + "grad_norm": 12.375, + "learning_rate": 2.0349394186288443e-06, + "loss": 1.2213549613952637, + "step": 4428 + }, + { + "epoch": 1.3634964604493691, + "grad_norm": 11.8125, + "learning_rate": 2.034157273902456e-06, + "loss": 1.4318450689315796, + "step": 4430 + }, + { + "epoch": 1.3641120344721454, + "grad_norm": 14.4375, + "learning_rate": 2.0333750257485317e-06, + "loss": 1.503476619720459, + "step": 4432 + }, + { + "epoch": 1.3647276084949216, + "grad_norm": 12.1875, + "learning_rate": 2.0325926745125914e-06, + "loss": 1.298352599143982, + "step": 4434 + }, + { + "epoch": 1.3653431825176978, + "grad_norm": 14.6875, + "learning_rate": 2.0318102205402003e-06, + "loss": 1.4269036054611206, + "step": 4436 + }, + { + "epoch": 1.365958756540474, + "grad_norm": 27.75, + "learning_rate": 2.03102766417697e-06, + "loss": 1.390794038772583, + "step": 4438 + }, + { + "epoch": 1.3665743305632503, + "grad_norm": 5.84375, + "learning_rate": 2.0302450057685555e-06, + "loss": 1.1889770030975342, + "step": 4440 + }, + { + "epoch": 1.3671899045860265, + "grad_norm": 8.75, + "learning_rate": 2.0294622456606585e-06, + "loss": 1.3410286903381348, + "step": 4442 + }, + { + "epoch": 1.3678054786088027, + "grad_norm": 10.5, + "learning_rate": 2.0286793841990247e-06, + "loss": 1.4344102144241333, + "step": 4444 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 33.5, + "learning_rate": 2.027896421729446e-06, + "loss": 1.5609376430511475, + "step": 4446 + }, + { + "epoch": 1.3690366266543552, + "grad_norm": 25.5, + "learning_rate": 2.0271133585977562e-06, + "loss": 1.7404444217681885, + "step": 4448 + }, + { + "epoch": 1.3696522006771314, + "grad_norm": 33.5, + "learning_rate": 2.026330195149836e-06, + "loss": 1.5617144107818604, + "step": 4450 + }, + { + "epoch": 1.3702677746999077, + "grad_norm": 17.375, + "learning_rate": 2.02554693173161e-06, + "loss": 1.167197585105896, + "step": 4452 + }, + { + "epoch": 1.370883348722684, + "grad_norm": 20.125, + "learning_rate": 2.0247635686890457e-06, + "loss": 1.384680986404419, + "step": 4454 + }, + { + "epoch": 1.3714989227454601, + "grad_norm": 55.75, + "learning_rate": 2.0239801063681557e-06, + "loss": 1.807742714881897, + "step": 4456 + }, + { + "epoch": 1.3721144967682364, + "grad_norm": 16.375, + "learning_rate": 2.023196545114996e-06, + "loss": 1.3313524723052979, + "step": 4458 + }, + { + "epoch": 1.3727300707910126, + "grad_norm": 20.25, + "learning_rate": 2.0224128852756677e-06, + "loss": 1.5227307081222534, + "step": 4460 + }, + { + "epoch": 1.3733456448137888, + "grad_norm": 24.75, + "learning_rate": 2.0216291271963127e-06, + "loss": 0.921663224697113, + "step": 4462 + }, + { + "epoch": 1.373961218836565, + "grad_norm": 16.75, + "learning_rate": 2.020845271223119e-06, + "loss": 1.6640578508377075, + "step": 4464 + }, + { + "epoch": 1.3745767928593413, + "grad_norm": 77.0, + "learning_rate": 2.020061317702316e-06, + "loss": 0.973523736000061, + "step": 4466 + }, + { + "epoch": 1.3751923668821175, + "grad_norm": 18.75, + "learning_rate": 2.019277266980177e-06, + "loss": 1.3174777030944824, + "step": 4468 + }, + { + "epoch": 1.3758079409048938, + "grad_norm": 29.5, + "learning_rate": 2.0184931194030174e-06, + "loss": 1.2604511976242065, + "step": 4470 + }, + { + "epoch": 1.37642351492767, + "grad_norm": 17.875, + "learning_rate": 2.0177088753171976e-06, + "loss": 1.442176103591919, + "step": 4472 + }, + { + "epoch": 1.3770390889504462, + "grad_norm": 32.0, + "learning_rate": 2.0169245350691186e-06, + "loss": 1.22340989112854, + "step": 4474 + }, + { + "epoch": 1.3776546629732225, + "grad_norm": 25.625, + "learning_rate": 2.0161400990052236e-06, + "loss": 1.3502919673919678, + "step": 4476 + }, + { + "epoch": 1.3782702369959987, + "grad_norm": 13.5625, + "learning_rate": 2.015355567472e-06, + "loss": 1.2456159591674805, + "step": 4478 + }, + { + "epoch": 1.378885811018775, + "grad_norm": 7.09375, + "learning_rate": 2.0145709408159754e-06, + "loss": 1.300649642944336, + "step": 4480 + }, + { + "epoch": 1.3795013850415512, + "grad_norm": 13.9375, + "learning_rate": 2.0137862193837205e-06, + "loss": 1.518198847770691, + "step": 4482 + }, + { + "epoch": 1.3801169590643274, + "grad_norm": 14.8125, + "learning_rate": 2.013001403521848e-06, + "loss": 1.3599984645843506, + "step": 4484 + }, + { + "epoch": 1.3807325330871036, + "grad_norm": 16.125, + "learning_rate": 2.012216493577012e-06, + "loss": 1.7173393964767456, + "step": 4486 + }, + { + "epoch": 1.38134810710988, + "grad_norm": 11.6875, + "learning_rate": 2.011431489895907e-06, + "loss": 1.530381679534912, + "step": 4488 + }, + { + "epoch": 1.3819636811326563, + "grad_norm": 15.5, + "learning_rate": 2.010646392825272e-06, + "loss": 1.6565570831298828, + "step": 4490 + }, + { + "epoch": 1.3825792551554326, + "grad_norm": 9.625, + "learning_rate": 2.009861202711883e-06, + "loss": 1.581078290939331, + "step": 4492 + }, + { + "epoch": 1.3831948291782088, + "grad_norm": 19.125, + "learning_rate": 2.009075919902561e-06, + "loss": 1.5048232078552246, + "step": 4494 + }, + { + "epoch": 1.383810403200985, + "grad_norm": 11.25, + "learning_rate": 2.0082905447441658e-06, + "loss": 1.4397077560424805, + "step": 4496 + }, + { + "epoch": 1.3844259772237613, + "grad_norm": 13.8125, + "learning_rate": 2.007505077583599e-06, + "loss": 1.2956914901733398, + "step": 4498 + }, + { + "epoch": 1.3850415512465375, + "grad_norm": 27.625, + "learning_rate": 2.0067195187678015e-06, + "loss": 1.5892021656036377, + "step": 4500 + }, + { + "epoch": 1.3856571252693137, + "grad_norm": 10.5, + "learning_rate": 2.005933868643756e-06, + "loss": 1.0231890678405762, + "step": 4502 + }, + { + "epoch": 1.38627269929209, + "grad_norm": 9.5, + "learning_rate": 2.0051481275584847e-06, + "loss": 1.1836471557617188, + "step": 4504 + }, + { + "epoch": 1.3868882733148662, + "grad_norm": 14.5625, + "learning_rate": 2.004362295859051e-06, + "loss": 1.558052897453308, + "step": 4506 + }, + { + "epoch": 1.3875038473376424, + "grad_norm": 16.125, + "learning_rate": 2.0035763738925575e-06, + "loss": 1.3589587211608887, + "step": 4508 + }, + { + "epoch": 1.3881194213604187, + "grad_norm": 21.625, + "learning_rate": 2.002790362006146e-06, + "loss": 1.8019932508468628, + "step": 4510 + }, + { + "epoch": 1.388734995383195, + "grad_norm": 26.25, + "learning_rate": 2.0020042605469997e-06, + "loss": 1.527815580368042, + "step": 4512 + }, + { + "epoch": 1.3893505694059711, + "grad_norm": 17.25, + "learning_rate": 2.00121806986234e-06, + "loss": 1.4716898202896118, + "step": 4514 + }, + { + "epoch": 1.3899661434287474, + "grad_norm": 12.1875, + "learning_rate": 2.000431790299429e-06, + "loss": 1.12889564037323, + "step": 4516 + }, + { + "epoch": 1.3905817174515236, + "grad_norm": 39.0, + "learning_rate": 1.9996454222055665e-06, + "loss": 1.1350713968276978, + "step": 4518 + }, + { + "epoch": 1.3911972914742998, + "grad_norm": 23.125, + "learning_rate": 1.998858965928093e-06, + "loss": 1.5621490478515625, + "step": 4520 + }, + { + "epoch": 1.391812865497076, + "grad_norm": 27.875, + "learning_rate": 1.9980724218143855e-06, + "loss": 1.580711841583252, + "step": 4522 + }, + { + "epoch": 1.3924284395198523, + "grad_norm": 32.5, + "learning_rate": 1.997285790211864e-06, + "loss": 1.6474636793136597, + "step": 4524 + }, + { + "epoch": 1.3930440135426285, + "grad_norm": 48.75, + "learning_rate": 1.996499071467982e-06, + "loss": 1.2069621086120605, + "step": 4526 + }, + { + "epoch": 1.3936595875654048, + "grad_norm": 5.9375, + "learning_rate": 1.9957122659302354e-06, + "loss": 1.103395700454712, + "step": 4528 + }, + { + "epoch": 1.394275161588181, + "grad_norm": 42.5, + "learning_rate": 1.994925373946157e-06, + "loss": 1.1344338655471802, + "step": 4530 + }, + { + "epoch": 1.3948907356109572, + "grad_norm": 8.3125, + "learning_rate": 1.994138395863318e-06, + "loss": 1.0106946229934692, + "step": 4532 + }, + { + "epoch": 1.3955063096337335, + "grad_norm": 14.25, + "learning_rate": 1.9933513320293267e-06, + "loss": 1.0307326316833496, + "step": 4534 + }, + { + "epoch": 1.3961218836565097, + "grad_norm": 21.875, + "learning_rate": 1.992564182791832e-06, + "loss": 1.3763636350631714, + "step": 4536 + }, + { + "epoch": 1.396737457679286, + "grad_norm": 13.5, + "learning_rate": 1.9917769484985157e-06, + "loss": 1.2347264289855957, + "step": 4538 + }, + { + "epoch": 1.3973530317020622, + "grad_norm": 17.0, + "learning_rate": 1.990989629497103e-06, + "loss": 1.7017972469329834, + "step": 4540 + }, + { + "epoch": 1.3979686057248384, + "grad_norm": 18.0, + "learning_rate": 1.9902022261353515e-06, + "loss": 1.6707942485809326, + "step": 4542 + }, + { + "epoch": 1.3985841797476146, + "grad_norm": 12.0625, + "learning_rate": 1.989414738761059e-06, + "loss": 1.6330020427703857, + "step": 4544 + }, + { + "epoch": 1.3991997537703909, + "grad_norm": 11.0625, + "learning_rate": 1.9886271677220603e-06, + "loss": 1.3158289194107056, + "step": 4546 + }, + { + "epoch": 1.399815327793167, + "grad_norm": 19.625, + "learning_rate": 1.9878395133662248e-06, + "loss": 1.3446176052093506, + "step": 4548 + }, + { + "epoch": 1.4004309018159433, + "grad_norm": 14.875, + "learning_rate": 1.987051776041462e-06, + "loss": 1.260849118232727, + "step": 4550 + }, + { + "epoch": 1.4010464758387196, + "grad_norm": 14.4375, + "learning_rate": 1.986263956095715e-06, + "loss": 1.2888896465301514, + "step": 4552 + }, + { + "epoch": 1.4016620498614958, + "grad_norm": 12.5, + "learning_rate": 1.9854760538769655e-06, + "loss": 1.490917444229126, + "step": 4554 + }, + { + "epoch": 1.402277623884272, + "grad_norm": 23.875, + "learning_rate": 1.9846880697332307e-06, + "loss": 1.6391327381134033, + "step": 4556 + }, + { + "epoch": 1.4028931979070483, + "grad_norm": 97.0, + "learning_rate": 1.9839000040125647e-06, + "loss": 1.3920652866363525, + "step": 4558 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 19.625, + "learning_rate": 1.9831118570630555e-06, + "loss": 1.3244240283966064, + "step": 4560 + }, + { + "epoch": 1.4041243459526007, + "grad_norm": 9.625, + "learning_rate": 1.98232362923283e-06, + "loss": 1.2185591459274292, + "step": 4562 + }, + { + "epoch": 1.404739919975377, + "grad_norm": 16.125, + "learning_rate": 1.9815353208700483e-06, + "loss": 1.27888822555542, + "step": 4564 + }, + { + "epoch": 1.4053554939981532, + "grad_norm": 15.3125, + "learning_rate": 1.9807469323229084e-06, + "loss": 1.0425304174423218, + "step": 4566 + }, + { + "epoch": 1.4059710680209294, + "grad_norm": 57.25, + "learning_rate": 1.9799584639396406e-06, + "loss": 1.788959264755249, + "step": 4568 + }, + { + "epoch": 1.4065866420437056, + "grad_norm": 13.75, + "learning_rate": 1.979169916068514e-06, + "loss": 1.2876644134521484, + "step": 4570 + }, + { + "epoch": 1.4072022160664819, + "grad_norm": 47.75, + "learning_rate": 1.9783812890578297e-06, + "loss": 1.4198131561279297, + "step": 4572 + }, + { + "epoch": 1.4078177900892581, + "grad_norm": 24.625, + "learning_rate": 1.9775925832559266e-06, + "loss": 1.0596034526824951, + "step": 4574 + }, + { + "epoch": 1.4084333641120343, + "grad_norm": 9.4375, + "learning_rate": 1.976803799011176e-06, + "loss": 1.3388042449951172, + "step": 4576 + }, + { + "epoch": 1.4090489381348106, + "grad_norm": 8.9375, + "learning_rate": 1.976014936671984e-06, + "loss": 1.1961338520050049, + "step": 4578 + }, + { + "epoch": 1.4096645121575868, + "grad_norm": 16.75, + "learning_rate": 1.9752259965867944e-06, + "loss": 1.3246583938598633, + "step": 4580 + }, + { + "epoch": 1.4102800861803633, + "grad_norm": 16.75, + "learning_rate": 1.97443697910408e-06, + "loss": 1.446965217590332, + "step": 4582 + }, + { + "epoch": 1.4108956602031395, + "grad_norm": 12.8125, + "learning_rate": 1.973647884572354e-06, + "loss": 1.5359292030334473, + "step": 4584 + }, + { + "epoch": 1.4115112342259157, + "grad_norm": 8.6875, + "learning_rate": 1.9728587133401577e-06, + "loss": 1.2705938816070557, + "step": 4586 + }, + { + "epoch": 1.412126808248692, + "grad_norm": 10.125, + "learning_rate": 1.9720694657560695e-06, + "loss": 1.2938408851623535, + "step": 4588 + }, + { + "epoch": 1.4127423822714682, + "grad_norm": 14.5625, + "learning_rate": 1.9712801421687013e-06, + "loss": 1.3553228378295898, + "step": 4590 + }, + { + "epoch": 1.4133579562942444, + "grad_norm": 8.9375, + "learning_rate": 1.970490742926699e-06, + "loss": 1.1003388166427612, + "step": 4592 + }, + { + "epoch": 1.4139735303170207, + "grad_norm": 12.75, + "learning_rate": 1.9697012683787397e-06, + "loss": 1.5267784595489502, + "step": 4594 + }, + { + "epoch": 1.414589104339797, + "grad_norm": 10.4375, + "learning_rate": 1.9689117188735365e-06, + "loss": 1.6333833932876587, + "step": 4596 + }, + { + "epoch": 1.4152046783625731, + "grad_norm": 13.8125, + "learning_rate": 1.9681220947598328e-06, + "loss": 1.5597987174987793, + "step": 4598 + }, + { + "epoch": 1.4158202523853494, + "grad_norm": 27.0, + "learning_rate": 1.9673323963864084e-06, + "loss": 1.5046354532241821, + "step": 4600 + }, + { + "epoch": 1.4164358264081256, + "grad_norm": 20.25, + "learning_rate": 1.9665426241020727e-06, + "loss": 0.9578072428703308, + "step": 4602 + }, + { + "epoch": 1.4170514004309018, + "grad_norm": 8.4375, + "learning_rate": 1.9657527782556687e-06, + "loss": 1.2864506244659424, + "step": 4604 + }, + { + "epoch": 1.417666974453678, + "grad_norm": 27.875, + "learning_rate": 1.964962859196073e-06, + "loss": 1.6020686626434326, + "step": 4606 + }, + { + "epoch": 1.4182825484764543, + "grad_norm": 15.3125, + "learning_rate": 1.964172867272194e-06, + "loss": 1.1308155059814453, + "step": 4608 + }, + { + "epoch": 1.4188981224992305, + "grad_norm": 18.75, + "learning_rate": 1.963382802832972e-06, + "loss": 1.6528303623199463, + "step": 4610 + }, + { + "epoch": 1.4195136965220068, + "grad_norm": 7.84375, + "learning_rate": 1.962592666227378e-06, + "loss": 1.3737719058990479, + "step": 4612 + }, + { + "epoch": 1.420129270544783, + "grad_norm": 14.125, + "learning_rate": 1.9618024578044174e-06, + "loss": 0.8094227910041809, + "step": 4614 + }, + { + "epoch": 1.4207448445675592, + "grad_norm": 20.875, + "learning_rate": 1.9610121779131256e-06, + "loss": 1.9535927772521973, + "step": 4616 + }, + { + "epoch": 1.4213604185903355, + "grad_norm": 20.0, + "learning_rate": 1.9602218269025713e-06, + "loss": 1.4262498617172241, + "step": 4618 + }, + { + "epoch": 1.4219759926131117, + "grad_norm": 12.375, + "learning_rate": 1.9594314051218526e-06, + "loss": 1.1243507862091064, + "step": 4620 + }, + { + "epoch": 1.422591566635888, + "grad_norm": 35.0, + "learning_rate": 1.9586409129200992e-06, + "loss": 1.2531572580337524, + "step": 4622 + }, + { + "epoch": 1.4232071406586642, + "grad_norm": 14.5625, + "learning_rate": 1.9578503506464738e-06, + "loss": 1.5430424213409424, + "step": 4624 + }, + { + "epoch": 1.4238227146814404, + "grad_norm": 17.5, + "learning_rate": 1.957059718650167e-06, + "loss": 1.426865577697754, + "step": 4626 + }, + { + "epoch": 1.4244382887042166, + "grad_norm": 13.1875, + "learning_rate": 1.956269017280403e-06, + "loss": 1.37661612033844, + "step": 4628 + }, + { + "epoch": 1.4250538627269929, + "grad_norm": 24.875, + "learning_rate": 1.955478246886435e-06, + "loss": 1.147402286529541, + "step": 4630 + }, + { + "epoch": 1.425669436749769, + "grad_norm": 48.25, + "learning_rate": 1.9546874078175476e-06, + "loss": 1.189788579940796, + "step": 4632 + }, + { + "epoch": 1.4262850107725453, + "grad_norm": 22.0, + "learning_rate": 1.9538965004230553e-06, + "loss": 1.5144860744476318, + "step": 4634 + }, + { + "epoch": 1.4269005847953216, + "grad_norm": 16.5, + "learning_rate": 1.9531055250523026e-06, + "loss": 1.277520775794983, + "step": 4636 + }, + { + "epoch": 1.427516158818098, + "grad_norm": 12.875, + "learning_rate": 1.952314482054663e-06, + "loss": 1.2806596755981445, + "step": 4638 + }, + { + "epoch": 1.4281317328408742, + "grad_norm": 17.0, + "learning_rate": 1.9515233717795435e-06, + "loss": 1.0641429424285889, + "step": 4640 + }, + { + "epoch": 1.4287473068636505, + "grad_norm": 16.875, + "learning_rate": 1.950732194576377e-06, + "loss": 1.2826216220855713, + "step": 4642 + }, + { + "epoch": 1.4293628808864267, + "grad_norm": 9.6875, + "learning_rate": 1.9499409507946277e-06, + "loss": 1.0223610401153564, + "step": 4644 + }, + { + "epoch": 1.429978454909203, + "grad_norm": 108.5, + "learning_rate": 1.9491496407837886e-06, + "loss": 1.5407743453979492, + "step": 4646 + }, + { + "epoch": 1.4305940289319792, + "grad_norm": 13.0625, + "learning_rate": 1.9483582648933827e-06, + "loss": 1.4629024267196655, + "step": 4648 + }, + { + "epoch": 1.4312096029547554, + "grad_norm": 13.625, + "learning_rate": 1.9475668234729606e-06, + "loss": 1.2143301963806152, + "step": 4650 + }, + { + "epoch": 1.4318251769775316, + "grad_norm": 11.25, + "learning_rate": 1.9467753168721047e-06, + "loss": 1.543229103088379, + "step": 4652 + }, + { + "epoch": 1.4324407510003079, + "grad_norm": 14.125, + "learning_rate": 1.9459837454404227e-06, + "loss": 1.3003244400024414, + "step": 4654 + }, + { + "epoch": 1.433056325023084, + "grad_norm": 6.28125, + "learning_rate": 1.9451921095275534e-06, + "loss": 1.0961573123931885, + "step": 4656 + }, + { + "epoch": 1.4336718990458603, + "grad_norm": 9.125, + "learning_rate": 1.944400409483163e-06, + "loss": 1.4426017999649048, + "step": 4658 + }, + { + "epoch": 1.4342874730686366, + "grad_norm": 11.6875, + "learning_rate": 1.9436086456569463e-06, + "loss": 1.3466172218322754, + "step": 4660 + }, + { + "epoch": 1.4349030470914128, + "grad_norm": 11.5625, + "learning_rate": 1.9428168183986265e-06, + "loss": 1.3730425834655762, + "step": 4662 + }, + { + "epoch": 1.435518621114189, + "grad_norm": 13.0625, + "learning_rate": 1.942024928057955e-06, + "loss": 1.225844144821167, + "step": 4664 + }, + { + "epoch": 1.4361341951369653, + "grad_norm": 10.9375, + "learning_rate": 1.9412329749847094e-06, + "loss": 1.5529940128326416, + "step": 4666 + }, + { + "epoch": 1.4367497691597415, + "grad_norm": 24.375, + "learning_rate": 1.9404409595286978e-06, + "loss": 1.7074967622756958, + "step": 4668 + }, + { + "epoch": 1.4373653431825177, + "grad_norm": 11.375, + "learning_rate": 1.9396488820397535e-06, + "loss": 1.468512773513794, + "step": 4670 + }, + { + "epoch": 1.437980917205294, + "grad_norm": 9.75, + "learning_rate": 1.938856742867738e-06, + "loss": 1.344146490097046, + "step": 4672 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 49.5, + "learning_rate": 1.93806454236254e-06, + "loss": 0.8190611600875854, + "step": 4674 + }, + { + "epoch": 1.4392120652508464, + "grad_norm": 22.25, + "learning_rate": 1.9372722808740756e-06, + "loss": 1.4838563203811646, + "step": 4676 + }, + { + "epoch": 1.4398276392736227, + "grad_norm": 4.65625, + "learning_rate": 1.936479958752288e-06, + "loss": 0.9019955396652222, + "step": 4678 + }, + { + "epoch": 1.440443213296399, + "grad_norm": 10.0, + "learning_rate": 1.935687576347146e-06, + "loss": 1.3104298114776611, + "step": 4680 + }, + { + "epoch": 1.4410587873191751, + "grad_norm": 19.125, + "learning_rate": 1.9348951340086463e-06, + "loss": 1.061185359954834, + "step": 4682 + }, + { + "epoch": 1.4416743613419514, + "grad_norm": 28.5, + "learning_rate": 1.934102632086811e-06, + "loss": 1.4017858505249023, + "step": 4684 + }, + { + "epoch": 1.4422899353647276, + "grad_norm": 9.6875, + "learning_rate": 1.933310070931691e-06, + "loss": 1.3744781017303467, + "step": 4686 + }, + { + "epoch": 1.4429055093875038, + "grad_norm": 8.6875, + "learning_rate": 1.9325174508933594e-06, + "loss": 1.2082910537719727, + "step": 4688 + }, + { + "epoch": 1.44352108341028, + "grad_norm": 26.5, + "learning_rate": 1.9317247723219176e-06, + "loss": 1.7057225704193115, + "step": 4690 + }, + { + "epoch": 1.4441366574330563, + "grad_norm": 16.375, + "learning_rate": 1.9309320355674933e-06, + "loss": 1.4047242403030396, + "step": 4692 + }, + { + "epoch": 1.4447522314558325, + "grad_norm": 9.0625, + "learning_rate": 1.930139240980239e-06, + "loss": 1.20115327835083, + "step": 4694 + }, + { + "epoch": 1.4453678054786088, + "grad_norm": 28.875, + "learning_rate": 1.929346388910333e-06, + "loss": 1.047264814376831, + "step": 4696 + }, + { + "epoch": 1.445983379501385, + "grad_norm": 13.6875, + "learning_rate": 1.928553479707979e-06, + "loss": 1.3328430652618408, + "step": 4698 + }, + { + "epoch": 1.4465989535241612, + "grad_norm": 17.25, + "learning_rate": 1.9277605137234057e-06, + "loss": 1.4096957445144653, + "step": 4700 + }, + { + "epoch": 1.4472145275469375, + "grad_norm": 14.9375, + "learning_rate": 1.9269674913068676e-06, + "loss": 1.224859595298767, + "step": 4702 + }, + { + "epoch": 1.4478301015697137, + "grad_norm": 19.25, + "learning_rate": 1.9261744128086427e-06, + "loss": 1.1249094009399414, + "step": 4704 + }, + { + "epoch": 1.44844567559249, + "grad_norm": 12.875, + "learning_rate": 1.925381278579036e-06, + "loss": 1.0842483043670654, + "step": 4706 + }, + { + "epoch": 1.4490612496152662, + "grad_norm": 15.0625, + "learning_rate": 1.9245880889683744e-06, + "loss": 1.2176549434661865, + "step": 4708 + }, + { + "epoch": 1.4496768236380424, + "grad_norm": 18.125, + "learning_rate": 1.9237948443270115e-06, + "loss": 1.6450581550598145, + "step": 4710 + }, + { + "epoch": 1.4502923976608186, + "grad_norm": 7.5625, + "learning_rate": 1.9230015450053236e-06, + "loss": 1.1668829917907715, + "step": 4712 + }, + { + "epoch": 1.4509079716835949, + "grad_norm": 11.5, + "learning_rate": 1.9222081913537135e-06, + "loss": 1.2747738361358643, + "step": 4714 + }, + { + "epoch": 1.451523545706371, + "grad_norm": 7.65625, + "learning_rate": 1.9214147837226045e-06, + "loss": 1.2668383121490479, + "step": 4716 + }, + { + "epoch": 1.4521391197291473, + "grad_norm": 41.25, + "learning_rate": 1.920621322462447e-06, + "loss": 1.328049898147583, + "step": 4718 + }, + { + "epoch": 1.4527546937519236, + "grad_norm": 20.75, + "learning_rate": 1.9198278079237127e-06, + "loss": 1.444204330444336, + "step": 4720 + }, + { + "epoch": 1.4533702677746998, + "grad_norm": 11.8125, + "learning_rate": 1.9190342404568996e-06, + "loss": 1.454003095626831, + "step": 4722 + }, + { + "epoch": 1.453985841797476, + "grad_norm": 18.0, + "learning_rate": 1.918240620412525e-06, + "loss": 1.5295004844665527, + "step": 4724 + }, + { + "epoch": 1.4546014158202523, + "grad_norm": 14.8125, + "learning_rate": 1.917446948141134e-06, + "loss": 1.567083477973938, + "step": 4726 + }, + { + "epoch": 1.4552169898430285, + "grad_norm": 13.3125, + "learning_rate": 1.9166532239932906e-06, + "loss": 1.5100464820861816, + "step": 4728 + }, + { + "epoch": 1.4558325638658047, + "grad_norm": 11.625, + "learning_rate": 1.915859448319586e-06, + "loss": 1.5772287845611572, + "step": 4730 + }, + { + "epoch": 1.4564481378885812, + "grad_norm": 19.25, + "learning_rate": 1.9150656214706295e-06, + "loss": 1.397694706916809, + "step": 4732 + }, + { + "epoch": 1.4570637119113574, + "grad_norm": 7.90625, + "learning_rate": 1.9142717437970564e-06, + "loss": 1.155037760734558, + "step": 4734 + }, + { + "epoch": 1.4576792859341337, + "grad_norm": 33.5, + "learning_rate": 1.913477815649523e-06, + "loss": 1.1763298511505127, + "step": 4736 + }, + { + "epoch": 1.4582948599569099, + "grad_norm": 17.375, + "learning_rate": 1.912683837378709e-06, + "loss": 0.5639537572860718, + "step": 4738 + }, + { + "epoch": 1.4589104339796861, + "grad_norm": 5.375, + "learning_rate": 1.9118898093353146e-06, + "loss": 0.9941329956054688, + "step": 4740 + }, + { + "epoch": 1.4595260080024623, + "grad_norm": 14.625, + "learning_rate": 1.9110957318700634e-06, + "loss": 1.6984761953353882, + "step": 4742 + }, + { + "epoch": 1.4601415820252386, + "grad_norm": 29.125, + "learning_rate": 1.9103016053337004e-06, + "loss": 1.4265990257263184, + "step": 4744 + }, + { + "epoch": 1.4607571560480148, + "grad_norm": 51.25, + "learning_rate": 1.909507430076992e-06, + "loss": 1.5186126232147217, + "step": 4746 + }, + { + "epoch": 1.461372730070791, + "grad_norm": 29.625, + "learning_rate": 1.908713206450727e-06, + "loss": 0.9814944267272949, + "step": 4748 + }, + { + "epoch": 1.4619883040935673, + "grad_norm": 16.5, + "learning_rate": 1.9079189348057132e-06, + "loss": 1.252488613128662, + "step": 4750 + }, + { + "epoch": 1.4626038781163435, + "grad_norm": 13.0, + "learning_rate": 1.907124615492783e-06, + "loss": 1.610899806022644, + "step": 4752 + }, + { + "epoch": 1.4632194521391197, + "grad_norm": 20.75, + "learning_rate": 1.9063302488627872e-06, + "loss": 1.6237657070159912, + "step": 4754 + }, + { + "epoch": 1.463835026161896, + "grad_norm": 13.75, + "learning_rate": 1.9055358352665995e-06, + "loss": 1.5839223861694336, + "step": 4756 + }, + { + "epoch": 1.4644506001846722, + "grad_norm": 22.25, + "learning_rate": 1.9047413750551118e-06, + "loss": 1.3604011535644531, + "step": 4758 + }, + { + "epoch": 1.4650661742074484, + "grad_norm": 36.5, + "learning_rate": 1.9039468685792391e-06, + "loss": 1.3691078424453735, + "step": 4760 + }, + { + "epoch": 1.4656817482302247, + "grad_norm": 6.90625, + "learning_rate": 1.9031523161899152e-06, + "loss": 1.2302625179290771, + "step": 4762 + }, + { + "epoch": 1.466297322253001, + "grad_norm": 16.875, + "learning_rate": 1.9023577182380954e-06, + "loss": 0.8660012483596802, + "step": 4764 + }, + { + "epoch": 1.4669128962757771, + "grad_norm": 23.875, + "learning_rate": 1.9015630750747544e-06, + "loss": 1.0083507299423218, + "step": 4766 + }, + { + "epoch": 1.4675284702985534, + "grad_norm": 7.96875, + "learning_rate": 1.9007683870508866e-06, + "loss": 1.3119466304779053, + "step": 4768 + }, + { + "epoch": 1.4681440443213296, + "grad_norm": 19.75, + "learning_rate": 1.899973654517507e-06, + "loss": 1.646362543106079, + "step": 4770 + }, + { + "epoch": 1.4687596183441058, + "grad_norm": 24.625, + "learning_rate": 1.8991788778256505e-06, + "loss": 1.4080818891525269, + "step": 4772 + }, + { + "epoch": 1.469375192366882, + "grad_norm": 23.625, + "learning_rate": 1.898384057326369e-06, + "loss": 1.5112760066986084, + "step": 4774 + }, + { + "epoch": 1.4699907663896583, + "grad_norm": 10.75, + "learning_rate": 1.8975891933707373e-06, + "loss": 1.438962459564209, + "step": 4776 + }, + { + "epoch": 1.4706063404124345, + "grad_norm": 14.3125, + "learning_rate": 1.8967942863098472e-06, + "loss": 1.0983920097351074, + "step": 4778 + }, + { + "epoch": 1.4712219144352108, + "grad_norm": 10.6875, + "learning_rate": 1.8959993364948096e-06, + "loss": 1.042116403579712, + "step": 4780 + }, + { + "epoch": 1.471837488457987, + "grad_norm": 10.5, + "learning_rate": 1.8952043442767555e-06, + "loss": 1.5508229732513428, + "step": 4782 + }, + { + "epoch": 1.4724530624807632, + "grad_norm": 15.75, + "learning_rate": 1.8944093100068334e-06, + "loss": 1.648895263671875, + "step": 4784 + }, + { + "epoch": 1.4730686365035395, + "grad_norm": 23.625, + "learning_rate": 1.8936142340362116e-06, + "loss": 1.737051010131836, + "step": 4786 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 19.875, + "learning_rate": 1.8928191167160749e-06, + "loss": 1.876301884651184, + "step": 4788 + }, + { + "epoch": 1.4742997845490922, + "grad_norm": 6.8125, + "learning_rate": 1.8920239583976283e-06, + "loss": 1.2666301727294922, + "step": 4790 + }, + { + "epoch": 1.4749153585718684, + "grad_norm": 12.0625, + "learning_rate": 1.891228759432094e-06, + "loss": 1.2881522178649902, + "step": 4792 + }, + { + "epoch": 1.4755309325946446, + "grad_norm": 7.75, + "learning_rate": 1.890433520170712e-06, + "loss": 1.2567453384399414, + "step": 4794 + }, + { + "epoch": 1.4761465066174209, + "grad_norm": 12.625, + "learning_rate": 1.8896382409647403e-06, + "loss": 1.103519320487976, + "step": 4796 + }, + { + "epoch": 1.476762080640197, + "grad_norm": 16.25, + "learning_rate": 1.8888429221654555e-06, + "loss": 1.3396915197372437, + "step": 4798 + }, + { + "epoch": 1.4773776546629733, + "grad_norm": 53.5, + "learning_rate": 1.8880475641241502e-06, + "loss": 1.0970816612243652, + "step": 4800 + }, + { + "epoch": 1.4779932286857496, + "grad_norm": 18.75, + "learning_rate": 1.8872521671921347e-06, + "loss": 1.3329362869262695, + "step": 4802 + }, + { + "epoch": 1.4786088027085258, + "grad_norm": 13.1875, + "learning_rate": 1.8864567317207377e-06, + "loss": 1.5142507553100586, + "step": 4804 + }, + { + "epoch": 1.479224376731302, + "grad_norm": 19.125, + "learning_rate": 1.8856612580613028e-06, + "loss": 1.3657267093658447, + "step": 4806 + }, + { + "epoch": 1.4798399507540783, + "grad_norm": 26.125, + "learning_rate": 1.8848657465651925e-06, + "loss": 0.9717738628387451, + "step": 4808 + }, + { + "epoch": 1.4804555247768545, + "grad_norm": 17.0, + "learning_rate": 1.884070197583784e-06, + "loss": 1.6565666198730469, + "step": 4810 + }, + { + "epoch": 1.4810710987996307, + "grad_norm": 11.6875, + "learning_rate": 1.883274611468474e-06, + "loss": 1.247531533241272, + "step": 4812 + }, + { + "epoch": 1.481686672822407, + "grad_norm": 44.75, + "learning_rate": 1.8824789885706721e-06, + "loss": 1.4119795560836792, + "step": 4814 + }, + { + "epoch": 1.4823022468451832, + "grad_norm": 13.125, + "learning_rate": 1.8816833292418075e-06, + "loss": 1.174653172492981, + "step": 4816 + }, + { + "epoch": 1.4829178208679594, + "grad_norm": 138.0, + "learning_rate": 1.8808876338333218e-06, + "loss": 1.3747670650482178, + "step": 4818 + }, + { + "epoch": 1.4835333948907357, + "grad_norm": 16.625, + "learning_rate": 1.8800919026966764e-06, + "loss": 1.3020074367523193, + "step": 4820 + }, + { + "epoch": 1.484148968913512, + "grad_norm": 6.15625, + "learning_rate": 1.8792961361833448e-06, + "loss": 1.279649257659912, + "step": 4822 + }, + { + "epoch": 1.4847645429362881, + "grad_norm": 30.875, + "learning_rate": 1.8785003346448202e-06, + "loss": 1.3550713062286377, + "step": 4824 + }, + { + "epoch": 1.4853801169590644, + "grad_norm": 10.625, + "learning_rate": 1.8777044984326075e-06, + "loss": 1.080082893371582, + "step": 4826 + }, + { + "epoch": 1.4859956909818406, + "grad_norm": 10.0, + "learning_rate": 1.8769086278982287e-06, + "loss": 1.4219310283660889, + "step": 4828 + }, + { + "epoch": 1.4866112650046168, + "grad_norm": 7.90625, + "learning_rate": 1.8761127233932209e-06, + "loss": 1.1008557081222534, + "step": 4830 + }, + { + "epoch": 1.487226839027393, + "grad_norm": 5.9375, + "learning_rate": 1.8753167852691365e-06, + "loss": 1.295001745223999, + "step": 4832 + }, + { + "epoch": 1.4878424130501693, + "grad_norm": 16.5, + "learning_rate": 1.8745208138775416e-06, + "loss": 1.5518990755081177, + "step": 4834 + }, + { + "epoch": 1.4884579870729455, + "grad_norm": 7.21875, + "learning_rate": 1.8737248095700179e-06, + "loss": 1.103243112564087, + "step": 4836 + }, + { + "epoch": 1.4890735610957218, + "grad_norm": 25.5, + "learning_rate": 1.8729287726981617e-06, + "loss": 1.256454348564148, + "step": 4838 + }, + { + "epoch": 1.489689135118498, + "grad_norm": 14.3125, + "learning_rate": 1.8721327036135826e-06, + "loss": 1.3482131958007812, + "step": 4840 + }, + { + "epoch": 1.4903047091412742, + "grad_norm": 8.0, + "learning_rate": 1.8713366026679064e-06, + "loss": 1.1732832193374634, + "step": 4842 + }, + { + "epoch": 1.4909202831640505, + "grad_norm": 19.5, + "learning_rate": 1.8705404702127706e-06, + "loss": 0.6500099897384644, + "step": 4844 + }, + { + "epoch": 1.4915358571868267, + "grad_norm": 10.125, + "learning_rate": 1.8697443065998284e-06, + "loss": 1.0080634355545044, + "step": 4846 + }, + { + "epoch": 1.492151431209603, + "grad_norm": 14.375, + "learning_rate": 1.868948112180746e-06, + "loss": 1.523182988166809, + "step": 4848 + }, + { + "epoch": 1.4927670052323792, + "grad_norm": 28.875, + "learning_rate": 1.8681518873072036e-06, + "loss": 1.9081377983093262, + "step": 4850 + }, + { + "epoch": 1.4933825792551554, + "grad_norm": 30.0, + "learning_rate": 1.867355632330895e-06, + "loss": 1.7250075340270996, + "step": 4852 + }, + { + "epoch": 1.4939981532779316, + "grad_norm": 12.5, + "learning_rate": 1.8665593476035259e-06, + "loss": 1.2684253454208374, + "step": 4854 + }, + { + "epoch": 1.4946137273007079, + "grad_norm": 16.75, + "learning_rate": 1.8657630334768171e-06, + "loss": 1.582967758178711, + "step": 4856 + }, + { + "epoch": 1.495229301323484, + "grad_norm": 23.5, + "learning_rate": 1.8649666903025013e-06, + "loss": 1.17526113986969, + "step": 4858 + }, + { + "epoch": 1.4958448753462603, + "grad_norm": 14.5, + "learning_rate": 1.8641703184323235e-06, + "loss": 1.4988826513290405, + "step": 4860 + }, + { + "epoch": 1.4964604493690365, + "grad_norm": 7.6875, + "learning_rate": 1.863373918218043e-06, + "loss": 1.3049447536468506, + "step": 4862 + }, + { + "epoch": 1.4970760233918128, + "grad_norm": 20.125, + "learning_rate": 1.8625774900114303e-06, + "loss": 1.086333990097046, + "step": 4864 + }, + { + "epoch": 1.497691597414589, + "grad_norm": 14.5, + "learning_rate": 1.8617810341642682e-06, + "loss": 1.2647141218185425, + "step": 4866 + }, + { + "epoch": 1.4983071714373652, + "grad_norm": 14.5625, + "learning_rate": 1.8609845510283534e-06, + "loss": 1.0968520641326904, + "step": 4868 + }, + { + "epoch": 1.4989227454601415, + "grad_norm": 20.625, + "learning_rate": 1.8601880409554924e-06, + "loss": 1.589921236038208, + "step": 4870 + }, + { + "epoch": 1.4995383194829177, + "grad_norm": 10.625, + "learning_rate": 1.8593915042975043e-06, + "loss": 1.0053234100341797, + "step": 4872 + }, + { + "epoch": 1.500153893505694, + "grad_norm": 10.3125, + "learning_rate": 1.8585949414062207e-06, + "loss": 1.2411702871322632, + "step": 4874 + }, + { + "epoch": 1.5007694675284702, + "grad_norm": 32.25, + "learning_rate": 1.8577983526334847e-06, + "loss": 1.2974989414215088, + "step": 4876 + }, + { + "epoch": 1.5013850415512464, + "grad_norm": 17.5, + "learning_rate": 1.8570017383311504e-06, + "loss": 1.8156037330627441, + "step": 4878 + }, + { + "epoch": 1.5020006155740226, + "grad_norm": 7.5625, + "learning_rate": 1.8562050988510824e-06, + "loss": 1.23923659324646, + "step": 4880 + }, + { + "epoch": 1.5026161895967989, + "grad_norm": 21.125, + "learning_rate": 1.8554084345451586e-06, + "loss": 0.8623427748680115, + "step": 4882 + }, + { + "epoch": 1.5032317636195751, + "grad_norm": 22.0, + "learning_rate": 1.8546117457652654e-06, + "loss": 1.8232307434082031, + "step": 4884 + }, + { + "epoch": 1.5038473376423513, + "grad_norm": 15.6875, + "learning_rate": 1.853815032863302e-06, + "loss": 1.102805495262146, + "step": 4886 + }, + { + "epoch": 1.5044629116651276, + "grad_norm": 9.4375, + "learning_rate": 1.8530182961911766e-06, + "loss": 1.3882505893707275, + "step": 4888 + }, + { + "epoch": 1.5050784856879038, + "grad_norm": 13.1875, + "learning_rate": 1.8522215361008086e-06, + "loss": 1.0989861488342285, + "step": 4890 + }, + { + "epoch": 1.50569405971068, + "grad_norm": 26.125, + "learning_rate": 1.8514247529441292e-06, + "loss": 1.6275972127914429, + "step": 4892 + }, + { + "epoch": 1.5063096337334565, + "grad_norm": 10.375, + "learning_rate": 1.8506279470730775e-06, + "loss": 1.095172643661499, + "step": 4894 + }, + { + "epoch": 1.5069252077562327, + "grad_norm": 9.8125, + "learning_rate": 1.849831118839603e-06, + "loss": 0.8544666171073914, + "step": 4896 + }, + { + "epoch": 1.507540781779009, + "grad_norm": 6.875, + "learning_rate": 1.8490342685956666e-06, + "loss": 0.8565798997879028, + "step": 4898 + }, + { + "epoch": 1.5081563558017852, + "grad_norm": 15.0, + "learning_rate": 1.8482373966932377e-06, + "loss": 1.2892870903015137, + "step": 4900 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 13.1875, + "learning_rate": 1.8474405034842957e-06, + "loss": 1.41206955909729, + "step": 4902 + }, + { + "epoch": 1.5093875038473377, + "grad_norm": 6.53125, + "learning_rate": 1.8466435893208286e-06, + "loss": 1.286069393157959, + "step": 4904 + }, + { + "epoch": 1.510003077870114, + "grad_norm": 14.5, + "learning_rate": 1.8458466545548348e-06, + "loss": 1.253098726272583, + "step": 4906 + }, + { + "epoch": 1.5106186518928901, + "grad_norm": 10.375, + "learning_rate": 1.8450496995383209e-06, + "loss": 1.565866231918335, + "step": 4908 + }, + { + "epoch": 1.5112342259156664, + "grad_norm": 41.25, + "learning_rate": 1.8442527246233041e-06, + "loss": 0.7049061059951782, + "step": 4910 + }, + { + "epoch": 1.5118497999384426, + "grad_norm": 16.25, + "learning_rate": 1.843455730161807e-06, + "loss": 1.042777419090271, + "step": 4912 + }, + { + "epoch": 1.5124653739612188, + "grad_norm": 7.8125, + "learning_rate": 1.8426587165058651e-06, + "loss": 1.1403331756591797, + "step": 4914 + }, + { + "epoch": 1.513080947983995, + "grad_norm": 5.71875, + "learning_rate": 1.841861684007519e-06, + "loss": 1.1445809602737427, + "step": 4916 + }, + { + "epoch": 1.5136965220067713, + "grad_norm": 4.75, + "learning_rate": 1.8410646330188187e-06, + "loss": 1.1456329822540283, + "step": 4918 + }, + { + "epoch": 1.5143120960295475, + "grad_norm": 9.4375, + "learning_rate": 1.8402675638918232e-06, + "loss": 1.1502320766448975, + "step": 4920 + }, + { + "epoch": 1.5149276700523238, + "grad_norm": 20.0, + "learning_rate": 1.839470476978599e-06, + "loss": 1.4103009700775146, + "step": 4922 + }, + { + "epoch": 1.5155432440751, + "grad_norm": 14.125, + "learning_rate": 1.8386733726312196e-06, + "loss": 1.3720169067382812, + "step": 4924 + }, + { + "epoch": 1.5161588180978762, + "grad_norm": 20.0, + "learning_rate": 1.8378762512017674e-06, + "loss": 1.7142300605773926, + "step": 4926 + }, + { + "epoch": 1.5167743921206525, + "grad_norm": 15.25, + "learning_rate": 1.8370791130423314e-06, + "loss": 1.574350357055664, + "step": 4928 + }, + { + "epoch": 1.5173899661434287, + "grad_norm": 27.0, + "learning_rate": 1.836281958505009e-06, + "loss": 1.3752760887145996, + "step": 4930 + }, + { + "epoch": 1.5180055401662051, + "grad_norm": 37.0, + "learning_rate": 1.8354847879419038e-06, + "loss": 1.4977751970291138, + "step": 4932 + }, + { + "epoch": 1.5186211141889814, + "grad_norm": 22.75, + "learning_rate": 1.834687601705127e-06, + "loss": 1.4560470581054688, + "step": 4934 + }, + { + "epoch": 1.5192366882117576, + "grad_norm": 20.75, + "learning_rate": 1.8338904001467976e-06, + "loss": 1.2786219120025635, + "step": 4936 + }, + { + "epoch": 1.5198522622345338, + "grad_norm": 6.875, + "learning_rate": 1.833093183619039e-06, + "loss": 1.3729817867279053, + "step": 4938 + }, + { + "epoch": 1.52046783625731, + "grad_norm": 32.5, + "learning_rate": 1.8322959524739835e-06, + "loss": 1.3546814918518066, + "step": 4940 + }, + { + "epoch": 1.5210834102800863, + "grad_norm": 13.125, + "learning_rate": 1.8314987070637687e-06, + "loss": 1.3367919921875, + "step": 4942 + }, + { + "epoch": 1.5216989843028625, + "grad_norm": 16.875, + "learning_rate": 1.830701447740539e-06, + "loss": 1.428148865699768, + "step": 4944 + }, + { + "epoch": 1.5223145583256388, + "grad_norm": 16.375, + "learning_rate": 1.829904174856445e-06, + "loss": 1.4339628219604492, + "step": 4946 + }, + { + "epoch": 1.522930132348415, + "grad_norm": 8.625, + "learning_rate": 1.829106888763642e-06, + "loss": 1.3263335227966309, + "step": 4948 + }, + { + "epoch": 1.5235457063711912, + "grad_norm": 9.0, + "learning_rate": 1.828309589814294e-06, + "loss": 1.1701977252960205, + "step": 4950 + }, + { + "epoch": 1.5241612803939675, + "grad_norm": 15.8125, + "learning_rate": 1.8275122783605668e-06, + "loss": 1.5451289415359497, + "step": 4952 + }, + { + "epoch": 1.5247768544167437, + "grad_norm": 7.75, + "learning_rate": 1.8267149547546353e-06, + "loss": 1.0618703365325928, + "step": 4954 + }, + { + "epoch": 1.52539242843952, + "grad_norm": 13.1875, + "learning_rate": 1.8259176193486771e-06, + "loss": 1.249274730682373, + "step": 4956 + }, + { + "epoch": 1.5260080024622962, + "grad_norm": 23.5, + "learning_rate": 1.825120272494877e-06, + "loss": 1.2727432250976562, + "step": 4958 + }, + { + "epoch": 1.5266235764850724, + "grad_norm": 14.9375, + "learning_rate": 1.8243229145454242e-06, + "loss": 1.678273320198059, + "step": 4960 + }, + { + "epoch": 1.5272391505078486, + "grad_norm": 7.71875, + "learning_rate": 1.823525545852512e-06, + "loss": 1.028113842010498, + "step": 4962 + }, + { + "epoch": 1.5278547245306249, + "grad_norm": 34.5, + "learning_rate": 1.8227281667683392e-06, + "loss": 1.322481393814087, + "step": 4964 + }, + { + "epoch": 1.528470298553401, + "grad_norm": 9.625, + "learning_rate": 1.821930777645109e-06, + "loss": 1.3164238929748535, + "step": 4966 + }, + { + "epoch": 1.5290858725761773, + "grad_norm": 10.375, + "learning_rate": 1.8211333788350292e-06, + "loss": 1.2079682350158691, + "step": 4968 + }, + { + "epoch": 1.5297014465989536, + "grad_norm": 19.25, + "learning_rate": 1.8203359706903122e-06, + "loss": 1.0435426235198975, + "step": 4970 + }, + { + "epoch": 1.5303170206217298, + "grad_norm": 21.625, + "learning_rate": 1.8195385535631735e-06, + "loss": 1.2816870212554932, + "step": 4972 + }, + { + "epoch": 1.530932594644506, + "grad_norm": 4.03125, + "learning_rate": 1.8187411278058333e-06, + "loss": 1.0759096145629883, + "step": 4974 + }, + { + "epoch": 1.5315481686672823, + "grad_norm": 28.875, + "learning_rate": 1.8179436937705147e-06, + "loss": 1.3759675025939941, + "step": 4976 + }, + { + "epoch": 1.5321637426900585, + "grad_norm": 18.25, + "learning_rate": 1.8171462518094472e-06, + "loss": 1.2887439727783203, + "step": 4978 + }, + { + "epoch": 1.5327793167128347, + "grad_norm": 17.5, + "learning_rate": 1.8163488022748597e-06, + "loss": 1.437053918838501, + "step": 4980 + }, + { + "epoch": 1.533394890735611, + "grad_norm": 13.6875, + "learning_rate": 1.8155513455189886e-06, + "loss": 1.6811416149139404, + "step": 4982 + }, + { + "epoch": 1.5340104647583872, + "grad_norm": 13.5, + "learning_rate": 1.8147538818940692e-06, + "loss": 1.391784906387329, + "step": 4984 + }, + { + "epoch": 1.5346260387811634, + "grad_norm": 16.25, + "learning_rate": 1.813956411752344e-06, + "loss": 1.7641667127609253, + "step": 4986 + }, + { + "epoch": 1.5352416128039397, + "grad_norm": 18.25, + "learning_rate": 1.8131589354460563e-06, + "loss": 1.4568517208099365, + "step": 4988 + }, + { + "epoch": 1.535857186826716, + "grad_norm": 11.125, + "learning_rate": 1.8123614533274505e-06, + "loss": 1.0670044422149658, + "step": 4990 + }, + { + "epoch": 1.5364727608494921, + "grad_norm": 4.5625, + "learning_rate": 1.811563965748777e-06, + "loss": 1.1624255180358887, + "step": 4992 + }, + { + "epoch": 1.5370883348722684, + "grad_norm": 13.1875, + "learning_rate": 1.8107664730622871e-06, + "loss": 1.3043831586837769, + "step": 4994 + }, + { + "epoch": 1.5377039088950446, + "grad_norm": 41.0, + "learning_rate": 1.8099689756202334e-06, + "loss": 1.5602350234985352, + "step": 4996 + }, + { + "epoch": 1.5383194829178208, + "grad_norm": 8.5625, + "learning_rate": 1.8091714737748712e-06, + "loss": 1.1072698831558228, + "step": 4998 + }, + { + "epoch": 1.538935056940597, + "grad_norm": 5.9375, + "learning_rate": 1.8083739678784596e-06, + "loss": 1.3773562908172607, + "step": 5000 + }, + { + "epoch": 1.5395506309633733, + "grad_norm": 19.875, + "learning_rate": 1.807576458283256e-06, + "loss": 1.3957157135009766, + "step": 5002 + }, + { + "epoch": 1.5401662049861495, + "grad_norm": 6.375, + "learning_rate": 1.8067789453415222e-06, + "loss": 0.9105068445205688, + "step": 5004 + }, + { + "epoch": 1.5407817790089258, + "grad_norm": 14.6875, + "learning_rate": 1.8059814294055209e-06, + "loss": 1.1876349449157715, + "step": 5006 + }, + { + "epoch": 1.541397353031702, + "grad_norm": 3.765625, + "learning_rate": 1.8051839108275152e-06, + "loss": 1.2273304462432861, + "step": 5008 + }, + { + "epoch": 1.5420129270544782, + "grad_norm": 10.1875, + "learning_rate": 1.8043863899597704e-06, + "loss": 1.285908579826355, + "step": 5010 + }, + { + "epoch": 1.5426285010772545, + "grad_norm": 14.625, + "learning_rate": 1.803588867154551e-06, + "loss": 1.3618907928466797, + "step": 5012 + }, + { + "epoch": 1.5432440751000307, + "grad_norm": 9.1875, + "learning_rate": 1.8027913427641265e-06, + "loss": 1.556501865386963, + "step": 5014 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 21.125, + "learning_rate": 1.8019938171407614e-06, + "loss": 1.5120654106140137, + "step": 5016 + }, + { + "epoch": 1.5444752231455832, + "grad_norm": 16.75, + "learning_rate": 1.8011962906367256e-06, + "loss": 1.6533551216125488, + "step": 5018 + }, + { + "epoch": 1.5450907971683594, + "grad_norm": 15.5, + "learning_rate": 1.8003987636042864e-06, + "loss": 1.369877576828003, + "step": 5020 + }, + { + "epoch": 1.5457063711911356, + "grad_norm": 10.625, + "learning_rate": 1.7996012363957136e-06, + "loss": 1.258755087852478, + "step": 5022 + }, + { + "epoch": 1.5463219452139119, + "grad_norm": 12.125, + "learning_rate": 1.798803709363275e-06, + "loss": 1.3866196870803833, + "step": 5024 + }, + { + "epoch": 1.546937519236688, + "grad_norm": 16.125, + "learning_rate": 1.798006182859239e-06, + "loss": 1.1527132987976074, + "step": 5026 + }, + { + "epoch": 1.5475530932594643, + "grad_norm": 37.5, + "learning_rate": 1.7972086572358742e-06, + "loss": 1.7384552955627441, + "step": 5028 + }, + { + "epoch": 1.5481686672822406, + "grad_norm": 18.625, + "learning_rate": 1.7964111328454488e-06, + "loss": 1.354884386062622, + "step": 5030 + }, + { + "epoch": 1.5487842413050168, + "grad_norm": 9.75, + "learning_rate": 1.7956136100402307e-06, + "loss": 1.752507209777832, + "step": 5032 + }, + { + "epoch": 1.549399815327793, + "grad_norm": 8.1875, + "learning_rate": 1.794816089172485e-06, + "loss": 1.3854937553405762, + "step": 5034 + }, + { + "epoch": 1.5500153893505693, + "grad_norm": 19.25, + "learning_rate": 1.7940185705944792e-06, + "loss": 1.3696260452270508, + "step": 5036 + }, + { + "epoch": 1.5506309633733455, + "grad_norm": 14.125, + "learning_rate": 1.7932210546584777e-06, + "loss": 1.2782727479934692, + "step": 5038 + }, + { + "epoch": 1.5512465373961217, + "grad_norm": 22.0, + "learning_rate": 1.7924235417167442e-06, + "loss": 1.621812343597412, + "step": 5040 + }, + { + "epoch": 1.551862111418898, + "grad_norm": 472.0, + "learning_rate": 1.7916260321215409e-06, + "loss": 1.1303892135620117, + "step": 5042 + }, + { + "epoch": 1.5524776854416744, + "grad_norm": 34.5, + "learning_rate": 1.7908285262251287e-06, + "loss": 1.2478010654449463, + "step": 5044 + }, + { + "epoch": 1.5530932594644506, + "grad_norm": 43.5, + "learning_rate": 1.7900310243797673e-06, + "loss": 1.3689146041870117, + "step": 5046 + }, + { + "epoch": 1.5537088334872269, + "grad_norm": 6.84375, + "learning_rate": 1.7892335269377136e-06, + "loss": 1.0796773433685303, + "step": 5048 + }, + { + "epoch": 1.5543244075100031, + "grad_norm": 16.0, + "learning_rate": 1.7884360342512231e-06, + "loss": 1.28507661819458, + "step": 5050 + }, + { + "epoch": 1.5549399815327793, + "grad_norm": 32.75, + "learning_rate": 1.7876385466725502e-06, + "loss": 1.0493865013122559, + "step": 5052 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 18.75, + "learning_rate": 1.7868410645539444e-06, + "loss": 1.580011010169983, + "step": 5054 + }, + { + "epoch": 1.5561711295783318, + "grad_norm": 17.75, + "learning_rate": 1.7860435882476564e-06, + "loss": 1.2562227249145508, + "step": 5056 + }, + { + "epoch": 1.556786703601108, + "grad_norm": 9.6875, + "learning_rate": 1.785246118105931e-06, + "loss": 1.388770580291748, + "step": 5058 + }, + { + "epoch": 1.5574022776238843, + "grad_norm": 12.3125, + "learning_rate": 1.7844486544810121e-06, + "loss": 1.4561406373977661, + "step": 5060 + }, + { + "epoch": 1.5580178516466605, + "grad_norm": 23.125, + "learning_rate": 1.78365119772514e-06, + "loss": 1.747645378112793, + "step": 5062 + }, + { + "epoch": 1.5586334256694367, + "grad_norm": 18.375, + "learning_rate": 1.782853748190553e-06, + "loss": 1.6524887084960938, + "step": 5064 + }, + { + "epoch": 1.559248999692213, + "grad_norm": 7.09375, + "learning_rate": 1.7820563062294853e-06, + "loss": 1.2352979183197021, + "step": 5066 + }, + { + "epoch": 1.5598645737149892, + "grad_norm": 7.84375, + "learning_rate": 1.7812588721941674e-06, + "loss": 1.2969111204147339, + "step": 5068 + }, + { + "epoch": 1.5604801477377654, + "grad_norm": 10.375, + "learning_rate": 1.7804614464368272e-06, + "loss": 1.2578907012939453, + "step": 5070 + }, + { + "epoch": 1.5610957217605417, + "grad_norm": 14.5, + "learning_rate": 1.779664029309688e-06, + "loss": 1.5978870391845703, + "step": 5072 + }, + { + "epoch": 1.561711295783318, + "grad_norm": 5.71875, + "learning_rate": 1.778866621164971e-06, + "loss": 1.0977182388305664, + "step": 5074 + }, + { + "epoch": 1.5623268698060941, + "grad_norm": 11.5, + "learning_rate": 1.7780692223548915e-06, + "loss": 1.1397321224212646, + "step": 5076 + }, + { + "epoch": 1.5629424438288704, + "grad_norm": 35.75, + "learning_rate": 1.777271833231661e-06, + "loss": 1.603751540184021, + "step": 5078 + }, + { + "epoch": 1.5635580178516466, + "grad_norm": 24.0, + "learning_rate": 1.7764744541474883e-06, + "loss": 1.832308053970337, + "step": 5080 + }, + { + "epoch": 1.564173591874423, + "grad_norm": 56.75, + "learning_rate": 1.775677085454576e-06, + "loss": 1.3269708156585693, + "step": 5082 + }, + { + "epoch": 1.5647891658971993, + "grad_norm": 10.5625, + "learning_rate": 1.774879727505123e-06, + "loss": 1.3348369598388672, + "step": 5084 + }, + { + "epoch": 1.5654047399199755, + "grad_norm": 16.375, + "learning_rate": 1.7740823806513231e-06, + "loss": 1.548821210861206, + "step": 5086 + }, + { + "epoch": 1.5660203139427518, + "grad_norm": 7.5625, + "learning_rate": 1.7732850452453652e-06, + "loss": 1.516291856765747, + "step": 5088 + }, + { + "epoch": 1.566635887965528, + "grad_norm": 44.0, + "learning_rate": 1.7724877216394336e-06, + "loss": 1.5322810411453247, + "step": 5090 + }, + { + "epoch": 1.5672514619883042, + "grad_norm": 17.25, + "learning_rate": 1.7716904101857067e-06, + "loss": 1.3388553857803345, + "step": 5092 + }, + { + "epoch": 1.5678670360110805, + "grad_norm": 20.0, + "learning_rate": 1.770893111236358e-06, + "loss": 0.8509764075279236, + "step": 5094 + }, + { + "epoch": 1.5684826100338567, + "grad_norm": 18.375, + "learning_rate": 1.7700958251435551e-06, + "loss": 1.3260862827301025, + "step": 5096 + }, + { + "epoch": 1.569098184056633, + "grad_norm": 10.5, + "learning_rate": 1.7692985522594612e-06, + "loss": 1.5346083641052246, + "step": 5098 + }, + { + "epoch": 1.5697137580794092, + "grad_norm": 11.75, + "learning_rate": 1.7685012929362318e-06, + "loss": 1.3697761297225952, + "step": 5100 + }, + { + "epoch": 1.5703293321021854, + "grad_norm": 6.8125, + "learning_rate": 1.7677040475260166e-06, + "loss": 0.9767165184020996, + "step": 5102 + }, + { + "epoch": 1.5709449061249616, + "grad_norm": 5.3125, + "learning_rate": 1.766906816380961e-06, + "loss": 1.2552387714385986, + "step": 5104 + }, + { + "epoch": 1.5715604801477379, + "grad_norm": 10.5, + "learning_rate": 1.766109599853203e-06, + "loss": 1.2528355121612549, + "step": 5106 + }, + { + "epoch": 1.572176054170514, + "grad_norm": 12.5625, + "learning_rate": 1.7653123982948729e-06, + "loss": 1.2786812782287598, + "step": 5108 + }, + { + "epoch": 1.5727916281932903, + "grad_norm": 16.625, + "learning_rate": 1.7645152120580964e-06, + "loss": 1.2018368244171143, + "step": 5110 + }, + { + "epoch": 1.5734072022160666, + "grad_norm": 27.375, + "learning_rate": 1.7637180414949915e-06, + "loss": 1.089292287826538, + "step": 5112 + }, + { + "epoch": 1.5740227762388428, + "grad_norm": 8.875, + "learning_rate": 1.7629208869576693e-06, + "loss": 1.360584020614624, + "step": 5114 + }, + { + "epoch": 1.574638350261619, + "grad_norm": 10.0, + "learning_rate": 1.762123748798233e-06, + "loss": 1.3072019815444946, + "step": 5116 + }, + { + "epoch": 1.5752539242843953, + "grad_norm": 9.3125, + "learning_rate": 1.761326627368781e-06, + "loss": 1.2890172004699707, + "step": 5118 + }, + { + "epoch": 1.5758694983071715, + "grad_norm": 8.0625, + "learning_rate": 1.7605295230214015e-06, + "loss": 1.119748592376709, + "step": 5120 + }, + { + "epoch": 1.5764850723299477, + "grad_norm": 20.5, + "learning_rate": 1.759732436108177e-06, + "loss": 1.0745265483856201, + "step": 5122 + }, + { + "epoch": 1.577100646352724, + "grad_norm": 13.3125, + "learning_rate": 1.7589353669811816e-06, + "loss": 1.3376274108886719, + "step": 5124 + }, + { + "epoch": 1.5777162203755002, + "grad_norm": 13.125, + "learning_rate": 1.7581383159924818e-06, + "loss": 0.7650940418243408, + "step": 5126 + }, + { + "epoch": 1.5783317943982764, + "grad_norm": 19.375, + "learning_rate": 1.7573412834941355e-06, + "loss": 1.7586708068847656, + "step": 5128 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 16.625, + "learning_rate": 1.756544269838193e-06, + "loss": 1.4873216152191162, + "step": 5130 + }, + { + "epoch": 1.5795629424438289, + "grad_norm": 15.375, + "learning_rate": 1.7557472753766966e-06, + "loss": 1.3263533115386963, + "step": 5132 + }, + { + "epoch": 1.5801785164666051, + "grad_norm": 20.75, + "learning_rate": 1.7549503004616792e-06, + "loss": 1.0792770385742188, + "step": 5134 + }, + { + "epoch": 1.5807940904893814, + "grad_norm": 13.3125, + "learning_rate": 1.7541533454451655e-06, + "loss": 1.2500200271606445, + "step": 5136 + }, + { + "epoch": 1.5814096645121576, + "grad_norm": 14.125, + "learning_rate": 1.753356410679172e-06, + "loss": 1.622786521911621, + "step": 5138 + }, + { + "epoch": 1.5820252385349338, + "grad_norm": 14.9375, + "learning_rate": 1.752559496515705e-06, + "loss": 1.4444410800933838, + "step": 5140 + }, + { + "epoch": 1.58264081255771, + "grad_norm": 15.0, + "learning_rate": 1.7517626033067628e-06, + "loss": 1.2542698383331299, + "step": 5142 + }, + { + "epoch": 1.5832563865804863, + "grad_norm": 14.5625, + "learning_rate": 1.7509657314043339e-06, + "loss": 1.3525272607803345, + "step": 5144 + }, + { + "epoch": 1.5838719606032625, + "grad_norm": 14.0, + "learning_rate": 1.7501688811603972e-06, + "loss": 1.3406755924224854, + "step": 5146 + }, + { + "epoch": 1.5844875346260388, + "grad_norm": 16.5, + "learning_rate": 1.7493720529269227e-06, + "loss": 1.49215829372406, + "step": 5148 + }, + { + "epoch": 1.585103108648815, + "grad_norm": 18.375, + "learning_rate": 1.748575247055871e-06, + "loss": 1.2907540798187256, + "step": 5150 + }, + { + "epoch": 1.5857186826715912, + "grad_norm": 21.25, + "learning_rate": 1.7477784638991915e-06, + "loss": 1.827765941619873, + "step": 5152 + }, + { + "epoch": 1.5863342566943675, + "grad_norm": 27.625, + "learning_rate": 1.7469817038088241e-06, + "loss": 1.5776755809783936, + "step": 5154 + }, + { + "epoch": 1.5869498307171437, + "grad_norm": 8.25, + "learning_rate": 1.7461849671366987e-06, + "loss": 1.2929551601409912, + "step": 5156 + }, + { + "epoch": 1.58756540473992, + "grad_norm": 19.625, + "learning_rate": 1.7453882542347349e-06, + "loss": 1.1364490985870361, + "step": 5158 + }, + { + "epoch": 1.5881809787626961, + "grad_norm": 16.125, + "learning_rate": 1.744591565454842e-06, + "loss": 1.199864387512207, + "step": 5160 + }, + { + "epoch": 1.5887965527854724, + "grad_norm": 12.8125, + "learning_rate": 1.7437949011489179e-06, + "loss": 1.3706705570220947, + "step": 5162 + }, + { + "epoch": 1.5894121268082486, + "grad_norm": 12.375, + "learning_rate": 1.7429982616688503e-06, + "loss": 1.5815324783325195, + "step": 5164 + }, + { + "epoch": 1.5900277008310248, + "grad_norm": 21.75, + "learning_rate": 1.7422016473665151e-06, + "loss": 1.1833311319351196, + "step": 5166 + }, + { + "epoch": 1.590643274853801, + "grad_norm": 16.625, + "learning_rate": 1.74140505859378e-06, + "loss": 1.182098627090454, + "step": 5168 + }, + { + "epoch": 1.5912588488765773, + "grad_norm": 12.125, + "learning_rate": 1.7406084957024964e-06, + "loss": 1.1151795387268066, + "step": 5170 + }, + { + "epoch": 1.5918744228993535, + "grad_norm": 15.9375, + "learning_rate": 1.7398119590445083e-06, + "loss": 1.2372705936431885, + "step": 5172 + }, + { + "epoch": 1.5924899969221298, + "grad_norm": 16.5, + "learning_rate": 1.739015448971647e-06, + "loss": 1.3048455715179443, + "step": 5174 + }, + { + "epoch": 1.593105570944906, + "grad_norm": 10.375, + "learning_rate": 1.7382189658357316e-06, + "loss": 1.2205870151519775, + "step": 5176 + }, + { + "epoch": 1.5937211449676822, + "grad_norm": 12.4375, + "learning_rate": 1.73742250998857e-06, + "loss": 1.3982374668121338, + "step": 5178 + }, + { + "epoch": 1.5943367189904585, + "grad_norm": 22.75, + "learning_rate": 1.7366260817819574e-06, + "loss": 1.1613901853561401, + "step": 5180 + }, + { + "epoch": 1.5949522930132347, + "grad_norm": 11.1875, + "learning_rate": 1.7358296815676768e-06, + "loss": 1.548307180404663, + "step": 5182 + }, + { + "epoch": 1.595567867036011, + "grad_norm": 21.0, + "learning_rate": 1.7350333096974992e-06, + "loss": 1.2549443244934082, + "step": 5184 + }, + { + "epoch": 1.5961834410587872, + "grad_norm": 11.75, + "learning_rate": 1.7342369665231833e-06, + "loss": 1.5804221630096436, + "step": 5186 + }, + { + "epoch": 1.5967990150815634, + "grad_norm": 17.125, + "learning_rate": 1.7334406523964748e-06, + "loss": 1.4917235374450684, + "step": 5188 + }, + { + "epoch": 1.5974145891043396, + "grad_norm": 11.3125, + "learning_rate": 1.732644367669105e-06, + "loss": 1.3621652126312256, + "step": 5190 + }, + { + "epoch": 1.5980301631271159, + "grad_norm": 10.0625, + "learning_rate": 1.7318481126927962e-06, + "loss": 1.3136223554611206, + "step": 5192 + }, + { + "epoch": 1.598645737149892, + "grad_norm": 9.875, + "learning_rate": 1.7310518878192546e-06, + "loss": 1.3015594482421875, + "step": 5194 + }, + { + "epoch": 1.5992613111726686, + "grad_norm": 6.28125, + "learning_rate": 1.730255693400172e-06, + "loss": 1.1437548398971558, + "step": 5196 + }, + { + "epoch": 1.5998768851954448, + "grad_norm": 20.375, + "learning_rate": 1.7294595297872298e-06, + "loss": 1.1758710145950317, + "step": 5198 + }, + { + "epoch": 1.600492459218221, + "grad_norm": 12.875, + "learning_rate": 1.7286633973320943e-06, + "loss": 1.402280330657959, + "step": 5200 + }, + { + "epoch": 1.6011080332409973, + "grad_norm": 21.875, + "learning_rate": 1.7278672963864177e-06, + "loss": 1.2749931812286377, + "step": 5202 + }, + { + "epoch": 1.6017236072637735, + "grad_norm": 12.9375, + "learning_rate": 1.727071227301839e-06, + "loss": 1.0368297100067139, + "step": 5204 + }, + { + "epoch": 1.6023391812865497, + "grad_norm": 12.75, + "learning_rate": 1.7262751904299828e-06, + "loss": 1.2086350917816162, + "step": 5206 + }, + { + "epoch": 1.602954755309326, + "grad_norm": 17.0, + "learning_rate": 1.7254791861224584e-06, + "loss": 1.3812611103057861, + "step": 5208 + }, + { + "epoch": 1.6035703293321022, + "grad_norm": 15.0625, + "learning_rate": 1.724683214730864e-06, + "loss": 1.6722028255462646, + "step": 5210 + }, + { + "epoch": 1.6041859033548784, + "grad_norm": 36.25, + "learning_rate": 1.7238872766067794e-06, + "loss": 1.2400789260864258, + "step": 5212 + }, + { + "epoch": 1.6048014773776547, + "grad_norm": 18.0, + "learning_rate": 1.7230913721017715e-06, + "loss": 1.4790221452713013, + "step": 5214 + }, + { + "epoch": 1.605417051400431, + "grad_norm": 27.375, + "learning_rate": 1.7222955015673927e-06, + "loss": 0.6969753503799438, + "step": 5216 + }, + { + "epoch": 1.6060326254232071, + "grad_norm": 11.375, + "learning_rate": 1.7214996653551805e-06, + "loss": 1.1425102949142456, + "step": 5218 + }, + { + "epoch": 1.6066481994459834, + "grad_norm": 15.75, + "learning_rate": 1.7207038638166554e-06, + "loss": 1.4025464057922363, + "step": 5220 + }, + { + "epoch": 1.6072637734687596, + "grad_norm": 25.625, + "learning_rate": 1.7199080973033243e-06, + "loss": 1.1340413093566895, + "step": 5222 + }, + { + "epoch": 1.6078793474915358, + "grad_norm": 7.9375, + "learning_rate": 1.7191123661666785e-06, + "loss": 1.1119030714035034, + "step": 5224 + }, + { + "epoch": 1.608494921514312, + "grad_norm": 29.5, + "learning_rate": 1.7183166707581932e-06, + "loss": 1.1708309650421143, + "step": 5226 + }, + { + "epoch": 1.6091104955370883, + "grad_norm": 11.4375, + "learning_rate": 1.717521011429328e-06, + "loss": 0.9883697628974915, + "step": 5228 + }, + { + "epoch": 1.6097260695598645, + "grad_norm": 20.375, + "learning_rate": 1.7167253885315265e-06, + "loss": 1.5919955968856812, + "step": 5230 + }, + { + "epoch": 1.6103416435826408, + "grad_norm": 29.375, + "learning_rate": 1.7159298024162164e-06, + "loss": 1.1920177936553955, + "step": 5232 + }, + { + "epoch": 1.6109572176054172, + "grad_norm": 23.125, + "learning_rate": 1.7151342534348078e-06, + "loss": 2.032461166381836, + "step": 5234 + }, + { + "epoch": 1.6115727916281934, + "grad_norm": 6.8125, + "learning_rate": 1.7143387419386974e-06, + "loss": 1.5207996368408203, + "step": 5236 + }, + { + "epoch": 1.6121883656509697, + "grad_norm": 11.6875, + "learning_rate": 1.7135432682792634e-06, + "loss": 1.305809736251831, + "step": 5238 + }, + { + "epoch": 1.612803939673746, + "grad_norm": 5.9375, + "learning_rate": 1.712747832807865e-06, + "loss": 1.299574375152588, + "step": 5240 + }, + { + "epoch": 1.6134195136965221, + "grad_norm": 12.6875, + "learning_rate": 1.71195243587585e-06, + "loss": 1.2237389087677002, + "step": 5242 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 12.9375, + "learning_rate": 1.711157077834545e-06, + "loss": 1.2686474323272705, + "step": 5244 + }, + { + "epoch": 1.6146506617420746, + "grad_norm": 15.5, + "learning_rate": 1.7103617590352597e-06, + "loss": 1.2688703536987305, + "step": 5246 + }, + { + "epoch": 1.6152662357648508, + "grad_norm": 7.59375, + "learning_rate": 1.7095664798292884e-06, + "loss": 1.488532304763794, + "step": 5248 + }, + { + "epoch": 1.615881809787627, + "grad_norm": 15.8125, + "learning_rate": 1.7087712405679065e-06, + "loss": 1.678232192993164, + "step": 5250 + }, + { + "epoch": 1.6164973838104033, + "grad_norm": 15.4375, + "learning_rate": 1.707976041602372e-06, + "loss": 1.0275745391845703, + "step": 5252 + }, + { + "epoch": 1.6171129578331795, + "grad_norm": 22.375, + "learning_rate": 1.7071808832839258e-06, + "loss": 1.2949142456054688, + "step": 5254 + }, + { + "epoch": 1.6177285318559558, + "grad_norm": 30.75, + "learning_rate": 1.7063857659637888e-06, + "loss": 1.493941068649292, + "step": 5256 + }, + { + "epoch": 1.618344105878732, + "grad_norm": 17.125, + "learning_rate": 1.7055906899931665e-06, + "loss": 0.8534493446350098, + "step": 5258 + }, + { + "epoch": 1.6189596799015082, + "grad_norm": 17.5, + "learning_rate": 1.7047956557232446e-06, + "loss": 1.3491942882537842, + "step": 5260 + }, + { + "epoch": 1.6195752539242845, + "grad_norm": 45.75, + "learning_rate": 1.7040006635051904e-06, + "loss": 1.4251902103424072, + "step": 5262 + }, + { + "epoch": 1.6201908279470607, + "grad_norm": 24.25, + "learning_rate": 1.7032057136901533e-06, + "loss": 1.4901896715164185, + "step": 5264 + }, + { + "epoch": 1.620806401969837, + "grad_norm": 8.25, + "learning_rate": 1.7024108066292631e-06, + "loss": 1.2126847505569458, + "step": 5266 + }, + { + "epoch": 1.6214219759926132, + "grad_norm": 36.0, + "learning_rate": 1.7016159426736315e-06, + "loss": 1.3032283782958984, + "step": 5268 + }, + { + "epoch": 1.6220375500153894, + "grad_norm": 17.875, + "learning_rate": 1.7008211221743501e-06, + "loss": 1.6620005369186401, + "step": 5270 + }, + { + "epoch": 1.6226531240381656, + "grad_norm": 13.8125, + "learning_rate": 1.7000263454824928e-06, + "loss": 1.160904049873352, + "step": 5272 + }, + { + "epoch": 1.6232686980609419, + "grad_norm": 15.625, + "learning_rate": 1.6992316129491138e-06, + "loss": 1.1405680179595947, + "step": 5274 + }, + { + "epoch": 1.623884272083718, + "grad_norm": 27.375, + "learning_rate": 1.6984369249252463e-06, + "loss": 1.6693198680877686, + "step": 5276 + }, + { + "epoch": 1.6244998461064943, + "grad_norm": 27.125, + "learning_rate": 1.6976422817619049e-06, + "loss": 1.5274195671081543, + "step": 5278 + }, + { + "epoch": 1.6251154201292706, + "grad_norm": 15.5625, + "learning_rate": 1.6968476838100854e-06, + "loss": 1.1784543991088867, + "step": 5280 + }, + { + "epoch": 1.6257309941520468, + "grad_norm": 25.875, + "learning_rate": 1.6960531314207618e-06, + "loss": 1.3474241495132446, + "step": 5282 + }, + { + "epoch": 1.626346568174823, + "grad_norm": 10.6875, + "learning_rate": 1.6952586249448885e-06, + "loss": 1.2467565536499023, + "step": 5284 + }, + { + "epoch": 1.6269621421975993, + "grad_norm": 9.1875, + "learning_rate": 1.6944641647334012e-06, + "loss": 1.2050135135650635, + "step": 5286 + }, + { + "epoch": 1.6275777162203755, + "grad_norm": 23.75, + "learning_rate": 1.6936697511372128e-06, + "loss": 1.5005052089691162, + "step": 5288 + }, + { + "epoch": 1.6281932902431517, + "grad_norm": 14.1875, + "learning_rate": 1.6928753845072173e-06, + "loss": 0.44793587923049927, + "step": 5290 + }, + { + "epoch": 1.628808864265928, + "grad_norm": 8.3125, + "learning_rate": 1.6920810651942868e-06, + "loss": 1.2666034698486328, + "step": 5292 + }, + { + "epoch": 1.6294244382887042, + "grad_norm": 10.5625, + "learning_rate": 1.691286793549274e-06, + "loss": 1.2090697288513184, + "step": 5294 + }, + { + "epoch": 1.6300400123114804, + "grad_norm": 17.875, + "learning_rate": 1.690492569923008e-06, + "loss": 1.7023463249206543, + "step": 5296 + }, + { + "epoch": 1.6306555863342567, + "grad_norm": 63.25, + "learning_rate": 1.6896983946662998e-06, + "loss": 1.1873985528945923, + "step": 5298 + }, + { + "epoch": 1.631271160357033, + "grad_norm": 28.75, + "learning_rate": 1.6889042681299366e-06, + "loss": 1.5106173753738403, + "step": 5300 + }, + { + "epoch": 1.6318867343798091, + "grad_norm": 45.5, + "learning_rate": 1.6881101906646855e-06, + "loss": 1.2140668630599976, + "step": 5302 + }, + { + "epoch": 1.6325023084025854, + "grad_norm": 6.28125, + "learning_rate": 1.6873161626212914e-06, + "loss": 1.0440281629562378, + "step": 5304 + }, + { + "epoch": 1.6331178824253616, + "grad_norm": 5.84375, + "learning_rate": 1.6865221843504775e-06, + "loss": 1.156790018081665, + "step": 5306 + }, + { + "epoch": 1.6337334564481378, + "grad_norm": 28.375, + "learning_rate": 1.685728256202944e-06, + "loss": 1.4441311359405518, + "step": 5308 + }, + { + "epoch": 1.634349030470914, + "grad_norm": 20.875, + "learning_rate": 1.684934378529371e-06, + "loss": 1.5814571380615234, + "step": 5310 + }, + { + "epoch": 1.6349646044936903, + "grad_norm": 24.875, + "learning_rate": 1.6841405516804147e-06, + "loss": 1.757308006286621, + "step": 5312 + }, + { + "epoch": 1.6355801785164665, + "grad_norm": 6.03125, + "learning_rate": 1.6833467760067092e-06, + "loss": 1.1893250942230225, + "step": 5314 + }, + { + "epoch": 1.6361957525392428, + "grad_norm": 11.3125, + "learning_rate": 1.6825530518588665e-06, + "loss": 1.164353370666504, + "step": 5316 + }, + { + "epoch": 1.636811326562019, + "grad_norm": 15.875, + "learning_rate": 1.681759379587475e-06, + "loss": 1.082535743713379, + "step": 5318 + }, + { + "epoch": 1.6374269005847952, + "grad_norm": 13.4375, + "learning_rate": 1.6809657595431009e-06, + "loss": 1.3592793941497803, + "step": 5320 + }, + { + "epoch": 1.6380424746075715, + "grad_norm": 19.125, + "learning_rate": 1.6801721920762871e-06, + "loss": 1.5188610553741455, + "step": 5322 + }, + { + "epoch": 1.6386580486303477, + "grad_norm": 49.25, + "learning_rate": 1.6793786775375532e-06, + "loss": 1.9063937664031982, + "step": 5324 + }, + { + "epoch": 1.639273622653124, + "grad_norm": 13.5625, + "learning_rate": 1.6785852162773955e-06, + "loss": 1.3931628465652466, + "step": 5326 + }, + { + "epoch": 1.6398891966759002, + "grad_norm": 9.125, + "learning_rate": 1.6777918086462872e-06, + "loss": 1.5546140670776367, + "step": 5328 + }, + { + "epoch": 1.6405047706986764, + "grad_norm": 7.0, + "learning_rate": 1.6769984549946762e-06, + "loss": 1.2095413208007812, + "step": 5330 + }, + { + "epoch": 1.6411203447214526, + "grad_norm": 25.5, + "learning_rate": 1.6762051556729892e-06, + "loss": 1.5617570877075195, + "step": 5332 + }, + { + "epoch": 1.6417359187442289, + "grad_norm": 12.25, + "learning_rate": 1.675411911031626e-06, + "loss": 1.3387706279754639, + "step": 5334 + }, + { + "epoch": 1.642351492767005, + "grad_norm": 20.5, + "learning_rate": 1.6746187214209649e-06, + "loss": 1.4189784526824951, + "step": 5336 + }, + { + "epoch": 1.6429670667897813, + "grad_norm": 8.6875, + "learning_rate": 1.6738255871913576e-06, + "loss": 1.34170663356781, + "step": 5338 + }, + { + "epoch": 1.6435826408125576, + "grad_norm": 13.1875, + "learning_rate": 1.673032508693133e-06, + "loss": 1.3447788953781128, + "step": 5340 + }, + { + "epoch": 1.6441982148353338, + "grad_norm": 13.5625, + "learning_rate": 1.672239486276595e-06, + "loss": 1.5071009397506714, + "step": 5342 + }, + { + "epoch": 1.64481378885811, + "grad_norm": 21.375, + "learning_rate": 1.6714465202920218e-06, + "loss": 1.550859808921814, + "step": 5344 + }, + { + "epoch": 1.6454293628808865, + "grad_norm": 13.6875, + "learning_rate": 1.6706536110896672e-06, + "loss": 1.128093957901001, + "step": 5346 + }, + { + "epoch": 1.6460449369036627, + "grad_norm": 17.5, + "learning_rate": 1.6698607590197614e-06, + "loss": 1.2202242612838745, + "step": 5348 + }, + { + "epoch": 1.646660510926439, + "grad_norm": 13.5625, + "learning_rate": 1.6690679644325074e-06, + "loss": 1.5327153205871582, + "step": 5350 + }, + { + "epoch": 1.6472760849492152, + "grad_norm": 21.0, + "learning_rate": 1.6682752276780827e-06, + "loss": 1.5310816764831543, + "step": 5352 + }, + { + "epoch": 1.6478916589719914, + "grad_norm": 38.5, + "learning_rate": 1.6674825491066413e-06, + "loss": 1.4935948848724365, + "step": 5354 + }, + { + "epoch": 1.6485072329947676, + "grad_norm": 19.875, + "learning_rate": 1.6666899290683097e-06, + "loss": 1.5328843593597412, + "step": 5356 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 21.0, + "learning_rate": 1.6658973679131886e-06, + "loss": 1.686381220817566, + "step": 5358 + }, + { + "epoch": 1.6497383810403201, + "grad_norm": 15.0, + "learning_rate": 1.6651048659913537e-06, + "loss": 1.4446570873260498, + "step": 5360 + }, + { + "epoch": 1.6503539550630963, + "grad_norm": 26.25, + "learning_rate": 1.6643124236528544e-06, + "loss": 1.2061572074890137, + "step": 5362 + }, + { + "epoch": 1.6509695290858726, + "grad_norm": 14.5, + "learning_rate": 1.6635200412477125e-06, + "loss": 1.6465775966644287, + "step": 5364 + }, + { + "epoch": 1.6515851031086488, + "grad_norm": 23.75, + "learning_rate": 1.6627277191259246e-06, + "loss": 1.7101616859436035, + "step": 5366 + }, + { + "epoch": 1.652200677131425, + "grad_norm": 10.375, + "learning_rate": 1.6619354576374604e-06, + "loss": 1.0784965753555298, + "step": 5368 + }, + { + "epoch": 1.6528162511542013, + "grad_norm": 34.0, + "learning_rate": 1.6611432571322625e-06, + "loss": 1.399306297302246, + "step": 5370 + }, + { + "epoch": 1.6534318251769775, + "grad_norm": 12.8125, + "learning_rate": 1.660351117960247e-06, + "loss": 1.044040322303772, + "step": 5372 + }, + { + "epoch": 1.6540473991997537, + "grad_norm": 13.3125, + "learning_rate": 1.659559040471303e-06, + "loss": 1.3308005332946777, + "step": 5374 + }, + { + "epoch": 1.65466297322253, + "grad_norm": 40.0, + "learning_rate": 1.6587670250152905e-06, + "loss": 1.4872740507125854, + "step": 5376 + }, + { + "epoch": 1.6552785472453062, + "grad_norm": 29.875, + "learning_rate": 1.6579750719420454e-06, + "loss": 1.3254127502441406, + "step": 5378 + }, + { + "epoch": 1.6558941212680824, + "grad_norm": 22.375, + "learning_rate": 1.6571831816013736e-06, + "loss": 1.5545458793640137, + "step": 5380 + }, + { + "epoch": 1.6565096952908587, + "grad_norm": 12.75, + "learning_rate": 1.6563913543430538e-06, + "loss": 1.4617619514465332, + "step": 5382 + }, + { + "epoch": 1.6571252693136351, + "grad_norm": 50.25, + "learning_rate": 1.6555995905168376e-06, + "loss": 1.5333282947540283, + "step": 5384 + }, + { + "epoch": 1.6577408433364114, + "grad_norm": 10.9375, + "learning_rate": 1.654807890472447e-06, + "loss": 1.2353944778442383, + "step": 5386 + }, + { + "epoch": 1.6583564173591876, + "grad_norm": 11.375, + "learning_rate": 1.654016254559578e-06, + "loss": 1.0554957389831543, + "step": 5388 + }, + { + "epoch": 1.6589719913819638, + "grad_norm": 10.8125, + "learning_rate": 1.653224683127896e-06, + "loss": 1.4639546871185303, + "step": 5390 + }, + { + "epoch": 1.65958756540474, + "grad_norm": 11.6875, + "learning_rate": 1.6524331765270395e-06, + "loss": 1.2144356966018677, + "step": 5392 + }, + { + "epoch": 1.6602031394275163, + "grad_norm": 21.125, + "learning_rate": 1.6516417351066182e-06, + "loss": 1.5362677574157715, + "step": 5394 + }, + { + "epoch": 1.6608187134502925, + "grad_norm": 37.5, + "learning_rate": 1.650850359216212e-06, + "loss": 1.618035078048706, + "step": 5396 + }, + { + "epoch": 1.6614342874730688, + "grad_norm": 16.5, + "learning_rate": 1.650059049205373e-06, + "loss": 1.1688024997711182, + "step": 5398 + }, + { + "epoch": 1.662049861495845, + "grad_norm": 13.625, + "learning_rate": 1.6492678054236234e-06, + "loss": 1.240717887878418, + "step": 5400 + }, + { + "epoch": 1.6626654355186212, + "grad_norm": 5.0625, + "learning_rate": 1.6484766282204567e-06, + "loss": 1.212373971939087, + "step": 5402 + }, + { + "epoch": 1.6632810095413975, + "grad_norm": 23.125, + "learning_rate": 1.647685517945337e-06, + "loss": 1.596376895904541, + "step": 5404 + }, + { + "epoch": 1.6638965835641737, + "grad_norm": 10.5625, + "learning_rate": 1.6468944749476985e-06, + "loss": 1.4453678131103516, + "step": 5406 + }, + { + "epoch": 1.66451215758695, + "grad_norm": 13.375, + "learning_rate": 1.6461034995769456e-06, + "loss": 1.5522806644439697, + "step": 5408 + }, + { + "epoch": 1.6651277316097262, + "grad_norm": 17.875, + "learning_rate": 1.6453125921824527e-06, + "loss": 1.395452618598938, + "step": 5410 + }, + { + "epoch": 1.6657433056325024, + "grad_norm": 11.375, + "learning_rate": 1.6445217531135652e-06, + "loss": 1.5645806789398193, + "step": 5412 + }, + { + "epoch": 1.6663588796552786, + "grad_norm": 10.0, + "learning_rate": 1.6437309827195975e-06, + "loss": 1.3590474128723145, + "step": 5414 + }, + { + "epoch": 1.6669744536780549, + "grad_norm": 11.8125, + "learning_rate": 1.6429402813498334e-06, + "loss": 1.3016057014465332, + "step": 5416 + }, + { + "epoch": 1.667590027700831, + "grad_norm": 11.6875, + "learning_rate": 1.6421496493535271e-06, + "loss": 1.5097131729125977, + "step": 5418 + }, + { + "epoch": 1.6682056017236073, + "grad_norm": 17.5, + "learning_rate": 1.6413590870799006e-06, + "loss": 1.579138994216919, + "step": 5420 + }, + { + "epoch": 1.6688211757463836, + "grad_norm": 22.625, + "learning_rate": 1.6405685948781474e-06, + "loss": 1.4919606447219849, + "step": 5422 + }, + { + "epoch": 1.6694367497691598, + "grad_norm": 33.5, + "learning_rate": 1.639778173097429e-06, + "loss": 1.4293932914733887, + "step": 5424 + }, + { + "epoch": 1.670052323791936, + "grad_norm": 22.375, + "learning_rate": 1.6389878220868742e-06, + "loss": 0.9004104137420654, + "step": 5426 + }, + { + "epoch": 1.6706678978147123, + "grad_norm": 29.625, + "learning_rate": 1.638197542195583e-06, + "loss": 1.6715246438980103, + "step": 5428 + }, + { + "epoch": 1.6712834718374885, + "grad_norm": 8.5, + "learning_rate": 1.6374073337726228e-06, + "loss": 1.551560878753662, + "step": 5430 + }, + { + "epoch": 1.6718990458602647, + "grad_norm": 54.75, + "learning_rate": 1.6366171971670287e-06, + "loss": 1.8659555912017822, + "step": 5432 + }, + { + "epoch": 1.672514619883041, + "grad_norm": 6.6875, + "learning_rate": 1.6358271327278063e-06, + "loss": 1.1660687923431396, + "step": 5434 + }, + { + "epoch": 1.6731301939058172, + "grad_norm": 15.5625, + "learning_rate": 1.6350371408039269e-06, + "loss": 1.4193882942199707, + "step": 5436 + }, + { + "epoch": 1.6737457679285934, + "grad_norm": 12.375, + "learning_rate": 1.6342472217443313e-06, + "loss": 1.1343913078308105, + "step": 5438 + }, + { + "epoch": 1.6743613419513697, + "grad_norm": 17.0, + "learning_rate": 1.6334573758979277e-06, + "loss": 1.5491418838500977, + "step": 5440 + }, + { + "epoch": 1.6749769159741459, + "grad_norm": 16.375, + "learning_rate": 1.6326676036135919e-06, + "loss": 1.5517988204956055, + "step": 5442 + }, + { + "epoch": 1.6755924899969221, + "grad_norm": 22.0, + "learning_rate": 1.6318779052401675e-06, + "loss": 1.1505072116851807, + "step": 5444 + }, + { + "epoch": 1.6762080640196984, + "grad_norm": 18.0, + "learning_rate": 1.631088281126464e-06, + "loss": 0.7888588905334473, + "step": 5446 + }, + { + "epoch": 1.6768236380424746, + "grad_norm": 28.875, + "learning_rate": 1.6302987316212604e-06, + "loss": 0.8521158695220947, + "step": 5448 + }, + { + "epoch": 1.6774392120652508, + "grad_norm": 26.875, + "learning_rate": 1.6295092570733016e-06, + "loss": 1.5754749774932861, + "step": 5450 + }, + { + "epoch": 1.678054786088027, + "grad_norm": 6.59375, + "learning_rate": 1.6287198578312986e-06, + "loss": 1.3611557483673096, + "step": 5452 + }, + { + "epoch": 1.6786703601108033, + "grad_norm": 34.0, + "learning_rate": 1.6279305342439308e-06, + "loss": 0.6813716888427734, + "step": 5454 + }, + { + "epoch": 1.6792859341335795, + "grad_norm": 30.125, + "learning_rate": 1.6271412866598432e-06, + "loss": 1.5481641292572021, + "step": 5456 + }, + { + "epoch": 1.6799015081563557, + "grad_norm": 12.5625, + "learning_rate": 1.6263521154276469e-06, + "loss": 1.5015933513641357, + "step": 5458 + }, + { + "epoch": 1.680517082179132, + "grad_norm": 17.875, + "learning_rate": 1.62556302089592e-06, + "loss": 1.2945340871810913, + "step": 5460 + }, + { + "epoch": 1.6811326562019082, + "grad_norm": 21.375, + "learning_rate": 1.6247740034132063e-06, + "loss": 1.0310568809509277, + "step": 5462 + }, + { + "epoch": 1.6817482302246844, + "grad_norm": 9.625, + "learning_rate": 1.6239850633280158e-06, + "loss": 1.0926060676574707, + "step": 5464 + }, + { + "epoch": 1.6823638042474607, + "grad_norm": 5.9375, + "learning_rate": 1.6231962009888247e-06, + "loss": 1.2937240600585938, + "step": 5466 + }, + { + "epoch": 1.682979378270237, + "grad_norm": 21.75, + "learning_rate": 1.6224074167440741e-06, + "loss": 1.2178066968917847, + "step": 5468 + }, + { + "epoch": 1.6835949522930131, + "grad_norm": 5.65625, + "learning_rate": 1.6216187109421704e-06, + "loss": 1.1452090740203857, + "step": 5470 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 11.4375, + "learning_rate": 1.6208300839314868e-06, + "loss": 1.3337724208831787, + "step": 5472 + }, + { + "epoch": 1.6848261003385656, + "grad_norm": 5.90625, + "learning_rate": 1.6200415360603596e-06, + "loss": 1.1718332767486572, + "step": 5474 + }, + { + "epoch": 1.6854416743613418, + "grad_norm": 37.75, + "learning_rate": 1.6192530676770923e-06, + "loss": 1.3318912982940674, + "step": 5476 + }, + { + "epoch": 1.686057248384118, + "grad_norm": 19.875, + "learning_rate": 1.6184646791299515e-06, + "loss": 1.6451802253723145, + "step": 5478 + }, + { + "epoch": 1.6866728224068943, + "grad_norm": 18.5, + "learning_rate": 1.6176763707671707e-06, + "loss": 1.4989774227142334, + "step": 5480 + }, + { + "epoch": 1.6872883964296705, + "grad_norm": 27.125, + "learning_rate": 1.6168881429369443e-06, + "loss": 1.7403260469436646, + "step": 5482 + }, + { + "epoch": 1.6879039704524468, + "grad_norm": 18.375, + "learning_rate": 1.6160999959874356e-06, + "loss": 1.7129693031311035, + "step": 5484 + }, + { + "epoch": 1.688519544475223, + "grad_norm": 7.1875, + "learning_rate": 1.6153119302667695e-06, + "loss": 1.4952833652496338, + "step": 5486 + }, + { + "epoch": 1.6891351184979992, + "grad_norm": 23.75, + "learning_rate": 1.6145239461230345e-06, + "loss": 1.4523990154266357, + "step": 5488 + }, + { + "epoch": 1.6897506925207755, + "grad_norm": 11.3125, + "learning_rate": 1.6137360439042855e-06, + "loss": 1.563145399093628, + "step": 5490 + }, + { + "epoch": 1.6903662665435517, + "grad_norm": 7.375, + "learning_rate": 1.6129482239585387e-06, + "loss": 1.189328908920288, + "step": 5492 + }, + { + "epoch": 1.690981840566328, + "grad_norm": 34.5, + "learning_rate": 1.6121604866337753e-06, + "loss": 1.478722095489502, + "step": 5494 + }, + { + "epoch": 1.6915974145891042, + "grad_norm": 19.125, + "learning_rate": 1.6113728322779404e-06, + "loss": 1.5562196969985962, + "step": 5496 + }, + { + "epoch": 1.6922129886118806, + "grad_norm": 27.625, + "learning_rate": 1.6105852612389413e-06, + "loss": 1.449194073677063, + "step": 5498 + }, + { + "epoch": 1.6928285626346569, + "grad_norm": 19.25, + "learning_rate": 1.6097977738646492e-06, + "loss": 1.2681084871292114, + "step": 5500 + }, + { + "epoch": 1.693444136657433, + "grad_norm": 11.125, + "learning_rate": 1.6090103705028978e-06, + "loss": 1.376092791557312, + "step": 5502 + }, + { + "epoch": 1.6940597106802093, + "grad_norm": 18.25, + "learning_rate": 1.6082230515014844e-06, + "loss": 0.9993728995323181, + "step": 5504 + }, + { + "epoch": 1.6946752847029856, + "grad_norm": 6.5625, + "learning_rate": 1.6074358172081692e-06, + "loss": 1.1781853437423706, + "step": 5506 + }, + { + "epoch": 1.6952908587257618, + "grad_norm": 17.25, + "learning_rate": 1.6066486679706731e-06, + "loss": 1.278997778892517, + "step": 5508 + }, + { + "epoch": 1.695906432748538, + "grad_norm": 6.5, + "learning_rate": 1.6058616041366823e-06, + "loss": 1.3280869722366333, + "step": 5510 + }, + { + "epoch": 1.6965220067713143, + "grad_norm": 9.875, + "learning_rate": 1.6050746260538435e-06, + "loss": 1.5951027870178223, + "step": 5512 + }, + { + "epoch": 1.6971375807940905, + "grad_norm": 14.4375, + "learning_rate": 1.604287734069765e-06, + "loss": 1.4890497922897339, + "step": 5514 + }, + { + "epoch": 1.6977531548168667, + "grad_norm": 29.875, + "learning_rate": 1.6035009285320186e-06, + "loss": 1.2161567211151123, + "step": 5516 + }, + { + "epoch": 1.698368728839643, + "grad_norm": 17.375, + "learning_rate": 1.602714209788137e-06, + "loss": 1.6530604362487793, + "step": 5518 + }, + { + "epoch": 1.6989843028624192, + "grad_norm": 9.5, + "learning_rate": 1.6019275781856147e-06, + "loss": 1.4269185066223145, + "step": 5520 + }, + { + "epoch": 1.6995998768851954, + "grad_norm": 11.0, + "learning_rate": 1.601141034071908e-06, + "loss": 1.2811148166656494, + "step": 5522 + }, + { + "epoch": 1.7002154509079717, + "grad_norm": 15.0, + "learning_rate": 1.600354577794434e-06, + "loss": 1.6427247524261475, + "step": 5524 + }, + { + "epoch": 1.700831024930748, + "grad_norm": 10.5625, + "learning_rate": 1.5995682097005709e-06, + "loss": 1.0762931108474731, + "step": 5526 + }, + { + "epoch": 1.7014465989535241, + "grad_norm": 13.375, + "learning_rate": 1.5987819301376602e-06, + "loss": 1.3468595743179321, + "step": 5528 + }, + { + "epoch": 1.7020621729763004, + "grad_norm": 7.09375, + "learning_rate": 1.5979957394530007e-06, + "loss": 1.1727588176727295, + "step": 5530 + }, + { + "epoch": 1.7026777469990766, + "grad_norm": 25.0, + "learning_rate": 1.5972096379938543e-06, + "loss": 1.1424286365509033, + "step": 5532 + }, + { + "epoch": 1.7032933210218528, + "grad_norm": 12.0, + "learning_rate": 1.5964236261074432e-06, + "loss": 1.0981193780899048, + "step": 5534 + }, + { + "epoch": 1.7039088950446293, + "grad_norm": 8.6875, + "learning_rate": 1.5956377041409495e-06, + "loss": 1.2754977941513062, + "step": 5536 + }, + { + "epoch": 1.7045244690674055, + "grad_norm": 10.5625, + "learning_rate": 1.5948518724415154e-06, + "loss": 1.2789602279663086, + "step": 5538 + }, + { + "epoch": 1.7051400430901817, + "grad_norm": 10.75, + "learning_rate": 1.5940661313562445e-06, + "loss": 1.104224443435669, + "step": 5540 + }, + { + "epoch": 1.705755617112958, + "grad_norm": 9.3125, + "learning_rate": 1.5932804812321991e-06, + "loss": 1.0468237400054932, + "step": 5542 + }, + { + "epoch": 1.7063711911357342, + "grad_norm": 13.25, + "learning_rate": 1.5924949224164016e-06, + "loss": 1.2828927040100098, + "step": 5544 + }, + { + "epoch": 1.7069867651585104, + "grad_norm": 12.375, + "learning_rate": 1.5917094552558343e-06, + "loss": 1.4367725849151611, + "step": 5546 + }, + { + "epoch": 1.7076023391812867, + "grad_norm": 11.0625, + "learning_rate": 1.5909240800974395e-06, + "loss": 1.252647876739502, + "step": 5548 + }, + { + "epoch": 1.708217913204063, + "grad_norm": 8.875, + "learning_rate": 1.5901387972881177e-06, + "loss": 1.0920422077178955, + "step": 5550 + }, + { + "epoch": 1.7088334872268391, + "grad_norm": 11.75, + "learning_rate": 1.5893536071747289e-06, + "loss": 1.3846423625946045, + "step": 5552 + }, + { + "epoch": 1.7094490612496154, + "grad_norm": 19.0, + "learning_rate": 1.5885685101040934e-06, + "loss": 1.7853939533233643, + "step": 5554 + }, + { + "epoch": 1.7100646352723916, + "grad_norm": 13.0, + "learning_rate": 1.587783506422989e-06, + "loss": 1.0750497579574585, + "step": 5556 + }, + { + "epoch": 1.7106802092951678, + "grad_norm": 10.0625, + "learning_rate": 1.5869985964781524e-06, + "loss": 1.1922788619995117, + "step": 5558 + }, + { + "epoch": 1.711295783317944, + "grad_norm": 23.125, + "learning_rate": 1.58621378061628e-06, + "loss": 1.2307591438293457, + "step": 5560 + }, + { + "epoch": 1.7119113573407203, + "grad_norm": 19.75, + "learning_rate": 1.585429059184025e-06, + "loss": 1.4407920837402344, + "step": 5562 + }, + { + "epoch": 1.7125269313634965, + "grad_norm": 55.25, + "learning_rate": 1.5846444325280004e-06, + "loss": 1.307939887046814, + "step": 5564 + }, + { + "epoch": 1.7131425053862728, + "grad_norm": 28.125, + "learning_rate": 1.5838599009947765e-06, + "loss": 1.2787885665893555, + "step": 5566 + }, + { + "epoch": 1.713758079409049, + "grad_norm": 17.5, + "learning_rate": 1.5830754649308819e-06, + "loss": 1.3981409072875977, + "step": 5568 + }, + { + "epoch": 1.7143736534318252, + "grad_norm": 25.625, + "learning_rate": 1.5822911246828024e-06, + "loss": 0.7358442544937134, + "step": 5570 + }, + { + "epoch": 1.7149892274546015, + "grad_norm": 11.0, + "learning_rate": 1.5815068805969829e-06, + "loss": 1.253906011581421, + "step": 5572 + }, + { + "epoch": 1.7156048014773777, + "grad_norm": 16.5, + "learning_rate": 1.5807227330198241e-06, + "loss": 1.49554443359375, + "step": 5574 + }, + { + "epoch": 1.716220375500154, + "grad_norm": 21.25, + "learning_rate": 1.5799386822976849e-06, + "loss": 1.349895715713501, + "step": 5576 + }, + { + "epoch": 1.7168359495229302, + "grad_norm": 29.0, + "learning_rate": 1.5791547287768816e-06, + "loss": 1.6913847923278809, + "step": 5578 + }, + { + "epoch": 1.7174515235457064, + "grad_norm": 73.0, + "learning_rate": 1.578370872803688e-06, + "loss": 1.261209487915039, + "step": 5580 + }, + { + "epoch": 1.7180670975684826, + "grad_norm": 24.625, + "learning_rate": 1.577587114724333e-06, + "loss": 1.4275460243225098, + "step": 5582 + }, + { + "epoch": 1.7186826715912589, + "grad_norm": 17.125, + "learning_rate": 1.5768034548850043e-06, + "loss": 1.113776683807373, + "step": 5584 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 10.625, + "learning_rate": 1.5760198936318447e-06, + "loss": 1.2948540449142456, + "step": 5586 + }, + { + "epoch": 1.7199138196368113, + "grad_norm": 10.25, + "learning_rate": 1.575236431310955e-06, + "loss": 1.360956072807312, + "step": 5588 + }, + { + "epoch": 1.7205293936595876, + "grad_norm": 7.84375, + "learning_rate": 1.5744530682683908e-06, + "loss": 0.9487060308456421, + "step": 5590 + }, + { + "epoch": 1.7211449676823638, + "grad_norm": 39.0, + "learning_rate": 1.573669804850164e-06, + "loss": 1.8299914598464966, + "step": 5592 + }, + { + "epoch": 1.72176054170514, + "grad_norm": 13.0625, + "learning_rate": 1.572886641402244e-06, + "loss": 1.2598625421524048, + "step": 5594 + }, + { + "epoch": 1.7223761157279163, + "grad_norm": 7.875, + "learning_rate": 1.5721035782705546e-06, + "loss": 1.1927969455718994, + "step": 5596 + }, + { + "epoch": 1.7229916897506925, + "grad_norm": 7.875, + "learning_rate": 1.5713206158009751e-06, + "loss": 1.1537501811981201, + "step": 5598 + }, + { + "epoch": 1.7236072637734687, + "grad_norm": 17.25, + "learning_rate": 1.5705377543393415e-06, + "loss": 1.6629645824432373, + "step": 5600 + }, + { + "epoch": 1.724222837796245, + "grad_norm": 21.0, + "learning_rate": 1.5697549942314443e-06, + "loss": 1.2592394351959229, + "step": 5602 + }, + { + "epoch": 1.7248384118190212, + "grad_norm": 12.5, + "learning_rate": 1.5689723358230306e-06, + "loss": 1.2869166135787964, + "step": 5604 + }, + { + "epoch": 1.7254539858417974, + "grad_norm": 15.25, + "learning_rate": 1.5681897794598e-06, + "loss": 1.4332456588745117, + "step": 5606 + }, + { + "epoch": 1.7260695598645737, + "grad_norm": 43.0, + "learning_rate": 1.5674073254874092e-06, + "loss": 1.5304079055786133, + "step": 5608 + }, + { + "epoch": 1.72668513388735, + "grad_norm": 19.875, + "learning_rate": 1.566624974251469e-06, + "loss": 1.7841585874557495, + "step": 5610 + }, + { + "epoch": 1.7273007079101261, + "grad_norm": 8.6875, + "learning_rate": 1.5658427260975448e-06, + "loss": 1.252263069152832, + "step": 5612 + }, + { + "epoch": 1.7279162819329024, + "grad_norm": 10.3125, + "learning_rate": 1.5650605813711564e-06, + "loss": 1.31412935256958, + "step": 5614 + }, + { + "epoch": 1.7285318559556786, + "grad_norm": 11.5, + "learning_rate": 1.5642785404177783e-06, + "loss": 1.274031162261963, + "step": 5616 + }, + { + "epoch": 1.7291474299784548, + "grad_norm": 13.875, + "learning_rate": 1.563496603582839e-06, + "loss": 1.690305233001709, + "step": 5618 + }, + { + "epoch": 1.729763004001231, + "grad_norm": 13.9375, + "learning_rate": 1.56271477121172e-06, + "loss": 1.4655866622924805, + "step": 5620 + }, + { + "epoch": 1.7303785780240073, + "grad_norm": 5.8125, + "learning_rate": 1.5619330436497587e-06, + "loss": 1.5229971408843994, + "step": 5622 + }, + { + "epoch": 1.7309941520467835, + "grad_norm": 15.0, + "learning_rate": 1.561151421242245e-06, + "loss": 1.4558359384536743, + "step": 5624 + }, + { + "epoch": 1.7316097260695598, + "grad_norm": 15.1875, + "learning_rate": 1.5603699043344205e-06, + "loss": 1.6087075471878052, + "step": 5626 + }, + { + "epoch": 1.732225300092336, + "grad_norm": 13.0, + "learning_rate": 1.5595884932714848e-06, + "loss": 1.607154369354248, + "step": 5628 + }, + { + "epoch": 1.7328408741151122, + "grad_norm": 14.8125, + "learning_rate": 1.5588071883985866e-06, + "loss": 1.6805574893951416, + "step": 5630 + }, + { + "epoch": 1.7334564481378885, + "grad_norm": 10.25, + "learning_rate": 1.558025990060829e-06, + "loss": 1.3068217039108276, + "step": 5632 + }, + { + "epoch": 1.7340720221606647, + "grad_norm": 5.5625, + "learning_rate": 1.557244898603269e-06, + "loss": 1.0744372606277466, + "step": 5634 + }, + { + "epoch": 1.734687596183441, + "grad_norm": 12.75, + "learning_rate": 1.5564639143709149e-06, + "loss": 1.440709114074707, + "step": 5636 + }, + { + "epoch": 1.7353031702062172, + "grad_norm": 37.0, + "learning_rate": 1.5556830377087283e-06, + "loss": 1.4084618091583252, + "step": 5638 + }, + { + "epoch": 1.7359187442289934, + "grad_norm": 17.375, + "learning_rate": 1.5549022689616245e-06, + "loss": 1.2655439376831055, + "step": 5640 + }, + { + "epoch": 1.7365343182517696, + "grad_norm": 24.0, + "learning_rate": 1.5541216084744686e-06, + "loss": 1.3823131322860718, + "step": 5642 + }, + { + "epoch": 1.7371498922745459, + "grad_norm": 64.0, + "learning_rate": 1.5533410565920793e-06, + "loss": 1.3828883171081543, + "step": 5644 + }, + { + "epoch": 1.737765466297322, + "grad_norm": 56.5, + "learning_rate": 1.552560613659228e-06, + "loss": 1.0084829330444336, + "step": 5646 + }, + { + "epoch": 1.7383810403200985, + "grad_norm": 21.875, + "learning_rate": 1.5517802800206373e-06, + "loss": 1.5251989364624023, + "step": 5648 + }, + { + "epoch": 1.7389966143428748, + "grad_norm": 40.25, + "learning_rate": 1.5510000560209802e-06, + "loss": 1.2445313930511475, + "step": 5650 + }, + { + "epoch": 1.739612188365651, + "grad_norm": 23.125, + "learning_rate": 1.550219942004884e-06, + "loss": 1.4254047870635986, + "step": 5652 + }, + { + "epoch": 1.7402277623884272, + "grad_norm": 10.625, + "learning_rate": 1.5494399383169256e-06, + "loss": 1.275312066078186, + "step": 5654 + }, + { + "epoch": 1.7408433364112035, + "grad_norm": 13.0, + "learning_rate": 1.5486600453016328e-06, + "loss": 1.4267611503601074, + "step": 5656 + }, + { + "epoch": 1.7414589104339797, + "grad_norm": 32.0, + "learning_rate": 1.5478802633034857e-06, + "loss": 1.2506225109100342, + "step": 5658 + }, + { + "epoch": 1.742074484456756, + "grad_norm": 10.375, + "learning_rate": 1.5471005926669154e-06, + "loss": 1.3239362239837646, + "step": 5660 + }, + { + "epoch": 1.7426900584795322, + "grad_norm": 9.0, + "learning_rate": 1.5463210337363023e-06, + "loss": 1.0405347347259521, + "step": 5662 + }, + { + "epoch": 1.7433056325023084, + "grad_norm": 18.875, + "learning_rate": 1.5455415868559788e-06, + "loss": 1.4487731456756592, + "step": 5664 + }, + { + "epoch": 1.7439212065250846, + "grad_norm": 12.5, + "learning_rate": 1.5447622523702285e-06, + "loss": 1.560713291168213, + "step": 5666 + }, + { + "epoch": 1.7445367805478609, + "grad_norm": 8.6875, + "learning_rate": 1.5439830306232833e-06, + "loss": 1.1130002737045288, + "step": 5668 + }, + { + "epoch": 1.745152354570637, + "grad_norm": 16.125, + "learning_rate": 1.5432039219593258e-06, + "loss": 1.4023849964141846, + "step": 5670 + }, + { + "epoch": 1.7457679285934133, + "grad_norm": 13.125, + "learning_rate": 1.5424249267224906e-06, + "loss": 1.2204084396362305, + "step": 5672 + }, + { + "epoch": 1.7463835026161896, + "grad_norm": 6.09375, + "learning_rate": 1.5416460452568602e-06, + "loss": 1.0636701583862305, + "step": 5674 + }, + { + "epoch": 1.7469990766389658, + "grad_norm": 32.75, + "learning_rate": 1.5408672779064668e-06, + "loss": 1.237810492515564, + "step": 5676 + }, + { + "epoch": 1.747614650661742, + "grad_norm": 8.4375, + "learning_rate": 1.5400886250152934e-06, + "loss": 0.7522115707397461, + "step": 5678 + }, + { + "epoch": 1.7482302246845183, + "grad_norm": 9.25, + "learning_rate": 1.5393100869272726e-06, + "loss": 1.1993417739868164, + "step": 5680 + }, + { + "epoch": 1.7488457987072945, + "grad_norm": 36.0, + "learning_rate": 1.538531663986284e-06, + "loss": 1.330182433128357, + "step": 5682 + }, + { + "epoch": 1.7494613727300707, + "grad_norm": 14.0, + "learning_rate": 1.5377533565361593e-06, + "loss": 1.4139305353164673, + "step": 5684 + }, + { + "epoch": 1.7500769467528472, + "grad_norm": 8.4375, + "learning_rate": 1.536975164920677e-06, + "loss": 1.4003902673721313, + "step": 5686 + }, + { + "epoch": 1.7506925207756234, + "grad_norm": 7.4375, + "learning_rate": 1.536197089483565e-06, + "loss": 1.389980435371399, + "step": 5688 + }, + { + "epoch": 1.7513080947983997, + "grad_norm": 15.8125, + "learning_rate": 1.5354191305685015e-06, + "loss": 1.1338951587677002, + "step": 5690 + }, + { + "epoch": 1.751923668821176, + "grad_norm": 110.5, + "learning_rate": 1.5346412885191106e-06, + "loss": 1.5165810585021973, + "step": 5692 + }, + { + "epoch": 1.7525392428439521, + "grad_norm": 10.25, + "learning_rate": 1.5338635636789656e-06, + "loss": 1.3715205192565918, + "step": 5694 + }, + { + "epoch": 1.7531548168667284, + "grad_norm": 11.5625, + "learning_rate": 1.5330859563915895e-06, + "loss": 1.2929373979568481, + "step": 5696 + }, + { + "epoch": 1.7537703908895046, + "grad_norm": 9.5625, + "learning_rate": 1.5323084670004526e-06, + "loss": 1.5255780220031738, + "step": 5698 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 8.0, + "learning_rate": 1.5315310958489717e-06, + "loss": 0.971356987953186, + "step": 5700 + }, + { + "epoch": 1.755001538935057, + "grad_norm": 15.5, + "learning_rate": 1.5307538432805136e-06, + "loss": 1.3214330673217773, + "step": 5702 + }, + { + "epoch": 1.7556171129578333, + "grad_norm": 10.1875, + "learning_rate": 1.5299767096383908e-06, + "loss": 1.6737107038497925, + "step": 5704 + }, + { + "epoch": 1.7562326869806095, + "grad_norm": 11.375, + "learning_rate": 1.5291996952658643e-06, + "loss": 1.386177897453308, + "step": 5706 + }, + { + "epoch": 1.7568482610033858, + "grad_norm": 13.25, + "learning_rate": 1.5284228005061426e-06, + "loss": 1.318868637084961, + "step": 5708 + }, + { + "epoch": 1.757463835026162, + "grad_norm": 13.5625, + "learning_rate": 1.527646025702381e-06, + "loss": 1.4786326885223389, + "step": 5710 + }, + { + "epoch": 1.7580794090489382, + "grad_norm": 16.875, + "learning_rate": 1.526869371197681e-06, + "loss": 1.4338879585266113, + "step": 5712 + }, + { + "epoch": 1.7586949830717145, + "grad_norm": 9.125, + "learning_rate": 1.5260928373350926e-06, + "loss": 1.275748372077942, + "step": 5714 + }, + { + "epoch": 1.7593105570944907, + "grad_norm": 16.625, + "learning_rate": 1.5253164244576118e-06, + "loss": 1.496471881866455, + "step": 5716 + }, + { + "epoch": 1.759926131117267, + "grad_norm": 18.25, + "learning_rate": 1.5245401329081808e-06, + "loss": 1.175106406211853, + "step": 5718 + }, + { + "epoch": 1.7605417051400432, + "grad_norm": 11.0, + "learning_rate": 1.5237639630296883e-06, + "loss": 1.1941828727722168, + "step": 5720 + }, + { + "epoch": 1.7611572791628194, + "grad_norm": 40.25, + "learning_rate": 1.5229879151649696e-06, + "loss": 1.853593349456787, + "step": 5722 + }, + { + "epoch": 1.7617728531855956, + "grad_norm": 20.0, + "learning_rate": 1.5222119896568056e-06, + "loss": 1.6410892009735107, + "step": 5724 + }, + { + "epoch": 1.7623884272083719, + "grad_norm": 16.5, + "learning_rate": 1.521436186847924e-06, + "loss": 1.4642964601516724, + "step": 5726 + }, + { + "epoch": 1.763004001231148, + "grad_norm": 14.8125, + "learning_rate": 1.5206605070809976e-06, + "loss": 1.4115772247314453, + "step": 5728 + }, + { + "epoch": 1.7636195752539243, + "grad_norm": 36.75, + "learning_rate": 1.5198849506986451e-06, + "loss": 1.3137972354888916, + "step": 5730 + }, + { + "epoch": 1.7642351492767006, + "grad_norm": 16.75, + "learning_rate": 1.51910951804343e-06, + "loss": 1.5579824447631836, + "step": 5732 + }, + { + "epoch": 1.7648507232994768, + "grad_norm": 15.0, + "learning_rate": 1.518334209457863e-06, + "loss": 1.5074825286865234, + "step": 5734 + }, + { + "epoch": 1.765466297322253, + "grad_norm": 3.703125, + "learning_rate": 1.5175590252843977e-06, + "loss": 1.1944538354873657, + "step": 5736 + }, + { + "epoch": 1.7660818713450293, + "grad_norm": 10.0, + "learning_rate": 1.516783965865434e-06, + "loss": 1.349628210067749, + "step": 5738 + }, + { + "epoch": 1.7666974453678055, + "grad_norm": 24.5, + "learning_rate": 1.5160090315433172e-06, + "loss": 1.5806326866149902, + "step": 5740 + }, + { + "epoch": 1.7673130193905817, + "grad_norm": 11.8125, + "learning_rate": 1.5152342226603363e-06, + "loss": 0.827213704586029, + "step": 5742 + }, + { + "epoch": 1.767928593413358, + "grad_norm": 12.25, + "learning_rate": 1.514459539558725e-06, + "loss": 1.1036474704742432, + "step": 5744 + }, + { + "epoch": 1.7685441674361342, + "grad_norm": 16.375, + "learning_rate": 1.5136849825806622e-06, + "loss": 1.5632669925689697, + "step": 5746 + }, + { + "epoch": 1.7691597414589104, + "grad_norm": 11.1875, + "learning_rate": 1.5129105520682701e-06, + "loss": 1.4175076484680176, + "step": 5748 + }, + { + "epoch": 1.7697753154816867, + "grad_norm": 9.9375, + "learning_rate": 1.5121362483636157e-06, + "loss": 1.2292062044143677, + "step": 5750 + }, + { + "epoch": 1.7703908895044629, + "grad_norm": 18.125, + "learning_rate": 1.5113620718087104e-06, + "loss": 1.5425353050231934, + "step": 5752 + }, + { + "epoch": 1.7710064635272391, + "grad_norm": 13.0, + "learning_rate": 1.510588022745508e-06, + "loss": 1.1885464191436768, + "step": 5754 + }, + { + "epoch": 1.7716220375500153, + "grad_norm": 15.625, + "learning_rate": 1.5098141015159072e-06, + "loss": 1.244379997253418, + "step": 5756 + }, + { + "epoch": 1.7722376115727916, + "grad_norm": 5.21875, + "learning_rate": 1.50904030846175e-06, + "loss": 1.322556972503662, + "step": 5758 + }, + { + "epoch": 1.7728531855955678, + "grad_norm": 21.125, + "learning_rate": 1.5082666439248217e-06, + "loss": 1.8244168758392334, + "step": 5760 + }, + { + "epoch": 1.773468759618344, + "grad_norm": 42.0, + "learning_rate": 1.5074931082468508e-06, + "loss": 1.696357250213623, + "step": 5762 + }, + { + "epoch": 1.7740843336411203, + "grad_norm": 6.40625, + "learning_rate": 1.5067197017695088e-06, + "loss": 1.1527273654937744, + "step": 5764 + }, + { + "epoch": 1.7746999076638965, + "grad_norm": 6.375, + "learning_rate": 1.5059464248344102e-06, + "loss": 1.110656976699829, + "step": 5766 + }, + { + "epoch": 1.7753154816866727, + "grad_norm": 9.5, + "learning_rate": 1.5051732777831122e-06, + "loss": 1.2908079624176025, + "step": 5768 + }, + { + "epoch": 1.775931055709449, + "grad_norm": 26.0, + "learning_rate": 1.5044002609571155e-06, + "loss": 1.5653884410858154, + "step": 5770 + }, + { + "epoch": 1.7765466297322252, + "grad_norm": 13.4375, + "learning_rate": 1.5036273746978614e-06, + "loss": 1.3660342693328857, + "step": 5772 + }, + { + "epoch": 1.7771622037550014, + "grad_norm": 21.5, + "learning_rate": 1.5028546193467355e-06, + "loss": 1.4595375061035156, + "step": 5774 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 9.25, + "learning_rate": 1.5020819952450642e-06, + "loss": 1.145453691482544, + "step": 5776 + }, + { + "epoch": 1.778393351800554, + "grad_norm": 14.375, + "learning_rate": 1.5013095027341166e-06, + "loss": 1.477217435836792, + "step": 5778 + }, + { + "epoch": 1.7790089258233301, + "grad_norm": 8.8125, + "learning_rate": 1.5005371421551035e-06, + "loss": 1.1808829307556152, + "step": 5780 + }, + { + "epoch": 1.7796244998461064, + "grad_norm": 7.0, + "learning_rate": 1.4997649138491773e-06, + "loss": 1.1786186695098877, + "step": 5782 + }, + { + "epoch": 1.7802400738688826, + "grad_norm": 6.21875, + "learning_rate": 1.4989928181574329e-06, + "loss": 1.0353856086730957, + "step": 5784 + }, + { + "epoch": 1.7808556478916588, + "grad_norm": 13.5, + "learning_rate": 1.498220855420905e-06, + "loss": 1.128792405128479, + "step": 5786 + }, + { + "epoch": 1.781471221914435, + "grad_norm": 12.0, + "learning_rate": 1.4974490259805706e-06, + "loss": 1.2422773838043213, + "step": 5788 + }, + { + "epoch": 1.7820867959372113, + "grad_norm": 12.25, + "learning_rate": 1.4966773301773479e-06, + "loss": 1.050607681274414, + "step": 5790 + }, + { + "epoch": 1.7827023699599875, + "grad_norm": 6.09375, + "learning_rate": 1.4959057683520958e-06, + "loss": 1.0194100141525269, + "step": 5792 + }, + { + "epoch": 1.7833179439827638, + "grad_norm": 11.9375, + "learning_rate": 1.4951343408456137e-06, + "loss": 1.3442493677139282, + "step": 5794 + }, + { + "epoch": 1.78393351800554, + "grad_norm": 15.3125, + "learning_rate": 1.4943630479986425e-06, + "loss": 1.3655450344085693, + "step": 5796 + }, + { + "epoch": 1.7845490920283162, + "grad_norm": 10.75, + "learning_rate": 1.4935918901518626e-06, + "loss": 1.3164520263671875, + "step": 5798 + }, + { + "epoch": 1.7851646660510927, + "grad_norm": 17.25, + "learning_rate": 1.4928208676458954e-06, + "loss": 1.3639867305755615, + "step": 5800 + }, + { + "epoch": 1.785780240073869, + "grad_norm": 9.625, + "learning_rate": 1.4920499808213032e-06, + "loss": 1.3705554008483887, + "step": 5802 + }, + { + "epoch": 1.7863958140966452, + "grad_norm": 7.21875, + "learning_rate": 1.491279230018587e-06, + "loss": 1.1877235174179077, + "step": 5804 + }, + { + "epoch": 1.7870113881194214, + "grad_norm": 13.1875, + "learning_rate": 1.4905086155781874e-06, + "loss": 1.4518612623214722, + "step": 5806 + }, + { + "epoch": 1.7876269621421976, + "grad_norm": 11.875, + "learning_rate": 1.4897381378404874e-06, + "loss": 1.3673511743545532, + "step": 5808 + }, + { + "epoch": 1.7882425361649739, + "grad_norm": 12.3125, + "learning_rate": 1.4889677971458068e-06, + "loss": 1.3825560808181763, + "step": 5810 + }, + { + "epoch": 1.78885811018775, + "grad_norm": 10.5625, + "learning_rate": 1.4881975938344063e-06, + "loss": 1.306952953338623, + "step": 5812 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 14.0, + "learning_rate": 1.4874275282464858e-06, + "loss": 1.19303297996521, + "step": 5814 + }, + { + "epoch": 1.7900892582333026, + "grad_norm": 44.25, + "learning_rate": 1.4866576007221835e-06, + "loss": 1.4067680835723877, + "step": 5816 + }, + { + "epoch": 1.7907048322560788, + "grad_norm": 20.0, + "learning_rate": 1.4858878116015778e-06, + "loss": 1.089118480682373, + "step": 5818 + }, + { + "epoch": 1.791320406278855, + "grad_norm": 8.3125, + "learning_rate": 1.4851181612246851e-06, + "loss": 1.4552359580993652, + "step": 5820 + }, + { + "epoch": 1.7919359803016313, + "grad_norm": 27.5, + "learning_rate": 1.4843486499314615e-06, + "loss": 1.4873318672180176, + "step": 5822 + }, + { + "epoch": 1.7925515543244075, + "grad_norm": 17.5, + "learning_rate": 1.4835792780617998e-06, + "loss": 1.5674926042556763, + "step": 5824 + }, + { + "epoch": 1.7931671283471837, + "grad_norm": 8.9375, + "learning_rate": 1.4828100459555338e-06, + "loss": 1.306696891784668, + "step": 5826 + }, + { + "epoch": 1.79378270236996, + "grad_norm": 4.84375, + "learning_rate": 1.4820409539524335e-06, + "loss": 1.0492743253707886, + "step": 5828 + }, + { + "epoch": 1.7943982763927362, + "grad_norm": 15.5, + "learning_rate": 1.4812720023922082e-06, + "loss": 1.1006546020507812, + "step": 5830 + }, + { + "epoch": 1.7950138504155124, + "grad_norm": 10.0, + "learning_rate": 1.4805031916145034e-06, + "loss": 1.339001178741455, + "step": 5832 + }, + { + "epoch": 1.7956294244382887, + "grad_norm": 38.25, + "learning_rate": 1.4797345219589052e-06, + "loss": 1.3763446807861328, + "step": 5834 + }, + { + "epoch": 1.796244998461065, + "grad_norm": 14.5, + "learning_rate": 1.4789659937649353e-06, + "loss": 1.6027696132659912, + "step": 5836 + }, + { + "epoch": 1.7968605724838413, + "grad_norm": 15.3125, + "learning_rate": 1.478197607372053e-06, + "loss": 1.2696528434753418, + "step": 5838 + }, + { + "epoch": 1.7974761465066176, + "grad_norm": 10.6875, + "learning_rate": 1.4774293631196565e-06, + "loss": 1.2174136638641357, + "step": 5840 + }, + { + "epoch": 1.7980917205293938, + "grad_norm": 16.125, + "learning_rate": 1.4766612613470795e-06, + "loss": 1.2887904644012451, + "step": 5842 + }, + { + "epoch": 1.79870729455217, + "grad_norm": 15.0625, + "learning_rate": 1.4758933023935927e-06, + "loss": 1.4577984809875488, + "step": 5844 + }, + { + "epoch": 1.7993228685749463, + "grad_norm": 9.75, + "learning_rate": 1.475125486598406e-06, + "loss": 1.325479507446289, + "step": 5846 + }, + { + "epoch": 1.7999384425977225, + "grad_norm": 13.5625, + "learning_rate": 1.474357814300664e-06, + "loss": 1.4405393600463867, + "step": 5848 + }, + { + "epoch": 1.8005540166204987, + "grad_norm": 14.5625, + "learning_rate": 1.4735902858394473e-06, + "loss": 1.1311811208724976, + "step": 5850 + }, + { + "epoch": 1.801169590643275, + "grad_norm": 9.4375, + "learning_rate": 1.4728229015537758e-06, + "loss": 1.2644745111465454, + "step": 5852 + }, + { + "epoch": 1.8017851646660512, + "grad_norm": 14.0625, + "learning_rate": 1.4720556617826028e-06, + "loss": 0.9831246137619019, + "step": 5854 + }, + { + "epoch": 1.8024007386888274, + "grad_norm": 13.8125, + "learning_rate": 1.4712885668648192e-06, + "loss": 1.3548238277435303, + "step": 5856 + }, + { + "epoch": 1.8030163127116037, + "grad_norm": 56.75, + "learning_rate": 1.4705216171392525e-06, + "loss": 1.4002970457077026, + "step": 5858 + }, + { + "epoch": 1.80363188673438, + "grad_norm": 11.0625, + "learning_rate": 1.4697548129446644e-06, + "loss": 1.233629584312439, + "step": 5860 + }, + { + "epoch": 1.8042474607571561, + "grad_norm": 7.8125, + "learning_rate": 1.4689881546197533e-06, + "loss": 0.9660263657569885, + "step": 5862 + }, + { + "epoch": 1.8048630347799324, + "grad_norm": 15.0, + "learning_rate": 1.4682216425031536e-06, + "loss": 0.9532240629196167, + "step": 5864 + }, + { + "epoch": 1.8054786088027086, + "grad_norm": 22.0, + "learning_rate": 1.467455276933434e-06, + "loss": 1.2945117950439453, + "step": 5866 + }, + { + "epoch": 1.8060941828254848, + "grad_norm": 14.0, + "learning_rate": 1.4666890582490986e-06, + "loss": 1.5546940565109253, + "step": 5868 + }, + { + "epoch": 1.806709756848261, + "grad_norm": 19.75, + "learning_rate": 1.4659229867885885e-06, + "loss": 0.7492038011550903, + "step": 5870 + }, + { + "epoch": 1.8073253308710373, + "grad_norm": 11.25, + "learning_rate": 1.4651570628902779e-06, + "loss": 1.2762389183044434, + "step": 5872 + }, + { + "epoch": 1.8079409048938135, + "grad_norm": 13.4375, + "learning_rate": 1.464391286892475e-06, + "loss": 1.2377161979675293, + "step": 5874 + }, + { + "epoch": 1.8085564789165898, + "grad_norm": 13.125, + "learning_rate": 1.4636256591334253e-06, + "loss": 1.1009962558746338, + "step": 5876 + }, + { + "epoch": 1.809172052939366, + "grad_norm": 16.375, + "learning_rate": 1.4628601799513072e-06, + "loss": 1.4094276428222656, + "step": 5878 + }, + { + "epoch": 1.8097876269621422, + "grad_norm": 3.96875, + "learning_rate": 1.4620948496842332e-06, + "loss": 0.9528759717941284, + "step": 5880 + }, + { + "epoch": 1.8104032009849185, + "grad_norm": 5.59375, + "learning_rate": 1.4613296686702512e-06, + "loss": 1.1366877555847168, + "step": 5882 + }, + { + "epoch": 1.8110187750076947, + "grad_norm": 16.875, + "learning_rate": 1.4605646372473423e-06, + "loss": 1.3106170892715454, + "step": 5884 + }, + { + "epoch": 1.811634349030471, + "grad_norm": 17.375, + "learning_rate": 1.4597997557534215e-06, + "loss": 1.3541948795318604, + "step": 5886 + }, + { + "epoch": 1.8122499230532472, + "grad_norm": 30.625, + "learning_rate": 1.4590350245263377e-06, + "loss": 1.4160265922546387, + "step": 5888 + }, + { + "epoch": 1.8128654970760234, + "grad_norm": 18.25, + "learning_rate": 1.4582704439038745e-06, + "loss": 1.4113764762878418, + "step": 5890 + }, + { + "epoch": 1.8134810710987996, + "grad_norm": 7.3125, + "learning_rate": 1.4575060142237472e-06, + "loss": 1.2803568840026855, + "step": 5892 + }, + { + "epoch": 1.8140966451215759, + "grad_norm": 19.75, + "learning_rate": 1.456741735823605e-06, + "loss": 1.5021907091140747, + "step": 5894 + }, + { + "epoch": 1.814712219144352, + "grad_norm": 13.875, + "learning_rate": 1.4559776090410314e-06, + "loss": 1.150805950164795, + "step": 5896 + }, + { + "epoch": 1.8153277931671283, + "grad_norm": 13.5, + "learning_rate": 1.4552136342135418e-06, + "loss": 1.2975443601608276, + "step": 5898 + }, + { + "epoch": 1.8159433671899046, + "grad_norm": 30.75, + "learning_rate": 1.4544498116785845e-06, + "loss": 1.4264914989471436, + "step": 5900 + }, + { + "epoch": 1.8165589412126808, + "grad_norm": 10.875, + "learning_rate": 1.4536861417735412e-06, + "loss": 1.366098165512085, + "step": 5902 + }, + { + "epoch": 1.817174515235457, + "grad_norm": 10.3125, + "learning_rate": 1.4529226248357255e-06, + "loss": 1.4455392360687256, + "step": 5904 + }, + { + "epoch": 1.8177900892582333, + "grad_norm": 16.5, + "learning_rate": 1.4521592612023832e-06, + "loss": 1.1741234064102173, + "step": 5906 + }, + { + "epoch": 1.8184056632810095, + "grad_norm": 17.625, + "learning_rate": 1.451396051210694e-06, + "loss": 1.3307926654815674, + "step": 5908 + }, + { + "epoch": 1.8190212373037857, + "grad_norm": 10.625, + "learning_rate": 1.4506329951977679e-06, + "loss": 1.3589565753936768, + "step": 5910 + }, + { + "epoch": 1.819636811326562, + "grad_norm": 10.0, + "learning_rate": 1.4498700935006472e-06, + "loss": 1.3831872940063477, + "step": 5912 + }, + { + "epoch": 1.8202523853493382, + "grad_norm": 9.5, + "learning_rate": 1.4491073464563079e-06, + "loss": 1.084181308746338, + "step": 5914 + }, + { + "epoch": 1.8208679593721144, + "grad_norm": 18.75, + "learning_rate": 1.448344754401655e-06, + "loss": 0.9600379467010498, + "step": 5916 + }, + { + "epoch": 1.8214835333948907, + "grad_norm": 16.375, + "learning_rate": 1.4475823176735261e-06, + "loss": 1.3483178615570068, + "step": 5918 + }, + { + "epoch": 1.822099107417667, + "grad_norm": 23.625, + "learning_rate": 1.4468200366086915e-06, + "loss": 1.416744351387024, + "step": 5920 + }, + { + "epoch": 1.8227146814404431, + "grad_norm": 14.0625, + "learning_rate": 1.446057911543851e-06, + "loss": 0.886028528213501, + "step": 5922 + }, + { + "epoch": 1.8233302554632194, + "grad_norm": 14.8125, + "learning_rate": 1.4452959428156358e-06, + "loss": 1.695695161819458, + "step": 5924 + }, + { + "epoch": 1.8239458294859956, + "grad_norm": 14.0, + "learning_rate": 1.4445341307606091e-06, + "loss": 1.2986364364624023, + "step": 5926 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 17.125, + "learning_rate": 1.4437724757152637e-06, + "loss": 1.0665688514709473, + "step": 5928 + }, + { + "epoch": 1.825176977531548, + "grad_norm": 26.875, + "learning_rate": 1.4430109780160231e-06, + "loss": 1.1360788345336914, + "step": 5930 + }, + { + "epoch": 1.8257925515543243, + "grad_norm": 19.875, + "learning_rate": 1.4422496379992428e-06, + "loss": 1.3271257877349854, + "step": 5932 + }, + { + "epoch": 1.8264081255771005, + "grad_norm": 14.9375, + "learning_rate": 1.4414884560012065e-06, + "loss": 1.3400228023529053, + "step": 5934 + }, + { + "epoch": 1.8270236995998768, + "grad_norm": 27.625, + "learning_rate": 1.4407274323581296e-06, + "loss": 1.4710955619812012, + "step": 5936 + }, + { + "epoch": 1.827639273622653, + "grad_norm": 10.1875, + "learning_rate": 1.439966567406157e-06, + "loss": 1.0766589641571045, + "step": 5938 + }, + { + "epoch": 1.8282548476454292, + "grad_norm": 9.5625, + "learning_rate": 1.4392058614813643e-06, + "loss": 1.5178849697113037, + "step": 5940 + }, + { + "epoch": 1.8288704216682055, + "grad_norm": 13.3125, + "learning_rate": 1.4384453149197553e-06, + "loss": 1.1585719585418701, + "step": 5942 + }, + { + "epoch": 1.8294859956909817, + "grad_norm": 39.75, + "learning_rate": 1.4376849280572643e-06, + "loss": 1.561208724975586, + "step": 5944 + }, + { + "epoch": 1.830101569713758, + "grad_norm": 30.25, + "learning_rate": 1.4369247012297557e-06, + "loss": 1.2275128364562988, + "step": 5946 + }, + { + "epoch": 1.8307171437365342, + "grad_norm": 10.5, + "learning_rate": 1.4361646347730221e-06, + "loss": 1.3385765552520752, + "step": 5948 + }, + { + "epoch": 1.8313327177593104, + "grad_norm": 18.75, + "learning_rate": 1.4354047290227856e-06, + "loss": 1.3635698556900024, + "step": 5950 + }, + { + "epoch": 1.8319482917820868, + "grad_norm": 9.5, + "learning_rate": 1.4346449843146978e-06, + "loss": 1.3228318691253662, + "step": 5952 + }, + { + "epoch": 1.832563865804863, + "grad_norm": 10.3125, + "learning_rate": 1.433885400984339e-06, + "loss": 1.1687228679656982, + "step": 5954 + }, + { + "epoch": 1.8331794398276393, + "grad_norm": 23.0, + "learning_rate": 1.4331259793672172e-06, + "loss": 1.4278203248977661, + "step": 5956 + }, + { + "epoch": 1.8337950138504155, + "grad_norm": 16.625, + "learning_rate": 1.4323667197987704e-06, + "loss": 1.6822397708892822, + "step": 5958 + }, + { + "epoch": 1.8344105878731918, + "grad_norm": 13.8125, + "learning_rate": 1.4316076226143645e-06, + "loss": 1.465928554534912, + "step": 5960 + }, + { + "epoch": 1.835026161895968, + "grad_norm": 3.5, + "learning_rate": 1.4308486881492927e-06, + "loss": 1.2920657396316528, + "step": 5962 + }, + { + "epoch": 1.8356417359187442, + "grad_norm": 12.125, + "learning_rate": 1.4300899167387787e-06, + "loss": 0.9915714859962463, + "step": 5964 + }, + { + "epoch": 1.8362573099415205, + "grad_norm": 12.1875, + "learning_rate": 1.4293313087179717e-06, + "loss": 1.468205451965332, + "step": 5966 + }, + { + "epoch": 1.8368728839642967, + "grad_norm": 31.125, + "learning_rate": 1.4285728644219499e-06, + "loss": 1.684161901473999, + "step": 5968 + }, + { + "epoch": 1.837488457987073, + "grad_norm": 9.5, + "learning_rate": 1.427814584185719e-06, + "loss": 1.4519860744476318, + "step": 5970 + }, + { + "epoch": 1.8381040320098492, + "grad_norm": 12.0625, + "learning_rate": 1.4270564683442127e-06, + "loss": 1.1766678094863892, + "step": 5972 + }, + { + "epoch": 1.8387196060326254, + "grad_norm": 24.375, + "learning_rate": 1.426298517232291e-06, + "loss": 1.4680485725402832, + "step": 5974 + }, + { + "epoch": 1.8393351800554016, + "grad_norm": 3.1875, + "learning_rate": 1.425540731184742e-06, + "loss": 1.1224040985107422, + "step": 5976 + }, + { + "epoch": 1.8399507540781779, + "grad_norm": 17.875, + "learning_rate": 1.4247831105362807e-06, + "loss": 1.3447721004486084, + "step": 5978 + }, + { + "epoch": 1.840566328100954, + "grad_norm": 22.125, + "learning_rate": 1.4240256556215486e-06, + "loss": 1.0965287685394287, + "step": 5980 + }, + { + "epoch": 1.8411819021237303, + "grad_norm": 11.6875, + "learning_rate": 1.4232683667751152e-06, + "loss": 1.4905890226364136, + "step": 5982 + }, + { + "epoch": 1.8417974761465066, + "grad_norm": 15.8125, + "learning_rate": 1.422511244331476e-06, + "loss": 1.76155424118042, + "step": 5984 + }, + { + "epoch": 1.8424130501692828, + "grad_norm": 31.75, + "learning_rate": 1.4217542886250513e-06, + "loss": 1.6751346588134766, + "step": 5986 + }, + { + "epoch": 1.8430286241920593, + "grad_norm": 22.0, + "learning_rate": 1.420997499990191e-06, + "loss": 1.5737178325653076, + "step": 5988 + }, + { + "epoch": 1.8436441982148355, + "grad_norm": 11.3125, + "learning_rate": 1.4202408787611686e-06, + "loss": 1.3350948095321655, + "step": 5990 + }, + { + "epoch": 1.8442597722376117, + "grad_norm": 30.25, + "learning_rate": 1.4194844252721846e-06, + "loss": 1.5872807502746582, + "step": 5992 + }, + { + "epoch": 1.844875346260388, + "grad_norm": 17.5, + "learning_rate": 1.4187281398573659e-06, + "loss": 1.6296907663345337, + "step": 5994 + }, + { + "epoch": 1.8454909202831642, + "grad_norm": 13.6875, + "learning_rate": 1.417972022850764e-06, + "loss": 1.1191483736038208, + "step": 5996 + }, + { + "epoch": 1.8461064943059404, + "grad_norm": 14.875, + "learning_rate": 1.4172160745863571e-06, + "loss": 1.686267375946045, + "step": 5998 + }, + { + "epoch": 1.8467220683287167, + "grad_norm": 23.75, + "learning_rate": 1.416460295398048e-06, + "loss": 1.6978259086608887, + "step": 6000 + }, + { + "epoch": 1.847337642351493, + "grad_norm": 11.25, + "learning_rate": 1.4157046856196658e-06, + "loss": 1.3672219514846802, + "step": 6002 + }, + { + "epoch": 1.8479532163742691, + "grad_norm": 10.6875, + "learning_rate": 1.4149492455849637e-06, + "loss": 0.9509298801422119, + "step": 6004 + }, + { + "epoch": 1.8485687903970454, + "grad_norm": 6.125, + "learning_rate": 1.4141939756276203e-06, + "loss": 1.0525821447372437, + "step": 6006 + }, + { + "epoch": 1.8491843644198216, + "grad_norm": 23.5, + "learning_rate": 1.4134388760812399e-06, + "loss": 1.6087608337402344, + "step": 6008 + }, + { + "epoch": 1.8497999384425978, + "grad_norm": 19.125, + "learning_rate": 1.4126839472793508e-06, + "loss": 1.2136560678482056, + "step": 6010 + }, + { + "epoch": 1.850415512465374, + "grad_norm": 30.25, + "learning_rate": 1.4119291895554045e-06, + "loss": 1.345487356185913, + "step": 6012 + }, + { + "epoch": 1.8510310864881503, + "grad_norm": 11.375, + "learning_rate": 1.4111746032427802e-06, + "loss": 1.2782511711120605, + "step": 6014 + }, + { + "epoch": 1.8516466605109265, + "grad_norm": 10.625, + "learning_rate": 1.4104201886747787e-06, + "loss": 0.9771016240119934, + "step": 6016 + }, + { + "epoch": 1.8522622345337028, + "grad_norm": 7.75, + "learning_rate": 1.4096659461846256e-06, + "loss": 1.4104878902435303, + "step": 6018 + }, + { + "epoch": 1.852877808556479, + "grad_norm": 18.125, + "learning_rate": 1.4089118761054711e-06, + "loss": 1.3528629541397095, + "step": 6020 + }, + { + "epoch": 1.8534933825792552, + "grad_norm": 38.0, + "learning_rate": 1.4081579787703885e-06, + "loss": 1.4981791973114014, + "step": 6022 + }, + { + "epoch": 1.8541089566020315, + "grad_norm": 15.0625, + "learning_rate": 1.4074042545123747e-06, + "loss": 1.4606707096099854, + "step": 6024 + }, + { + "epoch": 1.8547245306248077, + "grad_norm": 19.875, + "learning_rate": 1.406650703664352e-06, + "loss": 1.4054559469223022, + "step": 6026 + }, + { + "epoch": 1.855340104647584, + "grad_norm": 7.78125, + "learning_rate": 1.4058973265591637e-06, + "loss": 1.2006244659423828, + "step": 6028 + }, + { + "epoch": 1.8559556786703602, + "grad_norm": 20.375, + "learning_rate": 1.4051441235295766e-06, + "loss": 1.7498140335083008, + "step": 6030 + }, + { + "epoch": 1.8565712526931364, + "grad_norm": 12.875, + "learning_rate": 1.404391094908283e-06, + "loss": 1.2948148250579834, + "step": 6032 + }, + { + "epoch": 1.8571868267159126, + "grad_norm": 46.25, + "learning_rate": 1.4036382410278952e-06, + "loss": 0.7961997389793396, + "step": 6034 + }, + { + "epoch": 1.8578024007386889, + "grad_norm": 15.25, + "learning_rate": 1.4028855622209501e-06, + "loss": 1.1270966529846191, + "step": 6036 + }, + { + "epoch": 1.858417974761465, + "grad_norm": 14.75, + "learning_rate": 1.402133058819907e-06, + "loss": 1.3352243900299072, + "step": 6038 + }, + { + "epoch": 1.8590335487842413, + "grad_norm": 13.3125, + "learning_rate": 1.4013807311571472e-06, + "loss": 0.8288969993591309, + "step": 6040 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 27.75, + "learning_rate": 1.4006285795649744e-06, + "loss": 1.330993413925171, + "step": 6042 + }, + { + "epoch": 1.8602646968297938, + "grad_norm": 7.21875, + "learning_rate": 1.3998766043756157e-06, + "loss": 0.905243992805481, + "step": 6044 + }, + { + "epoch": 1.86088027085257, + "grad_norm": 20.625, + "learning_rate": 1.3991248059212186e-06, + "loss": 1.3450603485107422, + "step": 6046 + }, + { + "epoch": 1.8614958448753463, + "grad_norm": 13.625, + "learning_rate": 1.3983731845338533e-06, + "loss": 1.3866983652114868, + "step": 6048 + }, + { + "epoch": 1.8621114188981225, + "grad_norm": 11.375, + "learning_rate": 1.3976217405455126e-06, + "loss": 0.7493995428085327, + "step": 6050 + }, + { + "epoch": 1.8627269929208987, + "grad_norm": 10.0625, + "learning_rate": 1.3968704742881097e-06, + "loss": 1.3307311534881592, + "step": 6052 + }, + { + "epoch": 1.863342566943675, + "grad_norm": 19.375, + "learning_rate": 1.39611938609348e-06, + "loss": 1.3420662879943848, + "step": 6054 + }, + { + "epoch": 1.8639581409664512, + "grad_norm": 9.8125, + "learning_rate": 1.3953684762933792e-06, + "loss": 1.4183435440063477, + "step": 6056 + }, + { + "epoch": 1.8645737149892274, + "grad_norm": 13.125, + "learning_rate": 1.3946177452194858e-06, + "loss": 1.4013707637786865, + "step": 6058 + }, + { + "epoch": 1.8651892890120036, + "grad_norm": 18.5, + "learning_rate": 1.3938671932033987e-06, + "loss": 1.352440595626831, + "step": 6060 + }, + { + "epoch": 1.8658048630347799, + "grad_norm": 11.6875, + "learning_rate": 1.3931168205766368e-06, + "loss": 1.3323051929473877, + "step": 6062 + }, + { + "epoch": 1.8664204370575561, + "grad_norm": 23.125, + "learning_rate": 1.3923666276706413e-06, + "loss": 1.316023349761963, + "step": 6064 + }, + { + "epoch": 1.8670360110803323, + "grad_norm": 10.5625, + "learning_rate": 1.391616614816773e-06, + "loss": 1.466226577758789, + "step": 6066 + }, + { + "epoch": 1.8676515851031086, + "grad_norm": 6.8125, + "learning_rate": 1.3908667823463133e-06, + "loss": 1.3296306133270264, + "step": 6068 + }, + { + "epoch": 1.8682671591258848, + "grad_norm": 39.75, + "learning_rate": 1.3901171305904645e-06, + "loss": 1.5542833805084229, + "step": 6070 + }, + { + "epoch": 1.868882733148661, + "grad_norm": 8.125, + "learning_rate": 1.3893676598803481e-06, + "loss": 1.1805827617645264, + "step": 6072 + }, + { + "epoch": 1.8694983071714373, + "grad_norm": 15.3125, + "learning_rate": 1.3886183705470061e-06, + "loss": 1.3519244194030762, + "step": 6074 + }, + { + "epoch": 1.8701138811942135, + "grad_norm": 31.5, + "learning_rate": 1.3878692629214011e-06, + "loss": 1.0316970348358154, + "step": 6076 + }, + { + "epoch": 1.8707294552169897, + "grad_norm": 17.625, + "learning_rate": 1.3871203373344145e-06, + "loss": 1.20766019821167, + "step": 6078 + }, + { + "epoch": 1.871345029239766, + "grad_norm": 6.75, + "learning_rate": 1.3863715941168474e-06, + "loss": 1.157658576965332, + "step": 6080 + }, + { + "epoch": 1.8719606032625422, + "grad_norm": 9.8125, + "learning_rate": 1.3856230335994208e-06, + "loss": 1.1696847677230835, + "step": 6082 + }, + { + "epoch": 1.8725761772853184, + "grad_norm": 11.375, + "learning_rate": 1.3848746561127744e-06, + "loss": 1.259034276008606, + "step": 6084 + }, + { + "epoch": 1.8731917513080947, + "grad_norm": 14.125, + "learning_rate": 1.3841264619874678e-06, + "loss": 1.7221574783325195, + "step": 6086 + }, + { + "epoch": 1.873807325330871, + "grad_norm": 69.0, + "learning_rate": 1.383378451553979e-06, + "loss": 1.5460913181304932, + "step": 6088 + }, + { + "epoch": 1.8744228993536471, + "grad_norm": 56.75, + "learning_rate": 1.382630625142705e-06, + "loss": 1.4097764492034912, + "step": 6090 + }, + { + "epoch": 1.8750384733764234, + "grad_norm": 13.6875, + "learning_rate": 1.3818829830839614e-06, + "loss": 1.6211826801300049, + "step": 6092 + }, + { + "epoch": 1.8756540473991996, + "grad_norm": 20.75, + "learning_rate": 1.3811355257079834e-06, + "loss": 1.6837477684020996, + "step": 6094 + }, + { + "epoch": 1.8762696214219758, + "grad_norm": 24.375, + "learning_rate": 1.3803882533449228e-06, + "loss": 1.5443352460861206, + "step": 6096 + }, + { + "epoch": 1.876885195444752, + "grad_norm": 13.75, + "learning_rate": 1.3796411663248508e-06, + "loss": 1.5669658184051514, + "step": 6098 + }, + { + "epoch": 1.8775007694675283, + "grad_norm": 18.25, + "learning_rate": 1.378894264977757e-06, + "loss": 1.6158027648925781, + "step": 6100 + }, + { + "epoch": 1.8781163434903048, + "grad_norm": 59.75, + "learning_rate": 1.3781475496335487e-06, + "loss": 1.166754961013794, + "step": 6102 + }, + { + "epoch": 1.878731917513081, + "grad_norm": 18.25, + "learning_rate": 1.3774010206220504e-06, + "loss": 0.9254148006439209, + "step": 6104 + }, + { + "epoch": 1.8793474915358572, + "grad_norm": 4.1875, + "learning_rate": 1.376654678273005e-06, + "loss": 1.2385072708129883, + "step": 6106 + }, + { + "epoch": 1.8799630655586335, + "grad_norm": 8.5625, + "learning_rate": 1.3759085229160734e-06, + "loss": 1.2265651226043701, + "step": 6108 + }, + { + "epoch": 1.8805786395814097, + "grad_norm": 45.75, + "learning_rate": 1.3751625548808322e-06, + "loss": 1.5409295558929443, + "step": 6110 + }, + { + "epoch": 1.881194213604186, + "grad_norm": 21.5, + "learning_rate": 1.3744167744967768e-06, + "loss": 1.5468591451644897, + "step": 6112 + }, + { + "epoch": 1.8818097876269622, + "grad_norm": 19.625, + "learning_rate": 1.3736711820933193e-06, + "loss": 1.1492195129394531, + "step": 6114 + }, + { + "epoch": 1.8824253616497384, + "grad_norm": 12.375, + "learning_rate": 1.3729257779997888e-06, + "loss": 0.7553317546844482, + "step": 6116 + }, + { + "epoch": 1.8830409356725146, + "grad_norm": 39.0, + "learning_rate": 1.3721805625454307e-06, + "loss": 1.6203052997589111, + "step": 6118 + }, + { + "epoch": 1.8836565096952909, + "grad_norm": 44.75, + "learning_rate": 1.3714355360594082e-06, + "loss": 1.593339204788208, + "step": 6120 + }, + { + "epoch": 1.884272083718067, + "grad_norm": 7.28125, + "learning_rate": 1.3706906988708003e-06, + "loss": 0.950556755065918, + "step": 6122 + }, + { + "epoch": 1.8848876577408433, + "grad_norm": 14.3125, + "learning_rate": 1.3699460513086015e-06, + "loss": 1.2142386436462402, + "step": 6124 + }, + { + "epoch": 1.8855032317636196, + "grad_norm": 16.125, + "learning_rate": 1.3692015937017246e-06, + "loss": 1.2912973165512085, + "step": 6126 + }, + { + "epoch": 1.8861188057863958, + "grad_norm": 10.0625, + "learning_rate": 1.3684573263789967e-06, + "loss": 1.3026981353759766, + "step": 6128 + }, + { + "epoch": 1.886734379809172, + "grad_norm": 12.75, + "learning_rate": 1.3677132496691617e-06, + "loss": 1.272803783416748, + "step": 6130 + }, + { + "epoch": 1.8873499538319483, + "grad_norm": 7.875, + "learning_rate": 1.3669693639008794e-06, + "loss": 1.3544511795043945, + "step": 6132 + }, + { + "epoch": 1.8879655278547245, + "grad_norm": 14.6875, + "learning_rate": 1.3662256694027248e-06, + "loss": 1.2876298427581787, + "step": 6134 + }, + { + "epoch": 1.8885811018775007, + "grad_norm": 16.75, + "learning_rate": 1.3654821665031882e-06, + "loss": 0.8064992427825928, + "step": 6136 + }, + { + "epoch": 1.889196675900277, + "grad_norm": 24.875, + "learning_rate": 1.3647388555306766e-06, + "loss": 1.1648228168487549, + "step": 6138 + }, + { + "epoch": 1.8898122499230534, + "grad_norm": 15.5625, + "learning_rate": 1.3639957368135105e-06, + "loss": 1.1499993801116943, + "step": 6140 + }, + { + "epoch": 1.8904278239458296, + "grad_norm": 6.5, + "learning_rate": 1.3632528106799261e-06, + "loss": 1.2055202722549438, + "step": 6142 + }, + { + "epoch": 1.8910433979686059, + "grad_norm": 16.375, + "learning_rate": 1.3625100774580757e-06, + "loss": 0.9683582782745361, + "step": 6144 + }, + { + "epoch": 1.891658971991382, + "grad_norm": 9.875, + "learning_rate": 1.3617675374760248e-06, + "loss": 1.428824782371521, + "step": 6146 + }, + { + "epoch": 1.8922745460141583, + "grad_norm": 44.0, + "learning_rate": 1.3610251910617542e-06, + "loss": 1.4965403079986572, + "step": 6148 + }, + { + "epoch": 1.8928901200369346, + "grad_norm": 18.125, + "learning_rate": 1.3602830385431593e-06, + "loss": 1.4022653102874756, + "step": 6150 + }, + { + "epoch": 1.8935056940597108, + "grad_norm": 6.5, + "learning_rate": 1.3595410802480496e-06, + "loss": 1.2949481010437012, + "step": 6152 + }, + { + "epoch": 1.894121268082487, + "grad_norm": 16.125, + "learning_rate": 1.358799316504149e-06, + "loss": 1.4073116779327393, + "step": 6154 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 13.875, + "learning_rate": 1.3580577476390954e-06, + "loss": 1.3306310176849365, + "step": 6156 + }, + { + "epoch": 1.8953524161280395, + "grad_norm": 28.25, + "learning_rate": 1.3573163739804405e-06, + "loss": 1.3141050338745117, + "step": 6158 + }, + { + "epoch": 1.8959679901508157, + "grad_norm": 17.0, + "learning_rate": 1.3565751958556498e-06, + "loss": 1.7682037353515625, + "step": 6160 + }, + { + "epoch": 1.896583564173592, + "grad_norm": 42.0, + "learning_rate": 1.3558342135921026e-06, + "loss": 0.986321210861206, + "step": 6162 + }, + { + "epoch": 1.8971991381963682, + "grad_norm": 16.25, + "learning_rate": 1.3550934275170924e-06, + "loss": 1.5372295379638672, + "step": 6164 + }, + { + "epoch": 1.8978147122191444, + "grad_norm": 56.25, + "learning_rate": 1.3543528379578242e-06, + "loss": 1.5373578071594238, + "step": 6166 + }, + { + "epoch": 1.8984302862419207, + "grad_norm": 10.875, + "learning_rate": 1.3536124452414173e-06, + "loss": 1.3291563987731934, + "step": 6168 + }, + { + "epoch": 1.899045860264697, + "grad_norm": 9.1875, + "learning_rate": 1.3528722496949046e-06, + "loss": 1.4090769290924072, + "step": 6170 + }, + { + "epoch": 1.8996614342874731, + "grad_norm": 7.3125, + "learning_rate": 1.3521322516452313e-06, + "loss": 0.9713155031204224, + "step": 6172 + }, + { + "epoch": 1.9002770083102494, + "grad_norm": 25.0, + "learning_rate": 1.3513924514192546e-06, + "loss": 1.47679603099823, + "step": 6174 + }, + { + "epoch": 1.9008925823330256, + "grad_norm": 43.0, + "learning_rate": 1.3506528493437459e-06, + "loss": 1.8054498434066772, + "step": 6176 + }, + { + "epoch": 1.9015081563558018, + "grad_norm": 13.375, + "learning_rate": 1.3499134457453883e-06, + "loss": 1.1690771579742432, + "step": 6178 + }, + { + "epoch": 1.902123730378578, + "grad_norm": 20.25, + "learning_rate": 1.3491742409507764e-06, + "loss": 1.4926562309265137, + "step": 6180 + }, + { + "epoch": 1.9027393044013543, + "grad_norm": 10.0625, + "learning_rate": 1.3484352352864186e-06, + "loss": 1.525926113128662, + "step": 6182 + }, + { + "epoch": 1.9033548784241305, + "grad_norm": 14.5625, + "learning_rate": 1.3476964290787346e-06, + "loss": 1.117192268371582, + "step": 6184 + }, + { + "epoch": 1.9039704524469068, + "grad_norm": 34.0, + "learning_rate": 1.346957822654055e-06, + "loss": 1.599233627319336, + "step": 6186 + }, + { + "epoch": 1.904586026469683, + "grad_norm": 8.5, + "learning_rate": 1.3462194163386241e-06, + "loss": 1.2044343948364258, + "step": 6188 + }, + { + "epoch": 1.9052016004924592, + "grad_norm": 18.875, + "learning_rate": 1.3454812104585969e-06, + "loss": 1.5050618648529053, + "step": 6190 + }, + { + "epoch": 1.9058171745152355, + "grad_norm": 36.25, + "learning_rate": 1.3447432053400386e-06, + "loss": 1.3360257148742676, + "step": 6192 + }, + { + "epoch": 1.9064327485380117, + "grad_norm": 11.625, + "learning_rate": 1.344005401308928e-06, + "loss": 1.4488022327423096, + "step": 6194 + }, + { + "epoch": 1.907048322560788, + "grad_norm": 19.75, + "learning_rate": 1.3432677986911537e-06, + "loss": 1.5527957677841187, + "step": 6196 + }, + { + "epoch": 1.9076638965835642, + "grad_norm": 17.375, + "learning_rate": 1.3425303978125154e-06, + "loss": 1.5068588256835938, + "step": 6198 + }, + { + "epoch": 1.9082794706063404, + "grad_norm": 15.0625, + "learning_rate": 1.3417931989987242e-06, + "loss": 1.3681578636169434, + "step": 6200 + }, + { + "epoch": 1.9088950446291166, + "grad_norm": 13.125, + "learning_rate": 1.3410562025754015e-06, + "loss": 0.6912654638290405, + "step": 6202 + }, + { + "epoch": 1.9095106186518929, + "grad_norm": 16.875, + "learning_rate": 1.340319408868079e-06, + "loss": 0.8955786228179932, + "step": 6204 + }, + { + "epoch": 1.910126192674669, + "grad_norm": 11.625, + "learning_rate": 1.3395828182022005e-06, + "loss": 1.6106204986572266, + "step": 6206 + }, + { + "epoch": 1.9107417666974453, + "grad_norm": 7.90625, + "learning_rate": 1.3388464309031182e-06, + "loss": 1.243058681488037, + "step": 6208 + }, + { + "epoch": 1.9113573407202216, + "grad_norm": 11.875, + "learning_rate": 1.3381102472960947e-06, + "loss": 1.4035508632659912, + "step": 6210 + }, + { + "epoch": 1.9119729147429978, + "grad_norm": 9.375, + "learning_rate": 1.3373742677063042e-06, + "loss": 1.5388617515563965, + "step": 6212 + }, + { + "epoch": 1.912588488765774, + "grad_norm": 13.5625, + "learning_rate": 1.336638492458829e-06, + "loss": 1.381677508354187, + "step": 6214 + }, + { + "epoch": 1.9132040627885503, + "grad_norm": 17.0, + "learning_rate": 1.3359029218786623e-06, + "loss": 1.1878316402435303, + "step": 6216 + }, + { + "epoch": 1.9138196368113265, + "grad_norm": 35.5, + "learning_rate": 1.3351675562907064e-06, + "loss": 1.4496146440505981, + "step": 6218 + }, + { + "epoch": 1.9144352108341027, + "grad_norm": 37.0, + "learning_rate": 1.3344323960197728e-06, + "loss": 1.3815988302230835, + "step": 6220 + }, + { + "epoch": 1.915050784856879, + "grad_norm": 22.5, + "learning_rate": 1.3336974413905827e-06, + "loss": 1.449317216873169, + "step": 6222 + }, + { + "epoch": 1.9156663588796552, + "grad_norm": 11.375, + "learning_rate": 1.3329626927277669e-06, + "loss": 1.5711392164230347, + "step": 6224 + }, + { + "epoch": 1.9162819329024314, + "grad_norm": 11.125, + "learning_rate": 1.332228150355864e-06, + "loss": 1.304990530014038, + "step": 6226 + }, + { + "epoch": 1.9168975069252077, + "grad_norm": 30.0, + "learning_rate": 1.3314938145993227e-06, + "loss": 1.4342398643493652, + "step": 6228 + }, + { + "epoch": 1.917513080947984, + "grad_norm": 5.75, + "learning_rate": 1.3307596857824994e-06, + "loss": 1.0260145664215088, + "step": 6230 + }, + { + "epoch": 1.9181286549707601, + "grad_norm": 7.125, + "learning_rate": 1.3300257642296605e-06, + "loss": 1.0618666410446167, + "step": 6232 + }, + { + "epoch": 1.9187442289935364, + "grad_norm": 22.375, + "learning_rate": 1.3292920502649797e-06, + "loss": 1.2553677558898926, + "step": 6234 + }, + { + "epoch": 1.9193598030163126, + "grad_norm": 8.4375, + "learning_rate": 1.3285585442125386e-06, + "loss": 1.163750410079956, + "step": 6236 + }, + { + "epoch": 1.9199753770390888, + "grad_norm": 15.875, + "learning_rate": 1.3278252463963285e-06, + "loss": 1.3896212577819824, + "step": 6238 + }, + { + "epoch": 1.920590951061865, + "grad_norm": 17.5, + "learning_rate": 1.3270921571402477e-06, + "loss": 1.5164833068847656, + "step": 6240 + }, + { + "epoch": 1.9212065250846413, + "grad_norm": 4.6875, + "learning_rate": 1.3263592767681022e-06, + "loss": 1.0327818393707275, + "step": 6242 + }, + { + "epoch": 1.9218220991074175, + "grad_norm": 10.0, + "learning_rate": 1.3256266056036064e-06, + "loss": 1.2779362201690674, + "step": 6244 + }, + { + "epoch": 1.9224376731301938, + "grad_norm": 21.375, + "learning_rate": 1.3248941439703822e-06, + "loss": 1.6328758001327515, + "step": 6246 + }, + { + "epoch": 1.92305324715297, + "grad_norm": 9.5, + "learning_rate": 1.324161892191958e-06, + "loss": 1.6194400787353516, + "step": 6248 + }, + { + "epoch": 1.9236688211757462, + "grad_norm": 10.3125, + "learning_rate": 1.3234298505917713e-06, + "loss": 1.5902223587036133, + "step": 6250 + }, + { + "epoch": 1.9242843951985225, + "grad_norm": 29.125, + "learning_rate": 1.3226980194931648e-06, + "loss": 1.2746918201446533, + "step": 6252 + }, + { + "epoch": 1.924899969221299, + "grad_norm": 17.5, + "learning_rate": 1.3219663992193891e-06, + "loss": 1.4883370399475098, + "step": 6254 + }, + { + "epoch": 1.9255155432440751, + "grad_norm": 11.5, + "learning_rate": 1.3212349900936027e-06, + "loss": 1.228645920753479, + "step": 6256 + }, + { + "epoch": 1.9261311172668514, + "grad_norm": 26.875, + "learning_rate": 1.320503792438869e-06, + "loss": 1.536346435546875, + "step": 6258 + }, + { + "epoch": 1.9267466912896276, + "grad_norm": 35.0, + "learning_rate": 1.319772806578159e-06, + "loss": 1.0358887910842896, + "step": 6260 + }, + { + "epoch": 1.9273622653124038, + "grad_norm": 11.5, + "learning_rate": 1.3190420328343503e-06, + "loss": 1.271331787109375, + "step": 6262 + }, + { + "epoch": 1.92797783933518, + "grad_norm": 6.8125, + "learning_rate": 1.318311471530226e-06, + "loss": 1.1881802082061768, + "step": 6264 + }, + { + "epoch": 1.9285934133579563, + "grad_norm": 12.875, + "learning_rate": 1.3175811229884762e-06, + "loss": 1.2093167304992676, + "step": 6266 + }, + { + "epoch": 1.9292089873807325, + "grad_norm": 15.8125, + "learning_rate": 1.3168509875316968e-06, + "loss": 1.361979365348816, + "step": 6268 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 22.5, + "learning_rate": 1.3161210654823893e-06, + "loss": 1.1650192737579346, + "step": 6270 + }, + { + "epoch": 1.930440135426285, + "grad_norm": 10.9375, + "learning_rate": 1.3153913571629605e-06, + "loss": 1.1376789808273315, + "step": 6272 + }, + { + "epoch": 1.9310557094490612, + "grad_norm": 11.5, + "learning_rate": 1.3146618628957244e-06, + "loss": 1.3090848922729492, + "step": 6274 + }, + { + "epoch": 1.9316712834718375, + "grad_norm": 14.0, + "learning_rate": 1.3139325830028996e-06, + "loss": 1.6002295017242432, + "step": 6276 + }, + { + "epoch": 1.9322868574946137, + "grad_norm": 9.0625, + "learning_rate": 1.3132035178066089e-06, + "loss": 1.232072353363037, + "step": 6278 + }, + { + "epoch": 1.93290243151739, + "grad_norm": 12.6875, + "learning_rate": 1.312474667628881e-06, + "loss": 1.537278652191162, + "step": 6280 + }, + { + "epoch": 1.9335180055401662, + "grad_norm": 14.9375, + "learning_rate": 1.3117460327916512e-06, + "loss": 1.2492306232452393, + "step": 6282 + }, + { + "epoch": 1.9341335795629424, + "grad_norm": 15.375, + "learning_rate": 1.3110176136167576e-06, + "loss": 1.626798391342163, + "step": 6284 + }, + { + "epoch": 1.9347491535857186, + "grad_norm": 10.1875, + "learning_rate": 1.3102894104259433e-06, + "loss": 1.4308114051818848, + "step": 6286 + }, + { + "epoch": 1.9353647276084949, + "grad_norm": 42.75, + "learning_rate": 1.3095614235408575e-06, + "loss": 1.4480242729187012, + "step": 6288 + }, + { + "epoch": 1.935980301631271, + "grad_norm": 22.125, + "learning_rate": 1.3088336532830524e-06, + "loss": 1.646793007850647, + "step": 6290 + }, + { + "epoch": 1.9365958756540476, + "grad_norm": 8.1875, + "learning_rate": 1.3081060999739844e-06, + "loss": 1.1365125179290771, + "step": 6292 + }, + { + "epoch": 1.9372114496768238, + "grad_norm": 6.375, + "learning_rate": 1.3073787639350156e-06, + "loss": 0.9681548476219177, + "step": 6294 + }, + { + "epoch": 1.9378270236996, + "grad_norm": 26.0, + "learning_rate": 1.3066516454874107e-06, + "loss": 1.3221466541290283, + "step": 6296 + }, + { + "epoch": 1.9384425977223763, + "grad_norm": 7.3125, + "learning_rate": 1.305924744952338e-06, + "loss": 1.2677980661392212, + "step": 6298 + }, + { + "epoch": 1.9390581717451525, + "grad_norm": 25.625, + "learning_rate": 1.3051980626508717e-06, + "loss": 1.460378646850586, + "step": 6300 + }, + { + "epoch": 1.9396737457679287, + "grad_norm": 104.5, + "learning_rate": 1.304471598903988e-06, + "loss": 1.6934248208999634, + "step": 6302 + }, + { + "epoch": 1.940289319790705, + "grad_norm": 12.4375, + "learning_rate": 1.3037453540325651e-06, + "loss": 1.4465457201004028, + "step": 6304 + }, + { + "epoch": 1.9409048938134812, + "grad_norm": 12.6875, + "learning_rate": 1.3030193283573883e-06, + "loss": 1.2186150550842285, + "step": 6306 + }, + { + "epoch": 1.9415204678362574, + "grad_norm": 11.0, + "learning_rate": 1.3022935221991428e-06, + "loss": 1.6539312601089478, + "step": 6308 + }, + { + "epoch": 1.9421360418590337, + "grad_norm": 12.0, + "learning_rate": 1.3015679358784183e-06, + "loss": 1.5259175300598145, + "step": 6310 + }, + { + "epoch": 1.94275161588181, + "grad_norm": 19.0, + "learning_rate": 1.300842569715707e-06, + "loss": 1.3606197834014893, + "step": 6312 + }, + { + "epoch": 1.9433671899045861, + "grad_norm": 30.375, + "learning_rate": 1.3001174240314043e-06, + "loss": 1.5610425472259521, + "step": 6314 + }, + { + "epoch": 1.9439827639273624, + "grad_norm": 10.8125, + "learning_rate": 1.2993924991458072e-06, + "loss": 1.2968568801879883, + "step": 6316 + }, + { + "epoch": 1.9445983379501386, + "grad_norm": 13.875, + "learning_rate": 1.2986677953791168e-06, + "loss": 1.3084802627563477, + "step": 6318 + }, + { + "epoch": 1.9452139119729148, + "grad_norm": 47.25, + "learning_rate": 1.297943313051435e-06, + "loss": 1.3081390857696533, + "step": 6320 + }, + { + "epoch": 1.945829485995691, + "grad_norm": 9.9375, + "learning_rate": 1.2972190524827661e-06, + "loss": 1.126495122909546, + "step": 6322 + }, + { + "epoch": 1.9464450600184673, + "grad_norm": 11.875, + "learning_rate": 1.296495013993018e-06, + "loss": 1.385104775428772, + "step": 6324 + }, + { + "epoch": 1.9470606340412435, + "grad_norm": 5.0, + "learning_rate": 1.2957711979019981e-06, + "loss": 1.249680757522583, + "step": 6326 + }, + { + "epoch": 1.9476762080640198, + "grad_norm": 15.0625, + "learning_rate": 1.2950476045294178e-06, + "loss": 1.3745570182800293, + "step": 6328 + }, + { + "epoch": 1.948291782086796, + "grad_norm": 21.875, + "learning_rate": 1.294324234194888e-06, + "loss": 1.6673004627227783, + "step": 6330 + }, + { + "epoch": 1.9489073561095722, + "grad_norm": 22.125, + "learning_rate": 1.2936010872179227e-06, + "loss": 0.8813947439193726, + "step": 6332 + }, + { + "epoch": 1.9495229301323485, + "grad_norm": 22.125, + "learning_rate": 1.2928781639179377e-06, + "loss": 1.6500790119171143, + "step": 6334 + }, + { + "epoch": 1.9501385041551247, + "grad_norm": 5.9375, + "learning_rate": 1.292155464614247e-06, + "loss": 1.0126196146011353, + "step": 6336 + }, + { + "epoch": 1.950754078177901, + "grad_norm": 14.375, + "learning_rate": 1.29143298962607e-06, + "loss": 1.327214002609253, + "step": 6338 + }, + { + "epoch": 1.9513696522006772, + "grad_norm": 68.5, + "learning_rate": 1.290710739272523e-06, + "loss": 1.2860033512115479, + "step": 6340 + }, + { + "epoch": 1.9519852262234534, + "grad_norm": 16.125, + "learning_rate": 1.2899887138726256e-06, + "loss": 1.706552267074585, + "step": 6342 + }, + { + "epoch": 1.9526008002462296, + "grad_norm": 19.125, + "learning_rate": 1.2892669137452975e-06, + "loss": 1.4060251712799072, + "step": 6344 + }, + { + "epoch": 1.9532163742690059, + "grad_norm": 11.9375, + "learning_rate": 1.2885453392093578e-06, + "loss": 1.5031074285507202, + "step": 6346 + }, + { + "epoch": 1.953831948291782, + "grad_norm": 12.125, + "learning_rate": 1.2878239905835275e-06, + "loss": 1.1290861368179321, + "step": 6348 + }, + { + "epoch": 1.9544475223145583, + "grad_norm": 6.25, + "learning_rate": 1.287102868186427e-06, + "loss": 1.2029154300689697, + "step": 6350 + }, + { + "epoch": 1.9550630963373345, + "grad_norm": 15.875, + "learning_rate": 1.2863819723365765e-06, + "loss": 1.4671310186386108, + "step": 6352 + }, + { + "epoch": 1.9556786703601108, + "grad_norm": 11.0, + "learning_rate": 1.2856613033523968e-06, + "loss": 1.035024642944336, + "step": 6354 + }, + { + "epoch": 1.956294244382887, + "grad_norm": 30.25, + "learning_rate": 1.284940861552208e-06, + "loss": 1.1749812364578247, + "step": 6356 + }, + { + "epoch": 1.9569098184056632, + "grad_norm": 15.875, + "learning_rate": 1.2842206472542308e-06, + "loss": 1.2916359901428223, + "step": 6358 + }, + { + "epoch": 1.9575253924284395, + "grad_norm": 12.125, + "learning_rate": 1.2835006607765837e-06, + "loss": 1.0509839057922363, + "step": 6360 + }, + { + "epoch": 1.9581409664512157, + "grad_norm": 17.0, + "learning_rate": 1.2827809024372858e-06, + "loss": 1.5406067371368408, + "step": 6362 + }, + { + "epoch": 1.958756540473992, + "grad_norm": 20.0, + "learning_rate": 1.282061372554256e-06, + "loss": 1.4444975852966309, + "step": 6364 + }, + { + "epoch": 1.9593721144967682, + "grad_norm": 30.375, + "learning_rate": 1.2813420714453106e-06, + "loss": 1.199216604232788, + "step": 6366 + }, + { + "epoch": 1.9599876885195444, + "grad_norm": 3.640625, + "learning_rate": 1.2806229994281656e-06, + "loss": 1.1930866241455078, + "step": 6368 + }, + { + "epoch": 1.9606032625423206, + "grad_norm": 20.0, + "learning_rate": 1.2799041568204368e-06, + "loss": 1.1290255784988403, + "step": 6370 + }, + { + "epoch": 1.9612188365650969, + "grad_norm": 16.125, + "learning_rate": 1.2791855439396369e-06, + "loss": 1.2265934944152832, + "step": 6372 + }, + { + "epoch": 1.9618344105878731, + "grad_norm": 17.0, + "learning_rate": 1.2784671611031787e-06, + "loss": 1.585903525352478, + "step": 6374 + }, + { + "epoch": 1.9624499846106493, + "grad_norm": 11.25, + "learning_rate": 1.2777490086283728e-06, + "loss": 1.3189527988433838, + "step": 6376 + }, + { + "epoch": 1.9630655586334256, + "grad_norm": 7.34375, + "learning_rate": 1.2770310868324266e-06, + "loss": 1.0683166980743408, + "step": 6378 + }, + { + "epoch": 1.9636811326562018, + "grad_norm": 91.0, + "learning_rate": 1.2763133960324486e-06, + "loss": 1.0657436847686768, + "step": 6380 + }, + { + "epoch": 1.964296706678978, + "grad_norm": 14.0, + "learning_rate": 1.2755959365454433e-06, + "loss": 1.1493613719940186, + "step": 6382 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 17.75, + "learning_rate": 1.274878708688313e-06, + "loss": 1.5804072618484497, + "step": 6384 + }, + { + "epoch": 1.9655278547245305, + "grad_norm": 15.75, + "learning_rate": 1.2741617127778581e-06, + "loss": 1.352163553237915, + "step": 6386 + }, + { + "epoch": 1.9661434287473067, + "grad_norm": 19.125, + "learning_rate": 1.2734449491307764e-06, + "loss": 1.1006097793579102, + "step": 6388 + }, + { + "epoch": 1.966759002770083, + "grad_norm": 20.375, + "learning_rate": 1.2727284180636641e-06, + "loss": 1.8996819257736206, + "step": 6390 + }, + { + "epoch": 1.9673745767928592, + "grad_norm": 25.5, + "learning_rate": 1.2720121198930123e-06, + "loss": 1.290725588798523, + "step": 6392 + }, + { + "epoch": 1.9679901508156354, + "grad_norm": 15.4375, + "learning_rate": 1.271296054935212e-06, + "loss": 1.235483169555664, + "step": 6394 + }, + { + "epoch": 1.9686057248384117, + "grad_norm": 14.625, + "learning_rate": 1.2705802235065495e-06, + "loss": 1.2222580909729004, + "step": 6396 + }, + { + "epoch": 1.969221298861188, + "grad_norm": 12.75, + "learning_rate": 1.2698646259232077e-06, + "loss": 1.5252647399902344, + "step": 6398 + }, + { + "epoch": 1.9698368728839641, + "grad_norm": 10.9375, + "learning_rate": 1.2691492625012682e-06, + "loss": 1.555239200592041, + "step": 6400 + }, + { + "epoch": 1.9704524469067404, + "grad_norm": 36.25, + "learning_rate": 1.2684341335567074e-06, + "loss": 1.5563361644744873, + "step": 6402 + }, + { + "epoch": 1.9710680209295168, + "grad_norm": 12.1875, + "learning_rate": 1.2677192394053971e-06, + "loss": 1.6911733150482178, + "step": 6404 + }, + { + "epoch": 1.971683594952293, + "grad_norm": 10.8125, + "learning_rate": 1.2670045803631093e-06, + "loss": 1.5041658878326416, + "step": 6406 + }, + { + "epoch": 1.9722991689750693, + "grad_norm": 8.8125, + "learning_rate": 1.2662901567455077e-06, + "loss": 0.9709604978561401, + "step": 6408 + }, + { + "epoch": 1.9729147429978455, + "grad_norm": 11.0625, + "learning_rate": 1.2655759688681548e-06, + "loss": 1.1826274394989014, + "step": 6410 + }, + { + "epoch": 1.9735303170206218, + "grad_norm": 21.625, + "learning_rate": 1.264862017046509e-06, + "loss": 1.141406774520874, + "step": 6412 + }, + { + "epoch": 1.974145891043398, + "grad_norm": 132.0, + "learning_rate": 1.2641483015959223e-06, + "loss": 1.436084270477295, + "step": 6414 + }, + { + "epoch": 1.9747614650661742, + "grad_norm": 15.625, + "learning_rate": 1.2634348228316442e-06, + "loss": 1.4876971244812012, + "step": 6416 + }, + { + "epoch": 1.9753770390889505, + "grad_norm": 20.875, + "learning_rate": 1.2627215810688196e-06, + "loss": 1.5110372304916382, + "step": 6418 + }, + { + "epoch": 1.9759926131117267, + "grad_norm": 12.625, + "learning_rate": 1.262008576622487e-06, + "loss": 1.2345492839813232, + "step": 6420 + }, + { + "epoch": 1.976608187134503, + "grad_norm": 33.5, + "learning_rate": 1.261295809807582e-06, + "loss": 1.3591794967651367, + "step": 6422 + }, + { + "epoch": 1.9772237611572792, + "grad_norm": 13.75, + "learning_rate": 1.2605832809389347e-06, + "loss": 1.5868587493896484, + "step": 6424 + }, + { + "epoch": 1.9778393351800554, + "grad_norm": 10.0625, + "learning_rate": 1.25987099033127e-06, + "loss": 1.4743338823318481, + "step": 6426 + }, + { + "epoch": 1.9784549092028316, + "grad_norm": 12.125, + "learning_rate": 1.2591589382992066e-06, + "loss": 1.4161126613616943, + "step": 6428 + }, + { + "epoch": 1.9790704832256079, + "grad_norm": 17.875, + "learning_rate": 1.2584471251572596e-06, + "loss": 2.168795108795166, + "step": 6430 + }, + { + "epoch": 1.979686057248384, + "grad_norm": 19.25, + "learning_rate": 1.2577355512198374e-06, + "loss": 1.795354962348938, + "step": 6432 + }, + { + "epoch": 1.9803016312711603, + "grad_norm": 12.5625, + "learning_rate": 1.2570242168012427e-06, + "loss": 1.554661750793457, + "step": 6434 + }, + { + "epoch": 1.9809172052939366, + "grad_norm": 13.6875, + "learning_rate": 1.256313122215673e-06, + "loss": 1.3290802240371704, + "step": 6436 + }, + { + "epoch": 1.9815327793167128, + "grad_norm": 14.4375, + "learning_rate": 1.2556022677772202e-06, + "loss": 1.7011728286743164, + "step": 6438 + }, + { + "epoch": 1.982148353339489, + "grad_norm": 14.875, + "learning_rate": 1.2548916537998682e-06, + "loss": 1.0243396759033203, + "step": 6440 + }, + { + "epoch": 1.9827639273622655, + "grad_norm": 17.375, + "learning_rate": 1.2541812805974969e-06, + "loss": 0.9982007741928101, + "step": 6442 + }, + { + "epoch": 1.9833795013850417, + "grad_norm": 43.25, + "learning_rate": 1.2534711484838791e-06, + "loss": 1.570636510848999, + "step": 6444 + }, + { + "epoch": 1.983995075407818, + "grad_norm": 38.75, + "learning_rate": 1.2527612577726805e-06, + "loss": 0.9931204915046692, + "step": 6446 + }, + { + "epoch": 1.9846106494305942, + "grad_norm": 11.5, + "learning_rate": 1.2520516087774603e-06, + "loss": 1.2048075199127197, + "step": 6448 + }, + { + "epoch": 1.9852262234533704, + "grad_norm": 11.375, + "learning_rate": 1.251342201811672e-06, + "loss": 1.1237223148345947, + "step": 6450 + }, + { + "epoch": 1.9858417974761466, + "grad_norm": 10.5625, + "learning_rate": 1.2506330371886616e-06, + "loss": 1.353630781173706, + "step": 6452 + }, + { + "epoch": 1.9864573714989229, + "grad_norm": 31.5, + "learning_rate": 1.2499241152216673e-06, + "loss": 0.7195751070976257, + "step": 6454 + }, + { + "epoch": 1.987072945521699, + "grad_norm": 6.25, + "learning_rate": 1.2492154362238208e-06, + "loss": 1.288529872894287, + "step": 6456 + }, + { + "epoch": 1.9876885195444753, + "grad_norm": 9.4375, + "learning_rate": 1.2485070005081473e-06, + "loss": 1.4264882802963257, + "step": 6458 + }, + { + "epoch": 1.9883040935672516, + "grad_norm": 17.25, + "learning_rate": 1.2477988083875625e-06, + "loss": 1.44509756565094, + "step": 6460 + }, + { + "epoch": 1.9889196675900278, + "grad_norm": 19.0, + "learning_rate": 1.2470908601748759e-06, + "loss": 1.390662670135498, + "step": 6462 + }, + { + "epoch": 1.989535241612804, + "grad_norm": 44.75, + "learning_rate": 1.24638315618279e-06, + "loss": 1.1538774967193604, + "step": 6464 + }, + { + "epoch": 1.9901508156355803, + "grad_norm": 14.625, + "learning_rate": 1.2456756967238967e-06, + "loss": 1.4914668798446655, + "step": 6466 + }, + { + "epoch": 1.9907663896583565, + "grad_norm": 7.46875, + "learning_rate": 1.2449684821106837e-06, + "loss": 1.0413475036621094, + "step": 6468 + }, + { + "epoch": 1.9913819636811327, + "grad_norm": 15.125, + "learning_rate": 1.2442615126555275e-06, + "loss": 1.5457627773284912, + "step": 6470 + }, + { + "epoch": 1.991997537703909, + "grad_norm": 17.375, + "learning_rate": 1.2435547886706963e-06, + "loss": 1.5767290592193604, + "step": 6472 + }, + { + "epoch": 1.9926131117266852, + "grad_norm": 21.75, + "learning_rate": 1.242848310468353e-06, + "loss": 1.1986875534057617, + "step": 6474 + }, + { + "epoch": 1.9932286857494614, + "grad_norm": 10.3125, + "learning_rate": 1.2421420783605481e-06, + "loss": 1.5655173063278198, + "step": 6476 + }, + { + "epoch": 1.9938442597722377, + "grad_norm": 11.625, + "learning_rate": 1.2414360926592258e-06, + "loss": 1.719822883605957, + "step": 6478 + }, + { + "epoch": 1.994459833795014, + "grad_norm": 23.625, + "learning_rate": 1.2407303536762217e-06, + "loss": 1.5850684642791748, + "step": 6480 + }, + { + "epoch": 1.9950754078177901, + "grad_norm": 248.0, + "learning_rate": 1.2400248617232597e-06, + "loss": 0.8525184392929077, + "step": 6482 + }, + { + "epoch": 1.9956909818405664, + "grad_norm": 6.15625, + "learning_rate": 1.2393196171119575e-06, + "loss": 0.9294012188911438, + "step": 6484 + }, + { + "epoch": 1.9963065558633426, + "grad_norm": 21.875, + "learning_rate": 1.2386146201538224e-06, + "loss": 1.3426337242126465, + "step": 6486 + }, + { + "epoch": 1.9969221298861188, + "grad_norm": 11.25, + "learning_rate": 1.237909871160252e-06, + "loss": 0.5351234078407288, + "step": 6488 + }, + { + "epoch": 1.997537703908895, + "grad_norm": 29.875, + "learning_rate": 1.237205370442535e-06, + "loss": 1.0683671236038208, + "step": 6490 + }, + { + "epoch": 1.9981532779316713, + "grad_norm": 15.8125, + "learning_rate": 1.23650111831185e-06, + "loss": 1.2516847848892212, + "step": 6492 + }, + { + "epoch": 1.9987688519544475, + "grad_norm": 14.125, + "learning_rate": 1.2357971150792667e-06, + "loss": 1.3247123956680298, + "step": 6494 + }, + { + "epoch": 1.9993844259772238, + "grad_norm": 15.5625, + "learning_rate": 1.2350933610557434e-06, + "loss": 1.4294352531433105, + "step": 6496 + }, + { + "epoch": 2.0, + "grad_norm": 15.875, + "learning_rate": 1.2343898565521283e-06, + "loss": 1.3242424726486206, + "step": 6498 + }, + { + "epoch": 2.0006155740227762, + "grad_norm": 3.359375, + "learning_rate": 1.233686601879162e-06, + "loss": 1.1256086826324463, + "step": 6500 + }, + { + "epoch": 2.0012311480455525, + "grad_norm": 11.25, + "learning_rate": 1.2329835973474713e-06, + "loss": 1.5124751329421997, + "step": 6502 + }, + { + "epoch": 2.0018467220683287, + "grad_norm": 127.5, + "learning_rate": 1.2322808432675746e-06, + "loss": 1.2730624675750732, + "step": 6504 + }, + { + "epoch": 2.002462296091105, + "grad_norm": 5.5, + "learning_rate": 1.2315783399498802e-06, + "loss": 1.5110902786254883, + "step": 6506 + }, + { + "epoch": 2.003077870113881, + "grad_norm": 10.75, + "learning_rate": 1.2308760877046833e-06, + "loss": 1.7414963245391846, + "step": 6508 + }, + { + "epoch": 2.0036934441366574, + "grad_norm": 34.75, + "learning_rate": 1.2301740868421696e-06, + "loss": 1.1642667055130005, + "step": 6510 + }, + { + "epoch": 2.0043090181594336, + "grad_norm": 26.125, + "learning_rate": 1.2294723376724145e-06, + "loss": 0.8890253901481628, + "step": 6512 + }, + { + "epoch": 2.00492459218221, + "grad_norm": 31.75, + "learning_rate": 1.2287708405053806e-06, + "loss": 1.2963333129882812, + "step": 6514 + }, + { + "epoch": 2.005540166204986, + "grad_norm": 42.5, + "learning_rate": 1.2280695956509205e-06, + "loss": 1.245513916015625, + "step": 6516 + }, + { + "epoch": 2.0061557402277623, + "grad_norm": 8.3125, + "learning_rate": 1.2273686034187746e-06, + "loss": 1.2022202014923096, + "step": 6518 + }, + { + "epoch": 2.0067713142505386, + "grad_norm": 5.625, + "learning_rate": 1.2266678641185725e-06, + "loss": 1.3870012760162354, + "step": 6520 + }, + { + "epoch": 2.007386888273315, + "grad_norm": 19.875, + "learning_rate": 1.2259673780598306e-06, + "loss": 1.3389239311218262, + "step": 6522 + }, + { + "epoch": 2.008002462296091, + "grad_norm": 7.0, + "learning_rate": 1.2252671455519553e-06, + "loss": 1.1467727422714233, + "step": 6524 + }, + { + "epoch": 2.0086180363188673, + "grad_norm": 6.71875, + "learning_rate": 1.2245671669042399e-06, + "loss": 1.2564332485198975, + "step": 6526 + }, + { + "epoch": 2.0092336103416435, + "grad_norm": 6.15625, + "learning_rate": 1.2238674424258652e-06, + "loss": 1.5031843185424805, + "step": 6528 + }, + { + "epoch": 2.0098491843644197, + "grad_norm": 9.5625, + "learning_rate": 1.2231679724259005e-06, + "loss": 1.339223861694336, + "step": 6530 + }, + { + "epoch": 2.010464758387196, + "grad_norm": 8.4375, + "learning_rate": 1.2224687572133034e-06, + "loss": 1.743408203125, + "step": 6532 + }, + { + "epoch": 2.011080332409972, + "grad_norm": 12.625, + "learning_rate": 1.2217697970969164e-06, + "loss": 1.2714446783065796, + "step": 6534 + }, + { + "epoch": 2.0116959064327484, + "grad_norm": 27.75, + "learning_rate": 1.2210710923854726e-06, + "loss": 1.3177649974822998, + "step": 6536 + }, + { + "epoch": 2.0123114804555247, + "grad_norm": 4.40625, + "learning_rate": 1.2203726433875904e-06, + "loss": 1.0545878410339355, + "step": 6538 + }, + { + "epoch": 2.012927054478301, + "grad_norm": 10.4375, + "learning_rate": 1.219674450411774e-06, + "loss": 0.9841394424438477, + "step": 6540 + }, + { + "epoch": 2.013542628501077, + "grad_norm": 26.5, + "learning_rate": 1.2189765137664182e-06, + "loss": 1.654037356376648, + "step": 6542 + }, + { + "epoch": 2.0141582025238534, + "grad_norm": 45.25, + "learning_rate": 1.2182788337598009e-06, + "loss": 1.1750099658966064, + "step": 6544 + }, + { + "epoch": 2.0147737765466296, + "grad_norm": 14.25, + "learning_rate": 1.2175814107000885e-06, + "loss": 1.3668975830078125, + "step": 6546 + }, + { + "epoch": 2.015389350569406, + "grad_norm": 15.1875, + "learning_rate": 1.2168842448953343e-06, + "loss": 1.1165693998336792, + "step": 6548 + }, + { + "epoch": 2.016004924592182, + "grad_norm": 15.0, + "learning_rate": 1.216187336653476e-06, + "loss": 1.4498151540756226, + "step": 6550 + }, + { + "epoch": 2.0166204986149583, + "grad_norm": 52.5, + "learning_rate": 1.2154906862823402e-06, + "loss": 1.5345754623413086, + "step": 6552 + }, + { + "epoch": 2.0172360726377345, + "grad_norm": 12.75, + "learning_rate": 1.2147942940896367e-06, + "loss": 1.228990077972412, + "step": 6554 + }, + { + "epoch": 2.0178516466605108, + "grad_norm": 9.75, + "learning_rate": 1.2140981603829635e-06, + "loss": 1.6220732927322388, + "step": 6556 + }, + { + "epoch": 2.018467220683287, + "grad_norm": 10.75, + "learning_rate": 1.2134022854698037e-06, + "loss": 1.5125510692596436, + "step": 6558 + }, + { + "epoch": 2.019082794706063, + "grad_norm": 13.875, + "learning_rate": 1.2127066696575252e-06, + "loss": 1.2239012718200684, + "step": 6560 + }, + { + "epoch": 2.0196983687288395, + "grad_norm": 21.375, + "learning_rate": 1.212011313253384e-06, + "loss": 1.0492621660232544, + "step": 6562 + }, + { + "epoch": 2.0203139427516157, + "grad_norm": 23.75, + "learning_rate": 1.211316216564519e-06, + "loss": 1.2073233127593994, + "step": 6564 + }, + { + "epoch": 2.020929516774392, + "grad_norm": 8.375, + "learning_rate": 1.2106213798979539e-06, + "loss": 1.0625733137130737, + "step": 6566 + }, + { + "epoch": 2.021545090797168, + "grad_norm": 24.125, + "learning_rate": 1.209926803560601e-06, + "loss": 1.758016586303711, + "step": 6568 + }, + { + "epoch": 2.0221606648199444, + "grad_norm": 29.5, + "learning_rate": 1.2092324878592537e-06, + "loss": 1.503721833229065, + "step": 6570 + }, + { + "epoch": 2.0227762388427206, + "grad_norm": 16.625, + "learning_rate": 1.2085384331005931e-06, + "loss": 1.3319995403289795, + "step": 6572 + }, + { + "epoch": 2.023391812865497, + "grad_norm": 14.75, + "learning_rate": 1.2078446395911838e-06, + "loss": 1.56700599193573, + "step": 6574 + }, + { + "epoch": 2.0240073868882735, + "grad_norm": 16.25, + "learning_rate": 1.207151107637475e-06, + "loss": 1.5571670532226562, + "step": 6576 + }, + { + "epoch": 2.0246229609110498, + "grad_norm": 11.625, + "learning_rate": 1.2064578375458004e-06, + "loss": 0.8407614231109619, + "step": 6578 + }, + { + "epoch": 2.025238534933826, + "grad_norm": 14.3125, + "learning_rate": 1.2057648296223788e-06, + "loss": 1.2025877237319946, + "step": 6580 + }, + { + "epoch": 2.0258541089566022, + "grad_norm": 8.3125, + "learning_rate": 1.2050720841733117e-06, + "loss": 1.1296197175979614, + "step": 6582 + }, + { + "epoch": 2.0264696829793785, + "grad_norm": 10.3125, + "learning_rate": 1.204379601504586e-06, + "loss": 1.2144582271575928, + "step": 6584 + }, + { + "epoch": 2.0270852570021547, + "grad_norm": 9.6875, + "learning_rate": 1.2036873819220725e-06, + "loss": 1.43599534034729, + "step": 6586 + }, + { + "epoch": 2.027700831024931, + "grad_norm": 19.375, + "learning_rate": 1.2029954257315252e-06, + "loss": 1.6993939876556396, + "step": 6588 + }, + { + "epoch": 2.028316405047707, + "grad_norm": 9.75, + "learning_rate": 1.2023037332385814e-06, + "loss": 1.2860901355743408, + "step": 6590 + }, + { + "epoch": 2.0289319790704834, + "grad_norm": 8.625, + "learning_rate": 1.2016123047487627e-06, + "loss": 1.5420019626617432, + "step": 6592 + }, + { + "epoch": 2.0295475530932596, + "grad_norm": 4.96875, + "learning_rate": 1.2009211405674746e-06, + "loss": 1.373903512954712, + "step": 6594 + }, + { + "epoch": 2.030163127116036, + "grad_norm": 16.25, + "learning_rate": 1.2002302410000039e-06, + "loss": 1.3337182998657227, + "step": 6596 + }, + { + "epoch": 2.030778701138812, + "grad_norm": 7.96875, + "learning_rate": 1.1995396063515227e-06, + "loss": 1.0187783241271973, + "step": 6598 + }, + { + "epoch": 2.0313942751615883, + "grad_norm": 10.8125, + "learning_rate": 1.1988492369270847e-06, + "loss": 1.351269245147705, + "step": 6600 + }, + { + "epoch": 2.0320098491843646, + "grad_norm": 13.0, + "learning_rate": 1.1981591330316262e-06, + "loss": 1.346450686454773, + "step": 6602 + }, + { + "epoch": 2.032625423207141, + "grad_norm": 15.9375, + "learning_rate": 1.1974692949699687e-06, + "loss": 1.3943252563476562, + "step": 6604 + }, + { + "epoch": 2.033240997229917, + "grad_norm": 4.3125, + "learning_rate": 1.196779723046813e-06, + "loss": 1.275787353515625, + "step": 6606 + }, + { + "epoch": 2.0338565712526933, + "grad_norm": 4.65625, + "learning_rate": 1.196090417566744e-06, + "loss": 1.4417996406555176, + "step": 6608 + }, + { + "epoch": 2.0344721452754695, + "grad_norm": 23.75, + "learning_rate": 1.1954013788342285e-06, + "loss": 1.6339545249938965, + "step": 6610 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 13.0, + "learning_rate": 1.1947126071536165e-06, + "loss": 1.7662523984909058, + "step": 6612 + }, + { + "epoch": 2.035703293321022, + "grad_norm": 5.71875, + "learning_rate": 1.194024102829139e-06, + "loss": 1.236097812652588, + "step": 6614 + }, + { + "epoch": 2.036318867343798, + "grad_norm": 56.25, + "learning_rate": 1.1933358661649086e-06, + "loss": 1.5584526062011719, + "step": 6616 + }, + { + "epoch": 2.0369344413665744, + "grad_norm": 9.0, + "learning_rate": 1.1926478974649205e-06, + "loss": 1.2706172466278076, + "step": 6618 + }, + { + "epoch": 2.0375500153893507, + "grad_norm": 8.0625, + "learning_rate": 1.191960197033052e-06, + "loss": 1.143151879310608, + "step": 6620 + }, + { + "epoch": 2.038165589412127, + "grad_norm": 22.75, + "learning_rate": 1.1912727651730598e-06, + "loss": 1.0128357410430908, + "step": 6622 + }, + { + "epoch": 2.038781163434903, + "grad_norm": 20.75, + "learning_rate": 1.1905856021885842e-06, + "loss": 1.2151031494140625, + "step": 6624 + }, + { + "epoch": 2.0393967374576794, + "grad_norm": 19.0, + "learning_rate": 1.189898708383146e-06, + "loss": 1.426274061203003, + "step": 6626 + }, + { + "epoch": 2.0400123114804556, + "grad_norm": 29.0, + "learning_rate": 1.189212084060146e-06, + "loss": 0.9838099479675293, + "step": 6628 + }, + { + "epoch": 2.040627885503232, + "grad_norm": 58.0, + "learning_rate": 1.1885257295228685e-06, + "loss": 1.4184445142745972, + "step": 6630 + }, + { + "epoch": 2.041243459526008, + "grad_norm": 15.125, + "learning_rate": 1.1878396450744758e-06, + "loss": 1.3893468379974365, + "step": 6632 + }, + { + "epoch": 2.0418590335487843, + "grad_norm": 12.0, + "learning_rate": 1.1871538310180128e-06, + "loss": 0.8931977152824402, + "step": 6634 + }, + { + "epoch": 2.0424746075715605, + "grad_norm": 40.25, + "learning_rate": 1.1864682876564044e-06, + "loss": 1.477832555770874, + "step": 6636 + }, + { + "epoch": 2.0430901815943368, + "grad_norm": 16.5, + "learning_rate": 1.1857830152924553e-06, + "loss": 1.6426715850830078, + "step": 6638 + }, + { + "epoch": 2.043705755617113, + "grad_norm": 14.5625, + "learning_rate": 1.1850980142288515e-06, + "loss": 1.2351162433624268, + "step": 6640 + }, + { + "epoch": 2.044321329639889, + "grad_norm": 11.125, + "learning_rate": 1.184413284768159e-06, + "loss": 1.2899537086486816, + "step": 6642 + }, + { + "epoch": 2.0449369036626655, + "grad_norm": 9.3125, + "learning_rate": 1.183728827212823e-06, + "loss": 1.3072597980499268, + "step": 6644 + }, + { + "epoch": 2.0455524776854417, + "grad_norm": 25.0, + "learning_rate": 1.1830446418651694e-06, + "loss": 1.6979049444198608, + "step": 6646 + }, + { + "epoch": 2.046168051708218, + "grad_norm": 10.5625, + "learning_rate": 1.1823607290274045e-06, + "loss": 1.087388515472412, + "step": 6648 + }, + { + "epoch": 2.046783625730994, + "grad_norm": 51.5, + "learning_rate": 1.181677089001612e-06, + "loss": 1.2746059894561768, + "step": 6650 + }, + { + "epoch": 2.0473991997537704, + "grad_norm": 9.625, + "learning_rate": 1.1809937220897568e-06, + "loss": 1.4777030944824219, + "step": 6652 + }, + { + "epoch": 2.0480147737765466, + "grad_norm": 10.9375, + "learning_rate": 1.1803106285936834e-06, + "loss": 1.0702719688415527, + "step": 6654 + }, + { + "epoch": 2.048630347799323, + "grad_norm": 12.25, + "learning_rate": 1.1796278088151152e-06, + "loss": 1.3066442012786865, + "step": 6656 + }, + { + "epoch": 2.049245921822099, + "grad_norm": 42.0, + "learning_rate": 1.1789452630556535e-06, + "loss": 1.4079071283340454, + "step": 6658 + }, + { + "epoch": 2.0498614958448753, + "grad_norm": 6.53125, + "learning_rate": 1.17826299161678e-06, + "loss": 1.1593713760375977, + "step": 6660 + }, + { + "epoch": 2.0504770698676515, + "grad_norm": 12.0, + "learning_rate": 1.1775809947998552e-06, + "loss": 1.1441447734832764, + "step": 6662 + }, + { + "epoch": 2.051092643890428, + "grad_norm": 27.5, + "learning_rate": 1.1768992729061173e-06, + "loss": 1.1255943775177002, + "step": 6664 + }, + { + "epoch": 2.051708217913204, + "grad_norm": 4.28125, + "learning_rate": 1.1762178262366838e-06, + "loss": 1.6918699741363525, + "step": 6666 + }, + { + "epoch": 2.0523237919359802, + "grad_norm": 7.0625, + "learning_rate": 1.175536655092551e-06, + "loss": 1.2675297260284424, + "step": 6668 + }, + { + "epoch": 2.0529393659587565, + "grad_norm": 24.25, + "learning_rate": 1.1748557597745918e-06, + "loss": 1.716539978981018, + "step": 6670 + }, + { + "epoch": 2.0535549399815327, + "grad_norm": 14.1875, + "learning_rate": 1.1741751405835593e-06, + "loss": 0.9748501777648926, + "step": 6672 + }, + { + "epoch": 2.054170514004309, + "grad_norm": 11.125, + "learning_rate": 1.1734947978200835e-06, + "loss": 1.3657870292663574, + "step": 6674 + }, + { + "epoch": 2.054786088027085, + "grad_norm": 19.875, + "learning_rate": 1.1728147317846733e-06, + "loss": 1.5253214836120605, + "step": 6676 + }, + { + "epoch": 2.0554016620498614, + "grad_norm": 12.5625, + "learning_rate": 1.1721349427777133e-06, + "loss": 1.2408127784729004, + "step": 6678 + }, + { + "epoch": 2.0560172360726376, + "grad_norm": 9.8125, + "learning_rate": 1.171455431099468e-06, + "loss": 1.2663090229034424, + "step": 6680 + }, + { + "epoch": 2.056632810095414, + "grad_norm": 9.125, + "learning_rate": 1.1707761970500787e-06, + "loss": 0.8668129444122314, + "step": 6682 + }, + { + "epoch": 2.05724838411819, + "grad_norm": 10.875, + "learning_rate": 1.1700972409295631e-06, + "loss": 1.4875694513320923, + "step": 6684 + }, + { + "epoch": 2.0578639581409663, + "grad_norm": 22.125, + "learning_rate": 1.1694185630378171e-06, + "loss": 1.2524561882019043, + "step": 6686 + }, + { + "epoch": 2.0584795321637426, + "grad_norm": 17.0, + "learning_rate": 1.1687401636746143e-06, + "loss": 1.4323711395263672, + "step": 6688 + }, + { + "epoch": 2.059095106186519, + "grad_norm": 36.25, + "learning_rate": 1.1680620431396033e-06, + "loss": 1.189788818359375, + "step": 6690 + }, + { + "epoch": 2.059710680209295, + "grad_norm": 12.5, + "learning_rate": 1.1673842017323112e-06, + "loss": 1.1640629768371582, + "step": 6692 + }, + { + "epoch": 2.0603262542320713, + "grad_norm": 24.0, + "learning_rate": 1.1667066397521418e-06, + "loss": 1.3109760284423828, + "step": 6694 + }, + { + "epoch": 2.0609418282548475, + "grad_norm": 7.625, + "learning_rate": 1.1660293574983739e-06, + "loss": 1.2151691913604736, + "step": 6696 + }, + { + "epoch": 2.0615574022776237, + "grad_norm": 7.125, + "learning_rate": 1.165352355270165e-06, + "loss": 1.3004823923110962, + "step": 6698 + }, + { + "epoch": 2.0621729763004, + "grad_norm": 12.5625, + "learning_rate": 1.1646756333665472e-06, + "loss": 1.4980103969573975, + "step": 6700 + }, + { + "epoch": 2.062788550323176, + "grad_norm": 5.25, + "learning_rate": 1.1639991920864292e-06, + "loss": 1.001665711402893, + "step": 6702 + }, + { + "epoch": 2.0634041243459524, + "grad_norm": 22.625, + "learning_rate": 1.1633230317285967e-06, + "loss": 1.5311155319213867, + "step": 6704 + }, + { + "epoch": 2.0640196983687287, + "grad_norm": 12.125, + "learning_rate": 1.1626471525917093e-06, + "loss": 1.5896987915039062, + "step": 6706 + }, + { + "epoch": 2.064635272391505, + "grad_norm": 13.5625, + "learning_rate": 1.1619715549743045e-06, + "loss": 1.5665377378463745, + "step": 6708 + }, + { + "epoch": 2.065250846414281, + "grad_norm": 27.25, + "learning_rate": 1.1612962391747944e-06, + "loss": 1.6170427799224854, + "step": 6710 + }, + { + "epoch": 2.0658664204370574, + "grad_norm": 40.75, + "learning_rate": 1.1606212054914662e-06, + "loss": 1.57864248752594, + "step": 6712 + }, + { + "epoch": 2.0664819944598336, + "grad_norm": 59.0, + "learning_rate": 1.1599464542224836e-06, + "loss": 1.138413906097412, + "step": 6714 + }, + { + "epoch": 2.06709756848261, + "grad_norm": 13.9375, + "learning_rate": 1.1592719856658846e-06, + "loss": 1.4642479419708252, + "step": 6716 + }, + { + "epoch": 2.067713142505386, + "grad_norm": 12.875, + "learning_rate": 1.1585978001195838e-06, + "loss": 1.3136050701141357, + "step": 6718 + }, + { + "epoch": 2.0683287165281623, + "grad_norm": 3.21875, + "learning_rate": 1.1579238978813686e-06, + "loss": 1.2860009670257568, + "step": 6720 + }, + { + "epoch": 2.0689442905509385, + "grad_norm": 15.0, + "learning_rate": 1.1572502792489018e-06, + "loss": 1.012883186340332, + "step": 6722 + }, + { + "epoch": 2.0695598645737148, + "grad_norm": 5.0, + "learning_rate": 1.1565769445197234e-06, + "loss": 1.2962499856948853, + "step": 6724 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 33.75, + "learning_rate": 1.1559038939912448e-06, + "loss": 1.3131591081619263, + "step": 6726 + }, + { + "epoch": 2.0707910126192677, + "grad_norm": 29.875, + "learning_rate": 1.1552311279607536e-06, + "loss": 1.5089943408966064, + "step": 6728 + }, + { + "epoch": 2.071406586642044, + "grad_norm": 23.5, + "learning_rate": 1.1545586467254113e-06, + "loss": 1.2558484077453613, + "step": 6730 + }, + { + "epoch": 2.07202216066482, + "grad_norm": 9.6875, + "learning_rate": 1.1538864505822537e-06, + "loss": 1.2030107975006104, + "step": 6732 + }, + { + "epoch": 2.0726377346875964, + "grad_norm": 4.15625, + "learning_rate": 1.1532145398281904e-06, + "loss": 1.2497684955596924, + "step": 6734 + }, + { + "epoch": 2.0732533087103726, + "grad_norm": 10.25, + "learning_rate": 1.1525429147600054e-06, + "loss": 1.3862303495407104, + "step": 6736 + }, + { + "epoch": 2.073868882733149, + "grad_norm": 13.4375, + "learning_rate": 1.1518715756743558e-06, + "loss": 1.1934767961502075, + "step": 6738 + }, + { + "epoch": 2.074484456755925, + "grad_norm": 3.6875, + "learning_rate": 1.1512005228677735e-06, + "loss": 1.0756824016571045, + "step": 6740 + }, + { + "epoch": 2.0751000307787013, + "grad_norm": 12.375, + "learning_rate": 1.1505297566366623e-06, + "loss": 1.2047674655914307, + "step": 6742 + }, + { + "epoch": 2.0757156048014775, + "grad_norm": 19.75, + "learning_rate": 1.149859277277302e-06, + "loss": 1.3884174823760986, + "step": 6744 + }, + { + "epoch": 2.0763311788242538, + "grad_norm": 12.9375, + "learning_rate": 1.1491890850858426e-06, + "loss": 1.3445823192596436, + "step": 6746 + }, + { + "epoch": 2.07694675284703, + "grad_norm": 8.25, + "learning_rate": 1.1485191803583091e-06, + "loss": 1.028502345085144, + "step": 6748 + }, + { + "epoch": 2.0775623268698062, + "grad_norm": 12.125, + "learning_rate": 1.1478495633906003e-06, + "loss": 1.3428664207458496, + "step": 6750 + }, + { + "epoch": 2.0781779008925825, + "grad_norm": 10.875, + "learning_rate": 1.147180234478485e-06, + "loss": 1.5432965755462646, + "step": 6752 + }, + { + "epoch": 2.0787934749153587, + "grad_norm": 26.125, + "learning_rate": 1.1465111939176077e-06, + "loss": 1.7907958030700684, + "step": 6754 + }, + { + "epoch": 2.079409048938135, + "grad_norm": 9.75, + "learning_rate": 1.1458424420034846e-06, + "loss": 1.359710931777954, + "step": 6756 + }, + { + "epoch": 2.080024622960911, + "grad_norm": 5.65625, + "learning_rate": 1.1451739790315026e-06, + "loss": 1.5104893445968628, + "step": 6758 + }, + { + "epoch": 2.0806401969836874, + "grad_norm": 29.875, + "learning_rate": 1.1445058052969246e-06, + "loss": 1.3971478939056396, + "step": 6760 + }, + { + "epoch": 2.0812557710064636, + "grad_norm": 16.375, + "learning_rate": 1.143837921094883e-06, + "loss": 1.4559658765792847, + "step": 6762 + }, + { + "epoch": 2.08187134502924, + "grad_norm": 13.375, + "learning_rate": 1.1431703267203817e-06, + "loss": 1.188551664352417, + "step": 6764 + }, + { + "epoch": 2.082486919052016, + "grad_norm": 9.1875, + "learning_rate": 1.1425030224682998e-06, + "loss": 1.3196886777877808, + "step": 6766 + }, + { + "epoch": 2.0831024930747923, + "grad_norm": 4.84375, + "learning_rate": 1.1418360086333852e-06, + "loss": 1.2066409587860107, + "step": 6768 + }, + { + "epoch": 2.0837180670975686, + "grad_norm": 13.875, + "learning_rate": 1.141169285510259e-06, + "loss": 1.4319138526916504, + "step": 6770 + }, + { + "epoch": 2.084333641120345, + "grad_norm": 24.75, + "learning_rate": 1.1405028533934138e-06, + "loss": 0.9755879044532776, + "step": 6772 + }, + { + "epoch": 2.084949215143121, + "grad_norm": 3.375, + "learning_rate": 1.1398367125772132e-06, + "loss": 1.4275426864624023, + "step": 6774 + }, + { + "epoch": 2.0855647891658973, + "grad_norm": 11.25, + "learning_rate": 1.1391708633558924e-06, + "loss": 1.2716853618621826, + "step": 6776 + }, + { + "epoch": 2.0861803631886735, + "grad_norm": 17.25, + "learning_rate": 1.1385053060235576e-06, + "loss": 1.4581801891326904, + "step": 6778 + }, + { + "epoch": 2.0867959372114497, + "grad_norm": 13.6875, + "learning_rate": 1.1378400408741862e-06, + "loss": 1.4064788818359375, + "step": 6780 + }, + { + "epoch": 2.087411511234226, + "grad_norm": 27.375, + "learning_rate": 1.1371750682016272e-06, + "loss": 1.2097954750061035, + "step": 6782 + }, + { + "epoch": 2.088027085257002, + "grad_norm": 22.375, + "learning_rate": 1.1365103882995986e-06, + "loss": 1.9022862911224365, + "step": 6784 + }, + { + "epoch": 2.0886426592797784, + "grad_norm": 12.0625, + "learning_rate": 1.1358460014616915e-06, + "loss": 0.9254089593887329, + "step": 6786 + }, + { + "epoch": 2.0892582333025547, + "grad_norm": 16.5, + "learning_rate": 1.135181907981366e-06, + "loss": 1.5083138942718506, + "step": 6788 + }, + { + "epoch": 2.089873807325331, + "grad_norm": 16.625, + "learning_rate": 1.1345181081519521e-06, + "loss": 0.9767194986343384, + "step": 6790 + }, + { + "epoch": 2.090489381348107, + "grad_norm": 12.875, + "learning_rate": 1.1338546022666525e-06, + "loss": 1.3762192726135254, + "step": 6792 + }, + { + "epoch": 2.0911049553708834, + "grad_norm": 8.6875, + "learning_rate": 1.133191390618537e-06, + "loss": 1.3783683776855469, + "step": 6794 + }, + { + "epoch": 2.0917205293936596, + "grad_norm": 18.0, + "learning_rate": 1.1325284735005478e-06, + "loss": 0.9821135997772217, + "step": 6796 + }, + { + "epoch": 2.092336103416436, + "grad_norm": 20.5, + "learning_rate": 1.1318658512054961e-06, + "loss": 1.3283894062042236, + "step": 6798 + }, + { + "epoch": 2.092951677439212, + "grad_norm": 13.4375, + "learning_rate": 1.1312035240260623e-06, + "loss": 0.9971583485603333, + "step": 6800 + }, + { + "epoch": 2.0935672514619883, + "grad_norm": 30.375, + "learning_rate": 1.1305414922547976e-06, + "loss": 1.8160250186920166, + "step": 6802 + }, + { + "epoch": 2.0941828254847645, + "grad_norm": 24.875, + "learning_rate": 1.129879756184123e-06, + "loss": 1.6659497022628784, + "step": 6804 + }, + { + "epoch": 2.0947983995075408, + "grad_norm": 7.25, + "learning_rate": 1.1292183161063262e-06, + "loss": 0.7340489029884338, + "step": 6806 + }, + { + "epoch": 2.095413973530317, + "grad_norm": 25.125, + "learning_rate": 1.1285571723135673e-06, + "loss": 1.9677259922027588, + "step": 6808 + }, + { + "epoch": 2.0960295475530932, + "grad_norm": 21.0, + "learning_rate": 1.127896325097874e-06, + "loss": 1.7345890998840332, + "step": 6810 + }, + { + "epoch": 2.0966451215758695, + "grad_norm": 26.5, + "learning_rate": 1.1272357747511437e-06, + "loss": 1.692164659500122, + "step": 6812 + }, + { + "epoch": 2.0972606955986457, + "grad_norm": 4.25, + "learning_rate": 1.1265755215651414e-06, + "loss": 0.6040706038475037, + "step": 6814 + }, + { + "epoch": 2.097876269621422, + "grad_norm": 26.75, + "learning_rate": 1.125915565831502e-06, + "loss": 1.1488302946090698, + "step": 6816 + }, + { + "epoch": 2.098491843644198, + "grad_norm": 10.8125, + "learning_rate": 1.1252559078417293e-06, + "loss": 0.9155806303024292, + "step": 6818 + }, + { + "epoch": 2.0991074176669744, + "grad_norm": 9.125, + "learning_rate": 1.124596547887194e-06, + "loss": 1.2064423561096191, + "step": 6820 + }, + { + "epoch": 2.0997229916897506, + "grad_norm": 10.4375, + "learning_rate": 1.123937486259137e-06, + "loss": 1.3566370010375977, + "step": 6822 + }, + { + "epoch": 2.100338565712527, + "grad_norm": 32.5, + "learning_rate": 1.1232787232486663e-06, + "loss": 1.3902403116226196, + "step": 6824 + }, + { + "epoch": 2.100954139735303, + "grad_norm": 24.5, + "learning_rate": 1.1226202591467575e-06, + "loss": 1.4831122159957886, + "step": 6826 + }, + { + "epoch": 2.1015697137580793, + "grad_norm": 18.125, + "learning_rate": 1.1219620942442565e-06, + "loss": 1.1349093914031982, + "step": 6828 + }, + { + "epoch": 2.1021852877808556, + "grad_norm": 9.0625, + "learning_rate": 1.1213042288318747e-06, + "loss": 1.1530771255493164, + "step": 6830 + }, + { + "epoch": 2.102800861803632, + "grad_norm": 23.875, + "learning_rate": 1.1206466632001915e-06, + "loss": 1.4058551788330078, + "step": 6832 + }, + { + "epoch": 2.103416435826408, + "grad_norm": 3.859375, + "learning_rate": 1.1199893976396548e-06, + "loss": 1.1480472087860107, + "step": 6834 + }, + { + "epoch": 2.1040320098491843, + "grad_norm": 12.9375, + "learning_rate": 1.1193324324405795e-06, + "loss": 1.0479693412780762, + "step": 6836 + }, + { + "epoch": 2.1046475838719605, + "grad_norm": 17.125, + "learning_rate": 1.1186757678931484e-06, + "loss": 1.5369406938552856, + "step": 6838 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 15.0625, + "learning_rate": 1.1180194042874104e-06, + "loss": 1.1449368000030518, + "step": 6840 + }, + { + "epoch": 2.105878731917513, + "grad_norm": 13.125, + "learning_rate": 1.117363341913282e-06, + "loss": 1.3344638347625732, + "step": 6842 + }, + { + "epoch": 2.106494305940289, + "grad_norm": 4.6875, + "learning_rate": 1.1167075810605473e-06, + "loss": 1.231259822845459, + "step": 6844 + }, + { + "epoch": 2.1071098799630654, + "grad_norm": 11.6875, + "learning_rate": 1.1160521220188558e-06, + "loss": 1.2796053886413574, + "step": 6846 + }, + { + "epoch": 2.1077254539858417, + "grad_norm": 20.625, + "learning_rate": 1.1153969650777249e-06, + "loss": 0.9106309413909912, + "step": 6848 + }, + { + "epoch": 2.108341028008618, + "grad_norm": 4.9375, + "learning_rate": 1.114742110526539e-06, + "loss": 1.1471333503723145, + "step": 6850 + }, + { + "epoch": 2.108956602031394, + "grad_norm": 8.8125, + "learning_rate": 1.114087558654546e-06, + "loss": 1.4695062637329102, + "step": 6852 + }, + { + "epoch": 2.1095721760541704, + "grad_norm": 11.25, + "learning_rate": 1.1134333097508647e-06, + "loss": 1.306732177734375, + "step": 6854 + }, + { + "epoch": 2.1101877500769466, + "grad_norm": 12.5625, + "learning_rate": 1.1127793641044763e-06, + "loss": 1.2547025680541992, + "step": 6856 + }, + { + "epoch": 2.110803324099723, + "grad_norm": 8.875, + "learning_rate": 1.1121257220042286e-06, + "loss": 1.3866357803344727, + "step": 6858 + }, + { + "epoch": 2.111418898122499, + "grad_norm": 24.125, + "learning_rate": 1.1114723837388378e-06, + "loss": 1.4638195037841797, + "step": 6860 + }, + { + "epoch": 2.1120344721452753, + "grad_norm": 10.125, + "learning_rate": 1.110819349596883e-06, + "loss": 1.4681785106658936, + "step": 6862 + }, + { + "epoch": 2.1126500461680515, + "grad_norm": 7.09375, + "learning_rate": 1.11016661986681e-06, + "loss": 0.5226856470108032, + "step": 6864 + }, + { + "epoch": 2.1132656201908278, + "grad_norm": 24.875, + "learning_rate": 1.1095141948369316e-06, + "loss": 1.569403052330017, + "step": 6866 + }, + { + "epoch": 2.113881194213604, + "grad_norm": 19.5, + "learning_rate": 1.108862074795423e-06, + "loss": 1.3763833045959473, + "step": 6868 + }, + { + "epoch": 2.11449676823638, + "grad_norm": 7.0625, + "learning_rate": 1.1082102600303272e-06, + "loss": 0.9807989001274109, + "step": 6870 + }, + { + "epoch": 2.1151123422591565, + "grad_norm": 11.625, + "learning_rate": 1.107558750829552e-06, + "loss": 1.6106832027435303, + "step": 6872 + }, + { + "epoch": 2.1157279162819327, + "grad_norm": 8.125, + "learning_rate": 1.1069075474808685e-06, + "loss": 1.1969914436340332, + "step": 6874 + }, + { + "epoch": 2.1163434903047094, + "grad_norm": 18.5, + "learning_rate": 1.1062566502719151e-06, + "loss": 1.3149508237838745, + "step": 6876 + }, + { + "epoch": 2.116959064327485, + "grad_norm": 6.09375, + "learning_rate": 1.1056060594901931e-06, + "loss": 1.1083767414093018, + "step": 6878 + }, + { + "epoch": 2.117574638350262, + "grad_norm": 3.71875, + "learning_rate": 1.1049557754230703e-06, + "loss": 1.1356470584869385, + "step": 6880 + }, + { + "epoch": 2.118190212373038, + "grad_norm": 23.625, + "learning_rate": 1.1043057983577762e-06, + "loss": 1.579628825187683, + "step": 6882 + }, + { + "epoch": 2.1188057863958143, + "grad_norm": 16.625, + "learning_rate": 1.1036561285814077e-06, + "loss": 1.2171895503997803, + "step": 6884 + }, + { + "epoch": 2.1194213604185905, + "grad_norm": 11.0625, + "learning_rate": 1.1030067663809247e-06, + "loss": 1.6301791667938232, + "step": 6886 + }, + { + "epoch": 2.1200369344413668, + "grad_norm": 15.0, + "learning_rate": 1.1023577120431505e-06, + "loss": 0.9143880605697632, + "step": 6888 + }, + { + "epoch": 2.120652508464143, + "grad_norm": 25.0, + "learning_rate": 1.1017089658547733e-06, + "loss": 1.3826243877410889, + "step": 6890 + }, + { + "epoch": 2.1212680824869192, + "grad_norm": 7.03125, + "learning_rate": 1.1010605281023458e-06, + "loss": 1.3522517681121826, + "step": 6892 + }, + { + "epoch": 2.1218836565096955, + "grad_norm": 15.3125, + "learning_rate": 1.1004123990722829e-06, + "loss": 1.2374582290649414, + "step": 6894 + }, + { + "epoch": 2.1224992305324717, + "grad_norm": 8.3125, + "learning_rate": 1.0997645790508637e-06, + "loss": 1.2146995067596436, + "step": 6896 + }, + { + "epoch": 2.123114804555248, + "grad_norm": 17.125, + "learning_rate": 1.0991170683242324e-06, + "loss": 1.3192272186279297, + "step": 6898 + }, + { + "epoch": 2.123730378578024, + "grad_norm": 15.8125, + "learning_rate": 1.0984698671783936e-06, + "loss": 1.4727823734283447, + "step": 6900 + }, + { + "epoch": 2.1243459526008004, + "grad_norm": 28.375, + "learning_rate": 1.0978229758992177e-06, + "loss": 0.9449286460876465, + "step": 6902 + }, + { + "epoch": 2.1249615266235766, + "grad_norm": 24.5, + "learning_rate": 1.097176394772437e-06, + "loss": 1.1769983768463135, + "step": 6904 + }, + { + "epoch": 2.125577100646353, + "grad_norm": 16.125, + "learning_rate": 1.0965301240836481e-06, + "loss": 1.314269781112671, + "step": 6906 + }, + { + "epoch": 2.126192674669129, + "grad_norm": 8.0, + "learning_rate": 1.0958841641183077e-06, + "loss": 1.2756083011627197, + "step": 6908 + }, + { + "epoch": 2.1268082486919053, + "grad_norm": 39.25, + "learning_rate": 1.0952385151617384e-06, + "loss": 1.1773420572280884, + "step": 6910 + }, + { + "epoch": 2.1274238227146816, + "grad_norm": 28.75, + "learning_rate": 1.094593177499124e-06, + "loss": 1.4545814990997314, + "step": 6912 + }, + { + "epoch": 2.128039396737458, + "grad_norm": 15.5, + "learning_rate": 1.0939481514155102e-06, + "loss": 1.445265769958496, + "step": 6914 + }, + { + "epoch": 2.128654970760234, + "grad_norm": 35.0, + "learning_rate": 1.0933034371958061e-06, + "loss": 0.7057738304138184, + "step": 6916 + }, + { + "epoch": 2.1292705447830103, + "grad_norm": 19.125, + "learning_rate": 1.0926590351247835e-06, + "loss": 1.3218157291412354, + "step": 6918 + }, + { + "epoch": 2.1298861188057865, + "grad_norm": 34.25, + "learning_rate": 1.0920149454870736e-06, + "loss": 1.4286811351776123, + "step": 6920 + }, + { + "epoch": 2.1305016928285627, + "grad_norm": 10.5, + "learning_rate": 1.0913711685671738e-06, + "loss": 0.9320307970046997, + "step": 6922 + }, + { + "epoch": 2.131117266851339, + "grad_norm": 9.875, + "learning_rate": 1.0907277046494398e-06, + "loss": 1.3198941946029663, + "step": 6924 + }, + { + "epoch": 2.131732840874115, + "grad_norm": 32.25, + "learning_rate": 1.09008455401809e-06, + "loss": 1.175795555114746, + "step": 6926 + }, + { + "epoch": 2.1323484148968914, + "grad_norm": 12.1875, + "learning_rate": 1.089441716957206e-06, + "loss": 1.2813923358917236, + "step": 6928 + }, + { + "epoch": 2.1329639889196677, + "grad_norm": 13.375, + "learning_rate": 1.0887991937507287e-06, + "loss": 1.3827874660491943, + "step": 6930 + }, + { + "epoch": 2.133579562942444, + "grad_norm": 55.0, + "learning_rate": 1.0881569846824615e-06, + "loss": 1.8015201091766357, + "step": 6932 + }, + { + "epoch": 2.13419513696522, + "grad_norm": 35.25, + "learning_rate": 1.0875150900360695e-06, + "loss": 1.8140422105789185, + "step": 6934 + }, + { + "epoch": 2.1348107109879964, + "grad_norm": 12.8125, + "learning_rate": 1.0868735100950775e-06, + "loss": 1.3741447925567627, + "step": 6936 + }, + { + "epoch": 2.1354262850107726, + "grad_norm": 10.875, + "learning_rate": 1.0862322451428725e-06, + "loss": 0.892223596572876, + "step": 6938 + }, + { + "epoch": 2.136041859033549, + "grad_norm": 25.25, + "learning_rate": 1.085591295462702e-06, + "loss": 1.5447278022766113, + "step": 6940 + }, + { + "epoch": 2.136657433056325, + "grad_norm": 15.3125, + "learning_rate": 1.084950661337674e-06, + "loss": 1.270053744316101, + "step": 6942 + }, + { + "epoch": 2.1372730070791013, + "grad_norm": 17.75, + "learning_rate": 1.0843103430507579e-06, + "loss": 1.3592324256896973, + "step": 6944 + }, + { + "epoch": 2.1378885811018775, + "grad_norm": 53.5, + "learning_rate": 1.0836703408847815e-06, + "loss": 1.8099262714385986, + "step": 6946 + }, + { + "epoch": 2.1385041551246537, + "grad_norm": 9.5, + "learning_rate": 1.0830306551224365e-06, + "loss": 1.254908561706543, + "step": 6948 + }, + { + "epoch": 2.13911972914743, + "grad_norm": 37.5, + "learning_rate": 1.0823912860462715e-06, + "loss": 1.4922285079956055, + "step": 6950 + }, + { + "epoch": 2.139735303170206, + "grad_norm": 15.0625, + "learning_rate": 1.0817522339386967e-06, + "loss": 1.1510124206542969, + "step": 6952 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 14.25, + "learning_rate": 1.081113499081983e-06, + "loss": 1.4475257396697998, + "step": 6954 + }, + { + "epoch": 2.1409664512157587, + "grad_norm": 16.875, + "learning_rate": 1.080475081758259e-06, + "loss": 1.4003031253814697, + "step": 6956 + }, + { + "epoch": 2.141582025238535, + "grad_norm": 13.125, + "learning_rate": 1.0798369822495153e-06, + "loss": 1.8019049167633057, + "step": 6958 + }, + { + "epoch": 2.142197599261311, + "grad_norm": 13.1875, + "learning_rate": 1.0791992008376013e-06, + "loss": 1.6473685503005981, + "step": 6960 + }, + { + "epoch": 2.1428131732840874, + "grad_norm": 25.125, + "learning_rate": 1.078561737804225e-06, + "loss": 1.3453480005264282, + "step": 6962 + }, + { + "epoch": 2.1434287473068636, + "grad_norm": 11.75, + "learning_rate": 1.077924593430955e-06, + "loss": 1.3464794158935547, + "step": 6964 + }, + { + "epoch": 2.14404432132964, + "grad_norm": 14.25, + "learning_rate": 1.077287767999219e-06, + "loss": 1.691328525543213, + "step": 6966 + }, + { + "epoch": 2.144659895352416, + "grad_norm": 12.1875, + "learning_rate": 1.0766512617903028e-06, + "loss": 1.588313341140747, + "step": 6968 + }, + { + "epoch": 2.1452754693751923, + "grad_norm": 17.625, + "learning_rate": 1.0760150750853522e-06, + "loss": 1.3188862800598145, + "step": 6970 + }, + { + "epoch": 2.1458910433979685, + "grad_norm": 11.625, + "learning_rate": 1.0753792081653718e-06, + "loss": 1.4394432306289673, + "step": 6972 + }, + { + "epoch": 2.146506617420745, + "grad_norm": 10.1875, + "learning_rate": 1.0747436613112253e-06, + "loss": 1.6718394756317139, + "step": 6974 + }, + { + "epoch": 2.147122191443521, + "grad_norm": 8.75, + "learning_rate": 1.074108434803633e-06, + "loss": 0.9956381320953369, + "step": 6976 + }, + { + "epoch": 2.1477377654662972, + "grad_norm": 10.4375, + "learning_rate": 1.0734735289231762e-06, + "loss": 1.2394037246704102, + "step": 6978 + }, + { + "epoch": 2.1483533394890735, + "grad_norm": 18.75, + "learning_rate": 1.0728389439502937e-06, + "loss": 1.2736289501190186, + "step": 6980 + }, + { + "epoch": 2.1489689135118497, + "grad_norm": 7.75, + "learning_rate": 1.0722046801652819e-06, + "loss": 1.2063171863555908, + "step": 6982 + }, + { + "epoch": 2.149584487534626, + "grad_norm": 21.25, + "learning_rate": 1.0715707378482955e-06, + "loss": 1.4747381210327148, + "step": 6984 + }, + { + "epoch": 2.150200061557402, + "grad_norm": 13.375, + "learning_rate": 1.0709371172793488e-06, + "loss": 1.5591285228729248, + "step": 6986 + }, + { + "epoch": 2.1508156355801784, + "grad_norm": 26.75, + "learning_rate": 1.0703038187383112e-06, + "loss": 1.4576010704040527, + "step": 6988 + }, + { + "epoch": 2.1514312096029546, + "grad_norm": 4.65625, + "learning_rate": 1.069670842504913e-06, + "loss": 1.1618280410766602, + "step": 6990 + }, + { + "epoch": 2.152046783625731, + "grad_norm": 14.3125, + "learning_rate": 1.069038188858739e-06, + "loss": 0.8342984914779663, + "step": 6992 + }, + { + "epoch": 2.152662357648507, + "grad_norm": 35.25, + "learning_rate": 1.0684058580792339e-06, + "loss": 1.4440381526947021, + "step": 6994 + }, + { + "epoch": 2.1532779316712833, + "grad_norm": 21.5, + "learning_rate": 1.0677738504456992e-06, + "loss": 1.6453757286071777, + "step": 6996 + }, + { + "epoch": 2.1538935056940596, + "grad_norm": 12.6875, + "learning_rate": 1.0671421662372927e-06, + "loss": 1.1821353435516357, + "step": 6998 + }, + { + "epoch": 2.154509079716836, + "grad_norm": 11.6875, + "learning_rate": 1.0665108057330306e-06, + "loss": 1.4075431823730469, + "step": 7000 + }, + { + "epoch": 2.155124653739612, + "grad_norm": 28.5, + "learning_rate": 1.0658797692117847e-06, + "loss": 1.6227091550827026, + "step": 7002 + }, + { + "epoch": 2.1557402277623883, + "grad_norm": 15.125, + "learning_rate": 1.065249056952285e-06, + "loss": 1.389638900756836, + "step": 7004 + }, + { + "epoch": 2.1563558017851645, + "grad_norm": 14.9375, + "learning_rate": 1.0646186692331187e-06, + "loss": 1.2804958820343018, + "step": 7006 + }, + { + "epoch": 2.1569713758079407, + "grad_norm": 9.0, + "learning_rate": 1.0639886063327275e-06, + "loss": 1.4667779207229614, + "step": 7008 + }, + { + "epoch": 2.157586949830717, + "grad_norm": 62.0, + "learning_rate": 1.0633588685294114e-06, + "loss": 1.7286169528961182, + "step": 7010 + }, + { + "epoch": 2.158202523853493, + "grad_norm": 12.375, + "learning_rate": 1.0627294561013269e-06, + "loss": 1.4651620388031006, + "step": 7012 + }, + { + "epoch": 2.1588180978762694, + "grad_norm": 9.375, + "learning_rate": 1.0621003693264845e-06, + "loss": 1.2908897399902344, + "step": 7014 + }, + { + "epoch": 2.1594336718990457, + "grad_norm": 10.3125, + "learning_rate": 1.0614716084827546e-06, + "loss": 1.427079200744629, + "step": 7016 + }, + { + "epoch": 2.160049245921822, + "grad_norm": 21.875, + "learning_rate": 1.0608431738478604e-06, + "loss": 1.3272745609283447, + "step": 7018 + }, + { + "epoch": 2.160664819944598, + "grad_norm": 26.625, + "learning_rate": 1.0602150656993821e-06, + "loss": 1.7375531196594238, + "step": 7020 + }, + { + "epoch": 2.1612803939673744, + "grad_norm": 17.375, + "learning_rate": 1.0595872843147568e-06, + "loss": 1.364192247390747, + "step": 7022 + }, + { + "epoch": 2.1618959679901506, + "grad_norm": 20.375, + "learning_rate": 1.0589598299712753e-06, + "loss": 1.699464201927185, + "step": 7024 + }, + { + "epoch": 2.1625115420129273, + "grad_norm": 10.6875, + "learning_rate": 1.058332702946085e-06, + "loss": 1.4647746086120605, + "step": 7026 + }, + { + "epoch": 2.163127116035703, + "grad_norm": 24.625, + "learning_rate": 1.0577059035161893e-06, + "loss": 0.9580203294754028, + "step": 7028 + }, + { + "epoch": 2.1637426900584797, + "grad_norm": 11.375, + "learning_rate": 1.0570794319584451e-06, + "loss": 1.3357770442962646, + "step": 7030 + }, + { + "epoch": 2.164358264081256, + "grad_norm": 80.0, + "learning_rate": 1.0564532885495665e-06, + "loss": 1.3333418369293213, + "step": 7032 + }, + { + "epoch": 2.164973838104032, + "grad_norm": 35.0, + "learning_rate": 1.0558274735661214e-06, + "loss": 1.3418631553649902, + "step": 7034 + }, + { + "epoch": 2.1655894121268084, + "grad_norm": 26.125, + "learning_rate": 1.0552019872845336e-06, + "loss": 1.6847832202911377, + "step": 7036 + }, + { + "epoch": 2.1662049861495847, + "grad_norm": 7.78125, + "learning_rate": 1.05457682998108e-06, + "loss": 1.330476999282837, + "step": 7038 + }, + { + "epoch": 2.166820560172361, + "grad_norm": 10.8125, + "learning_rate": 1.0539520019318943e-06, + "loss": 1.4650633335113525, + "step": 7040 + }, + { + "epoch": 2.167436134195137, + "grad_norm": 17.5, + "learning_rate": 1.0533275034129636e-06, + "loss": 1.2590150833129883, + "step": 7042 + }, + { + "epoch": 2.1680517082179134, + "grad_norm": 10.3125, + "learning_rate": 1.0527033347001288e-06, + "loss": 0.8662137985229492, + "step": 7044 + }, + { + "epoch": 2.1686672822406896, + "grad_norm": 9.5625, + "learning_rate": 1.0520794960690866e-06, + "loss": 1.225198745727539, + "step": 7046 + }, + { + "epoch": 2.169282856263466, + "grad_norm": 11.375, + "learning_rate": 1.0514559877953876e-06, + "loss": 1.300356149673462, + "step": 7048 + }, + { + "epoch": 2.169898430286242, + "grad_norm": 43.25, + "learning_rate": 1.0508328101544355e-06, + "loss": 1.5632920265197754, + "step": 7050 + }, + { + "epoch": 2.1705140043090183, + "grad_norm": 44.25, + "learning_rate": 1.0502099634214882e-06, + "loss": 1.3846997022628784, + "step": 7052 + }, + { + "epoch": 2.1711295783317945, + "grad_norm": 39.25, + "learning_rate": 1.0495874478716593e-06, + "loss": 1.5945847034454346, + "step": 7054 + }, + { + "epoch": 2.1717451523545708, + "grad_norm": 13.9375, + "learning_rate": 1.0489652637799131e-06, + "loss": 1.6202548742294312, + "step": 7056 + }, + { + "epoch": 2.172360726377347, + "grad_norm": 23.0, + "learning_rate": 1.0483434114210694e-06, + "loss": 1.3037610054016113, + "step": 7058 + }, + { + "epoch": 2.1729763004001232, + "grad_norm": 8.5625, + "learning_rate": 1.0477218910698017e-06, + "loss": 0.9244287014007568, + "step": 7060 + }, + { + "epoch": 2.1735918744228995, + "grad_norm": 17.75, + "learning_rate": 1.047100703000636e-06, + "loss": 1.3129549026489258, + "step": 7062 + }, + { + "epoch": 2.1742074484456757, + "grad_norm": 5.90625, + "learning_rate": 1.0464798474879512e-06, + "loss": 0.8765146732330322, + "step": 7064 + }, + { + "epoch": 2.174823022468452, + "grad_norm": 8.9375, + "learning_rate": 1.0458593248059807e-06, + "loss": 1.2174797058105469, + "step": 7066 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 11.4375, + "learning_rate": 1.04523913522881e-06, + "loss": 1.4736378192901611, + "step": 7068 + }, + { + "epoch": 2.1760541705140044, + "grad_norm": 30.625, + "learning_rate": 1.0446192790303766e-06, + "loss": 1.3222408294677734, + "step": 7070 + }, + { + "epoch": 2.1766697445367806, + "grad_norm": 10.125, + "learning_rate": 1.0439997564844726e-06, + "loss": 1.1831345558166504, + "step": 7072 + }, + { + "epoch": 2.177285318559557, + "grad_norm": 16.5, + "learning_rate": 1.043380567864742e-06, + "loss": 1.5872745513916016, + "step": 7074 + }, + { + "epoch": 2.177900892582333, + "grad_norm": 4.625, + "learning_rate": 1.0427617134446797e-06, + "loss": 0.9572898149490356, + "step": 7076 + }, + { + "epoch": 2.1785164666051093, + "grad_norm": 23.0, + "learning_rate": 1.0421431934976363e-06, + "loss": 0.9428175091743469, + "step": 7078 + }, + { + "epoch": 2.1791320406278856, + "grad_norm": 43.25, + "learning_rate": 1.041525008296812e-06, + "loss": 1.2451021671295166, + "step": 7080 + }, + { + "epoch": 2.179747614650662, + "grad_norm": 25.125, + "learning_rate": 1.040907158115259e-06, + "loss": 1.7040786743164062, + "step": 7082 + }, + { + "epoch": 2.180363188673438, + "grad_norm": 13.6875, + "learning_rate": 1.0402896432258838e-06, + "loss": 1.5211122035980225, + "step": 7084 + }, + { + "epoch": 2.1809787626962143, + "grad_norm": 15.0, + "learning_rate": 1.0396724639014427e-06, + "loss": 1.5954887866973877, + "step": 7086 + }, + { + "epoch": 2.1815943367189905, + "grad_norm": 17.875, + "learning_rate": 1.0390556204145444e-06, + "loss": 1.3188745975494385, + "step": 7088 + }, + { + "epoch": 2.1822099107417667, + "grad_norm": 28.25, + "learning_rate": 1.0384391130376502e-06, + "loss": 1.4498337507247925, + "step": 7090 + }, + { + "epoch": 2.182825484764543, + "grad_norm": 11.375, + "learning_rate": 1.037822942043071e-06, + "loss": 1.3745802640914917, + "step": 7092 + }, + { + "epoch": 2.183441058787319, + "grad_norm": 9.1875, + "learning_rate": 1.0372071077029713e-06, + "loss": 1.3752477169036865, + "step": 7094 + }, + { + "epoch": 2.1840566328100954, + "grad_norm": 6.09375, + "learning_rate": 1.036591610289365e-06, + "loss": 0.9320778846740723, + "step": 7096 + }, + { + "epoch": 2.1846722068328717, + "grad_norm": 27.0, + "learning_rate": 1.0359764500741184e-06, + "loss": 0.8172409534454346, + "step": 7098 + }, + { + "epoch": 2.185287780855648, + "grad_norm": 18.25, + "learning_rate": 1.0353616273289483e-06, + "loss": 1.3536324501037598, + "step": 7100 + }, + { + "epoch": 2.185903354878424, + "grad_norm": 33.25, + "learning_rate": 1.0347471423254226e-06, + "loss": 1.423474907875061, + "step": 7102 + }, + { + "epoch": 2.1865189289012004, + "grad_norm": 18.25, + "learning_rate": 1.0341329953349606e-06, + "loss": 1.3694900274276733, + "step": 7104 + }, + { + "epoch": 2.1871345029239766, + "grad_norm": 11.125, + "learning_rate": 1.0335191866288312e-06, + "loss": 1.5513331890106201, + "step": 7106 + }, + { + "epoch": 2.187750076946753, + "grad_norm": 23.125, + "learning_rate": 1.032905716478154e-06, + "loss": 1.5492780208587646, + "step": 7108 + }, + { + "epoch": 2.188365650969529, + "grad_norm": 15.5, + "learning_rate": 1.0322925851539005e-06, + "loss": 0.5792900919914246, + "step": 7110 + }, + { + "epoch": 2.1889812249923053, + "grad_norm": 6.65625, + "learning_rate": 1.0316797929268905e-06, + "loss": 1.271028995513916, + "step": 7112 + }, + { + "epoch": 2.1895967990150815, + "grad_norm": 25.125, + "learning_rate": 1.0310673400677957e-06, + "loss": 1.341614007949829, + "step": 7114 + }, + { + "epoch": 2.1902123730378578, + "grad_norm": 18.375, + "learning_rate": 1.0304552268471373e-06, + "loss": 1.2016503810882568, + "step": 7116 + }, + { + "epoch": 2.190827947060634, + "grad_norm": 14.1875, + "learning_rate": 1.029843453535286e-06, + "loss": 0.934708297252655, + "step": 7118 + }, + { + "epoch": 2.1914435210834102, + "grad_norm": 36.25, + "learning_rate": 1.0292320204024623e-06, + "loss": 1.4819412231445312, + "step": 7120 + }, + { + "epoch": 2.1920590951061865, + "grad_norm": 24.5, + "learning_rate": 1.0286209277187384e-06, + "loss": 1.5816731452941895, + "step": 7122 + }, + { + "epoch": 2.1926746691289627, + "grad_norm": 16.625, + "learning_rate": 1.028010175754033e-06, + "loss": 0.9610680341720581, + "step": 7124 + }, + { + "epoch": 2.193290243151739, + "grad_norm": 22.375, + "learning_rate": 1.027399764778117e-06, + "loss": 1.6568622589111328, + "step": 7126 + }, + { + "epoch": 2.193905817174515, + "grad_norm": 24.75, + "learning_rate": 1.026789695060609e-06, + "loss": 1.8416197299957275, + "step": 7128 + }, + { + "epoch": 2.1945213911972914, + "grad_norm": 15.8125, + "learning_rate": 1.0261799668709785e-06, + "loss": 1.2987570762634277, + "step": 7130 + }, + { + "epoch": 2.1951369652200676, + "grad_norm": 10.5, + "learning_rate": 1.025570580478542e-06, + "loss": 1.2055429220199585, + "step": 7132 + }, + { + "epoch": 2.195752539242844, + "grad_norm": 8.5, + "learning_rate": 1.0249615361524663e-06, + "loss": 1.4445521831512451, + "step": 7134 + }, + { + "epoch": 2.19636811326562, + "grad_norm": 16.375, + "learning_rate": 1.0243528341617681e-06, + "loss": 1.2120137214660645, + "step": 7136 + }, + { + "epoch": 2.1969836872883963, + "grad_norm": 16.625, + "learning_rate": 1.0237444747753104e-06, + "loss": 1.4288749694824219, + "step": 7138 + }, + { + "epoch": 2.1975992613111726, + "grad_norm": 13.5625, + "learning_rate": 1.023136458261807e-06, + "loss": 1.5977098941802979, + "step": 7140 + }, + { + "epoch": 2.198214835333949, + "grad_norm": 7.28125, + "learning_rate": 1.0225287848898193e-06, + "loss": 1.297478437423706, + "step": 7142 + }, + { + "epoch": 2.198830409356725, + "grad_norm": 52.75, + "learning_rate": 1.021921454927757e-06, + "loss": 1.3651518821716309, + "step": 7144 + }, + { + "epoch": 2.1994459833795013, + "grad_norm": 19.5, + "learning_rate": 1.0213144686438791e-06, + "loss": 1.0678225755691528, + "step": 7146 + }, + { + "epoch": 2.2000615574022775, + "grad_norm": 24.375, + "learning_rate": 1.0207078263062918e-06, + "loss": 1.4905214309692383, + "step": 7148 + }, + { + "epoch": 2.2006771314250537, + "grad_norm": 25.75, + "learning_rate": 1.0201015281829492e-06, + "loss": 1.4449894428253174, + "step": 7150 + }, + { + "epoch": 2.20129270544783, + "grad_norm": 13.3125, + "learning_rate": 1.0194955745416549e-06, + "loss": 1.0292598009109497, + "step": 7152 + }, + { + "epoch": 2.201908279470606, + "grad_norm": 32.25, + "learning_rate": 1.0188899656500588e-06, + "loss": 1.6224634647369385, + "step": 7154 + }, + { + "epoch": 2.2025238534933824, + "grad_norm": 8.125, + "learning_rate": 1.0182847017756585e-06, + "loss": 1.441828966140747, + "step": 7156 + }, + { + "epoch": 2.2031394275161587, + "grad_norm": 12.5625, + "learning_rate": 1.0176797831858012e-06, + "loss": 1.4349195957183838, + "step": 7158 + }, + { + "epoch": 2.203755001538935, + "grad_norm": 8.3125, + "learning_rate": 1.0170752101476783e-06, + "loss": 1.3309807777404785, + "step": 7160 + }, + { + "epoch": 2.204370575561711, + "grad_norm": 14.375, + "learning_rate": 1.0164709829283315e-06, + "loss": 1.332626223564148, + "step": 7162 + }, + { + "epoch": 2.2049861495844874, + "grad_norm": 9.9375, + "learning_rate": 1.0158671017946491e-06, + "loss": 0.7435551285743713, + "step": 7164 + }, + { + "epoch": 2.2056017236072636, + "grad_norm": 19.5, + "learning_rate": 1.0152635670133648e-06, + "loss": 1.1366829872131348, + "step": 7166 + }, + { + "epoch": 2.20621729763004, + "grad_norm": 8.8125, + "learning_rate": 1.0146603788510617e-06, + "loss": 1.3661516904830933, + "step": 7168 + }, + { + "epoch": 2.206832871652816, + "grad_norm": 8.5, + "learning_rate": 1.0140575375741676e-06, + "loss": 1.4987188577651978, + "step": 7170 + }, + { + "epoch": 2.2074484456755923, + "grad_norm": 20.25, + "learning_rate": 1.0134550434489594e-06, + "loss": 1.2765529155731201, + "step": 7172 + }, + { + "epoch": 2.2080640196983685, + "grad_norm": 13.0, + "learning_rate": 1.0128528967415594e-06, + "loss": 1.1727826595306396, + "step": 7174 + }, + { + "epoch": 2.208679593721145, + "grad_norm": 17.5, + "learning_rate": 1.0122510977179347e-06, + "loss": 1.641790747642517, + "step": 7176 + }, + { + "epoch": 2.209295167743921, + "grad_norm": 21.125, + "learning_rate": 1.0116496466439029e-06, + "loss": 1.4589368104934692, + "step": 7178 + }, + { + "epoch": 2.2099107417666977, + "grad_norm": 12.25, + "learning_rate": 1.0110485437851243e-06, + "loss": 1.2559046745300293, + "step": 7180 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 11.8125, + "learning_rate": 1.0104477894071066e-06, + "loss": 1.2748613357543945, + "step": 7182 + }, + { + "epoch": 2.21114188981225, + "grad_norm": 12.5, + "learning_rate": 1.0098473837752049e-06, + "loss": 1.525190830230713, + "step": 7184 + }, + { + "epoch": 2.2117574638350264, + "grad_norm": 14.6875, + "learning_rate": 1.009247327154618e-06, + "loss": 1.805044174194336, + "step": 7186 + }, + { + "epoch": 2.2123730378578026, + "grad_norm": 16.625, + "learning_rate": 1.0086476198103918e-06, + "loss": 1.4728741645812988, + "step": 7188 + }, + { + "epoch": 2.212988611880579, + "grad_norm": 29.875, + "learning_rate": 1.008048262007418e-06, + "loss": 1.122135877609253, + "step": 7190 + }, + { + "epoch": 2.213604185903355, + "grad_norm": 22.125, + "learning_rate": 1.0074492540104334e-06, + "loss": 1.3765050172805786, + "step": 7192 + }, + { + "epoch": 2.2142197599261313, + "grad_norm": 8.5, + "learning_rate": 1.0068505960840204e-06, + "loss": 1.5011130571365356, + "step": 7194 + }, + { + "epoch": 2.2148353339489075, + "grad_norm": 6.28125, + "learning_rate": 1.0062522884926068e-06, + "loss": 1.3109992742538452, + "step": 7196 + }, + { + "epoch": 2.2154509079716838, + "grad_norm": 9.375, + "learning_rate": 1.0056543315004669e-06, + "loss": 1.220149278640747, + "step": 7198 + }, + { + "epoch": 2.21606648199446, + "grad_norm": 11.0, + "learning_rate": 1.0050567253717172e-06, + "loss": 1.2714948654174805, + "step": 7200 + }, + { + "epoch": 2.2166820560172362, + "grad_norm": 21.125, + "learning_rate": 1.0044594703703222e-06, + "loss": 1.0933783054351807, + "step": 7202 + }, + { + "epoch": 2.2172976300400125, + "grad_norm": 18.25, + "learning_rate": 1.0038625667600903e-06, + "loss": 1.6369684934616089, + "step": 7204 + }, + { + "epoch": 2.2179132040627887, + "grad_norm": 9.5625, + "learning_rate": 1.0032660148046734e-06, + "loss": 1.3350924253463745, + "step": 7206 + }, + { + "epoch": 2.218528778085565, + "grad_norm": 13.9375, + "learning_rate": 1.00266981476757e-06, + "loss": 1.3873803615570068, + "step": 7208 + }, + { + "epoch": 2.219144352108341, + "grad_norm": 20.375, + "learning_rate": 1.0020739669121223e-06, + "loss": 1.4078598022460938, + "step": 7210 + }, + { + "epoch": 2.2197599261311174, + "grad_norm": 38.5, + "learning_rate": 1.0014784715015161e-06, + "loss": 1.376378059387207, + "step": 7212 + }, + { + "epoch": 2.2203755001538936, + "grad_norm": 14.5, + "learning_rate": 1.0008833287987842e-06, + "loss": 1.2546615600585938, + "step": 7214 + }, + { + "epoch": 2.22099107417667, + "grad_norm": 17.625, + "learning_rate": 1.0002885390668002e-06, + "loss": 1.1415352821350098, + "step": 7216 + }, + { + "epoch": 2.221606648199446, + "grad_norm": 15.1875, + "learning_rate": 9.996941025682843e-07, + "loss": 1.455315113067627, + "step": 7218 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 31.875, + "learning_rate": 9.991000195657988e-07, + "loss": 1.6557023525238037, + "step": 7220 + }, + { + "epoch": 2.2228377962449986, + "grad_norm": 10.375, + "learning_rate": 9.985062903217516e-07, + "loss": 0.691315770149231, + "step": 7222 + }, + { + "epoch": 2.223453370267775, + "grad_norm": 16.0, + "learning_rate": 9.97912915098394e-07, + "loss": 1.4408737421035767, + "step": 7224 + }, + { + "epoch": 2.224068944290551, + "grad_norm": 20.0, + "learning_rate": 9.973198941578195e-07, + "loss": 1.5671606063842773, + "step": 7226 + }, + { + "epoch": 2.2246845183133273, + "grad_norm": 11.1875, + "learning_rate": 9.967272277619666e-07, + "loss": 1.2749438285827637, + "step": 7228 + }, + { + "epoch": 2.2253000923361035, + "grad_norm": 11.9375, + "learning_rate": 9.961349161726172e-07, + "loss": 1.1752612590789795, + "step": 7230 + }, + { + "epoch": 2.2259156663588797, + "grad_norm": 6.03125, + "learning_rate": 9.955429596513951e-07, + "loss": 1.1268336772918701, + "step": 7232 + }, + { + "epoch": 2.226531240381656, + "grad_norm": 29.0, + "learning_rate": 9.949513584597688e-07, + "loss": 1.4443705081939697, + "step": 7234 + }, + { + "epoch": 2.227146814404432, + "grad_norm": 11.625, + "learning_rate": 9.943601128590489e-07, + "loss": 1.3743427991867065, + "step": 7236 + }, + { + "epoch": 2.2277623884272084, + "grad_norm": 71.0, + "learning_rate": 9.93769223110389e-07, + "loss": 1.7574753761291504, + "step": 7238 + }, + { + "epoch": 2.2283779624499847, + "grad_norm": 17.25, + "learning_rate": 9.93178689474787e-07, + "loss": 1.6704533100128174, + "step": 7240 + }, + { + "epoch": 2.228993536472761, + "grad_norm": 14.4375, + "learning_rate": 9.925885122130812e-07, + "loss": 0.9240731000900269, + "step": 7242 + }, + { + "epoch": 2.229609110495537, + "grad_norm": 12.5625, + "learning_rate": 9.919986915859533e-07, + "loss": 1.3428912162780762, + "step": 7244 + }, + { + "epoch": 2.2302246845183133, + "grad_norm": 18.5, + "learning_rate": 9.914092278539291e-07, + "loss": 1.1740977764129639, + "step": 7246 + }, + { + "epoch": 2.2308402585410896, + "grad_norm": 6.65625, + "learning_rate": 9.90820121277374e-07, + "loss": 1.195859670639038, + "step": 7248 + }, + { + "epoch": 2.231455832563866, + "grad_norm": 15.375, + "learning_rate": 9.902313721164975e-07, + "loss": 1.3590707778930664, + "step": 7250 + }, + { + "epoch": 2.232071406586642, + "grad_norm": 14.5625, + "learning_rate": 9.896429806313515e-07, + "loss": 1.4215617179870605, + "step": 7252 + }, + { + "epoch": 2.2326869806094183, + "grad_norm": 20.0, + "learning_rate": 9.890549470818282e-07, + "loss": 1.3735675811767578, + "step": 7254 + }, + { + "epoch": 2.2333025546321945, + "grad_norm": 12.5, + "learning_rate": 9.88467271727663e-07, + "loss": 1.3217053413391113, + "step": 7256 + }, + { + "epoch": 2.2339181286549707, + "grad_norm": 9.25, + "learning_rate": 9.878799548284332e-07, + "loss": 1.4295120239257812, + "step": 7258 + }, + { + "epoch": 2.234533702677747, + "grad_norm": 11.75, + "learning_rate": 9.872929966435569e-07, + "loss": 1.1404094696044922, + "step": 7260 + }, + { + "epoch": 2.235149276700523, + "grad_norm": 14.0625, + "learning_rate": 9.86706397432294e-07, + "loss": 1.7106270790100098, + "step": 7262 + }, + { + "epoch": 2.2357648507232994, + "grad_norm": 25.0, + "learning_rate": 9.861201574537466e-07, + "loss": 1.638358235359192, + "step": 7264 + }, + { + "epoch": 2.2363804247460757, + "grad_norm": 62.75, + "learning_rate": 9.855342769668576e-07, + "loss": 1.517430067062378, + "step": 7266 + }, + { + "epoch": 2.236995998768852, + "grad_norm": 10.4375, + "learning_rate": 9.849487562304105e-07, + "loss": 1.170607089996338, + "step": 7268 + }, + { + "epoch": 2.237611572791628, + "grad_norm": 13.0625, + "learning_rate": 9.843635955030307e-07, + "loss": 1.1815297603607178, + "step": 7270 + }, + { + "epoch": 2.2382271468144044, + "grad_norm": 21.625, + "learning_rate": 9.837787950431848e-07, + "loss": 1.6441622972488403, + "step": 7272 + }, + { + "epoch": 2.2388427208371806, + "grad_norm": 17.875, + "learning_rate": 9.831943551091793e-07, + "loss": 1.020333170890808, + "step": 7274 + }, + { + "epoch": 2.239458294859957, + "grad_norm": 19.125, + "learning_rate": 9.82610275959162e-07, + "loss": 1.3882975578308105, + "step": 7276 + }, + { + "epoch": 2.240073868882733, + "grad_norm": 18.125, + "learning_rate": 9.820265578511218e-07, + "loss": 1.380478024482727, + "step": 7278 + }, + { + "epoch": 2.2406894429055093, + "grad_norm": 16.375, + "learning_rate": 9.814432010428871e-07, + "loss": 1.0519275665283203, + "step": 7280 + }, + { + "epoch": 2.2413050169282855, + "grad_norm": 8.75, + "learning_rate": 9.808602057921276e-07, + "loss": 1.449505090713501, + "step": 7282 + }, + { + "epoch": 2.2419205909510618, + "grad_norm": 20.375, + "learning_rate": 9.80277572356353e-07, + "loss": 1.6460387706756592, + "step": 7284 + }, + { + "epoch": 2.242536164973838, + "grad_norm": 13.5625, + "learning_rate": 9.79695300992913e-07, + "loss": 1.3300621509552002, + "step": 7286 + }, + { + "epoch": 2.2431517389966142, + "grad_norm": 14.6875, + "learning_rate": 9.791133919589974e-07, + "loss": 1.4139244556427002, + "step": 7288 + }, + { + "epoch": 2.2437673130193905, + "grad_norm": 12.5625, + "learning_rate": 9.785318455116363e-07, + "loss": 1.2685773372650146, + "step": 7290 + }, + { + "epoch": 2.2443828870421667, + "grad_norm": 18.125, + "learning_rate": 9.779506619076998e-07, + "loss": 1.2302950620651245, + "step": 7292 + }, + { + "epoch": 2.244998461064943, + "grad_norm": 15.375, + "learning_rate": 9.773698414038966e-07, + "loss": 1.5846174955368042, + "step": 7294 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 10.5625, + "learning_rate": 9.767893842567763e-07, + "loss": 1.4596608877182007, + "step": 7296 + }, + { + "epoch": 2.2462296091104954, + "grad_norm": 14.375, + "learning_rate": 9.762092907227272e-07, + "loss": 1.5452144145965576, + "step": 7298 + }, + { + "epoch": 2.2468451831332716, + "grad_norm": 16.125, + "learning_rate": 9.756295610579775e-07, + "loss": 1.6285851001739502, + "step": 7300 + }, + { + "epoch": 2.247460757156048, + "grad_norm": 12.375, + "learning_rate": 9.750501955185943e-07, + "loss": 1.5760045051574707, + "step": 7302 + }, + { + "epoch": 2.248076331178824, + "grad_norm": 18.125, + "learning_rate": 9.744711943604844e-07, + "loss": 1.7773367166519165, + "step": 7304 + }, + { + "epoch": 2.2486919052016003, + "grad_norm": 17.625, + "learning_rate": 9.738925578393922e-07, + "loss": 1.3781275749206543, + "step": 7306 + }, + { + "epoch": 2.2493074792243766, + "grad_norm": 20.5, + "learning_rate": 9.733142862109036e-07, + "loss": 0.925908088684082, + "step": 7308 + }, + { + "epoch": 2.249923053247153, + "grad_norm": 15.125, + "learning_rate": 9.727363797304407e-07, + "loss": 1.5970616340637207, + "step": 7310 + }, + { + "epoch": 2.250538627269929, + "grad_norm": 6.15625, + "learning_rate": 9.721588386532659e-07, + "loss": 1.098137617111206, + "step": 7312 + }, + { + "epoch": 2.2511542012927053, + "grad_norm": 8.5625, + "learning_rate": 9.715816632344803e-07, + "loss": 1.1556612253189087, + "step": 7314 + }, + { + "epoch": 2.2517697753154815, + "grad_norm": 20.75, + "learning_rate": 9.710048537290218e-07, + "loss": 0.9828079342842102, + "step": 7316 + }, + { + "epoch": 2.2523853493382577, + "grad_norm": 14.4375, + "learning_rate": 9.704284103916682e-07, + "loss": 1.13887619972229, + "step": 7318 + }, + { + "epoch": 2.253000923361034, + "grad_norm": 17.375, + "learning_rate": 9.69852333477036e-07, + "loss": 1.549184799194336, + "step": 7320 + }, + { + "epoch": 2.2536164973838106, + "grad_norm": 5.9375, + "learning_rate": 9.69276623239578e-07, + "loss": 1.257533311843872, + "step": 7322 + }, + { + "epoch": 2.2542320714065864, + "grad_norm": 11.4375, + "learning_rate": 9.687012799335864e-07, + "loss": 1.1815621852874756, + "step": 7324 + }, + { + "epoch": 2.254847645429363, + "grad_norm": 17.75, + "learning_rate": 9.681263038131916e-07, + "loss": 0.6288884878158569, + "step": 7326 + }, + { + "epoch": 2.255463219452139, + "grad_norm": 11.4375, + "learning_rate": 9.675516951323602e-07, + "loss": 1.4890342950820923, + "step": 7328 + }, + { + "epoch": 2.2560787934749156, + "grad_norm": 9.5, + "learning_rate": 9.669774541448983e-07, + "loss": 1.2622697353363037, + "step": 7330 + }, + { + "epoch": 2.2566943674976914, + "grad_norm": 16.125, + "learning_rate": 9.664035811044478e-07, + "loss": 1.391549825668335, + "step": 7332 + }, + { + "epoch": 2.257309941520468, + "grad_norm": 15.75, + "learning_rate": 9.658300762644905e-07, + "loss": 0.9622032642364502, + "step": 7334 + }, + { + "epoch": 2.257925515543244, + "grad_norm": 36.0, + "learning_rate": 9.652569398783432e-07, + "loss": 0.6518381237983704, + "step": 7336 + }, + { + "epoch": 2.2585410895660205, + "grad_norm": 35.5, + "learning_rate": 9.64684172199161e-07, + "loss": 1.6143358945846558, + "step": 7338 + }, + { + "epoch": 2.2591566635887967, + "grad_norm": 10.9375, + "learning_rate": 9.641117734799363e-07, + "loss": 1.3510743379592896, + "step": 7340 + }, + { + "epoch": 2.259772237611573, + "grad_norm": 14.375, + "learning_rate": 9.635397439734982e-07, + "loss": 1.3647127151489258, + "step": 7342 + }, + { + "epoch": 2.260387811634349, + "grad_norm": 18.5, + "learning_rate": 9.629680839325124e-07, + "loss": 1.4263982772827148, + "step": 7344 + }, + { + "epoch": 2.2610033856571254, + "grad_norm": 12.9375, + "learning_rate": 9.623967936094823e-07, + "loss": 1.775465488433838, + "step": 7346 + }, + { + "epoch": 2.2616189596799017, + "grad_norm": 6.96875, + "learning_rate": 9.618258732567473e-07, + "loss": 1.0765156745910645, + "step": 7348 + }, + { + "epoch": 2.262234533702678, + "grad_norm": 21.125, + "learning_rate": 9.612553231264832e-07, + "loss": 1.3329508304595947, + "step": 7350 + }, + { + "epoch": 2.262850107725454, + "grad_norm": 8.5, + "learning_rate": 9.606851434707034e-07, + "loss": 1.1238059997558594, + "step": 7352 + }, + { + "epoch": 2.2634656817482304, + "grad_norm": 30.25, + "learning_rate": 9.601153345412567e-07, + "loss": 1.6501717567443848, + "step": 7354 + }, + { + "epoch": 2.2640812557710066, + "grad_norm": 13.6875, + "learning_rate": 9.595458965898277e-07, + "loss": 1.6731393337249756, + "step": 7356 + }, + { + "epoch": 2.264696829793783, + "grad_norm": 2.953125, + "learning_rate": 9.589768298679387e-07, + "loss": 1.5364469289779663, + "step": 7358 + }, + { + "epoch": 2.265312403816559, + "grad_norm": 18.125, + "learning_rate": 9.584081346269465e-07, + "loss": 1.5376877784729004, + "step": 7360 + }, + { + "epoch": 2.2659279778393353, + "grad_norm": 80.0, + "learning_rate": 9.578398111180447e-07, + "loss": 1.4920644760131836, + "step": 7362 + }, + { + "epoch": 2.2665435518621115, + "grad_norm": 14.0, + "learning_rate": 9.572718595922622e-07, + "loss": 0.7458437085151672, + "step": 7364 + }, + { + "epoch": 2.2671591258848878, + "grad_norm": 9.0625, + "learning_rate": 9.567042803004643e-07, + "loss": 1.1495895385742188, + "step": 7366 + }, + { + "epoch": 2.267774699907664, + "grad_norm": 9.9375, + "learning_rate": 9.56137073493351e-07, + "loss": 1.1583331823349, + "step": 7368 + }, + { + "epoch": 2.2683902739304402, + "grad_norm": 8.125, + "learning_rate": 9.555702394214581e-07, + "loss": 1.0664770603179932, + "step": 7370 + }, + { + "epoch": 2.2690058479532165, + "grad_norm": 39.25, + "learning_rate": 9.550037783351575e-07, + "loss": 1.0306137800216675, + "step": 7372 + }, + { + "epoch": 2.2696214219759927, + "grad_norm": 11.0625, + "learning_rate": 9.544376904846547e-07, + "loss": 0.9307693839073181, + "step": 7374 + }, + { + "epoch": 2.270236995998769, + "grad_norm": 16.125, + "learning_rate": 9.538719761199927e-07, + "loss": 1.4178649187088013, + "step": 7376 + }, + { + "epoch": 2.270852570021545, + "grad_norm": 25.75, + "learning_rate": 9.533066354910469e-07, + "loss": 1.4406909942626953, + "step": 7378 + }, + { + "epoch": 2.2714681440443214, + "grad_norm": 11.625, + "learning_rate": 9.527416688475295e-07, + "loss": 0.8210102319717407, + "step": 7380 + }, + { + "epoch": 2.2720837180670976, + "grad_norm": 40.75, + "learning_rate": 9.521770764389873e-07, + "loss": 1.2936944961547852, + "step": 7382 + }, + { + "epoch": 2.272699292089874, + "grad_norm": 6.84375, + "learning_rate": 9.516128585148006e-07, + "loss": 1.1711663007736206, + "step": 7384 + }, + { + "epoch": 2.27331486611265, + "grad_norm": 14.8125, + "learning_rate": 9.510490153241858e-07, + "loss": 1.367745041847229, + "step": 7386 + }, + { + "epoch": 2.2739304401354263, + "grad_norm": 13.875, + "learning_rate": 9.504855471161922e-07, + "loss": 1.5758328437805176, + "step": 7388 + }, + { + "epoch": 2.2745460141582026, + "grad_norm": 13.25, + "learning_rate": 9.499224541397051e-07, + "loss": 1.3060379028320312, + "step": 7390 + }, + { + "epoch": 2.275161588180979, + "grad_norm": 11.3125, + "learning_rate": 9.493597366434435e-07, + "loss": 1.6739366054534912, + "step": 7392 + }, + { + "epoch": 2.275777162203755, + "grad_norm": 21.125, + "learning_rate": 9.487973948759593e-07, + "loss": 1.3423973321914673, + "step": 7394 + }, + { + "epoch": 2.2763927362265313, + "grad_norm": 12.5625, + "learning_rate": 9.482354290856407e-07, + "loss": 1.2677085399627686, + "step": 7396 + }, + { + "epoch": 2.2770083102493075, + "grad_norm": 23.25, + "learning_rate": 9.476738395207082e-07, + "loss": 1.1683954000473022, + "step": 7398 + }, + { + "epoch": 2.2776238842720837, + "grad_norm": 24.25, + "learning_rate": 9.471126264292158e-07, + "loss": 1.2260327339172363, + "step": 7400 + }, + { + "epoch": 2.27823945829486, + "grad_norm": 13.375, + "learning_rate": 9.465517900590535e-07, + "loss": 1.0855873823165894, + "step": 7402 + }, + { + "epoch": 2.278855032317636, + "grad_norm": 11.625, + "learning_rate": 9.459913306579419e-07, + "loss": 1.4089841842651367, + "step": 7404 + }, + { + "epoch": 2.2794706063404124, + "grad_norm": 8.3125, + "learning_rate": 9.454312484734374e-07, + "loss": 1.1654994487762451, + "step": 7406 + }, + { + "epoch": 2.2800861803631887, + "grad_norm": 11.3125, + "learning_rate": 9.448715437529287e-07, + "loss": 1.3815590143203735, + "step": 7408 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 22.25, + "learning_rate": 9.44312216743638e-07, + "loss": 0.8755576610565186, + "step": 7410 + }, + { + "epoch": 2.281317328408741, + "grad_norm": 10.75, + "learning_rate": 9.437532676926205e-07, + "loss": 0.9427816867828369, + "step": 7412 + }, + { + "epoch": 2.2819329024315174, + "grad_norm": 6.4375, + "learning_rate": 9.431946968467651e-07, + "loss": 0.912126898765564, + "step": 7414 + }, + { + "epoch": 2.2825484764542936, + "grad_norm": 17.0, + "learning_rate": 9.426365044527928e-07, + "loss": 1.6929936408996582, + "step": 7416 + }, + { + "epoch": 2.28316405047707, + "grad_norm": 15.8125, + "learning_rate": 9.420786907572579e-07, + "loss": 1.2102103233337402, + "step": 7418 + }, + { + "epoch": 2.283779624499846, + "grad_norm": 10.1875, + "learning_rate": 9.415212560065473e-07, + "loss": 1.3206998109817505, + "step": 7420 + }, + { + "epoch": 2.2843951985226223, + "grad_norm": 22.5, + "learning_rate": 9.40964200446881e-07, + "loss": 0.8749468326568604, + "step": 7422 + }, + { + "epoch": 2.2850107725453985, + "grad_norm": 36.5, + "learning_rate": 9.404075243243105e-07, + "loss": 1.5027368068695068, + "step": 7424 + }, + { + "epoch": 2.2856263465681748, + "grad_norm": 26.875, + "learning_rate": 9.398512278847203e-07, + "loss": 1.502715826034546, + "step": 7426 + }, + { + "epoch": 2.286241920590951, + "grad_norm": 22.0, + "learning_rate": 9.392953113738278e-07, + "loss": 1.255620002746582, + "step": 7428 + }, + { + "epoch": 2.2868574946137272, + "grad_norm": 6.53125, + "learning_rate": 9.387397750371812e-07, + "loss": 1.1149355173110962, + "step": 7430 + }, + { + "epoch": 2.2874730686365035, + "grad_norm": 16.25, + "learning_rate": 9.381846191201622e-07, + "loss": 1.1521387100219727, + "step": 7432 + }, + { + "epoch": 2.2880886426592797, + "grad_norm": 35.5, + "learning_rate": 9.376298438679835e-07, + "loss": 1.296518087387085, + "step": 7434 + }, + { + "epoch": 2.288704216682056, + "grad_norm": 6.0, + "learning_rate": 9.370754495256893e-07, + "loss": 1.2158961296081543, + "step": 7436 + }, + { + "epoch": 2.289319790704832, + "grad_norm": 9.625, + "learning_rate": 9.365214363381575e-07, + "loss": 1.3488669395446777, + "step": 7438 + }, + { + "epoch": 2.2899353647276084, + "grad_norm": 5.96875, + "learning_rate": 9.359678045500958e-07, + "loss": 1.019263744354248, + "step": 7440 + }, + { + "epoch": 2.2905509387503846, + "grad_norm": 10.1875, + "learning_rate": 9.354145544060435e-07, + "loss": 0.9132190942764282, + "step": 7442 + }, + { + "epoch": 2.291166512773161, + "grad_norm": 12.75, + "learning_rate": 9.348616861503722e-07, + "loss": 1.2218446731567383, + "step": 7444 + }, + { + "epoch": 2.291782086795937, + "grad_norm": 19.25, + "learning_rate": 9.343092000272847e-07, + "loss": 1.7379786968231201, + "step": 7446 + }, + { + "epoch": 2.2923976608187133, + "grad_norm": 12.5625, + "learning_rate": 9.337570962808148e-07, + "loss": 1.300768494606018, + "step": 7448 + }, + { + "epoch": 2.2930132348414896, + "grad_norm": 16.5, + "learning_rate": 9.332053751548265e-07, + "loss": 1.4459890127182007, + "step": 7450 + }, + { + "epoch": 2.293628808864266, + "grad_norm": 9.625, + "learning_rate": 9.326540368930166e-07, + "loss": 1.1389377117156982, + "step": 7452 + }, + { + "epoch": 2.294244382887042, + "grad_norm": 6.15625, + "learning_rate": 9.321030817389115e-07, + "loss": 1.1465227603912354, + "step": 7454 + }, + { + "epoch": 2.2948599569098183, + "grad_norm": 14.625, + "learning_rate": 9.315525099358687e-07, + "loss": 1.3622372150421143, + "step": 7456 + }, + { + "epoch": 2.2954755309325945, + "grad_norm": 17.125, + "learning_rate": 9.31002321727076e-07, + "loss": 1.5798914432525635, + "step": 7458 + }, + { + "epoch": 2.2960911049553707, + "grad_norm": 27.625, + "learning_rate": 9.304525173555531e-07, + "loss": 1.3255949020385742, + "step": 7460 + }, + { + "epoch": 2.296706678978147, + "grad_norm": 12.75, + "learning_rate": 9.299030970641481e-07, + "loss": 1.316671371459961, + "step": 7462 + }, + { + "epoch": 2.297322253000923, + "grad_norm": 13.1875, + "learning_rate": 9.293540610955418e-07, + "loss": 1.4738574028015137, + "step": 7464 + }, + { + "epoch": 2.2979378270236994, + "grad_norm": 15.625, + "learning_rate": 9.288054096922433e-07, + "loss": 1.0142216682434082, + "step": 7466 + }, + { + "epoch": 2.2985534010464757, + "grad_norm": 14.1875, + "learning_rate": 9.282571430965921e-07, + "loss": 1.0364811420440674, + "step": 7468 + }, + { + "epoch": 2.299168975069252, + "grad_norm": 17.5, + "learning_rate": 9.277092615507592e-07, + "loss": 0.938054621219635, + "step": 7470 + }, + { + "epoch": 2.2997845490920286, + "grad_norm": 13.4375, + "learning_rate": 9.271617652967437e-07, + "loss": 1.342480182647705, + "step": 7472 + }, + { + "epoch": 2.3004001231148044, + "grad_norm": 16.25, + "learning_rate": 9.266146545763756e-07, + "loss": 1.4579970836639404, + "step": 7474 + }, + { + "epoch": 2.301015697137581, + "grad_norm": 16.375, + "learning_rate": 9.26067929631315e-07, + "loss": 1.1322877407073975, + "step": 7476 + }, + { + "epoch": 2.301631271160357, + "grad_norm": 6.34375, + "learning_rate": 9.255215907030496e-07, + "loss": 1.4794162511825562, + "step": 7478 + }, + { + "epoch": 2.3022468451831335, + "grad_norm": 25.625, + "learning_rate": 9.249756380328987e-07, + "loss": 1.5662634372711182, + "step": 7480 + }, + { + "epoch": 2.3028624192059093, + "grad_norm": 19.25, + "learning_rate": 9.244300718620106e-07, + "loss": 1.1814448833465576, + "step": 7482 + }, + { + "epoch": 2.303477993228686, + "grad_norm": 9.375, + "learning_rate": 9.238848924313618e-07, + "loss": 1.393389344215393, + "step": 7484 + }, + { + "epoch": 2.3040935672514617, + "grad_norm": 13.25, + "learning_rate": 9.233400999817587e-07, + "loss": 1.3429032564163208, + "step": 7486 + }, + { + "epoch": 2.3047091412742384, + "grad_norm": 50.75, + "learning_rate": 9.227956947538371e-07, + "loss": 1.1032218933105469, + "step": 7488 + }, + { + "epoch": 2.3053247152970147, + "grad_norm": 9.8125, + "learning_rate": 9.222516769880616e-07, + "loss": 1.412358283996582, + "step": 7490 + }, + { + "epoch": 2.305940289319791, + "grad_norm": 26.25, + "learning_rate": 9.217080469247248e-07, + "loss": 1.5782816410064697, + "step": 7492 + }, + { + "epoch": 2.306555863342567, + "grad_norm": 13.4375, + "learning_rate": 9.211648048039491e-07, + "loss": 1.264474868774414, + "step": 7494 + }, + { + "epoch": 2.3071714373653434, + "grad_norm": 9.0625, + "learning_rate": 9.206219508656857e-07, + "loss": 1.283721923828125, + "step": 7496 + }, + { + "epoch": 2.3077870113881196, + "grad_norm": 28.625, + "learning_rate": 9.200794853497126e-07, + "loss": 1.184441089630127, + "step": 7498 + }, + { + "epoch": 2.308402585410896, + "grad_norm": 47.5, + "learning_rate": 9.195374084956382e-07, + "loss": 1.6521155834197998, + "step": 7500 + }, + { + "epoch": 2.309018159433672, + "grad_norm": 15.6875, + "learning_rate": 9.189957205428987e-07, + "loss": 1.0816607475280762, + "step": 7502 + }, + { + "epoch": 2.3096337334564483, + "grad_norm": 19.0, + "learning_rate": 9.184544217307577e-07, + "loss": 1.6764347553253174, + "step": 7504 + }, + { + "epoch": 2.3102493074792245, + "grad_norm": 4.625, + "learning_rate": 9.179135122983076e-07, + "loss": 1.303048849105835, + "step": 7506 + }, + { + "epoch": 2.3108648815020008, + "grad_norm": 10.0, + "learning_rate": 9.173729924844692e-07, + "loss": 1.2569313049316406, + "step": 7508 + }, + { + "epoch": 2.311480455524777, + "grad_norm": 9.875, + "learning_rate": 9.168328625279903e-07, + "loss": 1.2223122119903564, + "step": 7510 + }, + { + "epoch": 2.312096029547553, + "grad_norm": 8.0625, + "learning_rate": 9.162931226674469e-07, + "loss": 1.1510555744171143, + "step": 7512 + }, + { + "epoch": 2.3127116035703295, + "grad_norm": 28.25, + "learning_rate": 9.157537731412427e-07, + "loss": 1.366336464881897, + "step": 7514 + }, + { + "epoch": 2.3133271775931057, + "grad_norm": 15.5625, + "learning_rate": 9.152148141876096e-07, + "loss": 1.4099074602127075, + "step": 7516 + }, + { + "epoch": 2.313942751615882, + "grad_norm": 11.875, + "learning_rate": 9.146762460446054e-07, + "loss": 1.520900011062622, + "step": 7518 + }, + { + "epoch": 2.314558325638658, + "grad_norm": 10.0, + "learning_rate": 9.141380689501168e-07, + "loss": 1.2113454341888428, + "step": 7520 + }, + { + "epoch": 2.3151738996614344, + "grad_norm": 13.9375, + "learning_rate": 9.136002831418578e-07, + "loss": 1.0119948387145996, + "step": 7522 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 86.5, + "learning_rate": 9.13062888857368e-07, + "loss": 0.6641063094139099, + "step": 7524 + }, + { + "epoch": 2.316405047706987, + "grad_norm": 15.25, + "learning_rate": 9.125258863340153e-07, + "loss": 1.3526802062988281, + "step": 7526 + }, + { + "epoch": 2.317020621729763, + "grad_norm": 25.375, + "learning_rate": 9.119892758089951e-07, + "loss": 1.0443966388702393, + "step": 7528 + }, + { + "epoch": 2.3176361957525393, + "grad_norm": 10.625, + "learning_rate": 9.114530575193276e-07, + "loss": 1.4699032306671143, + "step": 7530 + }, + { + "epoch": 2.3182517697753156, + "grad_norm": 15.875, + "learning_rate": 9.109172317018624e-07, + "loss": 1.4337401390075684, + "step": 7532 + }, + { + "epoch": 2.318867343798092, + "grad_norm": 13.5, + "learning_rate": 9.103817985932737e-07, + "loss": 1.5521501302719116, + "step": 7534 + }, + { + "epoch": 2.319482917820868, + "grad_norm": 9.1875, + "learning_rate": 9.098467584300621e-07, + "loss": 1.0651452541351318, + "step": 7536 + }, + { + "epoch": 2.3200984918436443, + "grad_norm": 11.9375, + "learning_rate": 9.093121114485572e-07, + "loss": 1.4797773361206055, + "step": 7538 + }, + { + "epoch": 2.3207140658664205, + "grad_norm": 37.5, + "learning_rate": 9.087778578849118e-07, + "loss": 1.4951937198638916, + "step": 7540 + }, + { + "epoch": 2.3213296398891967, + "grad_norm": 18.5, + "learning_rate": 9.082439979751068e-07, + "loss": 1.453412652015686, + "step": 7542 + }, + { + "epoch": 2.321945213911973, + "grad_norm": 10.625, + "learning_rate": 9.077105319549492e-07, + "loss": 1.2274169921875, + "step": 7544 + }, + { + "epoch": 2.322560787934749, + "grad_norm": 8.4375, + "learning_rate": 9.071774600600706e-07, + "loss": 1.292219638824463, + "step": 7546 + }, + { + "epoch": 2.3231763619575254, + "grad_norm": 14.1875, + "learning_rate": 9.066447825259298e-07, + "loss": 1.2093820571899414, + "step": 7548 + }, + { + "epoch": 2.3237919359803016, + "grad_norm": 13.5, + "learning_rate": 9.061124995878117e-07, + "loss": 1.073814034461975, + "step": 7550 + }, + { + "epoch": 2.324407510003078, + "grad_norm": 23.25, + "learning_rate": 9.055806114808254e-07, + "loss": 0.8666530251502991, + "step": 7552 + }, + { + "epoch": 2.325023084025854, + "grad_norm": 13.6875, + "learning_rate": 9.050491184399073e-07, + "loss": 1.2789653539657593, + "step": 7554 + }, + { + "epoch": 2.3256386580486303, + "grad_norm": 9.5625, + "learning_rate": 9.045180206998174e-07, + "loss": 1.3404902219772339, + "step": 7556 + }, + { + "epoch": 2.3262542320714066, + "grad_norm": 14.4375, + "learning_rate": 9.039873184951435e-07, + "loss": 1.2336769104003906, + "step": 7558 + }, + { + "epoch": 2.326869806094183, + "grad_norm": 50.5, + "learning_rate": 9.034570120602964e-07, + "loss": 1.631737232208252, + "step": 7560 + }, + { + "epoch": 2.327485380116959, + "grad_norm": 24.375, + "learning_rate": 9.029271016295132e-07, + "loss": 1.5029709339141846, + "step": 7562 + }, + { + "epoch": 2.3281009541397353, + "grad_norm": 50.5, + "learning_rate": 9.023975874368567e-07, + "loss": 1.0878214836120605, + "step": 7564 + }, + { + "epoch": 2.3287165281625115, + "grad_norm": 75.5, + "learning_rate": 9.018684697162127e-07, + "loss": 0.8998363614082336, + "step": 7566 + }, + { + "epoch": 2.3293321021852877, + "grad_norm": 15.5625, + "learning_rate": 9.013397487012941e-07, + "loss": 1.1986682415008545, + "step": 7568 + }, + { + "epoch": 2.329947676208064, + "grad_norm": 23.75, + "learning_rate": 9.008114246256374e-07, + "loss": 1.7010976076126099, + "step": 7570 + }, + { + "epoch": 2.33056325023084, + "grad_norm": 26.125, + "learning_rate": 9.002834977226034e-07, + "loss": 1.6871237754821777, + "step": 7572 + }, + { + "epoch": 2.3311788242536164, + "grad_norm": 65.5, + "learning_rate": 8.997559682253784e-07, + "loss": 1.5429918766021729, + "step": 7574 + }, + { + "epoch": 2.3317943982763927, + "grad_norm": 26.5, + "learning_rate": 8.992288363669732e-07, + "loss": 1.3595178127288818, + "step": 7576 + }, + { + "epoch": 2.332409972299169, + "grad_norm": 5.09375, + "learning_rate": 8.987021023802215e-07, + "loss": 1.4431962966918945, + "step": 7578 + }, + { + "epoch": 2.333025546321945, + "grad_norm": 19.75, + "learning_rate": 8.981757664977833e-07, + "loss": 1.348787546157837, + "step": 7580 + }, + { + "epoch": 2.3336411203447214, + "grad_norm": 15.375, + "learning_rate": 8.976498289521412e-07, + "loss": 1.2389013767242432, + "step": 7582 + }, + { + "epoch": 2.3342566943674976, + "grad_norm": 8.125, + "learning_rate": 8.97124289975603e-07, + "loss": 1.3180859088897705, + "step": 7584 + }, + { + "epoch": 2.334872268390274, + "grad_norm": 31.5, + "learning_rate": 8.965991498002991e-07, + "loss": 0.9535354375839233, + "step": 7586 + }, + { + "epoch": 2.33548784241305, + "grad_norm": 5.8125, + "learning_rate": 8.960744086581849e-07, + "loss": 1.2119085788726807, + "step": 7588 + }, + { + "epoch": 2.3361034164358263, + "grad_norm": 15.8125, + "learning_rate": 8.955500667810395e-07, + "loss": 1.145918369293213, + "step": 7590 + }, + { + "epoch": 2.3367189904586025, + "grad_norm": 13.9375, + "learning_rate": 8.950261244004645e-07, + "loss": 0.48710447549819946, + "step": 7592 + }, + { + "epoch": 2.3373345644813788, + "grad_norm": 17.125, + "learning_rate": 8.945025817478864e-07, + "loss": 1.31890869140625, + "step": 7594 + }, + { + "epoch": 2.337950138504155, + "grad_norm": 7.15625, + "learning_rate": 8.939794390545547e-07, + "loss": 1.133885383605957, + "step": 7596 + }, + { + "epoch": 2.3385657125269312, + "grad_norm": 10.9375, + "learning_rate": 8.934566965515415e-07, + "loss": 1.3523964881896973, + "step": 7598 + }, + { + "epoch": 2.3391812865497075, + "grad_norm": 17.625, + "learning_rate": 8.929343544697438e-07, + "loss": 1.5850661993026733, + "step": 7600 + }, + { + "epoch": 2.3397968605724837, + "grad_norm": 23.75, + "learning_rate": 8.924124130398796e-07, + "loss": 1.057510256767273, + "step": 7602 + }, + { + "epoch": 2.34041243459526, + "grad_norm": 13.625, + "learning_rate": 8.918908724924915e-07, + "loss": 1.1042832136154175, + "step": 7604 + }, + { + "epoch": 2.341028008618036, + "grad_norm": 19.0, + "learning_rate": 8.913697330579452e-07, + "loss": 1.1760351657867432, + "step": 7606 + }, + { + "epoch": 2.3416435826408124, + "grad_norm": 12.125, + "learning_rate": 8.908489949664272e-07, + "loss": 1.384007453918457, + "step": 7608 + }, + { + "epoch": 2.3422591566635886, + "grad_norm": 23.625, + "learning_rate": 8.903286584479493e-07, + "loss": 1.2748414278030396, + "step": 7610 + }, + { + "epoch": 2.342874730686365, + "grad_norm": 12.4375, + "learning_rate": 8.898087237323441e-07, + "loss": 1.4564826488494873, + "step": 7612 + }, + { + "epoch": 2.343490304709141, + "grad_norm": 25.125, + "learning_rate": 8.892891910492675e-07, + "loss": 1.7585818767547607, + "step": 7614 + }, + { + "epoch": 2.3441058787319173, + "grad_norm": 13.75, + "learning_rate": 8.88770060628198e-07, + "loss": 1.3288085460662842, + "step": 7616 + }, + { + "epoch": 2.3447214527546936, + "grad_norm": 9.5, + "learning_rate": 8.882513326984355e-07, + "loss": 1.4811022281646729, + "step": 7618 + }, + { + "epoch": 2.34533702677747, + "grad_norm": 4.5625, + "learning_rate": 8.877330074891032e-07, + "loss": 0.993120551109314, + "step": 7620 + }, + { + "epoch": 2.3459526008002465, + "grad_norm": 20.375, + "learning_rate": 8.872150852291461e-07, + "loss": 1.3954640626907349, + "step": 7622 + }, + { + "epoch": 2.3465681748230223, + "grad_norm": 17.625, + "learning_rate": 8.866975661473299e-07, + "loss": 1.5178502798080444, + "step": 7624 + }, + { + "epoch": 2.347183748845799, + "grad_norm": 8.3125, + "learning_rate": 8.86180450472245e-07, + "loss": 1.3272488117218018, + "step": 7626 + }, + { + "epoch": 2.3477993228685747, + "grad_norm": 39.75, + "learning_rate": 8.856637384323009e-07, + "loss": 1.7186293601989746, + "step": 7628 + }, + { + "epoch": 2.3484148968913514, + "grad_norm": 24.5, + "learning_rate": 8.8514743025573e-07, + "loss": 1.2998948097229004, + "step": 7630 + }, + { + "epoch": 2.349030470914127, + "grad_norm": 24.375, + "learning_rate": 8.846315261705871e-07, + "loss": 1.6087599992752075, + "step": 7632 + }, + { + "epoch": 2.349646044936904, + "grad_norm": 15.75, + "learning_rate": 8.841160264047466e-07, + "loss": 1.5249056816101074, + "step": 7634 + }, + { + "epoch": 2.3502616189596797, + "grad_norm": 5.84375, + "learning_rate": 8.836009311859053e-07, + "loss": 1.2703745365142822, + "step": 7636 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 25.0, + "learning_rate": 8.830862407415824e-07, + "loss": 1.3457015752792358, + "step": 7638 + }, + { + "epoch": 2.351492767005232, + "grad_norm": 3.0, + "learning_rate": 8.82571955299116e-07, + "loss": 1.0862663984298706, + "step": 7640 + }, + { + "epoch": 2.352108341028009, + "grad_norm": 53.5, + "learning_rate": 8.820580750856673e-07, + "loss": 1.2190375328063965, + "step": 7642 + }, + { + "epoch": 2.352723915050785, + "grad_norm": 29.5, + "learning_rate": 8.815446003282177e-07, + "loss": 1.4536583423614502, + "step": 7644 + }, + { + "epoch": 2.3533394890735613, + "grad_norm": 36.5, + "learning_rate": 8.810315312535697e-07, + "loss": 1.399013638496399, + "step": 7646 + }, + { + "epoch": 2.3539550630963375, + "grad_norm": 18.375, + "learning_rate": 8.80518868088346e-07, + "loss": 1.4619370698928833, + "step": 7648 + }, + { + "epoch": 2.3545706371191137, + "grad_norm": 36.25, + "learning_rate": 8.80006611058991e-07, + "loss": 1.626915693283081, + "step": 7650 + }, + { + "epoch": 2.35518621114189, + "grad_norm": 30.75, + "learning_rate": 8.794947603917691e-07, + "loss": 1.156236171722412, + "step": 7652 + }, + { + "epoch": 2.355801785164666, + "grad_norm": 15.0, + "learning_rate": 8.789833163127652e-07, + "loss": 1.5183656215667725, + "step": 7654 + }, + { + "epoch": 2.3564173591874424, + "grad_norm": 7.8125, + "learning_rate": 8.784722790478847e-07, + "loss": 1.1837083101272583, + "step": 7656 + }, + { + "epoch": 2.3570329332102187, + "grad_norm": 22.0, + "learning_rate": 8.779616488228538e-07, + "loss": 1.3680181503295898, + "step": 7658 + }, + { + "epoch": 2.357648507232995, + "grad_norm": 18.25, + "learning_rate": 8.774514258632178e-07, + "loss": 1.4081780910491943, + "step": 7660 + }, + { + "epoch": 2.358264081255771, + "grad_norm": 34.25, + "learning_rate": 8.769416103943428e-07, + "loss": 1.5171761512756348, + "step": 7662 + }, + { + "epoch": 2.3588796552785474, + "grad_norm": 15.25, + "learning_rate": 8.764322026414157e-07, + "loss": 1.1231598854064941, + "step": 7664 + }, + { + "epoch": 2.3594952293013236, + "grad_norm": 8.5, + "learning_rate": 8.759232028294418e-07, + "loss": 1.3174912929534912, + "step": 7666 + }, + { + "epoch": 2.3601108033241, + "grad_norm": 15.8125, + "learning_rate": 8.754146111832467e-07, + "loss": 1.4191495180130005, + "step": 7668 + }, + { + "epoch": 2.360726377346876, + "grad_norm": 13.0, + "learning_rate": 8.749064279274764e-07, + "loss": 1.5959736108779907, + "step": 7670 + }, + { + "epoch": 2.3613419513696523, + "grad_norm": 38.0, + "learning_rate": 8.743986532865962e-07, + "loss": 1.6755528450012207, + "step": 7672 + }, + { + "epoch": 2.3619575253924285, + "grad_norm": 8.75, + "learning_rate": 8.7389128748489e-07, + "loss": 1.21274995803833, + "step": 7674 + }, + { + "epoch": 2.3625730994152048, + "grad_norm": 23.5, + "learning_rate": 8.733843307464623e-07, + "loss": 1.5054268836975098, + "step": 7676 + }, + { + "epoch": 2.363188673437981, + "grad_norm": 14.625, + "learning_rate": 8.728777832952366e-07, + "loss": 1.2082719802856445, + "step": 7678 + }, + { + "epoch": 2.3638042474607572, + "grad_norm": 52.25, + "learning_rate": 8.723716453549553e-07, + "loss": 1.5130014419555664, + "step": 7680 + }, + { + "epoch": 2.3644198214835335, + "grad_norm": 14.6875, + "learning_rate": 8.7186591714918e-07, + "loss": 1.3520900011062622, + "step": 7682 + }, + { + "epoch": 2.3650353955063097, + "grad_norm": 27.75, + "learning_rate": 8.713605989012918e-07, + "loss": 1.385072946548462, + "step": 7684 + }, + { + "epoch": 2.365650969529086, + "grad_norm": 12.75, + "learning_rate": 8.708556908344895e-07, + "loss": 1.2833045721054077, + "step": 7686 + }, + { + "epoch": 2.366266543551862, + "grad_norm": 9.625, + "learning_rate": 8.703511931717929e-07, + "loss": 1.4542651176452637, + "step": 7688 + }, + { + "epoch": 2.3668821175746384, + "grad_norm": 18.25, + "learning_rate": 8.698471061360385e-07, + "loss": 1.1327033042907715, + "step": 7690 + }, + { + "epoch": 2.3674976915974146, + "grad_norm": 8.1875, + "learning_rate": 8.693434299498812e-07, + "loss": 1.386979103088379, + "step": 7692 + }, + { + "epoch": 2.368113265620191, + "grad_norm": 14.4375, + "learning_rate": 8.688401648357972e-07, + "loss": 1.348421335220337, + "step": 7694 + }, + { + "epoch": 2.368728839642967, + "grad_norm": 18.125, + "learning_rate": 8.683373110160779e-07, + "loss": 1.8261079788208008, + "step": 7696 + }, + { + "epoch": 2.3693444136657433, + "grad_norm": 24.25, + "learning_rate": 8.678348687128348e-07, + "loss": 1.5496175289154053, + "step": 7698 + }, + { + "epoch": 2.3699599876885196, + "grad_norm": 5.34375, + "learning_rate": 8.673328381479978e-07, + "loss": 1.3678693771362305, + "step": 7700 + }, + { + "epoch": 2.370575561711296, + "grad_norm": 22.125, + "learning_rate": 8.668312195433132e-07, + "loss": 1.2740004062652588, + "step": 7702 + }, + { + "epoch": 2.371191135734072, + "grad_norm": 20.0, + "learning_rate": 8.663300131203472e-07, + "loss": 1.5466177463531494, + "step": 7704 + }, + { + "epoch": 2.3718067097568483, + "grad_norm": 12.9375, + "learning_rate": 8.658292191004838e-07, + "loss": 1.6574368476867676, + "step": 7706 + }, + { + "epoch": 2.3724222837796245, + "grad_norm": 12.0, + "learning_rate": 8.653288377049229e-07, + "loss": 1.3180005550384521, + "step": 7708 + }, + { + "epoch": 2.3730378578024007, + "grad_norm": 23.75, + "learning_rate": 8.648288691546848e-07, + "loss": 1.4729022979736328, + "step": 7710 + }, + { + "epoch": 2.373653431825177, + "grad_norm": 13.4375, + "learning_rate": 8.643293136706055e-07, + "loss": 0.969083309173584, + "step": 7712 + }, + { + "epoch": 2.374269005847953, + "grad_norm": 15.1875, + "learning_rate": 8.638301714733399e-07, + "loss": 1.605223298072815, + "step": 7714 + }, + { + "epoch": 2.3748845798707294, + "grad_norm": 11.6875, + "learning_rate": 8.633314427833587e-07, + "loss": 0.9919534921646118, + "step": 7716 + }, + { + "epoch": 2.3755001538935057, + "grad_norm": 5.8125, + "learning_rate": 8.628331278209516e-07, + "loss": 1.044752597808838, + "step": 7718 + }, + { + "epoch": 2.376115727916282, + "grad_norm": 11.1875, + "learning_rate": 8.62335226806225e-07, + "loss": 1.4849684238433838, + "step": 7720 + }, + { + "epoch": 2.376731301939058, + "grad_norm": 10.5625, + "learning_rate": 8.618377399591017e-07, + "loss": 1.4038259983062744, + "step": 7722 + }, + { + "epoch": 2.3773468759618344, + "grad_norm": 17.25, + "learning_rate": 8.613406674993228e-07, + "loss": 1.2761948108673096, + "step": 7724 + }, + { + "epoch": 2.3779624499846106, + "grad_norm": 10.4375, + "learning_rate": 8.608440096464458e-07, + "loss": 1.2625563144683838, + "step": 7726 + }, + { + "epoch": 2.378578024007387, + "grad_norm": 10.8125, + "learning_rate": 8.603477666198445e-07, + "loss": 1.325385570526123, + "step": 7728 + }, + { + "epoch": 2.379193598030163, + "grad_norm": 14.3125, + "learning_rate": 8.598519386387104e-07, + "loss": 1.3754222393035889, + "step": 7730 + }, + { + "epoch": 2.3798091720529393, + "grad_norm": 11.5, + "learning_rate": 8.593565259220514e-07, + "loss": 1.410444736480713, + "step": 7732 + }, + { + "epoch": 2.3804247460757155, + "grad_norm": 16.75, + "learning_rate": 8.588615286886914e-07, + "loss": 1.3901515007019043, + "step": 7734 + }, + { + "epoch": 2.3810403200984918, + "grad_norm": 16.375, + "learning_rate": 8.583669471572716e-07, + "loss": 1.8944385051727295, + "step": 7736 + }, + { + "epoch": 2.381655894121268, + "grad_norm": 12.5, + "learning_rate": 8.578727815462492e-07, + "loss": 1.2769474983215332, + "step": 7738 + }, + { + "epoch": 2.3822714681440442, + "grad_norm": 11.25, + "learning_rate": 8.573790320738979e-07, + "loss": 1.8802108764648438, + "step": 7740 + }, + { + "epoch": 2.3828870421668205, + "grad_norm": 11.0, + "learning_rate": 8.568856989583068e-07, + "loss": 1.3289381265640259, + "step": 7742 + }, + { + "epoch": 2.3835026161895967, + "grad_norm": 14.4375, + "learning_rate": 8.563927824173822e-07, + "loss": 1.67485511302948, + "step": 7744 + }, + { + "epoch": 2.384118190212373, + "grad_norm": 16.625, + "learning_rate": 8.559002826688462e-07, + "loss": 1.2186708450317383, + "step": 7746 + }, + { + "epoch": 2.384733764235149, + "grad_norm": 12.375, + "learning_rate": 8.554081999302356e-07, + "loss": 1.433382272720337, + "step": 7748 + }, + { + "epoch": 2.3853493382579254, + "grad_norm": 55.75, + "learning_rate": 8.549165344189045e-07, + "loss": 1.1944999694824219, + "step": 7750 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 5.875, + "learning_rate": 8.544252863520221e-07, + "loss": 1.154907464981079, + "step": 7752 + }, + { + "epoch": 2.386580486303478, + "grad_norm": 15.0, + "learning_rate": 8.539344559465728e-07, + "loss": 1.3341572284698486, + "step": 7754 + }, + { + "epoch": 2.387196060326254, + "grad_norm": 10.625, + "learning_rate": 8.534440434193579e-07, + "loss": 1.5109237432479858, + "step": 7756 + }, + { + "epoch": 2.3878116343490303, + "grad_norm": 31.25, + "learning_rate": 8.529540489869925e-07, + "loss": 1.6490485668182373, + "step": 7758 + }, + { + "epoch": 2.3884272083718066, + "grad_norm": 36.75, + "learning_rate": 8.524644728659071e-07, + "loss": 1.447040319442749, + "step": 7760 + }, + { + "epoch": 2.389042782394583, + "grad_norm": 18.75, + "learning_rate": 8.519753152723493e-07, + "loss": 1.6449151039123535, + "step": 7762 + }, + { + "epoch": 2.389658356417359, + "grad_norm": 6.6875, + "learning_rate": 8.514865764223799e-07, + "loss": 1.2321791648864746, + "step": 7764 + }, + { + "epoch": 2.3902739304401353, + "grad_norm": 5.96875, + "learning_rate": 8.509982565318752e-07, + "loss": 1.1312897205352783, + "step": 7766 + }, + { + "epoch": 2.3908895044629115, + "grad_norm": 14.0, + "learning_rate": 8.505103558165274e-07, + "loss": 1.233027696609497, + "step": 7768 + }, + { + "epoch": 2.3915050784856877, + "grad_norm": 22.5, + "learning_rate": 8.50022874491842e-07, + "loss": 1.7808541059494019, + "step": 7770 + }, + { + "epoch": 2.392120652508464, + "grad_norm": 15.1875, + "learning_rate": 8.495358127731406e-07, + "loss": 1.5311319828033447, + "step": 7772 + }, + { + "epoch": 2.39273622653124, + "grad_norm": 6.90625, + "learning_rate": 8.490491708755588e-07, + "loss": 1.3701417446136475, + "step": 7774 + }, + { + "epoch": 2.393351800554017, + "grad_norm": 55.0, + "learning_rate": 8.485629490140465e-07, + "loss": 1.1161890029907227, + "step": 7776 + }, + { + "epoch": 2.3939673745767927, + "grad_norm": 9.625, + "learning_rate": 8.480771474033691e-07, + "loss": 1.2349655628204346, + "step": 7778 + }, + { + "epoch": 2.3945829485995693, + "grad_norm": 20.5, + "learning_rate": 8.47591766258105e-07, + "loss": 0.8640923500061035, + "step": 7780 + }, + { + "epoch": 2.395198522622345, + "grad_norm": 8.5, + "learning_rate": 8.471068057926483e-07, + "loss": 1.1073216199874878, + "step": 7782 + }, + { + "epoch": 2.395814096645122, + "grad_norm": 18.875, + "learning_rate": 8.466222662212063e-07, + "loss": 1.2677631378173828, + "step": 7784 + }, + { + "epoch": 2.3964296706678976, + "grad_norm": 17.875, + "learning_rate": 8.461381477578003e-07, + "loss": 1.2740721702575684, + "step": 7786 + }, + { + "epoch": 2.3970452446906743, + "grad_norm": 10.625, + "learning_rate": 8.456544506162668e-07, + "loss": 1.4284172058105469, + "step": 7788 + }, + { + "epoch": 2.39766081871345, + "grad_norm": 14.4375, + "learning_rate": 8.451711750102546e-07, + "loss": 1.7935305833816528, + "step": 7790 + }, + { + "epoch": 2.3982763927362267, + "grad_norm": 20.375, + "learning_rate": 8.446883211532275e-07, + "loss": 1.6979771852493286, + "step": 7792 + }, + { + "epoch": 2.398891966759003, + "grad_norm": 11.25, + "learning_rate": 8.442058892584629e-07, + "loss": 1.3462008237838745, + "step": 7794 + }, + { + "epoch": 2.399507540781779, + "grad_norm": 10.4375, + "learning_rate": 8.437238795390507e-07, + "loss": 1.3277199268341064, + "step": 7796 + }, + { + "epoch": 2.4001231148045554, + "grad_norm": 14.6875, + "learning_rate": 8.432422922078955e-07, + "loss": 1.2554333209991455, + "step": 7798 + }, + { + "epoch": 2.4007386888273317, + "grad_norm": 25.5, + "learning_rate": 8.427611274777156e-07, + "loss": 1.3627779483795166, + "step": 7800 + }, + { + "epoch": 2.401354262850108, + "grad_norm": 11.375, + "learning_rate": 8.422803855610411e-07, + "loss": 1.3388714790344238, + "step": 7802 + }, + { + "epoch": 2.401969836872884, + "grad_norm": 17.625, + "learning_rate": 8.418000666702166e-07, + "loss": 1.5131101608276367, + "step": 7804 + }, + { + "epoch": 2.4025854108956604, + "grad_norm": 13.4375, + "learning_rate": 8.413201710173996e-07, + "loss": 1.553926944732666, + "step": 7806 + }, + { + "epoch": 2.4032009849184366, + "grad_norm": 10.9375, + "learning_rate": 8.408406988145607e-07, + "loss": 1.2082258462905884, + "step": 7808 + }, + { + "epoch": 2.403816558941213, + "grad_norm": 17.5, + "learning_rate": 8.403616502734828e-07, + "loss": 1.420758605003357, + "step": 7810 + }, + { + "epoch": 2.404432132963989, + "grad_norm": 20.0, + "learning_rate": 8.398830256057625e-07, + "loss": 1.002287745475769, + "step": 7812 + }, + { + "epoch": 2.4050477069867653, + "grad_norm": 15.0625, + "learning_rate": 8.394048250228093e-07, + "loss": 1.3062329292297363, + "step": 7814 + }, + { + "epoch": 2.4056632810095415, + "grad_norm": 33.0, + "learning_rate": 8.38927048735844e-07, + "loss": 1.532996416091919, + "step": 7816 + }, + { + "epoch": 2.4062788550323178, + "grad_norm": 14.25, + "learning_rate": 8.384496969559016e-07, + "loss": 1.4776278734207153, + "step": 7818 + }, + { + "epoch": 2.406894429055094, + "grad_norm": 12.25, + "learning_rate": 8.37972769893829e-07, + "loss": 1.2780346870422363, + "step": 7820 + }, + { + "epoch": 2.40751000307787, + "grad_norm": 5.59375, + "learning_rate": 8.374962677602847e-07, + "loss": 1.1072016954421997, + "step": 7822 + }, + { + "epoch": 2.4081255771006465, + "grad_norm": 20.125, + "learning_rate": 8.370201907657415e-07, + "loss": 1.4812865257263184, + "step": 7824 + }, + { + "epoch": 2.4087411511234227, + "grad_norm": 6.84375, + "learning_rate": 8.36544539120482e-07, + "loss": 1.0782959461212158, + "step": 7826 + }, + { + "epoch": 2.409356725146199, + "grad_norm": 22.375, + "learning_rate": 8.360693130346021e-07, + "loss": 1.2231228351593018, + "step": 7828 + }, + { + "epoch": 2.409972299168975, + "grad_norm": 8.875, + "learning_rate": 8.355945127180107e-07, + "loss": 1.3597500324249268, + "step": 7830 + }, + { + "epoch": 2.4105878731917514, + "grad_norm": 34.5, + "learning_rate": 8.351201383804266e-07, + "loss": 1.655010461807251, + "step": 7832 + }, + { + "epoch": 2.4112034472145276, + "grad_norm": 20.125, + "learning_rate": 8.346461902313823e-07, + "loss": 1.4868947267532349, + "step": 7834 + }, + { + "epoch": 2.411819021237304, + "grad_norm": 16.5, + "learning_rate": 8.341726684802205e-07, + "loss": 1.2523350715637207, + "step": 7836 + }, + { + "epoch": 2.41243459526008, + "grad_norm": 11.6875, + "learning_rate": 8.336995733360966e-07, + "loss": 1.1832252740859985, + "step": 7838 + }, + { + "epoch": 2.4130501692828563, + "grad_norm": 23.875, + "learning_rate": 8.332269050079777e-07, + "loss": 1.2065129280090332, + "step": 7840 + }, + { + "epoch": 2.4136657433056325, + "grad_norm": 23.0, + "learning_rate": 8.327546637046411e-07, + "loss": 1.335740089416504, + "step": 7842 + }, + { + "epoch": 2.414281317328409, + "grad_norm": 13.5625, + "learning_rate": 8.32282849634677e-07, + "loss": 1.565500020980835, + "step": 7844 + }, + { + "epoch": 2.414896891351185, + "grad_norm": 20.375, + "learning_rate": 8.318114630064861e-07, + "loss": 1.4919871091842651, + "step": 7846 + }, + { + "epoch": 2.4155124653739612, + "grad_norm": 19.0, + "learning_rate": 8.313405040282797e-07, + "loss": 1.5719404220581055, + "step": 7848 + }, + { + "epoch": 2.4161280393967375, + "grad_norm": 16.625, + "learning_rate": 8.308699729080822e-07, + "loss": 1.315718650817871, + "step": 7850 + }, + { + "epoch": 2.4167436134195137, + "grad_norm": 7.84375, + "learning_rate": 8.303998698537272e-07, + "loss": 1.0368592739105225, + "step": 7852 + }, + { + "epoch": 2.41735918744229, + "grad_norm": 12.625, + "learning_rate": 8.29930195072859e-07, + "loss": 1.2762839794158936, + "step": 7854 + }, + { + "epoch": 2.417974761465066, + "grad_norm": 15.1875, + "learning_rate": 8.294609487729346e-07, + "loss": 1.638676404953003, + "step": 7856 + }, + { + "epoch": 2.4185903354878424, + "grad_norm": 24.125, + "learning_rate": 8.289921311612198e-07, + "loss": 1.1454219818115234, + "step": 7858 + }, + { + "epoch": 2.4192059095106186, + "grad_norm": 19.875, + "learning_rate": 8.285237424447923e-07, + "loss": 1.719085693359375, + "step": 7860 + }, + { + "epoch": 2.419821483533395, + "grad_norm": 12.0, + "learning_rate": 8.2805578283054e-07, + "loss": 1.0720081329345703, + "step": 7862 + }, + { + "epoch": 2.420437057556171, + "grad_norm": 10.875, + "learning_rate": 8.275882525251607e-07, + "loss": 1.2760975360870361, + "step": 7864 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 18.125, + "learning_rate": 8.271211517351636e-07, + "loss": 1.639174222946167, + "step": 7866 + }, + { + "epoch": 2.4216682056017236, + "grad_norm": 37.25, + "learning_rate": 8.266544806668678e-07, + "loss": 1.2773809432983398, + "step": 7868 + }, + { + "epoch": 2.4222837796245, + "grad_norm": 10.1875, + "learning_rate": 8.261882395264017e-07, + "loss": 1.1791555881500244, + "step": 7870 + }, + { + "epoch": 2.422899353647276, + "grad_norm": 21.375, + "learning_rate": 8.257224285197049e-07, + "loss": 1.487235188484192, + "step": 7872 + }, + { + "epoch": 2.4235149276700523, + "grad_norm": 11.9375, + "learning_rate": 8.252570478525268e-07, + "loss": 1.3929115533828735, + "step": 7874 + }, + { + "epoch": 2.4241305016928285, + "grad_norm": 12.25, + "learning_rate": 8.247920977304267e-07, + "loss": 1.374371886253357, + "step": 7876 + }, + { + "epoch": 2.4247460757156047, + "grad_norm": 7.65625, + "learning_rate": 8.243275783587732e-07, + "loss": 1.2207591533660889, + "step": 7878 + }, + { + "epoch": 2.425361649738381, + "grad_norm": 18.25, + "learning_rate": 8.238634899427451e-07, + "loss": 0.96770840883255, + "step": 7880 + }, + { + "epoch": 2.425977223761157, + "grad_norm": 16.75, + "learning_rate": 8.233998326873314e-07, + "loss": 1.5982918739318848, + "step": 7882 + }, + { + "epoch": 2.4265927977839334, + "grad_norm": 8.25, + "learning_rate": 8.229366067973291e-07, + "loss": 1.2492954730987549, + "step": 7884 + }, + { + "epoch": 2.4272083718067097, + "grad_norm": 20.875, + "learning_rate": 8.22473812477346e-07, + "loss": 1.3705569505691528, + "step": 7886 + }, + { + "epoch": 2.427823945829486, + "grad_norm": 23.75, + "learning_rate": 8.220114499317994e-07, + "loss": 1.0353972911834717, + "step": 7888 + }, + { + "epoch": 2.428439519852262, + "grad_norm": 11.0, + "learning_rate": 8.215495193649144e-07, + "loss": 1.1741468906402588, + "step": 7890 + }, + { + "epoch": 2.4290550938750384, + "grad_norm": 48.0, + "learning_rate": 8.21088020980727e-07, + "loss": 1.2422951459884644, + "step": 7892 + }, + { + "epoch": 2.4296706678978146, + "grad_norm": 45.0, + "learning_rate": 8.206269549830813e-07, + "loss": 1.3556740283966064, + "step": 7894 + }, + { + "epoch": 2.430286241920591, + "grad_norm": 17.5, + "learning_rate": 8.201663215756308e-07, + "loss": 1.5279185771942139, + "step": 7896 + }, + { + "epoch": 2.430901815943367, + "grad_norm": 6.15625, + "learning_rate": 8.197061209618374e-07, + "loss": 1.1852245330810547, + "step": 7898 + }, + { + "epoch": 2.4315173899661433, + "grad_norm": 23.5, + "learning_rate": 8.192463533449727e-07, + "loss": 1.5237150192260742, + "step": 7900 + }, + { + "epoch": 2.4321329639889195, + "grad_norm": 10.6875, + "learning_rate": 8.187870189281167e-07, + "loss": 1.355332612991333, + "step": 7902 + }, + { + "epoch": 2.4327485380116958, + "grad_norm": 6.375, + "learning_rate": 8.183281179141573e-07, + "loss": 1.1575570106506348, + "step": 7904 + }, + { + "epoch": 2.433364112034472, + "grad_norm": 14.375, + "learning_rate": 8.178696505057921e-07, + "loss": 1.236515998840332, + "step": 7906 + }, + { + "epoch": 2.4339796860572482, + "grad_norm": 9.8125, + "learning_rate": 8.174116169055268e-07, + "loss": 1.3578482866287231, + "step": 7908 + }, + { + "epoch": 2.4345952600800245, + "grad_norm": 15.3125, + "learning_rate": 8.169540173156746e-07, + "loss": 1.4025132656097412, + "step": 7910 + }, + { + "epoch": 2.4352108341028007, + "grad_norm": 5.0625, + "learning_rate": 8.164968519383585e-07, + "loss": 1.1161717176437378, + "step": 7912 + }, + { + "epoch": 2.435826408125577, + "grad_norm": 13.75, + "learning_rate": 8.16040120975509e-07, + "loss": 1.6636682748794556, + "step": 7914 + }, + { + "epoch": 2.436441982148353, + "grad_norm": 27.25, + "learning_rate": 8.155838246288638e-07, + "loss": 1.5401861667633057, + "step": 7916 + }, + { + "epoch": 2.4370575561711294, + "grad_norm": 12.0, + "learning_rate": 8.151279630999709e-07, + "loss": 1.5521032810211182, + "step": 7918 + }, + { + "epoch": 2.4376731301939056, + "grad_norm": 90.5, + "learning_rate": 8.146725365901836e-07, + "loss": 1.5973589420318604, + "step": 7920 + }, + { + "epoch": 2.438288704216682, + "grad_norm": 15.125, + "learning_rate": 8.142175453006649e-07, + "loss": 1.0958237648010254, + "step": 7922 + }, + { + "epoch": 2.438904278239458, + "grad_norm": 12.3125, + "learning_rate": 8.137629894323854e-07, + "loss": 0.8826869130134583, + "step": 7924 + }, + { + "epoch": 2.439519852262235, + "grad_norm": 18.625, + "learning_rate": 8.133088691861219e-07, + "loss": 1.2588679790496826, + "step": 7926 + }, + { + "epoch": 2.4401354262850106, + "grad_norm": 8.6875, + "learning_rate": 8.128551847624605e-07, + "loss": 1.0845204591751099, + "step": 7928 + }, + { + "epoch": 2.4407510003077872, + "grad_norm": 12.3125, + "learning_rate": 8.12401936361794e-07, + "loss": 1.260317087173462, + "step": 7930 + }, + { + "epoch": 2.441366574330563, + "grad_norm": 23.75, + "learning_rate": 8.119491241843229e-07, + "loss": 1.0685640573501587, + "step": 7932 + }, + { + "epoch": 2.4419821483533397, + "grad_norm": 12.75, + "learning_rate": 8.114967484300544e-07, + "loss": 1.4659960269927979, + "step": 7934 + }, + { + "epoch": 2.4425977223761155, + "grad_norm": 31.625, + "learning_rate": 8.110448092988041e-07, + "loss": 1.2615103721618652, + "step": 7936 + }, + { + "epoch": 2.443213296398892, + "grad_norm": 29.625, + "learning_rate": 8.105933069901935e-07, + "loss": 1.3865262269973755, + "step": 7938 + }, + { + "epoch": 2.443828870421668, + "grad_norm": 6.8125, + "learning_rate": 8.101422417036516e-07, + "loss": 1.39396071434021, + "step": 7940 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 5.21875, + "learning_rate": 8.096916136384146e-07, + "loss": 1.4983265399932861, + "step": 7942 + }, + { + "epoch": 2.445060018467221, + "grad_norm": 11.5, + "learning_rate": 8.09241422993526e-07, + "loss": 1.1928825378417969, + "step": 7944 + }, + { + "epoch": 2.445675592489997, + "grad_norm": 8.5625, + "learning_rate": 8.087916699678345e-07, + "loss": 1.1064355373382568, + "step": 7946 + }, + { + "epoch": 2.4462911665127733, + "grad_norm": 21.0, + "learning_rate": 8.08342354759997e-07, + "loss": 1.461334466934204, + "step": 7948 + }, + { + "epoch": 2.4469067405355496, + "grad_norm": 15.1875, + "learning_rate": 8.078934775684771e-07, + "loss": 1.3841646909713745, + "step": 7950 + }, + { + "epoch": 2.447522314558326, + "grad_norm": 6.65625, + "learning_rate": 8.074450385915435e-07, + "loss": 0.9941138625144958, + "step": 7952 + }, + { + "epoch": 2.448137888581102, + "grad_norm": 39.75, + "learning_rate": 8.069970380272727e-07, + "loss": 0.9700048565864563, + "step": 7954 + }, + { + "epoch": 2.4487534626038783, + "grad_norm": 8.625, + "learning_rate": 8.065494760735468e-07, + "loss": 1.2525842189788818, + "step": 7956 + }, + { + "epoch": 2.4493690366266545, + "grad_norm": 32.75, + "learning_rate": 8.061023529280546e-07, + "loss": 1.5355992317199707, + "step": 7958 + }, + { + "epoch": 2.4499846106494307, + "grad_norm": 9.875, + "learning_rate": 8.056556687882909e-07, + "loss": 1.3056560754776, + "step": 7960 + }, + { + "epoch": 2.450600184672207, + "grad_norm": 11.1875, + "learning_rate": 8.052094238515563e-07, + "loss": 1.2505466938018799, + "step": 7962 + }, + { + "epoch": 2.451215758694983, + "grad_norm": 16.75, + "learning_rate": 8.047636183149584e-07, + "loss": 1.2493584156036377, + "step": 7964 + }, + { + "epoch": 2.4518313327177594, + "grad_norm": 7.3125, + "learning_rate": 8.043182523754092e-07, + "loss": 1.2162585258483887, + "step": 7966 + }, + { + "epoch": 2.4524469067405357, + "grad_norm": 15.625, + "learning_rate": 8.038733262296278e-07, + "loss": 1.416115403175354, + "step": 7968 + }, + { + "epoch": 2.453062480763312, + "grad_norm": 12.625, + "learning_rate": 8.034288400741388e-07, + "loss": 1.5428014993667603, + "step": 7970 + }, + { + "epoch": 2.453678054786088, + "grad_norm": 28.125, + "learning_rate": 8.029847941052717e-07, + "loss": 1.3966515064239502, + "step": 7972 + }, + { + "epoch": 2.4542936288088644, + "grad_norm": 47.75, + "learning_rate": 8.025411885191622e-07, + "loss": 1.602909803390503, + "step": 7974 + }, + { + "epoch": 2.4549092028316406, + "grad_norm": 20.125, + "learning_rate": 8.020980235117518e-07, + "loss": 1.5091824531555176, + "step": 7976 + }, + { + "epoch": 2.455524776854417, + "grad_norm": 32.5, + "learning_rate": 8.016552992787867e-07, + "loss": 1.4437615871429443, + "step": 7978 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 25.625, + "learning_rate": 8.012130160158187e-07, + "loss": 1.4464353322982788, + "step": 7980 + }, + { + "epoch": 2.4567559248999693, + "grad_norm": 14.8125, + "learning_rate": 8.007711739182051e-07, + "loss": 1.307931661605835, + "step": 7982 + }, + { + "epoch": 2.4573714989227455, + "grad_norm": 28.875, + "learning_rate": 8.003297731811072e-07, + "loss": 1.121088981628418, + "step": 7984 + }, + { + "epoch": 2.4579870729455218, + "grad_norm": 100.0, + "learning_rate": 7.998888139994933e-07, + "loss": 0.8891786336898804, + "step": 7986 + }, + { + "epoch": 2.458602646968298, + "grad_norm": 7.5, + "learning_rate": 7.994482965681352e-07, + "loss": 0.7378986477851868, + "step": 7988 + }, + { + "epoch": 2.4592182209910742, + "grad_norm": 26.0, + "learning_rate": 7.990082210816096e-07, + "loss": 1.3136024475097656, + "step": 7990 + }, + { + "epoch": 2.4598337950138505, + "grad_norm": 18.625, + "learning_rate": 7.985685877342989e-07, + "loss": 1.7769261598587036, + "step": 7992 + }, + { + "epoch": 2.4604493690366267, + "grad_norm": 39.0, + "learning_rate": 7.981293967203893e-07, + "loss": 1.4590258598327637, + "step": 7994 + }, + { + "epoch": 2.461064943059403, + "grad_norm": 32.75, + "learning_rate": 7.976906482338718e-07, + "loss": 1.2280614376068115, + "step": 7996 + }, + { + "epoch": 2.461680517082179, + "grad_norm": 7.75, + "learning_rate": 7.972523424685428e-07, + "loss": 0.8923991322517395, + "step": 7998 + }, + { + "epoch": 2.4622960911049554, + "grad_norm": 23.625, + "learning_rate": 7.968144796180014e-07, + "loss": 1.5898621082305908, + "step": 8000 + }, + { + "epoch": 2.4629116651277316, + "grad_norm": 30.125, + "learning_rate": 7.963770598756535e-07, + "loss": 1.3274173736572266, + "step": 8002 + }, + { + "epoch": 2.463527239150508, + "grad_norm": 21.25, + "learning_rate": 7.959400834347062e-07, + "loss": 1.9697543382644653, + "step": 8004 + }, + { + "epoch": 2.464142813173284, + "grad_norm": 16.0, + "learning_rate": 7.955035504881741e-07, + "loss": 1.3735530376434326, + "step": 8006 + }, + { + "epoch": 2.4647583871960603, + "grad_norm": 15.0625, + "learning_rate": 7.950674612288737e-07, + "loss": 1.231592059135437, + "step": 8008 + }, + { + "epoch": 2.4653739612188366, + "grad_norm": 35.5, + "learning_rate": 7.946318158494255e-07, + "loss": 1.4949934482574463, + "step": 8010 + }, + { + "epoch": 2.465989535241613, + "grad_norm": 161.0, + "learning_rate": 7.941966145422555e-07, + "loss": 0.8490093946456909, + "step": 8012 + }, + { + "epoch": 2.466605109264389, + "grad_norm": 59.25, + "learning_rate": 7.93761857499592e-07, + "loss": 0.8294261693954468, + "step": 8014 + }, + { + "epoch": 2.4672206832871653, + "grad_norm": 13.75, + "learning_rate": 7.93327544913468e-07, + "loss": 1.3439993858337402, + "step": 8016 + }, + { + "epoch": 2.4678362573099415, + "grad_norm": 23.25, + "learning_rate": 7.928936769757201e-07, + "loss": 1.3793857097625732, + "step": 8018 + }, + { + "epoch": 2.4684518313327177, + "grad_norm": 12.5625, + "learning_rate": 7.924602538779877e-07, + "loss": 1.548597812652588, + "step": 8020 + }, + { + "epoch": 2.469067405355494, + "grad_norm": 22.625, + "learning_rate": 7.920272758117145e-07, + "loss": 1.4939284324645996, + "step": 8022 + }, + { + "epoch": 2.46968297937827, + "grad_norm": 16.625, + "learning_rate": 7.915947429681476e-07, + "loss": 1.4503446817398071, + "step": 8024 + }, + { + "epoch": 2.4702985534010464, + "grad_norm": 6.6875, + "learning_rate": 7.911626555383368e-07, + "loss": 1.2970399856567383, + "step": 8026 + }, + { + "epoch": 2.4709141274238227, + "grad_norm": 5.34375, + "learning_rate": 7.907310137131358e-07, + "loss": 0.9716742634773254, + "step": 8028 + }, + { + "epoch": 2.471529701446599, + "grad_norm": 23.5, + "learning_rate": 7.902998176832018e-07, + "loss": 1.3309000730514526, + "step": 8030 + }, + { + "epoch": 2.472145275469375, + "grad_norm": 28.125, + "learning_rate": 7.89869067638994e-07, + "loss": 1.6835633516311646, + "step": 8032 + }, + { + "epoch": 2.4727608494921514, + "grad_norm": 25.75, + "learning_rate": 7.894387637707753e-07, + "loss": 1.668080449104309, + "step": 8034 + }, + { + "epoch": 2.4733764235149276, + "grad_norm": 14.625, + "learning_rate": 7.890089062686115e-07, + "loss": 1.832417607307434, + "step": 8036 + }, + { + "epoch": 2.473991997537704, + "grad_norm": 10.5625, + "learning_rate": 7.885794953223713e-07, + "loss": 1.4618122577667236, + "step": 8038 + }, + { + "epoch": 2.47460757156048, + "grad_norm": 8.6875, + "learning_rate": 7.881505311217255e-07, + "loss": 1.3789515495300293, + "step": 8040 + }, + { + "epoch": 2.4752231455832563, + "grad_norm": 12.875, + "learning_rate": 7.877220138561485e-07, + "loss": 1.2313289642333984, + "step": 8042 + }, + { + "epoch": 2.4758387196060325, + "grad_norm": 4.0, + "learning_rate": 7.87293943714917e-07, + "loss": 1.0658435821533203, + "step": 8044 + }, + { + "epoch": 2.4764542936288088, + "grad_norm": 15.3125, + "learning_rate": 7.868663208871092e-07, + "loss": 1.407973051071167, + "step": 8046 + }, + { + "epoch": 2.477069867651585, + "grad_norm": 12.75, + "learning_rate": 7.864391455616078e-07, + "loss": 1.230506181716919, + "step": 8048 + }, + { + "epoch": 2.477685441674361, + "grad_norm": 10.375, + "learning_rate": 7.860124179270963e-07, + "loss": 0.9987541437149048, + "step": 8050 + }, + { + "epoch": 2.4783010156971375, + "grad_norm": 14.0625, + "learning_rate": 7.855861381720601e-07, + "loss": 1.5021973848342896, + "step": 8052 + }, + { + "epoch": 2.4789165897199137, + "grad_norm": 17.125, + "learning_rate": 7.851603064847879e-07, + "loss": 1.5795702934265137, + "step": 8054 + }, + { + "epoch": 2.47953216374269, + "grad_norm": 15.75, + "learning_rate": 7.8473492305337e-07, + "loss": 1.112412929534912, + "step": 8056 + }, + { + "epoch": 2.480147737765466, + "grad_norm": 14.25, + "learning_rate": 7.843099880656992e-07, + "loss": 1.1409666538238525, + "step": 8058 + }, + { + "epoch": 2.4807633117882424, + "grad_norm": 11.4375, + "learning_rate": 7.83885501709469e-07, + "loss": 1.554758071899414, + "step": 8060 + }, + { + "epoch": 2.4813788858110186, + "grad_norm": 16.0, + "learning_rate": 7.834614641721759e-07, + "loss": 1.277086853981018, + "step": 8062 + }, + { + "epoch": 2.481994459833795, + "grad_norm": 7.625, + "learning_rate": 7.83037875641118e-07, + "loss": 1.2530086040496826, + "step": 8064 + }, + { + "epoch": 2.482610033856571, + "grad_norm": 12.5625, + "learning_rate": 7.826147363033943e-07, + "loss": 1.2744269371032715, + "step": 8066 + }, + { + "epoch": 2.4832256078793473, + "grad_norm": 11.25, + "learning_rate": 7.821920463459062e-07, + "loss": 1.3146778345108032, + "step": 8068 + }, + { + "epoch": 2.4838411819021236, + "grad_norm": 13.125, + "learning_rate": 7.817698059553566e-07, + "loss": 1.4526034593582153, + "step": 8070 + }, + { + "epoch": 2.4844567559249, + "grad_norm": 19.0, + "learning_rate": 7.813480153182487e-07, + "loss": 1.1881377696990967, + "step": 8072 + }, + { + "epoch": 2.485072329947676, + "grad_norm": 2.9375, + "learning_rate": 7.809266746208894e-07, + "loss": 1.0977604389190674, + "step": 8074 + }, + { + "epoch": 2.4856879039704527, + "grad_norm": 23.125, + "learning_rate": 7.805057840493841e-07, + "loss": 1.4201725721359253, + "step": 8076 + }, + { + "epoch": 2.4863034779932285, + "grad_norm": 42.25, + "learning_rate": 7.800853437896407e-07, + "loss": 1.129119873046875, + "step": 8078 + }, + { + "epoch": 2.486919052016005, + "grad_norm": 18.0, + "learning_rate": 7.796653540273689e-07, + "loss": 1.2863776683807373, + "step": 8080 + }, + { + "epoch": 2.487534626038781, + "grad_norm": 24.75, + "learning_rate": 7.792458149480781e-07, + "loss": 1.3503018617630005, + "step": 8082 + }, + { + "epoch": 2.4881502000615576, + "grad_norm": 9.8125, + "learning_rate": 7.788267267370792e-07, + "loss": 1.3690378665924072, + "step": 8084 + }, + { + "epoch": 2.4887657740843334, + "grad_norm": 10.4375, + "learning_rate": 7.784080895794845e-07, + "loss": 1.1764789819717407, + "step": 8086 + }, + { + "epoch": 2.48938134810711, + "grad_norm": 21.75, + "learning_rate": 7.779899036602055e-07, + "loss": 1.2622982263565063, + "step": 8088 + }, + { + "epoch": 2.489996922129886, + "grad_norm": 19.0, + "learning_rate": 7.775721691639563e-07, + "loss": 1.3501579761505127, + "step": 8090 + }, + { + "epoch": 2.4906124961526626, + "grad_norm": 10.6875, + "learning_rate": 7.771548862752504e-07, + "loss": 0.9969066381454468, + "step": 8092 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 6.25, + "learning_rate": 7.767380551784021e-07, + "loss": 0.5417050123214722, + "step": 8094 + }, + { + "epoch": 2.491843644198215, + "grad_norm": 14.1875, + "learning_rate": 7.76321676057526e-07, + "loss": 1.3599417209625244, + "step": 8096 + }, + { + "epoch": 2.4924592182209913, + "grad_norm": 24.5, + "learning_rate": 7.759057490965375e-07, + "loss": 1.6254465579986572, + "step": 8098 + }, + { + "epoch": 2.4930747922437675, + "grad_norm": 132.0, + "learning_rate": 7.754902744791523e-07, + "loss": 1.958270788192749, + "step": 8100 + }, + { + "epoch": 2.4936903662665437, + "grad_norm": 12.1875, + "learning_rate": 7.750752523888852e-07, + "loss": 1.4026708602905273, + "step": 8102 + }, + { + "epoch": 2.49430594028932, + "grad_norm": 94.5, + "learning_rate": 7.746606830090525e-07, + "loss": 1.394837498664856, + "step": 8104 + }, + { + "epoch": 2.494921514312096, + "grad_norm": 9.0, + "learning_rate": 7.742465665227702e-07, + "loss": 1.3615858554840088, + "step": 8106 + }, + { + "epoch": 2.4955370883348724, + "grad_norm": 65.5, + "learning_rate": 7.738329031129533e-07, + "loss": 1.4655237197875977, + "step": 8108 + }, + { + "epoch": 2.4961526623576487, + "grad_norm": 10.0625, + "learning_rate": 7.734196929623177e-07, + "loss": 1.4283119440078735, + "step": 8110 + }, + { + "epoch": 2.496768236380425, + "grad_norm": 6.90625, + "learning_rate": 7.730069362533791e-07, + "loss": 1.05420982837677, + "step": 8112 + }, + { + "epoch": 2.497383810403201, + "grad_norm": 11.4375, + "learning_rate": 7.725946331684523e-07, + "loss": 1.1853525638580322, + "step": 8114 + }, + { + "epoch": 2.4979993844259774, + "grad_norm": 4.625, + "learning_rate": 7.721827838896523e-07, + "loss": 1.0965955257415771, + "step": 8116 + }, + { + "epoch": 2.4986149584487536, + "grad_norm": 19.75, + "learning_rate": 7.717713885988933e-07, + "loss": 1.4483702182769775, + "step": 8118 + }, + { + "epoch": 2.49923053247153, + "grad_norm": 23.5, + "learning_rate": 7.713604474778886e-07, + "loss": 1.3182878494262695, + "step": 8120 + }, + { + "epoch": 2.499846106494306, + "grad_norm": 15.375, + "learning_rate": 7.709499607081519e-07, + "loss": 1.0512882471084595, + "step": 8122 + }, + { + "epoch": 2.5004616805170823, + "grad_norm": 17.875, + "learning_rate": 7.705399284709955e-07, + "loss": 1.2965221405029297, + "step": 8124 + }, + { + "epoch": 2.5010772545398585, + "grad_norm": 40.75, + "learning_rate": 7.701303509475315e-07, + "loss": 1.5295205116271973, + "step": 8126 + }, + { + "epoch": 2.5016928285626348, + "grad_norm": 27.0, + "learning_rate": 7.6972122831867e-07, + "loss": 1.6842844486236572, + "step": 8128 + }, + { + "epoch": 2.502308402585411, + "grad_norm": 7.625, + "learning_rate": 7.693125607651216e-07, + "loss": 0.8428970575332642, + "step": 8130 + }, + { + "epoch": 2.502923976608187, + "grad_norm": 15.3125, + "learning_rate": 7.68904348467395e-07, + "loss": 1.4775192737579346, + "step": 8132 + }, + { + "epoch": 2.5035395506309635, + "grad_norm": 22.75, + "learning_rate": 7.684965916057978e-07, + "loss": 1.201244831085205, + "step": 8134 + }, + { + "epoch": 2.5041551246537397, + "grad_norm": 14.0625, + "learning_rate": 7.680892903604369e-07, + "loss": 1.4047563076019287, + "step": 8136 + }, + { + "epoch": 2.504770698676516, + "grad_norm": 12.375, + "learning_rate": 7.676824449112181e-07, + "loss": 1.0803864002227783, + "step": 8138 + }, + { + "epoch": 2.505386272699292, + "grad_norm": 13.6875, + "learning_rate": 7.672760554378444e-07, + "loss": 1.3757635354995728, + "step": 8140 + }, + { + "epoch": 2.5060018467220684, + "grad_norm": 28.875, + "learning_rate": 7.668701221198197e-07, + "loss": 1.7247374057769775, + "step": 8142 + }, + { + "epoch": 2.5066174207448446, + "grad_norm": 9.0, + "learning_rate": 7.664646451364448e-07, + "loss": 0.49472346901893616, + "step": 8144 + }, + { + "epoch": 2.507232994767621, + "grad_norm": 4.5625, + "learning_rate": 7.660596246668188e-07, + "loss": 1.024422526359558, + "step": 8146 + }, + { + "epoch": 2.507848568790397, + "grad_norm": 21.75, + "learning_rate": 7.656550608898407e-07, + "loss": 1.0848078727722168, + "step": 8148 + }, + { + "epoch": 2.5084641428131733, + "grad_norm": 17.0, + "learning_rate": 7.652509539842058e-07, + "loss": 1.2989914417266846, + "step": 8150 + }, + { + "epoch": 2.5090797168359495, + "grad_norm": 11.5625, + "learning_rate": 7.648473041284093e-07, + "loss": 1.4980051517486572, + "step": 8152 + }, + { + "epoch": 2.509695290858726, + "grad_norm": 11.0625, + "learning_rate": 7.644441115007437e-07, + "loss": 1.068181037902832, + "step": 8154 + }, + { + "epoch": 2.510310864881502, + "grad_norm": 13.8125, + "learning_rate": 7.640413762792991e-07, + "loss": 1.6641740798950195, + "step": 8156 + }, + { + "epoch": 2.5109264389042782, + "grad_norm": 20.0, + "learning_rate": 7.636390986419646e-07, + "loss": 1.0981097221374512, + "step": 8158 + }, + { + "epoch": 2.5115420129270545, + "grad_norm": 131.0, + "learning_rate": 7.632372787664268e-07, + "loss": 0.4805665910243988, + "step": 8160 + }, + { + "epoch": 2.5121575869498307, + "grad_norm": 8.625, + "learning_rate": 7.628359168301697e-07, + "loss": 1.3994219303131104, + "step": 8162 + }, + { + "epoch": 2.512773160972607, + "grad_norm": 11.375, + "learning_rate": 7.624350130104754e-07, + "loss": 1.0327104330062866, + "step": 8164 + }, + { + "epoch": 2.513388734995383, + "grad_norm": 10.625, + "learning_rate": 7.620345674844232e-07, + "loss": 1.2655659914016724, + "step": 8166 + }, + { + "epoch": 2.5140043090181594, + "grad_norm": 17.125, + "learning_rate": 7.616345804288912e-07, + "loss": 0.9777556657791138, + "step": 8168 + }, + { + "epoch": 2.5146198830409356, + "grad_norm": 24.125, + "learning_rate": 7.612350520205537e-07, + "loss": 1.3112051486968994, + "step": 8170 + }, + { + "epoch": 2.515235457063712, + "grad_norm": 17.5, + "learning_rate": 7.608359824358824e-07, + "loss": 1.42042076587677, + "step": 8172 + }, + { + "epoch": 2.515851031086488, + "grad_norm": 21.5, + "learning_rate": 7.604373718511477e-07, + "loss": 1.642563819885254, + "step": 8174 + }, + { + "epoch": 2.5164666051092643, + "grad_norm": 20.625, + "learning_rate": 7.600392204424156e-07, + "loss": 1.469331979751587, + "step": 8176 + }, + { + "epoch": 2.5170821791320406, + "grad_norm": 12.75, + "learning_rate": 7.596415283855503e-07, + "loss": 1.5045514106750488, + "step": 8178 + }, + { + "epoch": 2.517697753154817, + "grad_norm": 19.875, + "learning_rate": 7.592442958562132e-07, + "loss": 1.5238573551177979, + "step": 8180 + }, + { + "epoch": 2.518313327177593, + "grad_norm": 25.625, + "learning_rate": 7.588475230298616e-07, + "loss": 1.4592609405517578, + "step": 8182 + }, + { + "epoch": 2.5189289012003693, + "grad_norm": 15.125, + "learning_rate": 7.584512100817509e-07, + "loss": 1.4213488101959229, + "step": 8184 + }, + { + "epoch": 2.5195444752231455, + "grad_norm": 94.5, + "learning_rate": 7.580553571869333e-07, + "loss": 1.2677407264709473, + "step": 8186 + }, + { + "epoch": 2.5201600492459217, + "grad_norm": 16.75, + "learning_rate": 7.576599645202571e-07, + "loss": 1.285733699798584, + "step": 8188 + }, + { + "epoch": 2.520775623268698, + "grad_norm": 15.125, + "learning_rate": 7.572650322563676e-07, + "loss": 1.280822515487671, + "step": 8190 + }, + { + "epoch": 2.521391197291474, + "grad_norm": 24.875, + "learning_rate": 7.568705605697071e-07, + "loss": 1.3867173194885254, + "step": 8192 + }, + { + "epoch": 2.5220067713142504, + "grad_norm": 16.125, + "learning_rate": 7.564765496345142e-07, + "loss": 1.411359190940857, + "step": 8194 + }, + { + "epoch": 2.5226223453370267, + "grad_norm": 13.9375, + "learning_rate": 7.560829996248237e-07, + "loss": 1.5883018970489502, + "step": 8196 + }, + { + "epoch": 2.523237919359803, + "grad_norm": 18.75, + "learning_rate": 7.556899107144672e-07, + "loss": 1.0765776634216309, + "step": 8198 + }, + { + "epoch": 2.523853493382579, + "grad_norm": 11.4375, + "learning_rate": 7.55297283077073e-07, + "loss": 1.353743553161621, + "step": 8200 + }, + { + "epoch": 2.5244690674053554, + "grad_norm": 5.25, + "learning_rate": 7.549051168860643e-07, + "loss": 1.3110792636871338, + "step": 8202 + }, + { + "epoch": 2.5250846414281316, + "grad_norm": 9.1875, + "learning_rate": 7.545134123146621e-07, + "loss": 1.0713155269622803, + "step": 8204 + }, + { + "epoch": 2.525700215450908, + "grad_norm": 8.6875, + "learning_rate": 7.541221695358827e-07, + "loss": 1.2455883026123047, + "step": 8206 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 16.875, + "learning_rate": 7.537313887225374e-07, + "loss": 1.690041184425354, + "step": 8208 + }, + { + "epoch": 2.5269313634964603, + "grad_norm": 10.5, + "learning_rate": 7.533410700472362e-07, + "loss": 1.1286089420318604, + "step": 8210 + }, + { + "epoch": 2.5275469375192365, + "grad_norm": 27.5, + "learning_rate": 7.529512136823826e-07, + "loss": 1.151153802871704, + "step": 8212 + }, + { + "epoch": 2.5281625115420128, + "grad_norm": 15.125, + "learning_rate": 7.525618198001758e-07, + "loss": 1.4475476741790771, + "step": 8214 + }, + { + "epoch": 2.528778085564789, + "grad_norm": 6.125, + "learning_rate": 7.521728885726129e-07, + "loss": 1.101869821548462, + "step": 8216 + }, + { + "epoch": 2.5293936595875657, + "grad_norm": 9.0625, + "learning_rate": 7.517844201714842e-07, + "loss": 0.9757754802703857, + "step": 8218 + }, + { + "epoch": 2.5300092336103415, + "grad_norm": 9.5625, + "learning_rate": 7.513964147683775e-07, + "loss": 1.3237364292144775, + "step": 8220 + }, + { + "epoch": 2.530624807633118, + "grad_norm": 12.0, + "learning_rate": 7.510088725346742e-07, + "loss": 1.3525787591934204, + "step": 8222 + }, + { + "epoch": 2.531240381655894, + "grad_norm": 12.3125, + "learning_rate": 7.506217936415528e-07, + "loss": 1.103072166442871, + "step": 8224 + }, + { + "epoch": 2.5318559556786706, + "grad_norm": 16.875, + "learning_rate": 7.50235178259987e-07, + "loss": 1.3378674983978271, + "step": 8226 + }, + { + "epoch": 2.5324715297014464, + "grad_norm": 15.125, + "learning_rate": 7.49849026560744e-07, + "loss": 1.2925280332565308, + "step": 8228 + }, + { + "epoch": 2.533087103724223, + "grad_norm": 16.0, + "learning_rate": 7.494633387143883e-07, + "loss": 1.5961421728134155, + "step": 8230 + }, + { + "epoch": 2.533702677746999, + "grad_norm": 10.125, + "learning_rate": 7.490781148912786e-07, + "loss": 1.464501142501831, + "step": 8232 + }, + { + "epoch": 2.5343182517697755, + "grad_norm": 24.625, + "learning_rate": 7.486933552615682e-07, + "loss": 1.707505702972412, + "step": 8234 + }, + { + "epoch": 2.5349338257925513, + "grad_norm": 21.75, + "learning_rate": 7.483090599952067e-07, + "loss": 1.503312110900879, + "step": 8236 + }, + { + "epoch": 2.535549399815328, + "grad_norm": 6.15625, + "learning_rate": 7.479252292619371e-07, + "loss": 1.1280899047851562, + "step": 8238 + }, + { + "epoch": 2.536164973838104, + "grad_norm": 81.0, + "learning_rate": 7.47541863231298e-07, + "loss": 1.3761932849884033, + "step": 8240 + }, + { + "epoch": 2.5367805478608805, + "grad_norm": 12.0625, + "learning_rate": 7.47158962072623e-07, + "loss": 1.0959453582763672, + "step": 8242 + }, + { + "epoch": 2.5373961218836563, + "grad_norm": 14.5, + "learning_rate": 7.467765259550394e-07, + "loss": 1.3974474668502808, + "step": 8244 + }, + { + "epoch": 2.538011695906433, + "grad_norm": 7.46875, + "learning_rate": 7.463945550474699e-07, + "loss": 1.3391976356506348, + "step": 8246 + }, + { + "epoch": 2.5386272699292087, + "grad_norm": 17.75, + "learning_rate": 7.460130495186319e-07, + "loss": 1.3113188743591309, + "step": 8248 + }, + { + "epoch": 2.5392428439519854, + "grad_norm": 13.375, + "learning_rate": 7.45632009537036e-07, + "loss": 1.2737629413604736, + "step": 8250 + }, + { + "epoch": 2.539858417974761, + "grad_norm": 6.375, + "learning_rate": 7.452514352709887e-07, + "loss": 1.1970089673995972, + "step": 8252 + }, + { + "epoch": 2.540473991997538, + "grad_norm": 29.5, + "learning_rate": 7.4487132688859e-07, + "loss": 1.0147309303283691, + "step": 8254 + }, + { + "epoch": 2.541089566020314, + "grad_norm": 46.5, + "learning_rate": 7.444916845577338e-07, + "loss": 1.3090674877166748, + "step": 8256 + }, + { + "epoch": 2.5417051400430903, + "grad_norm": 10.75, + "learning_rate": 7.441125084461088e-07, + "loss": 1.1613578796386719, + "step": 8258 + }, + { + "epoch": 2.5423207140658666, + "grad_norm": 20.125, + "learning_rate": 7.437337987211975e-07, + "loss": 1.4671497344970703, + "step": 8260 + }, + { + "epoch": 2.542936288088643, + "grad_norm": 15.875, + "learning_rate": 7.433555555502766e-07, + "loss": 1.3811712265014648, + "step": 8262 + }, + { + "epoch": 2.543551862111419, + "grad_norm": 19.25, + "learning_rate": 7.429777791004164e-07, + "loss": 1.6986422538757324, + "step": 8264 + }, + { + "epoch": 2.5441674361341953, + "grad_norm": 56.5, + "learning_rate": 7.42600469538481e-07, + "loss": 1.27402925491333, + "step": 8266 + }, + { + "epoch": 2.5447830101569715, + "grad_norm": 20.875, + "learning_rate": 7.42223627031129e-07, + "loss": 1.6471729278564453, + "step": 8268 + }, + { + "epoch": 2.5453985841797477, + "grad_norm": 14.6875, + "learning_rate": 7.418472517448114e-07, + "loss": 1.2318017482757568, + "step": 8270 + }, + { + "epoch": 2.546014158202524, + "grad_norm": 12.4375, + "learning_rate": 7.414713438457741e-07, + "loss": 1.270857810974121, + "step": 8272 + }, + { + "epoch": 2.5466297322253, + "grad_norm": 26.625, + "learning_rate": 7.410959035000563e-07, + "loss": 1.4487571716308594, + "step": 8274 + }, + { + "epoch": 2.5472453062480764, + "grad_norm": 14.8125, + "learning_rate": 7.407209308734898e-07, + "loss": 1.3470463752746582, + "step": 8276 + }, + { + "epoch": 2.5478608802708527, + "grad_norm": 9.4375, + "learning_rate": 7.403464261317005e-07, + "loss": 1.4863656759262085, + "step": 8278 + }, + { + "epoch": 2.548476454293629, + "grad_norm": 31.5, + "learning_rate": 7.399723894401081e-07, + "loss": 1.744840383529663, + "step": 8280 + }, + { + "epoch": 2.549092028316405, + "grad_norm": 21.375, + "learning_rate": 7.395988209639248e-07, + "loss": 1.571589708328247, + "step": 8282 + }, + { + "epoch": 2.5497076023391814, + "grad_norm": 15.5, + "learning_rate": 7.392257208681559e-07, + "loss": 1.1173479557037354, + "step": 8284 + }, + { + "epoch": 2.5503231763619576, + "grad_norm": 25.75, + "learning_rate": 7.388530893176005e-07, + "loss": 1.352543830871582, + "step": 8286 + }, + { + "epoch": 2.550938750384734, + "grad_norm": 32.25, + "learning_rate": 7.384809264768504e-07, + "loss": 1.4264599084854126, + "step": 8288 + }, + { + "epoch": 2.55155432440751, + "grad_norm": 9.0, + "learning_rate": 7.381092325102902e-07, + "loss": 1.4546525478363037, + "step": 8290 + }, + { + "epoch": 2.5521698984302863, + "grad_norm": 11.4375, + "learning_rate": 7.377380075820974e-07, + "loss": 0.9854034781455994, + "step": 8292 + }, + { + "epoch": 2.5527854724530625, + "grad_norm": 13.6875, + "learning_rate": 7.37367251856243e-07, + "loss": 1.3239917755126953, + "step": 8294 + }, + { + "epoch": 2.5534010464758388, + "grad_norm": 9.75, + "learning_rate": 7.369969654964895e-07, + "loss": 1.4116442203521729, + "step": 8296 + }, + { + "epoch": 2.554016620498615, + "grad_norm": 16.875, + "learning_rate": 7.366271486663933e-07, + "loss": 1.1564812660217285, + "step": 8298 + }, + { + "epoch": 2.5546321945213912, + "grad_norm": 21.0, + "learning_rate": 7.36257801529303e-07, + "loss": 1.0999027490615845, + "step": 8300 + }, + { + "epoch": 2.5552477685441675, + "grad_norm": 18.0, + "learning_rate": 7.35888924248359e-07, + "loss": 1.4925274848937988, + "step": 8302 + }, + { + "epoch": 2.5558633425669437, + "grad_norm": 20.0, + "learning_rate": 7.355205169864957e-07, + "loss": 0.9652723073959351, + "step": 8304 + }, + { + "epoch": 2.55647891658972, + "grad_norm": 27.75, + "learning_rate": 7.351525799064384e-07, + "loss": 1.7413601875305176, + "step": 8306 + }, + { + "epoch": 2.557094490612496, + "grad_norm": 9.3125, + "learning_rate": 7.347851131707057e-07, + "loss": 1.0957928895950317, + "step": 8308 + }, + { + "epoch": 2.5577100646352724, + "grad_norm": 30.75, + "learning_rate": 7.34418116941608e-07, + "loss": 1.7662091255187988, + "step": 8310 + }, + { + "epoch": 2.5583256386580486, + "grad_norm": 29.625, + "learning_rate": 7.340515913812476e-07, + "loss": 1.799209713935852, + "step": 8312 + }, + { + "epoch": 2.558941212680825, + "grad_norm": 20.5, + "learning_rate": 7.336855366515195e-07, + "loss": 1.5272533893585205, + "step": 8314 + }, + { + "epoch": 2.559556786703601, + "grad_norm": 24.625, + "learning_rate": 7.333199529141107e-07, + "loss": 1.0781242847442627, + "step": 8316 + }, + { + "epoch": 2.5601723607263773, + "grad_norm": 13.9375, + "learning_rate": 7.329548403304996e-07, + "loss": 1.262157678604126, + "step": 8318 + }, + { + "epoch": 2.5607879347491536, + "grad_norm": 65.5, + "learning_rate": 7.32590199061957e-07, + "loss": 1.4290770292282104, + "step": 8320 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 10.125, + "learning_rate": 7.322260292695454e-07, + "loss": 1.4656684398651123, + "step": 8322 + }, + { + "epoch": 2.562019082794706, + "grad_norm": 9.25, + "learning_rate": 7.318623311141191e-07, + "loss": 1.0915682315826416, + "step": 8324 + }, + { + "epoch": 2.5626346568174823, + "grad_norm": 85.5, + "learning_rate": 7.314991047563237e-07, + "loss": 1.1750187873840332, + "step": 8326 + }, + { + "epoch": 2.5632502308402585, + "grad_norm": 9.1875, + "learning_rate": 7.311363503565969e-07, + "loss": 1.7010297775268555, + "step": 8328 + }, + { + "epoch": 2.5638658048630347, + "grad_norm": 15.0625, + "learning_rate": 7.30774068075168e-07, + "loss": 1.6195107698440552, + "step": 8330 + }, + { + "epoch": 2.564481378885811, + "grad_norm": 17.625, + "learning_rate": 7.304122580720569e-07, + "loss": 1.3422554731369019, + "step": 8332 + }, + { + "epoch": 2.565096952908587, + "grad_norm": 13.25, + "learning_rate": 7.300509205070758e-07, + "loss": 1.464439868927002, + "step": 8334 + }, + { + "epoch": 2.5657125269313634, + "grad_norm": 17.75, + "learning_rate": 7.296900555398282e-07, + "loss": 1.6185365915298462, + "step": 8336 + }, + { + "epoch": 2.5663281009541397, + "grad_norm": 10.625, + "learning_rate": 7.293296633297081e-07, + "loss": 1.3132727146148682, + "step": 8338 + }, + { + "epoch": 2.566943674976916, + "grad_norm": 19.875, + "learning_rate": 7.289697440359012e-07, + "loss": 1.5001401901245117, + "step": 8340 + }, + { + "epoch": 2.567559248999692, + "grad_norm": 19.125, + "learning_rate": 7.286102978173847e-07, + "loss": 1.0073649883270264, + "step": 8342 + }, + { + "epoch": 2.5681748230224684, + "grad_norm": 27.5, + "learning_rate": 7.282513248329258e-07, + "loss": 1.0872913599014282, + "step": 8344 + }, + { + "epoch": 2.5687903970452446, + "grad_norm": 15.375, + "learning_rate": 7.278928252410838e-07, + "loss": 1.535001516342163, + "step": 8346 + }, + { + "epoch": 2.569405971068021, + "grad_norm": 11.8125, + "learning_rate": 7.275347992002079e-07, + "loss": 1.4017884731292725, + "step": 8348 + }, + { + "epoch": 2.570021545090797, + "grad_norm": 18.25, + "learning_rate": 7.271772468684393e-07, + "loss": 1.2841901779174805, + "step": 8350 + }, + { + "epoch": 2.5706371191135733, + "grad_norm": 41.0, + "learning_rate": 7.268201684037085e-07, + "loss": 1.235544204711914, + "step": 8352 + }, + { + "epoch": 2.5712526931363495, + "grad_norm": 16.625, + "learning_rate": 7.26463563963738e-07, + "loss": 0.9660817980766296, + "step": 8354 + }, + { + "epoch": 2.5718682671591258, + "grad_norm": 9.6875, + "learning_rate": 7.261074337060402e-07, + "loss": 1.301021933555603, + "step": 8356 + }, + { + "epoch": 2.572483841181902, + "grad_norm": 8.125, + "learning_rate": 7.257517777879182e-07, + "loss": 1.1702752113342285, + "step": 8358 + }, + { + "epoch": 2.573099415204678, + "grad_norm": 10.375, + "learning_rate": 7.253965963664656e-07, + "loss": 1.286567211151123, + "step": 8360 + }, + { + "epoch": 2.5737149892274545, + "grad_norm": 11.5625, + "learning_rate": 7.250418895985668e-07, + "loss": 1.185631513595581, + "step": 8362 + }, + { + "epoch": 2.5743305632502307, + "grad_norm": 8.4375, + "learning_rate": 7.246876576408954e-07, + "loss": 1.2492417097091675, + "step": 8364 + }, + { + "epoch": 2.574946137273007, + "grad_norm": 9.1875, + "learning_rate": 7.243339006499171e-07, + "loss": 1.3682482242584229, + "step": 8366 + }, + { + "epoch": 2.575561711295783, + "grad_norm": 22.5, + "learning_rate": 7.239806187818861e-07, + "loss": 1.232422113418579, + "step": 8368 + }, + { + "epoch": 2.5761772853185594, + "grad_norm": 31.125, + "learning_rate": 7.236278121928472e-07, + "loss": 0.8495137691497803, + "step": 8370 + }, + { + "epoch": 2.576792859341336, + "grad_norm": 12.4375, + "learning_rate": 7.232754810386362e-07, + "loss": 1.4019091129302979, + "step": 8372 + }, + { + "epoch": 2.577408433364112, + "grad_norm": 6.15625, + "learning_rate": 7.229236254748776e-07, + "loss": 1.1350939273834229, + "step": 8374 + }, + { + "epoch": 2.5780240073868885, + "grad_norm": 27.0, + "learning_rate": 7.225722456569866e-07, + "loss": 1.1491904258728027, + "step": 8376 + }, + { + "epoch": 2.5786395814096643, + "grad_norm": 7.46875, + "learning_rate": 7.222213417401682e-07, + "loss": 1.5071659088134766, + "step": 8378 + }, + { + "epoch": 2.579255155432441, + "grad_norm": 13.4375, + "learning_rate": 7.218709138794167e-07, + "loss": 1.4240700006484985, + "step": 8380 + }, + { + "epoch": 2.579870729455217, + "grad_norm": 23.625, + "learning_rate": 7.215209622295168e-07, + "loss": 1.2526869773864746, + "step": 8382 + }, + { + "epoch": 2.5804863034779935, + "grad_norm": 26.25, + "learning_rate": 7.211714869450427e-07, + "loss": 1.2095661163330078, + "step": 8384 + }, + { + "epoch": 2.5811018775007692, + "grad_norm": 25.5, + "learning_rate": 7.208224881803574e-07, + "loss": 1.530590534210205, + "step": 8386 + }, + { + "epoch": 2.581717451523546, + "grad_norm": 14.0625, + "learning_rate": 7.204739660896148e-07, + "loss": 1.5323766469955444, + "step": 8388 + }, + { + "epoch": 2.5823330255463217, + "grad_norm": 29.375, + "learning_rate": 7.201259208267567e-07, + "loss": 0.969746470451355, + "step": 8390 + }, + { + "epoch": 2.5829485995690984, + "grad_norm": 6.875, + "learning_rate": 7.197783525455159e-07, + "loss": 1.546391248703003, + "step": 8392 + }, + { + "epoch": 2.583564173591874, + "grad_norm": 12.0625, + "learning_rate": 7.194312613994134e-07, + "loss": 1.3328742980957031, + "step": 8394 + }, + { + "epoch": 2.584179747614651, + "grad_norm": 12.5625, + "learning_rate": 7.190846475417593e-07, + "loss": 1.2772825956344604, + "step": 8396 + }, + { + "epoch": 2.5847953216374266, + "grad_norm": 12.5, + "learning_rate": 7.187385111256541e-07, + "loss": 1.4215928316116333, + "step": 8398 + }, + { + "epoch": 2.5854108956602033, + "grad_norm": 22.125, + "learning_rate": 7.183928523039861e-07, + "loss": 1.6627047061920166, + "step": 8400 + }, + { + "epoch": 2.586026469682979, + "grad_norm": 24.75, + "learning_rate": 7.180476712294335e-07, + "loss": 1.6336177587509155, + "step": 8402 + }, + { + "epoch": 2.586642043705756, + "grad_norm": 18.125, + "learning_rate": 7.17702968054463e-07, + "loss": 1.4483904838562012, + "step": 8404 + }, + { + "epoch": 2.587257617728532, + "grad_norm": 15.375, + "learning_rate": 7.173587429313306e-07, + "loss": 1.3190503120422363, + "step": 8406 + }, + { + "epoch": 2.5878731917513083, + "grad_norm": 11.6875, + "learning_rate": 7.170149960120804e-07, + "loss": 1.1253911256790161, + "step": 8408 + }, + { + "epoch": 2.5884887657740845, + "grad_norm": 12.0, + "learning_rate": 7.166717274485467e-07, + "loss": 1.256879448890686, + "step": 8410 + }, + { + "epoch": 2.5891043397968607, + "grad_norm": 33.25, + "learning_rate": 7.163289373923507e-07, + "loss": 1.4488470554351807, + "step": 8412 + }, + { + "epoch": 2.589719913819637, + "grad_norm": 10.75, + "learning_rate": 7.159866259949036e-07, + "loss": 1.2682024240493774, + "step": 8414 + }, + { + "epoch": 2.590335487842413, + "grad_norm": 9.8125, + "learning_rate": 7.156447934074048e-07, + "loss": 1.371632695198059, + "step": 8416 + }, + { + "epoch": 2.5909510618651894, + "grad_norm": 71.5, + "learning_rate": 7.153034397808421e-07, + "loss": 0.9261094927787781, + "step": 8418 + }, + { + "epoch": 2.5915666358879657, + "grad_norm": 12.625, + "learning_rate": 7.149625652659918e-07, + "loss": 1.3142518997192383, + "step": 8420 + }, + { + "epoch": 2.592182209910742, + "grad_norm": 9.9375, + "learning_rate": 7.146221700134182e-07, + "loss": 1.2637038230895996, + "step": 8422 + }, + { + "epoch": 2.592797783933518, + "grad_norm": 8.5, + "learning_rate": 7.142822541734751e-07, + "loss": 1.1678962707519531, + "step": 8424 + }, + { + "epoch": 2.5934133579562944, + "grad_norm": 19.0, + "learning_rate": 7.139428178963027e-07, + "loss": 1.3438940048217773, + "step": 8426 + }, + { + "epoch": 2.5940289319790706, + "grad_norm": 9.375, + "learning_rate": 7.136038613318309e-07, + "loss": 1.3192870616912842, + "step": 8428 + }, + { + "epoch": 2.594644506001847, + "grad_norm": 15.875, + "learning_rate": 7.132653846297776e-07, + "loss": 1.523959755897522, + "step": 8430 + }, + { + "epoch": 2.595260080024623, + "grad_norm": 18.25, + "learning_rate": 7.129273879396473e-07, + "loss": 1.1334865093231201, + "step": 8432 + }, + { + "epoch": 2.5958756540473993, + "grad_norm": 11.75, + "learning_rate": 7.125898714107347e-07, + "loss": 1.4075385332107544, + "step": 8434 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 15.125, + "learning_rate": 7.122528351921207e-07, + "loss": 1.591729998588562, + "step": 8436 + }, + { + "epoch": 2.5971068020929517, + "grad_norm": 13.6875, + "learning_rate": 7.11916279432674e-07, + "loss": 1.4569002389907837, + "step": 8438 + }, + { + "epoch": 2.597722376115728, + "grad_norm": 11.25, + "learning_rate": 7.115802042810529e-07, + "loss": 1.2565406560897827, + "step": 8440 + }, + { + "epoch": 2.598337950138504, + "grad_norm": 13.75, + "learning_rate": 7.112446098857011e-07, + "loss": 1.421222448348999, + "step": 8442 + }, + { + "epoch": 2.5989535241612804, + "grad_norm": 4.59375, + "learning_rate": 7.109094963948518e-07, + "loss": 1.1741873025894165, + "step": 8444 + }, + { + "epoch": 2.5995690981840567, + "grad_norm": 6.9375, + "learning_rate": 7.105748639565243e-07, + "loss": 0.9959874749183655, + "step": 8446 + }, + { + "epoch": 2.600184672206833, + "grad_norm": 23.125, + "learning_rate": 7.102407127185266e-07, + "loss": 1.4912631511688232, + "step": 8448 + }, + { + "epoch": 2.600800246229609, + "grad_norm": 18.125, + "learning_rate": 7.099070428284537e-07, + "loss": 1.0366930961608887, + "step": 8450 + }, + { + "epoch": 2.6014158202523854, + "grad_norm": 27.625, + "learning_rate": 7.095738544336877e-07, + "loss": 1.302032709121704, + "step": 8452 + }, + { + "epoch": 2.6020313942751616, + "grad_norm": 15.9375, + "learning_rate": 7.092411476813984e-07, + "loss": 1.2757986783981323, + "step": 8454 + }, + { + "epoch": 2.602646968297938, + "grad_norm": 11.0, + "learning_rate": 7.089089227185432e-07, + "loss": 1.0776348114013672, + "step": 8456 + }, + { + "epoch": 2.603262542320714, + "grad_norm": 15.5625, + "learning_rate": 7.085771796918652e-07, + "loss": 1.6991130113601685, + "step": 8458 + }, + { + "epoch": 2.6038781163434903, + "grad_norm": 7.9375, + "learning_rate": 7.082459187478967e-07, + "loss": 1.3396224975585938, + "step": 8460 + }, + { + "epoch": 2.6044936903662665, + "grad_norm": 23.25, + "learning_rate": 7.079151400329557e-07, + "loss": 1.3427355289459229, + "step": 8462 + }, + { + "epoch": 2.605109264389043, + "grad_norm": 36.5, + "learning_rate": 7.075848436931472e-07, + "loss": 1.116722822189331, + "step": 8464 + }, + { + "epoch": 2.605724838411819, + "grad_norm": 15.9375, + "learning_rate": 7.072550298743641e-07, + "loss": 0.7307620048522949, + "step": 8466 + }, + { + "epoch": 2.6063404124345952, + "grad_norm": 14.25, + "learning_rate": 7.06925698722285e-07, + "loss": 1.4221217632293701, + "step": 8468 + }, + { + "epoch": 2.6069559864573715, + "grad_norm": 13.1875, + "learning_rate": 7.065968503823761e-07, + "loss": 1.3831367492675781, + "step": 8470 + }, + { + "epoch": 2.6075715604801477, + "grad_norm": 17.0, + "learning_rate": 7.062684849998903e-07, + "loss": 0.9629683494567871, + "step": 8472 + }, + { + "epoch": 2.608187134502924, + "grad_norm": 11.625, + "learning_rate": 7.059406027198662e-07, + "loss": 1.333807349205017, + "step": 8474 + }, + { + "epoch": 2.6088027085257, + "grad_norm": 33.5, + "learning_rate": 7.056132036871306e-07, + "loss": 0.8511074781417847, + "step": 8476 + }, + { + "epoch": 2.6094182825484764, + "grad_norm": 21.5, + "learning_rate": 7.052862880462958e-07, + "loss": 1.3947702646255493, + "step": 8478 + }, + { + "epoch": 2.6100338565712526, + "grad_norm": 26.5, + "learning_rate": 7.049598559417604e-07, + "loss": 1.2550370693206787, + "step": 8480 + }, + { + "epoch": 2.610649430594029, + "grad_norm": 33.25, + "learning_rate": 7.046339075177104e-07, + "loss": 1.7725454568862915, + "step": 8482 + }, + { + "epoch": 2.611265004616805, + "grad_norm": 17.0, + "learning_rate": 7.043084429181172e-07, + "loss": 1.7614102363586426, + "step": 8484 + }, + { + "epoch": 2.6118805786395813, + "grad_norm": 15.6875, + "learning_rate": 7.039834622867393e-07, + "loss": 1.2160773277282715, + "step": 8486 + }, + { + "epoch": 2.6124961526623576, + "grad_norm": 39.75, + "learning_rate": 7.036589657671205e-07, + "loss": 1.4206631183624268, + "step": 8488 + }, + { + "epoch": 2.613111726685134, + "grad_norm": 30.5, + "learning_rate": 7.033349535025915e-07, + "loss": 1.267873764038086, + "step": 8490 + }, + { + "epoch": 2.61372730070791, + "grad_norm": 12.875, + "learning_rate": 7.030114256362693e-07, + "loss": 1.347649335861206, + "step": 8492 + }, + { + "epoch": 2.6143428747306863, + "grad_norm": 35.75, + "learning_rate": 7.026883823110557e-07, + "loss": 0.9955426454544067, + "step": 8494 + }, + { + "epoch": 2.6149584487534625, + "grad_norm": 13.5, + "learning_rate": 7.023658236696399e-07, + "loss": 1.5194344520568848, + "step": 8496 + }, + { + "epoch": 2.6155740227762387, + "grad_norm": 19.0, + "learning_rate": 7.020437498544965e-07, + "loss": 1.7742666006088257, + "step": 8498 + }, + { + "epoch": 2.616189596799015, + "grad_norm": 17.0, + "learning_rate": 7.017221610078855e-07, + "loss": 1.0506432056427002, + "step": 8500 + }, + { + "epoch": 2.616805170821791, + "grad_norm": 18.25, + "learning_rate": 7.014010572718532e-07, + "loss": 1.3107993602752686, + "step": 8502 + }, + { + "epoch": 2.6174207448445674, + "grad_norm": 19.875, + "learning_rate": 7.010804387882316e-07, + "loss": 1.4073238372802734, + "step": 8504 + }, + { + "epoch": 2.6180363188673437, + "grad_norm": 13.375, + "learning_rate": 7.00760305698638e-07, + "loss": 1.240088701248169, + "step": 8506 + }, + { + "epoch": 2.61865189289012, + "grad_norm": 7.375, + "learning_rate": 7.004406581444758e-07, + "loss": 0.9746329188346863, + "step": 8508 + }, + { + "epoch": 2.619267466912896, + "grad_norm": 12.4375, + "learning_rate": 7.001214962669335e-07, + "loss": 1.2753421068191528, + "step": 8510 + }, + { + "epoch": 2.6198830409356724, + "grad_norm": 16.625, + "learning_rate": 6.998028202069855e-07, + "loss": 1.495098352432251, + "step": 8512 + }, + { + "epoch": 2.6204986149584486, + "grad_norm": 10.75, + "learning_rate": 6.994846301053912e-07, + "loss": 1.4813196659088135, + "step": 8514 + }, + { + "epoch": 2.621114188981225, + "grad_norm": 11.0, + "learning_rate": 6.991669261026955e-07, + "loss": 1.1549345254898071, + "step": 8516 + }, + { + "epoch": 2.621729763004001, + "grad_norm": 16.0, + "learning_rate": 6.988497083392288e-07, + "loss": 1.3601171970367432, + "step": 8518 + }, + { + "epoch": 2.6223453370267773, + "grad_norm": 14.125, + "learning_rate": 6.985329769551065e-07, + "loss": 1.7009246349334717, + "step": 8520 + }, + { + "epoch": 2.622960911049554, + "grad_norm": 20.0, + "learning_rate": 6.98216732090229e-07, + "loss": 0.9509875774383545, + "step": 8522 + }, + { + "epoch": 2.6235764850723298, + "grad_norm": 8.0625, + "learning_rate": 6.979009738842824e-07, + "loss": 1.3136813640594482, + "step": 8524 + }, + { + "epoch": 2.6241920590951064, + "grad_norm": 10.9375, + "learning_rate": 6.975857024767368e-07, + "loss": 1.6199955940246582, + "step": 8526 + }, + { + "epoch": 2.6248076331178822, + "grad_norm": 45.0, + "learning_rate": 6.972709180068488e-07, + "loss": 1.7706856727600098, + "step": 8528 + }, + { + "epoch": 2.625423207140659, + "grad_norm": 23.625, + "learning_rate": 6.969566206136588e-07, + "loss": 1.009418249130249, + "step": 8530 + }, + { + "epoch": 2.6260387811634347, + "grad_norm": 16.25, + "learning_rate": 6.966428104359916e-07, + "loss": 1.1988286972045898, + "step": 8532 + }, + { + "epoch": 2.6266543551862114, + "grad_norm": 10.9375, + "learning_rate": 6.963294876124585e-07, + "loss": 1.2532944679260254, + "step": 8534 + }, + { + "epoch": 2.627269929208987, + "grad_norm": 23.375, + "learning_rate": 6.96016652281454e-07, + "loss": 1.2428243160247803, + "step": 8536 + }, + { + "epoch": 2.627885503231764, + "grad_norm": 25.75, + "learning_rate": 6.957043045811579e-07, + "loss": 0.9903680682182312, + "step": 8538 + }, + { + "epoch": 2.6285010772545396, + "grad_norm": 41.5, + "learning_rate": 6.953924446495348e-07, + "loss": 0.9027482867240906, + "step": 8540 + }, + { + "epoch": 2.6291166512773163, + "grad_norm": 3.578125, + "learning_rate": 6.950810726243332e-07, + "loss": 1.1158403158187866, + "step": 8542 + }, + { + "epoch": 2.629732225300092, + "grad_norm": 23.25, + "learning_rate": 6.947701886430868e-07, + "loss": 1.6318439245224, + "step": 8544 + }, + { + "epoch": 2.6303477993228688, + "grad_norm": 13.25, + "learning_rate": 6.944597928431132e-07, + "loss": 1.3490636348724365, + "step": 8546 + }, + { + "epoch": 2.6309633733456446, + "grad_norm": 13.5, + "learning_rate": 6.941498853615145e-07, + "loss": 1.3829448223114014, + "step": 8548 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 24.5, + "learning_rate": 6.938404663351774e-07, + "loss": 1.3896286487579346, + "step": 8550 + }, + { + "epoch": 2.632194521391197, + "grad_norm": 33.5, + "learning_rate": 6.935315359007725e-07, + "loss": 1.0280698537826538, + "step": 8552 + }, + { + "epoch": 2.6328100954139737, + "grad_norm": 11.1875, + "learning_rate": 6.932230941947551e-07, + "loss": 1.194199562072754, + "step": 8554 + }, + { + "epoch": 2.6334256694367495, + "grad_norm": 19.875, + "learning_rate": 6.929151413533638e-07, + "loss": 1.0795966386795044, + "step": 8556 + }, + { + "epoch": 2.634041243459526, + "grad_norm": 21.75, + "learning_rate": 6.926076775126217e-07, + "loss": 1.6833020448684692, + "step": 8558 + }, + { + "epoch": 2.6346568174823024, + "grad_norm": 18.75, + "learning_rate": 6.923007028083365e-07, + "loss": 1.5397007465362549, + "step": 8560 + }, + { + "epoch": 2.6352723915050786, + "grad_norm": 16.125, + "learning_rate": 6.919942173760986e-07, + "loss": 1.5429842472076416, + "step": 8562 + }, + { + "epoch": 2.635887965527855, + "grad_norm": 6.75, + "learning_rate": 6.916882213512831e-07, + "loss": 1.1548995971679688, + "step": 8564 + }, + { + "epoch": 2.636503539550631, + "grad_norm": 20.125, + "learning_rate": 6.913827148690494e-07, + "loss": 1.2357516288757324, + "step": 8566 + }, + { + "epoch": 2.6371191135734073, + "grad_norm": 25.5, + "learning_rate": 6.910776980643394e-07, + "loss": 1.1196115016937256, + "step": 8568 + }, + { + "epoch": 2.6377346875961836, + "grad_norm": 12.125, + "learning_rate": 6.907731710718794e-07, + "loss": 1.3863110542297363, + "step": 8570 + }, + { + "epoch": 2.63835026161896, + "grad_norm": 17.125, + "learning_rate": 6.904691340261799e-07, + "loss": 1.6990160942077637, + "step": 8572 + }, + { + "epoch": 2.638965835641736, + "grad_norm": 9.5, + "learning_rate": 6.901655870615336e-07, + "loss": 1.697874665260315, + "step": 8574 + }, + { + "epoch": 2.6395814096645123, + "grad_norm": 59.25, + "learning_rate": 6.898625303120182e-07, + "loss": 1.6032500267028809, + "step": 8576 + }, + { + "epoch": 2.6401969836872885, + "grad_norm": 10.125, + "learning_rate": 6.895599639114939e-07, + "loss": 1.2693076133728027, + "step": 8578 + }, + { + "epoch": 2.6408125577100647, + "grad_norm": 11.25, + "learning_rate": 6.892578879936048e-07, + "loss": 1.2726253271102905, + "step": 8580 + }, + { + "epoch": 2.641428131732841, + "grad_norm": 12.5625, + "learning_rate": 6.88956302691778e-07, + "loss": 1.5192341804504395, + "step": 8582 + }, + { + "epoch": 2.642043705755617, + "grad_norm": 26.0, + "learning_rate": 6.88655208139224e-07, + "loss": 1.3771846294403076, + "step": 8584 + }, + { + "epoch": 2.6426592797783934, + "grad_norm": 18.5, + "learning_rate": 6.88354604468937e-07, + "loss": 1.4578754901885986, + "step": 8586 + }, + { + "epoch": 2.6432748538011697, + "grad_norm": 11.4375, + "learning_rate": 6.880544918136936e-07, + "loss": 1.217280387878418, + "step": 8588 + }, + { + "epoch": 2.643890427823946, + "grad_norm": 24.0, + "learning_rate": 6.877548703060541e-07, + "loss": 1.3406977653503418, + "step": 8590 + }, + { + "epoch": 2.644506001846722, + "grad_norm": 14.5625, + "learning_rate": 6.874557400783616e-07, + "loss": 1.7917208671569824, + "step": 8592 + }, + { + "epoch": 2.6451215758694984, + "grad_norm": 19.25, + "learning_rate": 6.871571012627421e-07, + "loss": 1.2604199647903442, + "step": 8594 + }, + { + "epoch": 2.6457371498922746, + "grad_norm": 13.375, + "learning_rate": 6.868589539911052e-07, + "loss": 1.0737535953521729, + "step": 8596 + }, + { + "epoch": 2.646352723915051, + "grad_norm": 25.125, + "learning_rate": 6.865612983951423e-07, + "loss": 1.3047271966934204, + "step": 8598 + }, + { + "epoch": 2.646968297937827, + "grad_norm": 25.25, + "learning_rate": 6.862641346063285e-07, + "loss": 1.7620556354522705, + "step": 8600 + }, + { + "epoch": 2.6475838719606033, + "grad_norm": 19.25, + "learning_rate": 6.859674627559217e-07, + "loss": 1.283871054649353, + "step": 8602 + }, + { + "epoch": 2.6481994459833795, + "grad_norm": 12.0, + "learning_rate": 6.856712829749616e-07, + "loss": 1.4960191249847412, + "step": 8604 + }, + { + "epoch": 2.6488150200061558, + "grad_norm": 22.125, + "learning_rate": 6.853755953942714e-07, + "loss": 1.9348828792572021, + "step": 8606 + }, + { + "epoch": 2.649430594028932, + "grad_norm": 19.75, + "learning_rate": 6.85080400144457e-07, + "loss": 1.135761022567749, + "step": 8608 + }, + { + "epoch": 2.6500461680517082, + "grad_norm": 109.5, + "learning_rate": 6.847856973559057e-07, + "loss": 1.211793303489685, + "step": 8610 + }, + { + "epoch": 2.6506617420744845, + "grad_norm": 24.375, + "learning_rate": 6.844914871587888e-07, + "loss": 1.6693403720855713, + "step": 8612 + }, + { + "epoch": 2.6512773160972607, + "grad_norm": 25.25, + "learning_rate": 6.841977696830587e-07, + "loss": 1.7439048290252686, + "step": 8614 + }, + { + "epoch": 2.651892890120037, + "grad_norm": 5.0625, + "learning_rate": 6.839045450584512e-07, + "loss": 1.3092360496520996, + "step": 8616 + }, + { + "epoch": 2.652508464142813, + "grad_norm": 10.1875, + "learning_rate": 6.836118134144839e-07, + "loss": 1.1425950527191162, + "step": 8618 + }, + { + "epoch": 2.6531240381655894, + "grad_norm": 3.609375, + "learning_rate": 6.833195748804561e-07, + "loss": 1.2227166891098022, + "step": 8620 + }, + { + "epoch": 2.6537396121883656, + "grad_norm": 14.125, + "learning_rate": 6.830278295854509e-07, + "loss": 1.398766279220581, + "step": 8622 + }, + { + "epoch": 2.654355186211142, + "grad_norm": 13.3125, + "learning_rate": 6.827365776583319e-07, + "loss": 1.239612102508545, + "step": 8624 + }, + { + "epoch": 2.654970760233918, + "grad_norm": 12.5, + "learning_rate": 6.824458192277451e-07, + "loss": 1.468193531036377, + "step": 8626 + }, + { + "epoch": 2.6555863342566943, + "grad_norm": 10.8125, + "learning_rate": 6.821555544221199e-07, + "loss": 1.3009378910064697, + "step": 8628 + }, + { + "epoch": 2.6562019082794706, + "grad_norm": 15.5, + "learning_rate": 6.818657833696655e-07, + "loss": 1.7035462856292725, + "step": 8630 + }, + { + "epoch": 2.656817482302247, + "grad_norm": 8.1875, + "learning_rate": 6.815765061983747e-07, + "loss": 1.3358898162841797, + "step": 8632 + }, + { + "epoch": 2.657433056325023, + "grad_norm": 32.0, + "learning_rate": 6.812877230360214e-07, + "loss": 1.3851983547210693, + "step": 8634 + }, + { + "epoch": 2.6580486303477993, + "grad_norm": 9.5625, + "learning_rate": 6.809994340101614e-07, + "loss": 1.2320975065231323, + "step": 8636 + }, + { + "epoch": 2.6586642043705755, + "grad_norm": 27.5, + "learning_rate": 6.807116392481322e-07, + "loss": 1.1153748035430908, + "step": 8638 + }, + { + "epoch": 2.6592797783933517, + "grad_norm": 6.75, + "learning_rate": 6.804243388770534e-07, + "loss": 1.4036579132080078, + "step": 8640 + }, + { + "epoch": 2.659895352416128, + "grad_norm": 18.5, + "learning_rate": 6.801375330238259e-07, + "loss": 1.3697552680969238, + "step": 8642 + }, + { + "epoch": 2.660510926438904, + "grad_norm": 20.125, + "learning_rate": 6.798512218151318e-07, + "loss": 1.6686692237854004, + "step": 8644 + }, + { + "epoch": 2.6611265004616804, + "grad_norm": 15.25, + "learning_rate": 6.795654053774355e-07, + "loss": 1.219560146331787, + "step": 8646 + }, + { + "epoch": 2.6617420744844567, + "grad_norm": 14.875, + "learning_rate": 6.792800838369821e-07, + "loss": 1.338904619216919, + "step": 8648 + }, + { + "epoch": 2.662357648507233, + "grad_norm": 15.6875, + "learning_rate": 6.789952573197986e-07, + "loss": 1.3034480810165405, + "step": 8650 + }, + { + "epoch": 2.662973222530009, + "grad_norm": 20.75, + "learning_rate": 6.787109259516932e-07, + "loss": 1.2775514125823975, + "step": 8652 + }, + { + "epoch": 2.6635887965527854, + "grad_norm": 25.125, + "learning_rate": 6.784270898582553e-07, + "loss": 1.7190017700195312, + "step": 8654 + }, + { + "epoch": 2.6642043705755616, + "grad_norm": 12.0625, + "learning_rate": 6.781437491648559e-07, + "loss": 1.3352466821670532, + "step": 8656 + }, + { + "epoch": 2.664819944598338, + "grad_norm": 11.5625, + "learning_rate": 6.778609039966464e-07, + "loss": 1.4454190731048584, + "step": 8658 + }, + { + "epoch": 2.665435518621114, + "grad_norm": 22.125, + "learning_rate": 6.775785544785606e-07, + "loss": 1.3824318647384644, + "step": 8660 + }, + { + "epoch": 2.6660510926438903, + "grad_norm": 19.875, + "learning_rate": 6.772967007353117e-07, + "loss": 1.6892662048339844, + "step": 8662 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 19.25, + "learning_rate": 6.770153428913956e-07, + "loss": 1.223496437072754, + "step": 8664 + }, + { + "epoch": 2.6672822406894428, + "grad_norm": 19.25, + "learning_rate": 6.767344810710878e-07, + "loss": 1.4526058435440063, + "step": 8666 + }, + { + "epoch": 2.667897814712219, + "grad_norm": 38.0, + "learning_rate": 6.764541153984458e-07, + "loss": 1.5432841777801514, + "step": 8668 + }, + { + "epoch": 2.668513388734995, + "grad_norm": 12.6875, + "learning_rate": 6.76174245997307e-07, + "loss": 1.3734015226364136, + "step": 8670 + }, + { + "epoch": 2.669128962757772, + "grad_norm": 18.75, + "learning_rate": 6.758948729912904e-07, + "loss": 1.566282033920288, + "step": 8672 + }, + { + "epoch": 2.6697445367805477, + "grad_norm": 10.875, + "learning_rate": 6.756159965037951e-07, + "loss": 1.2576236724853516, + "step": 8674 + }, + { + "epoch": 2.6703601108033244, + "grad_norm": 11.3125, + "learning_rate": 6.753376166580013e-07, + "loss": 1.0591974258422852, + "step": 8676 + }, + { + "epoch": 2.6709756848261, + "grad_norm": 37.25, + "learning_rate": 6.750597335768695e-07, + "loss": 1.7590951919555664, + "step": 8678 + }, + { + "epoch": 2.671591258848877, + "grad_norm": 17.25, + "learning_rate": 6.747823473831416e-07, + "loss": 1.7607227563858032, + "step": 8680 + }, + { + "epoch": 2.6722068328716526, + "grad_norm": 17.5, + "learning_rate": 6.745054581993382e-07, + "loss": 1.4102087020874023, + "step": 8682 + }, + { + "epoch": 2.6728224068944293, + "grad_norm": 12.0625, + "learning_rate": 6.742290661477629e-07, + "loss": 1.2666304111480713, + "step": 8684 + }, + { + "epoch": 2.673437980917205, + "grad_norm": 30.625, + "learning_rate": 6.739531713504978e-07, + "loss": 1.2427082061767578, + "step": 8686 + }, + { + "epoch": 2.6740535549399818, + "grad_norm": 18.75, + "learning_rate": 6.736777739294055e-07, + "loss": 1.1650917530059814, + "step": 8688 + }, + { + "epoch": 2.6746691289627575, + "grad_norm": 22.125, + "learning_rate": 6.734028740061301e-07, + "loss": 1.6606533527374268, + "step": 8690 + }, + { + "epoch": 2.6752847029855342, + "grad_norm": 11.8125, + "learning_rate": 6.731284717020948e-07, + "loss": 1.676476240158081, + "step": 8692 + }, + { + "epoch": 2.67590027700831, + "grad_norm": 23.125, + "learning_rate": 6.728545671385035e-07, + "loss": 0.9003405570983887, + "step": 8694 + }, + { + "epoch": 2.6765158510310867, + "grad_norm": 8.6875, + "learning_rate": 6.725811604363403e-07, + "loss": 0.7444573640823364, + "step": 8696 + }, + { + "epoch": 2.6771314250538625, + "grad_norm": 14.75, + "learning_rate": 6.72308251716369e-07, + "loss": 1.0186790227890015, + "step": 8698 + }, + { + "epoch": 2.677746999076639, + "grad_norm": 13.9375, + "learning_rate": 6.720358410991337e-07, + "loss": 1.7139232158660889, + "step": 8700 + }, + { + "epoch": 2.678362573099415, + "grad_norm": 17.5, + "learning_rate": 6.717639287049589e-07, + "loss": 0.9571051597595215, + "step": 8702 + }, + { + "epoch": 2.6789781471221916, + "grad_norm": 13.4375, + "learning_rate": 6.714925146539483e-07, + "loss": 0.839079737663269, + "step": 8704 + }, + { + "epoch": 2.6795937211449674, + "grad_norm": 27.75, + "learning_rate": 6.712215990659856e-07, + "loss": 1.8214905261993408, + "step": 8706 + }, + { + "epoch": 2.680209295167744, + "grad_norm": 23.375, + "learning_rate": 6.709511820607348e-07, + "loss": 1.3192217350006104, + "step": 8708 + }, + { + "epoch": 2.6808248691905203, + "grad_norm": 12.875, + "learning_rate": 6.706812637576395e-07, + "loss": 0.8749622106552124, + "step": 8710 + }, + { + "epoch": 2.6814404432132966, + "grad_norm": 7.5625, + "learning_rate": 6.704118442759226e-07, + "loss": 1.2919583320617676, + "step": 8712 + }, + { + "epoch": 2.682056017236073, + "grad_norm": 20.25, + "learning_rate": 6.701429237345872e-07, + "loss": 1.2938746213912964, + "step": 8714 + }, + { + "epoch": 2.682671591258849, + "grad_norm": 11.3125, + "learning_rate": 6.698745022524159e-07, + "loss": 1.1630654335021973, + "step": 8716 + }, + { + "epoch": 2.6832871652816253, + "grad_norm": 4.5, + "learning_rate": 6.696065799479704e-07, + "loss": 1.2157087326049805, + "step": 8718 + }, + { + "epoch": 2.6839027393044015, + "grad_norm": 19.625, + "learning_rate": 6.693391569395927e-07, + "loss": 1.2119028568267822, + "step": 8720 + }, + { + "epoch": 2.6845183133271777, + "grad_norm": 13.875, + "learning_rate": 6.690722333454038e-07, + "loss": 1.262474536895752, + "step": 8722 + }, + { + "epoch": 2.685133887349954, + "grad_norm": 11.125, + "learning_rate": 6.688058092833038e-07, + "loss": 1.2290127277374268, + "step": 8724 + }, + { + "epoch": 2.68574946137273, + "grad_norm": 11.1875, + "learning_rate": 6.685398848709727e-07, + "loss": 1.341980218887329, + "step": 8726 + }, + { + "epoch": 2.6863650353955064, + "grad_norm": 17.875, + "learning_rate": 6.6827446022587e-07, + "loss": 1.7965751886367798, + "step": 8728 + }, + { + "epoch": 2.6869806094182827, + "grad_norm": 18.5, + "learning_rate": 6.680095354652336e-07, + "loss": 1.4040262699127197, + "step": 8730 + }, + { + "epoch": 2.687596183441059, + "grad_norm": 26.25, + "learning_rate": 6.67745110706081e-07, + "loss": 1.7785351276397705, + "step": 8732 + }, + { + "epoch": 2.688211757463835, + "grad_norm": 18.625, + "learning_rate": 6.674811860652094e-07, + "loss": 1.7813286781311035, + "step": 8734 + }, + { + "epoch": 2.6888273314866113, + "grad_norm": 22.75, + "learning_rate": 6.672177616591943e-07, + "loss": 1.3169972896575928, + "step": 8736 + }, + { + "epoch": 2.6894429055093876, + "grad_norm": 10.8125, + "learning_rate": 6.669548376043905e-07, + "loss": 1.4833617210388184, + "step": 8738 + }, + { + "epoch": 2.690058479532164, + "grad_norm": 11.625, + "learning_rate": 6.666924140169319e-07, + "loss": 1.3738125562667847, + "step": 8740 + }, + { + "epoch": 2.69067405355494, + "grad_norm": 27.875, + "learning_rate": 6.664304910127317e-07, + "loss": 1.2362579107284546, + "step": 8742 + }, + { + "epoch": 2.6912896275777163, + "grad_norm": 28.625, + "learning_rate": 6.66169068707481e-07, + "loss": 1.728048324584961, + "step": 8744 + }, + { + "epoch": 2.6919052016004925, + "grad_norm": 30.875, + "learning_rate": 6.659081472166506e-07, + "loss": 1.4855546951293945, + "step": 8746 + }, + { + "epoch": 2.6925207756232687, + "grad_norm": 8.25, + "learning_rate": 6.656477266554898e-07, + "loss": 1.1835436820983887, + "step": 8748 + }, + { + "epoch": 2.693136349646045, + "grad_norm": 16.625, + "learning_rate": 6.653878071390264e-07, + "loss": 1.6319580078125, + "step": 8750 + }, + { + "epoch": 2.693751923668821, + "grad_norm": 11.4375, + "learning_rate": 6.651283887820678e-07, + "loss": 0.6635792255401611, + "step": 8752 + }, + { + "epoch": 2.6943674976915974, + "grad_norm": 5.375, + "learning_rate": 6.648694716991992e-07, + "loss": 1.3232951164245605, + "step": 8754 + }, + { + "epoch": 2.6949830717143737, + "grad_norm": 11.6875, + "learning_rate": 6.646110560047838e-07, + "loss": 1.319361686706543, + "step": 8756 + }, + { + "epoch": 2.69559864573715, + "grad_norm": 8.375, + "learning_rate": 6.643531418129651e-07, + "loss": 1.2317544221878052, + "step": 8758 + }, + { + "epoch": 2.696214219759926, + "grad_norm": 57.0, + "learning_rate": 6.640957292376635e-07, + "loss": 1.5650432109832764, + "step": 8760 + }, + { + "epoch": 2.6968297937827024, + "grad_norm": 23.875, + "learning_rate": 6.638388183925786e-07, + "loss": 1.5568079948425293, + "step": 8762 + }, + { + "epoch": 2.6974453678054786, + "grad_norm": 4.75, + "learning_rate": 6.635824093911883e-07, + "loss": 1.2287406921386719, + "step": 8764 + }, + { + "epoch": 2.698060941828255, + "grad_norm": 16.0, + "learning_rate": 6.633265023467485e-07, + "loss": 1.466149091720581, + "step": 8766 + }, + { + "epoch": 2.698676515851031, + "grad_norm": 18.25, + "learning_rate": 6.63071097372294e-07, + "loss": 1.5374802350997925, + "step": 8768 + }, + { + "epoch": 2.6992920898738073, + "grad_norm": 12.4375, + "learning_rate": 6.628161945806372e-07, + "loss": 1.3909275531768799, + "step": 8770 + }, + { + "epoch": 2.6999076638965835, + "grad_norm": 17.375, + "learning_rate": 6.625617940843691e-07, + "loss": 1.3820092678070068, + "step": 8772 + }, + { + "epoch": 2.7005232379193598, + "grad_norm": 12.75, + "learning_rate": 6.623078959958584e-07, + "loss": 1.3533204793930054, + "step": 8774 + }, + { + "epoch": 2.701138811942136, + "grad_norm": 18.125, + "learning_rate": 6.620545004272524e-07, + "loss": 1.3313978910446167, + "step": 8776 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 11.0625, + "learning_rate": 6.618016074904765e-07, + "loss": 1.130128026008606, + "step": 8778 + }, + { + "epoch": 2.7023699599876885, + "grad_norm": 13.625, + "learning_rate": 6.615492172972335e-07, + "loss": 1.2231190204620361, + "step": 8780 + }, + { + "epoch": 2.7029855340104647, + "grad_norm": 16.625, + "learning_rate": 6.612973299590039e-07, + "loss": 0.885326623916626, + "step": 8782 + }, + { + "epoch": 2.703601108033241, + "grad_norm": 11.6875, + "learning_rate": 6.610459455870476e-07, + "loss": 1.4117552042007446, + "step": 8784 + }, + { + "epoch": 2.704216682056017, + "grad_norm": 5.03125, + "learning_rate": 6.607950642924009e-07, + "loss": 1.1439400911331177, + "step": 8786 + }, + { + "epoch": 2.7048322560787934, + "grad_norm": 5.875, + "learning_rate": 6.605446861858783e-07, + "loss": 1.2332112789154053, + "step": 8788 + }, + { + "epoch": 2.7054478301015696, + "grad_norm": 3.453125, + "learning_rate": 6.602948113780724e-07, + "loss": 1.0315492153167725, + "step": 8790 + }, + { + "epoch": 2.706063404124346, + "grad_norm": 24.125, + "learning_rate": 6.600454399793526e-07, + "loss": 1.2301273345947266, + "step": 8792 + }, + { + "epoch": 2.706678978147122, + "grad_norm": 14.5, + "learning_rate": 6.597965720998672e-07, + "loss": 1.4328529834747314, + "step": 8794 + }, + { + "epoch": 2.7072945521698983, + "grad_norm": 23.875, + "learning_rate": 6.595482078495411e-07, + "loss": 1.2243239879608154, + "step": 8796 + }, + { + "epoch": 2.7079101261926746, + "grad_norm": 11.125, + "learning_rate": 6.59300347338077e-07, + "loss": 1.1748735904693604, + "step": 8798 + }, + { + "epoch": 2.708525700215451, + "grad_norm": 13.5625, + "learning_rate": 6.590529906749554e-07, + "loss": 1.195931077003479, + "step": 8800 + }, + { + "epoch": 2.709141274238227, + "grad_norm": 16.375, + "learning_rate": 6.588061379694336e-07, + "loss": 1.6373540163040161, + "step": 8802 + }, + { + "epoch": 2.7097568482610033, + "grad_norm": 29.25, + "learning_rate": 6.585597893305473e-07, + "loss": 1.38240647315979, + "step": 8804 + }, + { + "epoch": 2.7103724222837795, + "grad_norm": 4.6875, + "learning_rate": 6.583139448671087e-07, + "loss": 1.1793478727340698, + "step": 8806 + }, + { + "epoch": 2.7109879963065557, + "grad_norm": 10.25, + "learning_rate": 6.580686046877075e-07, + "loss": 1.1160082817077637, + "step": 8808 + }, + { + "epoch": 2.711603570329332, + "grad_norm": 12.0, + "learning_rate": 6.578237689007108e-07, + "loss": 1.3985621929168701, + "step": 8810 + }, + { + "epoch": 2.712219144352108, + "grad_norm": 9.4375, + "learning_rate": 6.575794376142629e-07, + "loss": 1.3924959897994995, + "step": 8812 + }, + { + "epoch": 2.7128347183748844, + "grad_norm": 8.6875, + "learning_rate": 6.573356109362851e-07, + "loss": 1.2795300483703613, + "step": 8814 + }, + { + "epoch": 2.7134502923976607, + "grad_norm": 28.25, + "learning_rate": 6.570922889744762e-07, + "loss": 1.362342357635498, + "step": 8816 + }, + { + "epoch": 2.714065866420437, + "grad_norm": 27.125, + "learning_rate": 6.568494718363112e-07, + "loss": 1.2380298376083374, + "step": 8818 + }, + { + "epoch": 2.714681440443213, + "grad_norm": 18.875, + "learning_rate": 6.566071596290434e-07, + "loss": 0.7231048941612244, + "step": 8820 + }, + { + "epoch": 2.71529701446599, + "grad_norm": 18.375, + "learning_rate": 6.563653524597021e-07, + "loss": 1.4576283693313599, + "step": 8822 + }, + { + "epoch": 2.7159125884887656, + "grad_norm": 31.875, + "learning_rate": 6.561240504350935e-07, + "loss": 1.336194396018982, + "step": 8824 + }, + { + "epoch": 2.7165281625115423, + "grad_norm": 18.5, + "learning_rate": 6.558832536618015e-07, + "loss": 1.5954232215881348, + "step": 8826 + }, + { + "epoch": 2.717143736534318, + "grad_norm": 17.625, + "learning_rate": 6.55642962246186e-07, + "loss": 1.5177388191223145, + "step": 8828 + }, + { + "epoch": 2.7177593105570947, + "grad_norm": 9.25, + "learning_rate": 6.55403176294384e-07, + "loss": 1.209438443183899, + "step": 8830 + }, + { + "epoch": 2.7183748845798705, + "grad_norm": 7.1875, + "learning_rate": 6.551638959123095e-07, + "loss": 1.2927244901657104, + "step": 8832 + }, + { + "epoch": 2.718990458602647, + "grad_norm": 75.5, + "learning_rate": 6.549251212056525e-07, + "loss": 1.129912257194519, + "step": 8834 + }, + { + "epoch": 2.719606032625423, + "grad_norm": 13.375, + "learning_rate": 6.546868522798803e-07, + "loss": 1.4281249046325684, + "step": 8836 + }, + { + "epoch": 2.7202216066481997, + "grad_norm": 9.8125, + "learning_rate": 6.544490892402366e-07, + "loss": 1.14664626121521, + "step": 8838 + }, + { + "epoch": 2.7208371806709755, + "grad_norm": 20.625, + "learning_rate": 6.542118321917414e-07, + "loss": 1.4425570964813232, + "step": 8840 + }, + { + "epoch": 2.721452754693752, + "grad_norm": 13.5, + "learning_rate": 6.539750812391918e-07, + "loss": 1.6932718753814697, + "step": 8842 + }, + { + "epoch": 2.722068328716528, + "grad_norm": 20.75, + "learning_rate": 6.537388364871601e-07, + "loss": 1.110339641571045, + "step": 8844 + }, + { + "epoch": 2.7226839027393046, + "grad_norm": 13.3125, + "learning_rate": 6.535030980399971e-07, + "loss": 1.1894402503967285, + "step": 8846 + }, + { + "epoch": 2.7232994767620804, + "grad_norm": 11.0625, + "learning_rate": 6.532678660018281e-07, + "loss": 1.2754297256469727, + "step": 8848 + }, + { + "epoch": 2.723915050784857, + "grad_norm": 12.6875, + "learning_rate": 6.53033140476555e-07, + "loss": 1.4751468896865845, + "step": 8850 + }, + { + "epoch": 2.724530624807633, + "grad_norm": 10.5625, + "learning_rate": 6.527989215678571e-07, + "loss": 1.2689716815948486, + "step": 8852 + }, + { + "epoch": 2.7251461988304095, + "grad_norm": 38.75, + "learning_rate": 6.525652093791885e-07, + "loss": 1.395056962966919, + "step": 8854 + }, + { + "epoch": 2.7257617728531853, + "grad_norm": 9.1875, + "learning_rate": 6.523320040137805e-07, + "loss": 1.2616653442382812, + "step": 8856 + }, + { + "epoch": 2.726377346875962, + "grad_norm": 27.0, + "learning_rate": 6.520993055746403e-07, + "loss": 1.917528510093689, + "step": 8858 + }, + { + "epoch": 2.7269929208987382, + "grad_norm": 21.125, + "learning_rate": 6.518671141645506e-07, + "loss": 1.4512934684753418, + "step": 8860 + }, + { + "epoch": 2.7276084949215145, + "grad_norm": 14.875, + "learning_rate": 6.516354298860711e-07, + "loss": 1.3462598323822021, + "step": 8862 + }, + { + "epoch": 2.7282240689442907, + "grad_norm": 10.0, + "learning_rate": 6.514042528415366e-07, + "loss": 1.181436538696289, + "step": 8864 + }, + { + "epoch": 2.728839642967067, + "grad_norm": 30.375, + "learning_rate": 6.511735831330586e-07, + "loss": 1.5524072647094727, + "step": 8866 + }, + { + "epoch": 2.729455216989843, + "grad_norm": 10.625, + "learning_rate": 6.50943420862524e-07, + "loss": 1.4597262144088745, + "step": 8868 + }, + { + "epoch": 2.7300707910126194, + "grad_norm": 26.25, + "learning_rate": 6.507137661315956e-07, + "loss": 1.7072659730911255, + "step": 8870 + }, + { + "epoch": 2.7306863650353956, + "grad_norm": 12.0, + "learning_rate": 6.504846190417125e-07, + "loss": 1.2412455081939697, + "step": 8872 + }, + { + "epoch": 2.731301939058172, + "grad_norm": 15.875, + "learning_rate": 6.50255979694089e-07, + "loss": 1.5315024852752686, + "step": 8874 + }, + { + "epoch": 2.731917513080948, + "grad_norm": 45.0, + "learning_rate": 6.500278481897154e-07, + "loss": 1.7796781063079834, + "step": 8876 + }, + { + "epoch": 2.7325330871037243, + "grad_norm": 22.25, + "learning_rate": 6.498002246293578e-07, + "loss": 1.4514446258544922, + "step": 8878 + }, + { + "epoch": 2.7331486611265006, + "grad_norm": 16.0, + "learning_rate": 6.495731091135575e-07, + "loss": 1.5924737453460693, + "step": 8880 + }, + { + "epoch": 2.733764235149277, + "grad_norm": 10.75, + "learning_rate": 6.493465017426318e-07, + "loss": 1.1855311393737793, + "step": 8882 + }, + { + "epoch": 2.734379809172053, + "grad_norm": 12.1875, + "learning_rate": 6.491204026166737e-07, + "loss": 1.1828142404556274, + "step": 8884 + }, + { + "epoch": 2.7349953831948293, + "grad_norm": 23.375, + "learning_rate": 6.488948118355509e-07, + "loss": 1.6172746419906616, + "step": 8886 + }, + { + "epoch": 2.7356109572176055, + "grad_norm": 7.09375, + "learning_rate": 6.486697294989078e-07, + "loss": 1.1457502841949463, + "step": 8888 + }, + { + "epoch": 2.7362265312403817, + "grad_norm": 10.75, + "learning_rate": 6.48445155706163e-07, + "loss": 1.4178847074508667, + "step": 8890 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 32.25, + "learning_rate": 6.482210905565111e-07, + "loss": 1.2971978187561035, + "step": 8892 + }, + { + "epoch": 2.737457679285934, + "grad_norm": 23.75, + "learning_rate": 6.479975341489219e-07, + "loss": 1.2564046382904053, + "step": 8894 + }, + { + "epoch": 2.7380732533087104, + "grad_norm": 16.75, + "learning_rate": 6.477744865821406e-07, + "loss": 1.1586273908615112, + "step": 8896 + }, + { + "epoch": 2.7386888273314867, + "grad_norm": 7.875, + "learning_rate": 6.475519479546876e-07, + "loss": 1.404249906539917, + "step": 8898 + }, + { + "epoch": 2.739304401354263, + "grad_norm": 9.25, + "learning_rate": 6.473299183648585e-07, + "loss": 1.3985319137573242, + "step": 8900 + }, + { + "epoch": 2.739919975377039, + "grad_norm": 10.875, + "learning_rate": 6.471083979107239e-07, + "loss": 1.315010666847229, + "step": 8902 + }, + { + "epoch": 2.7405355493998154, + "grad_norm": 28.375, + "learning_rate": 6.468873866901298e-07, + "loss": 1.347687005996704, + "step": 8904 + }, + { + "epoch": 2.7411511234225916, + "grad_norm": 7.09375, + "learning_rate": 6.466668848006969e-07, + "loss": 1.2556222677230835, + "step": 8906 + }, + { + "epoch": 2.741766697445368, + "grad_norm": 16.625, + "learning_rate": 6.464468923398212e-07, + "loss": 1.4205504655838013, + "step": 8908 + }, + { + "epoch": 2.742382271468144, + "grad_norm": 4.65625, + "learning_rate": 6.462274094046739e-07, + "loss": 1.0294299125671387, + "step": 8910 + }, + { + "epoch": 2.7429978454909203, + "grad_norm": 10.9375, + "learning_rate": 6.460084360922002e-07, + "loss": 1.1584376096725464, + "step": 8912 + }, + { + "epoch": 2.7436134195136965, + "grad_norm": 22.5, + "learning_rate": 6.457899724991216e-07, + "loss": 1.7459558248519897, + "step": 8914 + }, + { + "epoch": 2.7442289935364728, + "grad_norm": 9.6875, + "learning_rate": 6.455720187219333e-07, + "loss": 1.2811490297317505, + "step": 8916 + }, + { + "epoch": 2.744844567559249, + "grad_norm": 11.4375, + "learning_rate": 6.453545748569058e-07, + "loss": 1.325385332107544, + "step": 8918 + }, + { + "epoch": 2.7454601415820252, + "grad_norm": 18.375, + "learning_rate": 6.451376410000846e-07, + "loss": 1.186486005783081, + "step": 8920 + }, + { + "epoch": 2.7460757156048015, + "grad_norm": 19.25, + "learning_rate": 6.449212172472891e-07, + "loss": 1.2240514755249023, + "step": 8922 + }, + { + "epoch": 2.7466912896275777, + "grad_norm": 22.375, + "learning_rate": 6.447053036941143e-07, + "loss": 0.8598315119743347, + "step": 8924 + }, + { + "epoch": 2.747306863650354, + "grad_norm": 13.3125, + "learning_rate": 6.444899004359295e-07, + "loss": 1.358074426651001, + "step": 8926 + }, + { + "epoch": 2.74792243767313, + "grad_norm": 10.4375, + "learning_rate": 6.442750075678782e-07, + "loss": 0.8594173192977905, + "step": 8928 + }, + { + "epoch": 2.7485380116959064, + "grad_norm": 13.875, + "learning_rate": 6.440606251848792e-07, + "loss": 1.2267684936523438, + "step": 8930 + }, + { + "epoch": 2.7491535857186826, + "grad_norm": 23.375, + "learning_rate": 6.438467533816253e-07, + "loss": 1.3598570823669434, + "step": 8932 + }, + { + "epoch": 2.749769159741459, + "grad_norm": 32.0, + "learning_rate": 6.436333922525838e-07, + "loss": 1.4652607440948486, + "step": 8934 + }, + { + "epoch": 2.750384733764235, + "grad_norm": 13.0, + "learning_rate": 6.434205418919967e-07, + "loss": 1.2682214975357056, + "step": 8936 + }, + { + "epoch": 2.7510003077870113, + "grad_norm": 7.25, + "learning_rate": 6.432082023938802e-07, + "loss": 1.2484195232391357, + "step": 8938 + }, + { + "epoch": 2.7516158818097876, + "grad_norm": 40.75, + "learning_rate": 6.429963738520251e-07, + "loss": 1.449970006942749, + "step": 8940 + }, + { + "epoch": 2.752231455832564, + "grad_norm": 20.125, + "learning_rate": 6.427850563599959e-07, + "loss": 1.4970884323120117, + "step": 8942 + }, + { + "epoch": 2.75284702985534, + "grad_norm": 19.75, + "learning_rate": 6.425742500111322e-07, + "loss": 1.2655609846115112, + "step": 8944 + }, + { + "epoch": 2.7534626038781163, + "grad_norm": 14.875, + "learning_rate": 6.423639548985471e-07, + "loss": 1.5221806764602661, + "step": 8946 + }, + { + "epoch": 2.7540781779008925, + "grad_norm": 3.1875, + "learning_rate": 6.421541711151284e-07, + "loss": 1.1110451221466064, + "step": 8948 + }, + { + "epoch": 2.7546937519236687, + "grad_norm": 4.6875, + "learning_rate": 6.419448987535376e-07, + "loss": 1.0029635429382324, + "step": 8950 + }, + { + "epoch": 2.755309325946445, + "grad_norm": 22.875, + "learning_rate": 6.417361379062107e-07, + "loss": 1.7326197624206543, + "step": 8952 + }, + { + "epoch": 2.755924899969221, + "grad_norm": 7.75, + "learning_rate": 6.415278886653577e-07, + "loss": 1.3342669010162354, + "step": 8954 + }, + { + "epoch": 2.7565404739919974, + "grad_norm": 16.625, + "learning_rate": 6.413201511229622e-07, + "loss": 1.4437904357910156, + "step": 8956 + }, + { + "epoch": 2.7571560480147737, + "grad_norm": 19.125, + "learning_rate": 6.411129253707827e-07, + "loss": 1.2913410663604736, + "step": 8958 + }, + { + "epoch": 2.75777162203755, + "grad_norm": 11.4375, + "learning_rate": 6.409062115003505e-07, + "loss": 1.433046817779541, + "step": 8960 + }, + { + "epoch": 2.758387196060326, + "grad_norm": 22.0, + "learning_rate": 6.407000096029715e-07, + "loss": 1.4304602146148682, + "step": 8962 + }, + { + "epoch": 2.7590027700831024, + "grad_norm": 19.875, + "learning_rate": 6.404943197697252e-07, + "loss": 1.5041424036026, + "step": 8964 + }, + { + "epoch": 2.7596183441058786, + "grad_norm": 45.0, + "learning_rate": 6.402891420914655e-07, + "loss": 1.2554550170898438, + "step": 8966 + }, + { + "epoch": 2.760233918128655, + "grad_norm": 5.40625, + "learning_rate": 6.40084476658819e-07, + "loss": 1.1710494756698608, + "step": 8968 + }, + { + "epoch": 2.760849492151431, + "grad_norm": 27.25, + "learning_rate": 6.398803235621871e-07, + "loss": 1.3762147426605225, + "step": 8970 + }, + { + "epoch": 2.7614650661742073, + "grad_norm": 32.25, + "learning_rate": 6.396766828917444e-07, + "loss": 1.7743159532546997, + "step": 8972 + }, + { + "epoch": 2.7620806401969835, + "grad_norm": 19.0, + "learning_rate": 6.39473554737439e-07, + "loss": 1.6314376592636108, + "step": 8974 + }, + { + "epoch": 2.76269621421976, + "grad_norm": 11.6875, + "learning_rate": 6.39270939188993e-07, + "loss": 1.3375153541564941, + "step": 8976 + }, + { + "epoch": 2.763311788242536, + "grad_norm": 9.25, + "learning_rate": 6.390688363359018e-07, + "loss": 1.412973403930664, + "step": 8978 + }, + { + "epoch": 2.7639273622653127, + "grad_norm": 15.3125, + "learning_rate": 6.388672462674345e-07, + "loss": 1.4405186176300049, + "step": 8980 + }, + { + "epoch": 2.7645429362880884, + "grad_norm": 42.75, + "learning_rate": 6.386661690726338e-07, + "loss": 1.7845385074615479, + "step": 8982 + }, + { + "epoch": 2.765158510310865, + "grad_norm": 13.1875, + "learning_rate": 6.384656048403156e-07, + "loss": 1.1434121131896973, + "step": 8984 + }, + { + "epoch": 2.765774084333641, + "grad_norm": 27.0, + "learning_rate": 6.382655536590695e-07, + "loss": 1.0858327150344849, + "step": 8986 + }, + { + "epoch": 2.7663896583564176, + "grad_norm": 21.0, + "learning_rate": 6.380660156172584e-07, + "loss": 1.6220226287841797, + "step": 8988 + }, + { + "epoch": 2.7670052323791934, + "grad_norm": 23.0, + "learning_rate": 6.378669908030179e-07, + "loss": 1.24774169921875, + "step": 8990 + }, + { + "epoch": 2.76762080640197, + "grad_norm": 11.625, + "learning_rate": 6.376684793042581e-07, + "loss": 0.8773247599601746, + "step": 8992 + }, + { + "epoch": 2.768236380424746, + "grad_norm": 14.5, + "learning_rate": 6.374704812086616e-07, + "loss": 1.4489095211029053, + "step": 8994 + }, + { + "epoch": 2.7688519544475225, + "grad_norm": 18.875, + "learning_rate": 6.372729966036841e-07, + "loss": 1.2934106588363647, + "step": 8996 + }, + { + "epoch": 2.7694675284702983, + "grad_norm": 5.0625, + "learning_rate": 6.370760255765553e-07, + "loss": 1.3208156824111938, + "step": 8998 + }, + { + "epoch": 2.770083102493075, + "grad_norm": 34.75, + "learning_rate": 6.368795682142769e-07, + "loss": 1.487781047821045, + "step": 9000 + }, + { + "epoch": 2.7706986765158508, + "grad_norm": 76.0, + "learning_rate": 6.36683624603625e-07, + "loss": 1.2479522228240967, + "step": 9002 + }, + { + "epoch": 2.7713142505386275, + "grad_norm": 25.875, + "learning_rate": 6.364881948311478e-07, + "loss": 1.1628947257995605, + "step": 9004 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 18.75, + "learning_rate": 6.362932789831665e-07, + "loss": 1.6086757183074951, + "step": 9006 + }, + { + "epoch": 2.77254539858418, + "grad_norm": 11.8125, + "learning_rate": 6.360988771457763e-07, + "loss": 1.2544701099395752, + "step": 9008 + }, + { + "epoch": 2.7731609726069557, + "grad_norm": 25.0, + "learning_rate": 6.359049894048445e-07, + "loss": 1.9421390295028687, + "step": 9010 + }, + { + "epoch": 2.7737765466297324, + "grad_norm": 20.125, + "learning_rate": 6.357116158460112e-07, + "loss": 1.416395664215088, + "step": 9012 + }, + { + "epoch": 2.7743921206525086, + "grad_norm": 8.5, + "learning_rate": 6.355187565546902e-07, + "loss": 1.0506492853164673, + "step": 9014 + }, + { + "epoch": 2.775007694675285, + "grad_norm": 8.75, + "learning_rate": 6.353264116160672e-07, + "loss": 1.2236146926879883, + "step": 9016 + }, + { + "epoch": 2.775623268698061, + "grad_norm": 31.5, + "learning_rate": 6.351345811151017e-07, + "loss": 1.3423714637756348, + "step": 9018 + }, + { + "epoch": 2.7762388427208373, + "grad_norm": 13.0, + "learning_rate": 6.349432651365252e-07, + "loss": 1.5946636199951172, + "step": 9020 + }, + { + "epoch": 2.7768544167436136, + "grad_norm": 12.3125, + "learning_rate": 6.347524637648418e-07, + "loss": 1.2847610712051392, + "step": 9022 + }, + { + "epoch": 2.77746999076639, + "grad_norm": 8.4375, + "learning_rate": 6.345621770843293e-07, + "loss": 1.281554102897644, + "step": 9024 + }, + { + "epoch": 2.778085564789166, + "grad_norm": 24.875, + "learning_rate": 6.343724051790371e-07, + "loss": 1.3122029304504395, + "step": 9026 + }, + { + "epoch": 2.7787011388119423, + "grad_norm": 7.5, + "learning_rate": 6.341831481327882e-07, + "loss": 1.4820513725280762, + "step": 9028 + }, + { + "epoch": 2.7793167128347185, + "grad_norm": 12.75, + "learning_rate": 6.339944060291772e-07, + "loss": 1.0853147506713867, + "step": 9030 + }, + { + "epoch": 2.7799322868574947, + "grad_norm": 13.8125, + "learning_rate": 6.338061789515717e-07, + "loss": 1.1276772022247314, + "step": 9032 + }, + { + "epoch": 2.780547860880271, + "grad_norm": 7.03125, + "learning_rate": 6.336184669831121e-07, + "loss": 1.0151764154434204, + "step": 9034 + }, + { + "epoch": 2.781163434903047, + "grad_norm": 15.125, + "learning_rate": 6.334312702067106e-07, + "loss": 1.247842788696289, + "step": 9036 + }, + { + "epoch": 2.7817790089258234, + "grad_norm": 13.25, + "learning_rate": 6.332445887050527e-07, + "loss": 0.9100526571273804, + "step": 9038 + }, + { + "epoch": 2.7823945829485996, + "grad_norm": 5.90625, + "learning_rate": 6.330584225605955e-07, + "loss": 1.2452267408370972, + "step": 9040 + }, + { + "epoch": 2.783010156971376, + "grad_norm": 9.8125, + "learning_rate": 6.328727718555688e-07, + "loss": 0.924896240234375, + "step": 9042 + }, + { + "epoch": 2.783625730994152, + "grad_norm": 9.6875, + "learning_rate": 6.326876366719749e-07, + "loss": 1.6312079429626465, + "step": 9044 + }, + { + "epoch": 2.7842413050169283, + "grad_norm": 20.0, + "learning_rate": 6.325030170915882e-07, + "loss": 1.1611487865447998, + "step": 9046 + }, + { + "epoch": 2.7848568790397046, + "grad_norm": 39.5, + "learning_rate": 6.323189131959552e-07, + "loss": 1.4214301109313965, + "step": 9048 + }, + { + "epoch": 2.785472453062481, + "grad_norm": 70.0, + "learning_rate": 6.321353250663951e-07, + "loss": 1.5114094018936157, + "step": 9050 + }, + { + "epoch": 2.786088027085257, + "grad_norm": 11.0, + "learning_rate": 6.319522527839986e-07, + "loss": 1.2237645387649536, + "step": 9052 + }, + { + "epoch": 2.7867036011080333, + "grad_norm": 15.6875, + "learning_rate": 6.317696964296293e-07, + "loss": 1.2461700439453125, + "step": 9054 + }, + { + "epoch": 2.7873191751308095, + "grad_norm": 16.5, + "learning_rate": 6.315876560839224e-07, + "loss": 1.46170973777771, + "step": 9056 + }, + { + "epoch": 2.7879347491535857, + "grad_norm": 17.625, + "learning_rate": 6.314061318272852e-07, + "loss": 1.203120231628418, + "step": 9058 + }, + { + "epoch": 2.788550323176362, + "grad_norm": 27.0, + "learning_rate": 6.312251237398975e-07, + "loss": 1.5163331031799316, + "step": 9060 + }, + { + "epoch": 2.789165897199138, + "grad_norm": 11.75, + "learning_rate": 6.310446319017104e-07, + "loss": 1.1536353826522827, + "step": 9062 + }, + { + "epoch": 2.7897814712219144, + "grad_norm": 16.875, + "learning_rate": 6.308646563924475e-07, + "loss": 1.4147756099700928, + "step": 9064 + }, + { + "epoch": 2.7903970452446907, + "grad_norm": 5.8125, + "learning_rate": 6.306851972916045e-07, + "loss": 1.1038388013839722, + "step": 9066 + }, + { + "epoch": 2.791012619267467, + "grad_norm": 15.0, + "learning_rate": 6.30506254678448e-07, + "loss": 1.3057680130004883, + "step": 9068 + }, + { + "epoch": 2.791628193290243, + "grad_norm": 45.25, + "learning_rate": 6.303278286320177e-07, + "loss": 1.4038832187652588, + "step": 9070 + }, + { + "epoch": 2.7922437673130194, + "grad_norm": 16.375, + "learning_rate": 6.301499192311246e-07, + "loss": 1.6461620330810547, + "step": 9072 + }, + { + "epoch": 2.7928593413357956, + "grad_norm": 34.0, + "learning_rate": 6.299725265543512e-07, + "loss": 1.4873437881469727, + "step": 9074 + }, + { + "epoch": 2.793474915358572, + "grad_norm": 16.75, + "learning_rate": 6.297956506800524e-07, + "loss": 1.1311352252960205, + "step": 9076 + }, + { + "epoch": 2.794090489381348, + "grad_norm": 9.375, + "learning_rate": 6.29619291686354e-07, + "loss": 0.941398024559021, + "step": 9078 + }, + { + "epoch": 2.7947060634041243, + "grad_norm": 13.0625, + "learning_rate": 6.294434496511543e-07, + "loss": 1.284990668296814, + "step": 9080 + }, + { + "epoch": 2.7953216374269005, + "grad_norm": 9.5, + "learning_rate": 6.292681246521231e-07, + "loss": 1.284984827041626, + "step": 9082 + }, + { + "epoch": 2.7959372114496768, + "grad_norm": 19.125, + "learning_rate": 6.290933167667011e-07, + "loss": 1.6055502891540527, + "step": 9084 + }, + { + "epoch": 2.796552785472453, + "grad_norm": 25.875, + "learning_rate": 6.289190260721016e-07, + "loss": 1.4686188697814941, + "step": 9086 + }, + { + "epoch": 2.7971683594952292, + "grad_norm": 10.9375, + "learning_rate": 6.287452526453089e-07, + "loss": 1.1350395679473877, + "step": 9088 + }, + { + "epoch": 2.7977839335180055, + "grad_norm": 11.0, + "learning_rate": 6.285719965630789e-07, + "loss": 1.3151172399520874, + "step": 9090 + }, + { + "epoch": 2.7983995075407817, + "grad_norm": 20.625, + "learning_rate": 6.283992579019388e-07, + "loss": 1.334385633468628, + "step": 9092 + }, + { + "epoch": 2.799015081563558, + "grad_norm": 13.75, + "learning_rate": 6.282270367381877e-07, + "loss": 1.3577499389648438, + "step": 9094 + }, + { + "epoch": 2.799630655586334, + "grad_norm": 13.625, + "learning_rate": 6.280553331478961e-07, + "loss": 1.3364747762680054, + "step": 9096 + }, + { + "epoch": 2.8002462296091104, + "grad_norm": 15.3125, + "learning_rate": 6.278841472069051e-07, + "loss": 1.4921326637268066, + "step": 9098 + }, + { + "epoch": 2.8008618036318866, + "grad_norm": 8.5, + "learning_rate": 6.277134789908282e-07, + "loss": 1.0777859687805176, + "step": 9100 + }, + { + "epoch": 2.801477377654663, + "grad_norm": 20.75, + "learning_rate": 6.275433285750497e-07, + "loss": 1.0497572422027588, + "step": 9102 + }, + { + "epoch": 2.802092951677439, + "grad_norm": 17.5, + "learning_rate": 6.273736960347249e-07, + "loss": 1.194733738899231, + "step": 9104 + }, + { + "epoch": 2.8027085257002153, + "grad_norm": 11.0625, + "learning_rate": 6.272045814447808e-07, + "loss": 1.3191816806793213, + "step": 9106 + }, + { + "epoch": 2.8033240997229916, + "grad_norm": 16.625, + "learning_rate": 6.27035984879916e-07, + "loss": 1.4231816530227661, + "step": 9108 + }, + { + "epoch": 2.803939673745768, + "grad_norm": 17.625, + "learning_rate": 6.268679064145992e-07, + "loss": 0.9059617519378662, + "step": 9110 + }, + { + "epoch": 2.804555247768544, + "grad_norm": 7.09375, + "learning_rate": 6.267003461230709e-07, + "loss": 1.060996413230896, + "step": 9112 + }, + { + "epoch": 2.8051708217913203, + "grad_norm": 8.75, + "learning_rate": 6.26533304079343e-07, + "loss": 1.1279702186584473, + "step": 9114 + }, + { + "epoch": 2.8057863958140965, + "grad_norm": 17.5, + "learning_rate": 6.263667803571981e-07, + "loss": 1.3636348247528076, + "step": 9116 + }, + { + "epoch": 2.8064019698368727, + "grad_norm": 12.4375, + "learning_rate": 6.262007750301896e-07, + "loss": 1.2173731327056885, + "step": 9118 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 10.1875, + "learning_rate": 6.260352881716425e-07, + "loss": 1.0125764608383179, + "step": 9120 + }, + { + "epoch": 2.807633117882425, + "grad_norm": 3.9375, + "learning_rate": 6.258703198546526e-07, + "loss": 1.1348586082458496, + "step": 9122 + }, + { + "epoch": 2.8082486919052014, + "grad_norm": 243.0, + "learning_rate": 6.257058701520865e-07, + "loss": 1.2904644012451172, + "step": 9124 + }, + { + "epoch": 2.808864265927978, + "grad_norm": 27.5, + "learning_rate": 6.255419391365816e-07, + "loss": 1.3715291023254395, + "step": 9126 + }, + { + "epoch": 2.809479839950754, + "grad_norm": 11.25, + "learning_rate": 6.253785268805469e-07, + "loss": 0.9206017255783081, + "step": 9128 + }, + { + "epoch": 2.8100954139735306, + "grad_norm": 5.65625, + "learning_rate": 6.252156334561611e-07, + "loss": 1.1874823570251465, + "step": 9130 + }, + { + "epoch": 2.8107109879963064, + "grad_norm": 31.75, + "learning_rate": 6.250532589353752e-07, + "loss": 1.0900100469589233, + "step": 9132 + }, + { + "epoch": 2.811326562019083, + "grad_norm": 9.875, + "learning_rate": 6.248914033899098e-07, + "loss": 1.2578953504562378, + "step": 9134 + }, + { + "epoch": 2.811942136041859, + "grad_norm": 10.375, + "learning_rate": 6.247300668912565e-07, + "loss": 1.4572657346725464, + "step": 9136 + }, + { + "epoch": 2.8125577100646355, + "grad_norm": 14.1875, + "learning_rate": 6.24569249510678e-07, + "loss": 1.3846606016159058, + "step": 9138 + }, + { + "epoch": 2.8131732840874113, + "grad_norm": 14.5, + "learning_rate": 6.244089513192079e-07, + "loss": 1.4546737670898438, + "step": 9140 + }, + { + "epoch": 2.813788858110188, + "grad_norm": 14.0625, + "learning_rate": 6.242491723876493e-07, + "loss": 1.3725277185440063, + "step": 9142 + }, + { + "epoch": 2.8144044321329638, + "grad_norm": 9.375, + "learning_rate": 6.240899127865775e-07, + "loss": 1.3328849077224731, + "step": 9144 + }, + { + "epoch": 2.8150200061557404, + "grad_norm": 22.5, + "learning_rate": 6.239311725863371e-07, + "loss": 1.1928489208221436, + "step": 9146 + }, + { + "epoch": 2.8156355801785162, + "grad_norm": 12.0, + "learning_rate": 6.237729518570442e-07, + "loss": 1.4977320432662964, + "step": 9148 + }, + { + "epoch": 2.816251154201293, + "grad_norm": 22.625, + "learning_rate": 6.236152506685849e-07, + "loss": 1.2532992362976074, + "step": 9150 + }, + { + "epoch": 2.8168667282240687, + "grad_norm": 17.25, + "learning_rate": 6.23458069090616e-07, + "loss": 1.485316514968872, + "step": 9152 + }, + { + "epoch": 2.8174823022468454, + "grad_norm": 29.875, + "learning_rate": 6.233014071925648e-07, + "loss": 1.481433629989624, + "step": 9154 + }, + { + "epoch": 2.818097876269621, + "grad_norm": 26.625, + "learning_rate": 6.231452650436291e-07, + "loss": 0.9959707260131836, + "step": 9156 + }, + { + "epoch": 2.818713450292398, + "grad_norm": 9.875, + "learning_rate": 6.22989642712777e-07, + "loss": 1.2736601829528809, + "step": 9158 + }, + { + "epoch": 2.8193290243151736, + "grad_norm": 22.0, + "learning_rate": 6.228345402687471e-07, + "loss": 1.4581197500228882, + "step": 9160 + }, + { + "epoch": 2.8199445983379503, + "grad_norm": 8.0625, + "learning_rate": 6.226799577800481e-07, + "loss": 1.2403430938720703, + "step": 9162 + }, + { + "epoch": 2.8205601723607265, + "grad_norm": 19.875, + "learning_rate": 6.225258953149598e-07, + "loss": 1.0801653861999512, + "step": 9164 + }, + { + "epoch": 2.8211757463835028, + "grad_norm": 10.625, + "learning_rate": 6.223723529415314e-07, + "loss": 1.0728917121887207, + "step": 9166 + }, + { + "epoch": 2.821791320406279, + "grad_norm": 30.375, + "learning_rate": 6.222193307275824e-07, + "loss": 1.2684574127197266, + "step": 9168 + }, + { + "epoch": 2.8224068944290552, + "grad_norm": 13.5625, + "learning_rate": 6.220668287407036e-07, + "loss": 1.4267816543579102, + "step": 9170 + }, + { + "epoch": 2.8230224684518315, + "grad_norm": 16.25, + "learning_rate": 6.219148470482549e-07, + "loss": 1.0929862260818481, + "step": 9172 + }, + { + "epoch": 2.8236380424746077, + "grad_norm": 7.375, + "learning_rate": 6.217633857173668e-07, + "loss": 1.4870045185089111, + "step": 9174 + }, + { + "epoch": 2.824253616497384, + "grad_norm": 9.125, + "learning_rate": 6.2161244481494e-07, + "loss": 1.235878348350525, + "step": 9176 + }, + { + "epoch": 2.82486919052016, + "grad_norm": 13.9375, + "learning_rate": 6.214620244076452e-07, + "loss": 1.0306167602539062, + "step": 9178 + }, + { + "epoch": 2.8254847645429364, + "grad_norm": 32.75, + "learning_rate": 6.213121245619233e-07, + "loss": 1.281254529953003, + "step": 9180 + }, + { + "epoch": 2.8261003385657126, + "grad_norm": 14.9375, + "learning_rate": 6.211627453439856e-07, + "loss": 1.2573875188827515, + "step": 9182 + }, + { + "epoch": 2.826715912588489, + "grad_norm": 16.375, + "learning_rate": 6.210138868198124e-07, + "loss": 1.3871865272521973, + "step": 9184 + }, + { + "epoch": 2.827331486611265, + "grad_norm": 11.9375, + "learning_rate": 6.208655490551551e-07, + "loss": 1.324197769165039, + "step": 9186 + }, + { + "epoch": 2.8279470606340413, + "grad_norm": 13.0625, + "learning_rate": 6.207177321155345e-07, + "loss": 1.4072306156158447, + "step": 9188 + }, + { + "epoch": 2.8285626346568176, + "grad_norm": 6.03125, + "learning_rate": 6.205704360662417e-07, + "loss": 1.0662217140197754, + "step": 9190 + }, + { + "epoch": 2.829178208679594, + "grad_norm": 20.375, + "learning_rate": 6.204236609723375e-07, + "loss": 1.5592257976531982, + "step": 9192 + }, + { + "epoch": 2.82979378270237, + "grad_norm": 13.3125, + "learning_rate": 6.202774068986524e-07, + "loss": 1.4568991661071777, + "step": 9194 + }, + { + "epoch": 2.8304093567251463, + "grad_norm": 20.0, + "learning_rate": 6.201316739097872e-07, + "loss": 1.1466706991195679, + "step": 9196 + }, + { + "epoch": 2.8310249307479225, + "grad_norm": 14.0625, + "learning_rate": 6.199864620701123e-07, + "loss": 1.2321534156799316, + "step": 9198 + }, + { + "epoch": 2.8316405047706987, + "grad_norm": 36.5, + "learning_rate": 6.198417714437677e-07, + "loss": 1.470842719078064, + "step": 9200 + }, + { + "epoch": 2.832256078793475, + "grad_norm": 12.25, + "learning_rate": 6.19697602094664e-07, + "loss": 1.2441885471343994, + "step": 9202 + }, + { + "epoch": 2.832871652816251, + "grad_norm": 11.4375, + "learning_rate": 6.195539540864802e-07, + "loss": 1.274795651435852, + "step": 9204 + }, + { + "epoch": 2.8334872268390274, + "grad_norm": 34.75, + "learning_rate": 6.194108274826663e-07, + "loss": 1.6561574935913086, + "step": 9206 + }, + { + "epoch": 2.8341028008618037, + "grad_norm": 22.75, + "learning_rate": 6.192682223464415e-07, + "loss": 1.5058321952819824, + "step": 9208 + }, + { + "epoch": 2.83471837488458, + "grad_norm": 60.5, + "learning_rate": 6.191261387407942e-07, + "loss": 1.4805090427398682, + "step": 9210 + }, + { + "epoch": 2.835333948907356, + "grad_norm": 22.625, + "learning_rate": 6.189845767284836e-07, + "loss": 0.9961310625076294, + "step": 9212 + }, + { + "epoch": 2.8359495229301324, + "grad_norm": 23.0, + "learning_rate": 6.188435363720372e-07, + "loss": 1.1998114585876465, + "step": 9214 + }, + { + "epoch": 2.8365650969529086, + "grad_norm": 21.375, + "learning_rate": 6.187030177337529e-07, + "loss": 1.7376620769500732, + "step": 9216 + }, + { + "epoch": 2.837180670975685, + "grad_norm": 15.5625, + "learning_rate": 6.18563020875698e-07, + "loss": 1.4271830320358276, + "step": 9218 + }, + { + "epoch": 2.837796244998461, + "grad_norm": 12.9375, + "learning_rate": 6.184235458597091e-07, + "loss": 1.3464505672454834, + "step": 9220 + }, + { + "epoch": 2.8384118190212373, + "grad_norm": 17.25, + "learning_rate": 6.182845927473927e-07, + "loss": 1.2641328573226929, + "step": 9222 + }, + { + "epoch": 2.8390273930440135, + "grad_norm": 4.5, + "learning_rate": 6.181461616001242e-07, + "loss": 1.2309045791625977, + "step": 9224 + }, + { + "epoch": 2.8396429670667898, + "grad_norm": 20.875, + "learning_rate": 6.180082524790492e-07, + "loss": 1.2517509460449219, + "step": 9226 + }, + { + "epoch": 2.840258541089566, + "grad_norm": 8.75, + "learning_rate": 6.178708654450823e-07, + "loss": 1.2566943168640137, + "step": 9228 + }, + { + "epoch": 2.8408741151123422, + "grad_norm": 26.375, + "learning_rate": 6.17734000558907e-07, + "loss": 1.3818330764770508, + "step": 9230 + }, + { + "epoch": 2.8414896891351185, + "grad_norm": 56.0, + "learning_rate": 6.175976578809773e-07, + "loss": 1.668084979057312, + "step": 9232 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 13.0, + "learning_rate": 6.174618374715157e-07, + "loss": 1.6795376539230347, + "step": 9234 + }, + { + "epoch": 2.842720837180671, + "grad_norm": 16.25, + "learning_rate": 6.173265393905139e-07, + "loss": 1.54259192943573, + "step": 9236 + }, + { + "epoch": 2.843336411203447, + "grad_norm": 10.9375, + "learning_rate": 6.171917636977337e-07, + "loss": 1.4081530570983887, + "step": 9238 + }, + { + "epoch": 2.8439519852262234, + "grad_norm": 15.1875, + "learning_rate": 6.170575104527053e-07, + "loss": 1.540208339691162, + "step": 9240 + }, + { + "epoch": 2.8445675592489996, + "grad_norm": 22.75, + "learning_rate": 6.169237797147289e-07, + "loss": 1.5846127271652222, + "step": 9242 + }, + { + "epoch": 2.845183133271776, + "grad_norm": 17.0, + "learning_rate": 6.167905715428731e-07, + "loss": 1.472456693649292, + "step": 9244 + }, + { + "epoch": 2.845798707294552, + "grad_norm": 26.875, + "learning_rate": 6.166578859959766e-07, + "loss": 1.312475323677063, + "step": 9246 + }, + { + "epoch": 2.8464142813173283, + "grad_norm": 28.75, + "learning_rate": 6.165257231326462e-07, + "loss": 1.7437453269958496, + "step": 9248 + }, + { + "epoch": 2.8470298553401046, + "grad_norm": 16.375, + "learning_rate": 6.163940830112587e-07, + "loss": 1.4031589031219482, + "step": 9250 + }, + { + "epoch": 2.847645429362881, + "grad_norm": 7.96875, + "learning_rate": 6.162629656899597e-07, + "loss": 1.2629880905151367, + "step": 9252 + }, + { + "epoch": 2.848261003385657, + "grad_norm": 4.25, + "learning_rate": 6.161323712266637e-07, + "loss": 0.9520352482795715, + "step": 9254 + }, + { + "epoch": 2.8488765774084333, + "grad_norm": 77.5, + "learning_rate": 6.160022996790547e-07, + "loss": 1.3207604885101318, + "step": 9256 + }, + { + "epoch": 2.8494921514312095, + "grad_norm": 9.5625, + "learning_rate": 6.158727511045853e-07, + "loss": 1.3743228912353516, + "step": 9258 + }, + { + "epoch": 2.8501077254539857, + "grad_norm": 20.75, + "learning_rate": 6.157437255604771e-07, + "loss": 1.3018256425857544, + "step": 9260 + }, + { + "epoch": 2.850723299476762, + "grad_norm": 13.5625, + "learning_rate": 6.15615223103721e-07, + "loss": 1.3166005611419678, + "step": 9262 + }, + { + "epoch": 2.851338873499538, + "grad_norm": 4.34375, + "learning_rate": 6.154872437910768e-07, + "loss": 1.047378420829773, + "step": 9264 + }, + { + "epoch": 2.8519544475223144, + "grad_norm": 15.3125, + "learning_rate": 6.153597876790729e-07, + "loss": 1.4092975854873657, + "step": 9266 + }, + { + "epoch": 2.8525700215450907, + "grad_norm": 15.6875, + "learning_rate": 6.152328548240068e-07, + "loss": 1.2166848182678223, + "step": 9268 + }, + { + "epoch": 2.853185595567867, + "grad_norm": 21.0, + "learning_rate": 6.15106445281945e-07, + "loss": 1.451936960220337, + "step": 9270 + }, + { + "epoch": 2.853801169590643, + "grad_norm": 20.75, + "learning_rate": 6.149805591087226e-07, + "loss": 1.4809799194335938, + "step": 9272 + }, + { + "epoch": 2.8544167436134193, + "grad_norm": 10.25, + "learning_rate": 6.14855196359944e-07, + "loss": 1.3496973514556885, + "step": 9274 + }, + { + "epoch": 2.855032317636196, + "grad_norm": 18.125, + "learning_rate": 6.147303570909818e-07, + "loss": 1.3943860530853271, + "step": 9276 + }, + { + "epoch": 2.855647891658972, + "grad_norm": 36.0, + "learning_rate": 6.146060413569776e-07, + "loss": 1.413816213607788, + "step": 9278 + }, + { + "epoch": 2.8562634656817485, + "grad_norm": 22.625, + "learning_rate": 6.14482249212842e-07, + "loss": 1.5197443962097168, + "step": 9280 + }, + { + "epoch": 2.8568790397045243, + "grad_norm": 8.0625, + "learning_rate": 6.143589807132539e-07, + "loss": 1.0143325328826904, + "step": 9282 + }, + { + "epoch": 2.857494613727301, + "grad_norm": 13.625, + "learning_rate": 6.142362359126613e-07, + "loss": 1.0320184230804443, + "step": 9284 + }, + { + "epoch": 2.8581101877500767, + "grad_norm": 11.6875, + "learning_rate": 6.141140148652807e-07, + "loss": 1.1835293769836426, + "step": 9286 + }, + { + "epoch": 2.8587257617728534, + "grad_norm": 20.25, + "learning_rate": 6.139923176250972e-07, + "loss": 1.2464087009429932, + "step": 9288 + }, + { + "epoch": 2.859341335795629, + "grad_norm": 11.875, + "learning_rate": 6.138711442458648e-07, + "loss": 0.8897020816802979, + "step": 9290 + }, + { + "epoch": 2.859956909818406, + "grad_norm": 52.25, + "learning_rate": 6.137504947811057e-07, + "loss": 1.041160225868225, + "step": 9292 + }, + { + "epoch": 2.8605724838411817, + "grad_norm": 12.875, + "learning_rate": 6.136303692841109e-07, + "loss": 1.2199876308441162, + "step": 9294 + }, + { + "epoch": 2.8611880578639584, + "grad_norm": 9.875, + "learning_rate": 6.135107678079403e-07, + "loss": 1.2565871477127075, + "step": 9296 + }, + { + "epoch": 2.861803631886734, + "grad_norm": 11.875, + "learning_rate": 6.133916904054215e-07, + "loss": 1.2938958406448364, + "step": 9298 + }, + { + "epoch": 2.862419205909511, + "grad_norm": 13.375, + "learning_rate": 6.132731371291515e-07, + "loss": 0.8824082612991333, + "step": 9300 + }, + { + "epoch": 2.8630347799322866, + "grad_norm": 12.6875, + "learning_rate": 6.131551080314953e-07, + "loss": 1.3084430694580078, + "step": 9302 + }, + { + "epoch": 2.8636503539550633, + "grad_norm": 13.1875, + "learning_rate": 6.130376031645865e-07, + "loss": 1.4461159706115723, + "step": 9304 + }, + { + "epoch": 2.864265927977839, + "grad_norm": 13.6875, + "learning_rate": 6.129206225803272e-07, + "loss": 1.3158795833587646, + "step": 9306 + }, + { + "epoch": 2.8648815020006158, + "grad_norm": 8.875, + "learning_rate": 6.128041663303877e-07, + "loss": 1.3527281284332275, + "step": 9308 + }, + { + "epoch": 2.8654970760233915, + "grad_norm": 9.3125, + "learning_rate": 6.126882344662072e-07, + "loss": 1.3457539081573486, + "step": 9310 + }, + { + "epoch": 2.866112650046168, + "grad_norm": 14.25, + "learning_rate": 6.125728270389925e-07, + "loss": 1.2900372743606567, + "step": 9312 + }, + { + "epoch": 2.8667282240689445, + "grad_norm": 12.5, + "learning_rate": 6.124579440997193e-07, + "loss": 1.3647406101226807, + "step": 9314 + }, + { + "epoch": 2.8673437980917207, + "grad_norm": 20.375, + "learning_rate": 6.123435856991316e-07, + "loss": 1.5858960151672363, + "step": 9316 + }, + { + "epoch": 2.867959372114497, + "grad_norm": 33.5, + "learning_rate": 6.122297518877417e-07, + "loss": 1.2413525581359863, + "step": 9318 + }, + { + "epoch": 2.868574946137273, + "grad_norm": 7.21875, + "learning_rate": 6.121164427158303e-07, + "loss": 1.42106294631958, + "step": 9320 + }, + { + "epoch": 2.8691905201600494, + "grad_norm": 10.0625, + "learning_rate": 6.120036582334457e-07, + "loss": 1.2894312143325806, + "step": 9322 + }, + { + "epoch": 2.8698060941828256, + "grad_norm": 6.375, + "learning_rate": 6.118913984904053e-07, + "loss": 1.0894622802734375, + "step": 9324 + }, + { + "epoch": 2.870421668205602, + "grad_norm": 9.0625, + "learning_rate": 6.117796635362942e-07, + "loss": 1.2433956861495972, + "step": 9326 + }, + { + "epoch": 2.871037242228378, + "grad_norm": 21.25, + "learning_rate": 6.116684534204659e-07, + "loss": 1.2344213724136353, + "step": 9328 + }, + { + "epoch": 2.8716528162511543, + "grad_norm": 16.25, + "learning_rate": 6.115577681920423e-07, + "loss": 1.0639562606811523, + "step": 9330 + }, + { + "epoch": 2.8722683902739305, + "grad_norm": 9.1875, + "learning_rate": 6.114476078999126e-07, + "loss": 1.2135043144226074, + "step": 9332 + }, + { + "epoch": 2.872883964296707, + "grad_norm": 14.375, + "learning_rate": 6.113379725927352e-07, + "loss": 1.5391733646392822, + "step": 9334 + }, + { + "epoch": 2.873499538319483, + "grad_norm": 14.25, + "learning_rate": 6.112288623189359e-07, + "loss": 1.6590864658355713, + "step": 9336 + }, + { + "epoch": 2.8741151123422592, + "grad_norm": 15.375, + "learning_rate": 6.111202771267092e-07, + "loss": 1.3919843435287476, + "step": 9338 + }, + { + "epoch": 2.8747306863650355, + "grad_norm": 21.25, + "learning_rate": 6.110122170640168e-07, + "loss": 1.5578798055648804, + "step": 9340 + }, + { + "epoch": 2.8753462603878117, + "grad_norm": 16.75, + "learning_rate": 6.109046821785893e-07, + "loss": 1.5761566162109375, + "step": 9342 + }, + { + "epoch": 2.875961834410588, + "grad_norm": 33.5, + "learning_rate": 6.107976725179247e-07, + "loss": 1.8075366020202637, + "step": 9344 + }, + { + "epoch": 2.876577408433364, + "grad_norm": 18.5, + "learning_rate": 6.106911881292898e-07, + "loss": 1.3936195373535156, + "step": 9346 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 18.75, + "learning_rate": 6.105852290597182e-07, + "loss": 1.6210025548934937, + "step": 9348 + }, + { + "epoch": 2.8778085564789166, + "grad_norm": 30.125, + "learning_rate": 6.104797953560127e-07, + "loss": 1.3459267616271973, + "step": 9350 + }, + { + "epoch": 2.878424130501693, + "grad_norm": 22.625, + "learning_rate": 6.103748870647431e-07, + "loss": 1.0890836715698242, + "step": 9352 + }, + { + "epoch": 2.879039704524469, + "grad_norm": 9.625, + "learning_rate": 6.102705042322478e-07, + "loss": 1.0718843936920166, + "step": 9354 + }, + { + "epoch": 2.8796552785472453, + "grad_norm": 4.03125, + "learning_rate": 6.101666469046329e-07, + "loss": 1.2060925960540771, + "step": 9356 + }, + { + "epoch": 2.8802708525700216, + "grad_norm": 14.625, + "learning_rate": 6.100633151277719e-07, + "loss": 1.3895275592803955, + "step": 9358 + }, + { + "epoch": 2.880886426592798, + "grad_norm": 19.75, + "learning_rate": 6.099605089473066e-07, + "loss": 1.490225076675415, + "step": 9360 + }, + { + "epoch": 2.881502000615574, + "grad_norm": 36.0, + "learning_rate": 6.098582284086471e-07, + "loss": 1.4311065673828125, + "step": 9362 + }, + { + "epoch": 2.8821175746383503, + "grad_norm": 48.0, + "learning_rate": 6.097564735569703e-07, + "loss": 0.726864218711853, + "step": 9364 + }, + { + "epoch": 2.8827331486611265, + "grad_norm": 20.0, + "learning_rate": 6.096552444372216e-07, + "loss": 1.4224977493286133, + "step": 9366 + }, + { + "epoch": 2.8833487226839027, + "grad_norm": 27.125, + "learning_rate": 6.095545410941143e-07, + "loss": 1.629457712173462, + "step": 9368 + }, + { + "epoch": 2.883964296706679, + "grad_norm": 23.75, + "learning_rate": 6.094543635721287e-07, + "loss": 1.1574146747589111, + "step": 9370 + }, + { + "epoch": 2.884579870729455, + "grad_norm": 18.5, + "learning_rate": 6.093547119155136e-07, + "loss": 1.085228681564331, + "step": 9372 + }, + { + "epoch": 2.8851954447522314, + "grad_norm": 12.5625, + "learning_rate": 6.092555861682855e-07, + "loss": 1.32918119430542, + "step": 9374 + }, + { + "epoch": 2.8858110187750077, + "grad_norm": 12.6875, + "learning_rate": 6.091569863742277e-07, + "loss": 1.2604478597640991, + "step": 9376 + }, + { + "epoch": 2.886426592797784, + "grad_norm": 9.9375, + "learning_rate": 6.090589125768923e-07, + "loss": 1.2702546119689941, + "step": 9378 + }, + { + "epoch": 2.88704216682056, + "grad_norm": 13.125, + "learning_rate": 6.089613648195987e-07, + "loss": 1.282178521156311, + "step": 9380 + }, + { + "epoch": 2.8876577408433364, + "grad_norm": 24.0, + "learning_rate": 6.088643431454336e-07, + "loss": 1.0952709913253784, + "step": 9382 + }, + { + "epoch": 2.8882733148661126, + "grad_norm": 9.4375, + "learning_rate": 6.087678475972516e-07, + "loss": 1.4014630317687988, + "step": 9384 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 31.25, + "learning_rate": 6.086718782176749e-07, + "loss": 1.0754542350769043, + "step": 9386 + }, + { + "epoch": 2.889504462911665, + "grad_norm": 21.625, + "learning_rate": 6.085764350490934e-07, + "loss": 0.689826488494873, + "step": 9388 + }, + { + "epoch": 2.8901200369344413, + "grad_norm": 8.5625, + "learning_rate": 6.084815181336646e-07, + "loss": 1.3719182014465332, + "step": 9390 + }, + { + "epoch": 2.8907356109572175, + "grad_norm": 11.75, + "learning_rate": 6.083871275133129e-07, + "loss": 1.2801945209503174, + "step": 9392 + }, + { + "epoch": 2.8913511849799938, + "grad_norm": 34.75, + "learning_rate": 6.082932632297312e-07, + "loss": 1.1227266788482666, + "step": 9394 + }, + { + "epoch": 2.89196675900277, + "grad_norm": 17.5, + "learning_rate": 6.081999253243793e-07, + "loss": 1.4343689680099487, + "step": 9396 + }, + { + "epoch": 2.8925823330255462, + "grad_norm": 24.125, + "learning_rate": 6.081071138384848e-07, + "loss": 1.5909650325775146, + "step": 9398 + }, + { + "epoch": 2.8931979070483225, + "grad_norm": 10.0625, + "learning_rate": 6.080148288130424e-07, + "loss": 1.1403911113739014, + "step": 9400 + }, + { + "epoch": 2.8938134810710987, + "grad_norm": 12.5625, + "learning_rate": 6.079230702888147e-07, + "loss": 1.381251335144043, + "step": 9402 + }, + { + "epoch": 2.894429055093875, + "grad_norm": 17.75, + "learning_rate": 6.078318383063312e-07, + "loss": 1.3560043573379517, + "step": 9404 + }, + { + "epoch": 2.895044629116651, + "grad_norm": 11.6875, + "learning_rate": 6.077411329058897e-07, + "loss": 1.2616386413574219, + "step": 9406 + }, + { + "epoch": 2.8956602031394274, + "grad_norm": 19.375, + "learning_rate": 6.076509541275545e-07, + "loss": 1.6452581882476807, + "step": 9408 + }, + { + "epoch": 2.8962757771622036, + "grad_norm": 16.125, + "learning_rate": 6.075613020111578e-07, + "loss": 1.3793082237243652, + "step": 9410 + }, + { + "epoch": 2.89689135118498, + "grad_norm": 21.75, + "learning_rate": 6.074721765962991e-07, + "loss": 1.2782071828842163, + "step": 9412 + }, + { + "epoch": 2.897506925207756, + "grad_norm": 22.125, + "learning_rate": 6.073835779223451e-07, + "loss": 1.5189945697784424, + "step": 9414 + }, + { + "epoch": 2.8981224992305323, + "grad_norm": 5.125, + "learning_rate": 6.072955060284299e-07, + "loss": 1.2915692329406738, + "step": 9416 + }, + { + "epoch": 2.8987380732533086, + "grad_norm": 16.5, + "learning_rate": 6.072079609534549e-07, + "loss": 1.546412706375122, + "step": 9418 + }, + { + "epoch": 2.899353647276085, + "grad_norm": 14.875, + "learning_rate": 6.071209427360892e-07, + "loss": 1.0842649936676025, + "step": 9420 + }, + { + "epoch": 2.899969221298861, + "grad_norm": 15.625, + "learning_rate": 6.070344514147685e-07, + "loss": 1.0759096145629883, + "step": 9422 + }, + { + "epoch": 2.9005847953216373, + "grad_norm": 15.8125, + "learning_rate": 6.069484870276964e-07, + "loss": 1.7201030254364014, + "step": 9424 + }, + { + "epoch": 2.901200369344414, + "grad_norm": 5.375, + "learning_rate": 6.068630496128431e-07, + "loss": 1.4501621723175049, + "step": 9426 + }, + { + "epoch": 2.9018159433671897, + "grad_norm": 20.25, + "learning_rate": 6.067781392079465e-07, + "loss": 1.4068629741668701, + "step": 9428 + }, + { + "epoch": 2.9024315173899664, + "grad_norm": 37.0, + "learning_rate": 6.066937558505121e-07, + "loss": 1.6690633296966553, + "step": 9430 + }, + { + "epoch": 2.903047091412742, + "grad_norm": 6.40625, + "learning_rate": 6.066098995778116e-07, + "loss": 1.0934585332870483, + "step": 9432 + }, + { + "epoch": 2.903662665435519, + "grad_norm": 10.375, + "learning_rate": 6.065265704268845e-07, + "loss": 1.3146367073059082, + "step": 9434 + }, + { + "epoch": 2.9042782394582947, + "grad_norm": 11.625, + "learning_rate": 6.064437684345375e-07, + "loss": 1.6005584001541138, + "step": 9436 + }, + { + "epoch": 2.9048938134810713, + "grad_norm": 9.8125, + "learning_rate": 6.063614936373442e-07, + "loss": 1.2471882104873657, + "step": 9438 + }, + { + "epoch": 2.905509387503847, + "grad_norm": 17.125, + "learning_rate": 6.062797460716457e-07, + "loss": 1.5874648094177246, + "step": 9440 + }, + { + "epoch": 2.906124961526624, + "grad_norm": 18.875, + "learning_rate": 6.061985257735498e-07, + "loss": 1.3863563537597656, + "step": 9442 + }, + { + "epoch": 2.9067405355493996, + "grad_norm": 11.5, + "learning_rate": 6.061178327789316e-07, + "loss": 1.272346019744873, + "step": 9444 + }, + { + "epoch": 2.9073561095721763, + "grad_norm": 28.625, + "learning_rate": 6.060376671234333e-07, + "loss": 1.717216968536377, + "step": 9446 + }, + { + "epoch": 2.907971683594952, + "grad_norm": 10.1875, + "learning_rate": 6.059580288424644e-07, + "loss": 1.3084156513214111, + "step": 9448 + }, + { + "epoch": 2.9085872576177287, + "grad_norm": 14.1875, + "learning_rate": 6.058789179712007e-07, + "loss": 0.8683636784553528, + "step": 9450 + }, + { + "epoch": 2.9092028316405045, + "grad_norm": 17.25, + "learning_rate": 6.058003345445861e-07, + "loss": 0.7504647970199585, + "step": 9452 + }, + { + "epoch": 2.909818405663281, + "grad_norm": 32.75, + "learning_rate": 6.057222785973308e-07, + "loss": 1.600705623626709, + "step": 9454 + }, + { + "epoch": 2.910433979686057, + "grad_norm": 12.125, + "learning_rate": 6.056447501639121e-07, + "loss": 1.3672009706497192, + "step": 9456 + }, + { + "epoch": 2.9110495537088337, + "grad_norm": 12.8125, + "learning_rate": 6.055677492785745e-07, + "loss": 1.138643741607666, + "step": 9458 + }, + { + "epoch": 2.9116651277316095, + "grad_norm": 22.875, + "learning_rate": 6.054912759753293e-07, + "loss": 1.6278893947601318, + "step": 9460 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 14.375, + "learning_rate": 6.05415330287955e-07, + "loss": 1.4144954681396484, + "step": 9462 + }, + { + "epoch": 2.9128962757771624, + "grad_norm": 12.5, + "learning_rate": 6.053399122499966e-07, + "loss": 1.3152668476104736, + "step": 9464 + }, + { + "epoch": 2.9135118497999386, + "grad_norm": 16.875, + "learning_rate": 6.052650218947665e-07, + "loss": 1.298598289489746, + "step": 9466 + }, + { + "epoch": 2.914127423822715, + "grad_norm": 11.0, + "learning_rate": 6.051906592553442e-07, + "loss": 1.3542667627334595, + "step": 9468 + }, + { + "epoch": 2.914742997845491, + "grad_norm": 16.375, + "learning_rate": 6.051168243645752e-07, + "loss": 1.3643980026245117, + "step": 9470 + }, + { + "epoch": 2.9153585718682673, + "grad_norm": 28.75, + "learning_rate": 6.050435172550727e-07, + "loss": 1.6208703517913818, + "step": 9472 + }, + { + "epoch": 2.9159741458910435, + "grad_norm": 23.0, + "learning_rate": 6.049707379592166e-07, + "loss": 1.3265775442123413, + "step": 9474 + }, + { + "epoch": 2.9165897199138198, + "grad_norm": 19.625, + "learning_rate": 6.048984865091536e-07, + "loss": 1.5905539989471436, + "step": 9476 + }, + { + "epoch": 2.917205293936596, + "grad_norm": 6.8125, + "learning_rate": 6.048267629367971e-07, + "loss": 1.0480585098266602, + "step": 9478 + }, + { + "epoch": 2.9178208679593722, + "grad_norm": 4.90625, + "learning_rate": 6.047555672738275e-07, + "loss": 1.0062479972839355, + "step": 9480 + }, + { + "epoch": 2.9184364419821485, + "grad_norm": 7.40625, + "learning_rate": 6.046848995516922e-07, + "loss": 1.1649580001831055, + "step": 9482 + }, + { + "epoch": 2.9190520160049247, + "grad_norm": 4.25, + "learning_rate": 6.046147598016049e-07, + "loss": 1.1874901056289673, + "step": 9484 + }, + { + "epoch": 2.919667590027701, + "grad_norm": 56.25, + "learning_rate": 6.045451480545469e-07, + "loss": 1.3118739128112793, + "step": 9486 + }, + { + "epoch": 2.920283164050477, + "grad_norm": 15.625, + "learning_rate": 6.044760643412653e-07, + "loss": 1.6022027730941772, + "step": 9488 + }, + { + "epoch": 2.9208987380732534, + "grad_norm": 36.0, + "learning_rate": 6.044075086922746e-07, + "loss": 1.4660375118255615, + "step": 9490 + }, + { + "epoch": 2.9215143120960296, + "grad_norm": 73.0, + "learning_rate": 6.043394811378559e-07, + "loss": 0.815237283706665, + "step": 9492 + }, + { + "epoch": 2.922129886118806, + "grad_norm": 15.875, + "learning_rate": 6.042719817080573e-07, + "loss": 1.4905991554260254, + "step": 9494 + }, + { + "epoch": 2.922745460141582, + "grad_norm": 85.5, + "learning_rate": 6.042050104326928e-07, + "loss": 1.7456505298614502, + "step": 9496 + }, + { + "epoch": 2.9233610341643583, + "grad_norm": 41.75, + "learning_rate": 6.041385673413444e-07, + "loss": 1.542734980583191, + "step": 9498 + }, + { + "epoch": 2.9239766081871346, + "grad_norm": 10.6875, + "learning_rate": 6.040726524633597e-07, + "loss": 1.2516852617263794, + "step": 9500 + }, + { + "epoch": 2.924592182209911, + "grad_norm": 13.3125, + "learning_rate": 6.040072658278534e-07, + "loss": 1.4482108354568481, + "step": 9502 + }, + { + "epoch": 2.925207756232687, + "grad_norm": 11.0, + "learning_rate": 6.039424074637067e-07, + "loss": 1.4160391092300415, + "step": 9504 + }, + { + "epoch": 2.9258233302554633, + "grad_norm": 19.75, + "learning_rate": 6.038780773995679e-07, + "loss": 1.4620568752288818, + "step": 9506 + }, + { + "epoch": 2.9264389042782395, + "grad_norm": 5.0, + "learning_rate": 6.038142756638518e-07, + "loss": 1.1898398399353027, + "step": 9508 + }, + { + "epoch": 2.9270544783010157, + "grad_norm": 6.71875, + "learning_rate": 6.037510022847392e-07, + "loss": 1.1003707647323608, + "step": 9510 + }, + { + "epoch": 2.927670052323792, + "grad_norm": 15.75, + "learning_rate": 6.036882572901782e-07, + "loss": 1.2336218357086182, + "step": 9512 + }, + { + "epoch": 2.928285626346568, + "grad_norm": 97.0, + "learning_rate": 6.036260407078835e-07, + "loss": 1.258795976638794, + "step": 9514 + }, + { + "epoch": 2.9289012003693444, + "grad_norm": 17.25, + "learning_rate": 6.035643525653363e-07, + "loss": 1.35211980342865, + "step": 9516 + }, + { + "epoch": 2.9295167743921207, + "grad_norm": 15.625, + "learning_rate": 6.035031928897839e-07, + "loss": 1.0009329319000244, + "step": 9518 + }, + { + "epoch": 2.930132348414897, + "grad_norm": 12.0625, + "learning_rate": 6.03442561708241e-07, + "loss": 1.1892321109771729, + "step": 9520 + }, + { + "epoch": 2.930747922437673, + "grad_norm": 9.375, + "learning_rate": 6.033824590474884e-07, + "loss": 1.3555978536605835, + "step": 9522 + }, + { + "epoch": 2.9313634964604494, + "grad_norm": 49.75, + "learning_rate": 6.033228849340733e-07, + "loss": 1.5087881088256836, + "step": 9524 + }, + { + "epoch": 2.9319790704832256, + "grad_norm": 23.875, + "learning_rate": 6.032638393943101e-07, + "loss": 1.3452261686325073, + "step": 9526 + }, + { + "epoch": 2.932594644506002, + "grad_norm": 13.3125, + "learning_rate": 6.032053224542786e-07, + "loss": 1.3585171699523926, + "step": 9528 + }, + { + "epoch": 2.933210218528778, + "grad_norm": 10.1875, + "learning_rate": 6.031473341398264e-07, + "loss": 1.3980622291564941, + "step": 9530 + }, + { + "epoch": 2.9338257925515543, + "grad_norm": 27.625, + "learning_rate": 6.030898744765667e-07, + "loss": 1.542459487915039, + "step": 9532 + }, + { + "epoch": 2.9344413665743305, + "grad_norm": 34.25, + "learning_rate": 6.030329434898795e-07, + "loss": 1.4522576332092285, + "step": 9534 + }, + { + "epoch": 2.9350569405971068, + "grad_norm": 18.625, + "learning_rate": 6.029765412049117e-07, + "loss": 1.3313522338867188, + "step": 9536 + }, + { + "epoch": 2.935672514619883, + "grad_norm": 29.125, + "learning_rate": 6.029206676465756e-07, + "loss": 1.4171909093856812, + "step": 9538 + }, + { + "epoch": 2.936288088642659, + "grad_norm": 11.5, + "learning_rate": 6.02865322839551e-07, + "loss": 1.6026504039764404, + "step": 9540 + }, + { + "epoch": 2.9369036626654355, + "grad_norm": 3.90625, + "learning_rate": 6.028105068082838e-07, + "loss": 1.0158360004425049, + "step": 9542 + }, + { + "epoch": 2.9375192366882117, + "grad_norm": 12.3125, + "learning_rate": 6.027562195769862e-07, + "loss": 1.1536140441894531, + "step": 9544 + }, + { + "epoch": 2.938134810710988, + "grad_norm": 21.375, + "learning_rate": 6.027024611696368e-07, + "loss": 1.3445171117782593, + "step": 9546 + }, + { + "epoch": 2.938750384733764, + "grad_norm": 9.625, + "learning_rate": 6.02649231609981e-07, + "loss": 1.2725205421447754, + "step": 9548 + }, + { + "epoch": 2.9393659587565404, + "grad_norm": 32.5, + "learning_rate": 6.025965309215302e-07, + "loss": 1.693610429763794, + "step": 9550 + }, + { + "epoch": 2.9399815327793166, + "grad_norm": 15.75, + "learning_rate": 6.025443591275625e-07, + "loss": 1.5363895893096924, + "step": 9552 + }, + { + "epoch": 2.940597106802093, + "grad_norm": 10.1875, + "learning_rate": 6.02492716251122e-07, + "loss": 1.305220365524292, + "step": 9554 + }, + { + "epoch": 2.941212680824869, + "grad_norm": 23.375, + "learning_rate": 6.024416023150197e-07, + "loss": 1.430680274963379, + "step": 9556 + }, + { + "epoch": 2.9418282548476453, + "grad_norm": 13.625, + "learning_rate": 6.023910173418323e-07, + "loss": 1.56490159034729, + "step": 9558 + }, + { + "epoch": 2.9424438288704216, + "grad_norm": 9.0, + "learning_rate": 6.023409613539036e-07, + "loss": 1.1676526069641113, + "step": 9560 + }, + { + "epoch": 2.943059402893198, + "grad_norm": 9.6875, + "learning_rate": 6.022914343733434e-07, + "loss": 1.792161226272583, + "step": 9562 + }, + { + "epoch": 2.943674976915974, + "grad_norm": 18.75, + "learning_rate": 6.022424364220275e-07, + "loss": 1.4854860305786133, + "step": 9564 + }, + { + "epoch": 2.9442905509387503, + "grad_norm": 10.5, + "learning_rate": 6.021939675215987e-07, + "loss": 1.162688970565796, + "step": 9566 + }, + { + "epoch": 2.9449061249615265, + "grad_norm": 13.6875, + "learning_rate": 6.021460276934656e-07, + "loss": 1.3605610132217407, + "step": 9568 + }, + { + "epoch": 2.9455216989843027, + "grad_norm": 14.75, + "learning_rate": 6.020986169588032e-07, + "loss": 1.1231822967529297, + "step": 9570 + }, + { + "epoch": 2.946137273007079, + "grad_norm": 14.9375, + "learning_rate": 6.02051735338553e-07, + "loss": 1.1760200262069702, + "step": 9572 + }, + { + "epoch": 2.946752847029855, + "grad_norm": 11.6875, + "learning_rate": 6.020053828534226e-07, + "loss": 1.4490975141525269, + "step": 9574 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 18.625, + "learning_rate": 6.019595595238861e-07, + "loss": 1.256340503692627, + "step": 9576 + }, + { + "epoch": 2.9479839950754076, + "grad_norm": 13.9375, + "learning_rate": 6.019142653701834e-07, + "loss": 1.6551717519760132, + "step": 9578 + }, + { + "epoch": 2.9485995690981843, + "grad_norm": 24.0, + "learning_rate": 6.018695004123214e-07, + "loss": 1.2904531955718994, + "step": 9580 + }, + { + "epoch": 2.94921514312096, + "grad_norm": 35.5, + "learning_rate": 6.018252646700724e-07, + "loss": 0.9481006860733032, + "step": 9582 + }, + { + "epoch": 2.949830717143737, + "grad_norm": 18.875, + "learning_rate": 6.017815581629757e-07, + "loss": 1.5369739532470703, + "step": 9584 + }, + { + "epoch": 2.9504462911665126, + "grad_norm": 16.875, + "learning_rate": 6.017383809103363e-07, + "loss": 1.1765402555465698, + "step": 9586 + }, + { + "epoch": 2.9510618651892893, + "grad_norm": 13.5625, + "learning_rate": 6.01695732931226e-07, + "loss": 1.171614170074463, + "step": 9588 + }, + { + "epoch": 2.951677439212065, + "grad_norm": 43.25, + "learning_rate": 6.01653614244482e-07, + "loss": 1.760496735572815, + "step": 9590 + }, + { + "epoch": 2.9522930132348417, + "grad_norm": 14.5, + "learning_rate": 6.016120248687082e-07, + "loss": 1.387416124343872, + "step": 9592 + }, + { + "epoch": 2.9529085872576175, + "grad_norm": 9.5625, + "learning_rate": 6.01570964822275e-07, + "loss": 1.355863332748413, + "step": 9594 + }, + { + "epoch": 2.953524161280394, + "grad_norm": 11.25, + "learning_rate": 6.015304341233187e-07, + "loss": 1.4700431823730469, + "step": 9596 + }, + { + "epoch": 2.95413973530317, + "grad_norm": 10.1875, + "learning_rate": 6.014904327897414e-07, + "loss": 1.1266649961471558, + "step": 9598 + }, + { + "epoch": 2.9547553093259467, + "grad_norm": 8.6875, + "learning_rate": 6.01450960839212e-07, + "loss": 1.219506859779358, + "step": 9600 + }, + { + "epoch": 2.9553708833487224, + "grad_norm": 56.25, + "learning_rate": 6.014120182891651e-07, + "loss": 1.198972463607788, + "step": 9602 + }, + { + "epoch": 2.955986457371499, + "grad_norm": 9.375, + "learning_rate": 6.013736051568018e-07, + "loss": 1.1335132122039795, + "step": 9604 + }, + { + "epoch": 2.956602031394275, + "grad_norm": 9.875, + "learning_rate": 6.013357214590893e-07, + "loss": 1.3339592218399048, + "step": 9606 + }, + { + "epoch": 2.9572176054170516, + "grad_norm": 6.28125, + "learning_rate": 6.012983672127608e-07, + "loss": 1.0773075819015503, + "step": 9608 + }, + { + "epoch": 2.9578331794398274, + "grad_norm": 33.5, + "learning_rate": 6.012615424343155e-07, + "loss": 1.5172120332717896, + "step": 9610 + }, + { + "epoch": 2.958448753462604, + "grad_norm": 16.625, + "learning_rate": 6.012252471400194e-07, + "loss": 1.3709399700164795, + "step": 9612 + }, + { + "epoch": 2.95906432748538, + "grad_norm": 8.5625, + "learning_rate": 6.011894813459037e-07, + "loss": 1.2132893800735474, + "step": 9614 + }, + { + "epoch": 2.9596799015081565, + "grad_norm": 12.0, + "learning_rate": 6.011542450677664e-07, + "loss": 1.3026639223098755, + "step": 9616 + }, + { + "epoch": 2.9602954755309328, + "grad_norm": 14.125, + "learning_rate": 6.011195383211716e-07, + "loss": 1.0850412845611572, + "step": 9618 + }, + { + "epoch": 2.960911049553709, + "grad_norm": 22.875, + "learning_rate": 6.01085361121449e-07, + "loss": 1.24189293384552, + "step": 9620 + }, + { + "epoch": 2.961526623576485, + "grad_norm": 11.4375, + "learning_rate": 6.010517134836948e-07, + "loss": 1.3805148601531982, + "step": 9622 + }, + { + "epoch": 2.9621421975992615, + "grad_norm": 17.0, + "learning_rate": 6.010185954227712e-07, + "loss": 1.5809800624847412, + "step": 9624 + }, + { + "epoch": 2.9627577716220377, + "grad_norm": 19.625, + "learning_rate": 6.009860069533068e-07, + "loss": 1.1562623977661133, + "step": 9626 + }, + { + "epoch": 2.963373345644814, + "grad_norm": 5.6875, + "learning_rate": 6.009539480896955e-07, + "loss": 0.9422524571418762, + "step": 9628 + }, + { + "epoch": 2.96398891966759, + "grad_norm": 16.375, + "learning_rate": 6.009224188460978e-07, + "loss": 1.0297799110412598, + "step": 9630 + }, + { + "epoch": 2.9646044936903664, + "grad_norm": 17.25, + "learning_rate": 6.008914192364404e-07, + "loss": 1.4425406455993652, + "step": 9632 + }, + { + "epoch": 2.9652200677131426, + "grad_norm": 7.46875, + "learning_rate": 6.008609492744159e-07, + "loss": 1.2678622007369995, + "step": 9634 + }, + { + "epoch": 2.965835641735919, + "grad_norm": 12.125, + "learning_rate": 6.008310089734825e-07, + "loss": 1.4951027631759644, + "step": 9636 + }, + { + "epoch": 2.966451215758695, + "grad_norm": 23.875, + "learning_rate": 6.008015983468653e-07, + "loss": 1.4566600322723389, + "step": 9638 + }, + { + "epoch": 2.9670667897814713, + "grad_norm": 7.875, + "learning_rate": 6.007727174075549e-07, + "loss": 1.620081901550293, + "step": 9640 + }, + { + "epoch": 2.9676823638042475, + "grad_norm": 80.0, + "learning_rate": 6.007443661683082e-07, + "loss": 1.240918755531311, + "step": 9642 + }, + { + "epoch": 2.968297937827024, + "grad_norm": 11.1875, + "learning_rate": 6.007165446416476e-07, + "loss": 1.126937747001648, + "step": 9644 + }, + { + "epoch": 2.9689135118498, + "grad_norm": 10.125, + "learning_rate": 6.006892528398622e-07, + "loss": 1.3524675369262695, + "step": 9646 + }, + { + "epoch": 2.9695290858725762, + "grad_norm": 35.25, + "learning_rate": 6.006624907750065e-07, + "loss": 1.562439203262329, + "step": 9648 + }, + { + "epoch": 2.9701446598953525, + "grad_norm": 32.0, + "learning_rate": 6.006362584589017e-07, + "loss": 1.5702564716339111, + "step": 9650 + }, + { + "epoch": 2.9707602339181287, + "grad_norm": 15.625, + "learning_rate": 6.006105559031345e-07, + "loss": 1.7766485214233398, + "step": 9652 + }, + { + "epoch": 2.971375807940905, + "grad_norm": 13.5625, + "learning_rate": 6.005853831190577e-07, + "loss": 1.5016021728515625, + "step": 9654 + }, + { + "epoch": 2.971991381963681, + "grad_norm": 8.5, + "learning_rate": 6.005607401177901e-07, + "loss": 1.2089016437530518, + "step": 9656 + }, + { + "epoch": 2.9726069559864574, + "grad_norm": 5.09375, + "learning_rate": 6.005366269102167e-07, + "loss": 1.0317680835723877, + "step": 9658 + }, + { + "epoch": 2.9732225300092336, + "grad_norm": 18.5, + "learning_rate": 6.005130435069883e-07, + "loss": 1.2018183469772339, + "step": 9660 + }, + { + "epoch": 2.97383810403201, + "grad_norm": 17.75, + "learning_rate": 6.004899899185216e-07, + "loss": 1.211350917816162, + "step": 9662 + }, + { + "epoch": 2.974453678054786, + "grad_norm": 21.0, + "learning_rate": 6.004674661549996e-07, + "loss": 1.4988772869110107, + "step": 9664 + }, + { + "epoch": 2.9750692520775623, + "grad_norm": 13.75, + "learning_rate": 6.004454722263708e-07, + "loss": 1.623139500617981, + "step": 9666 + }, + { + "epoch": 2.9756848261003386, + "grad_norm": 24.75, + "learning_rate": 6.004240081423502e-07, + "loss": 1.3296325206756592, + "step": 9668 + }, + { + "epoch": 2.976300400123115, + "grad_norm": 15.875, + "learning_rate": 6.004030739124183e-07, + "loss": 1.125887155532837, + "step": 9670 + }, + { + "epoch": 2.976915974145891, + "grad_norm": 21.75, + "learning_rate": 6.003826695458218e-07, + "loss": 1.6624093055725098, + "step": 9672 + }, + { + "epoch": 2.9775315481686673, + "grad_norm": 48.25, + "learning_rate": 6.003627950515737e-07, + "loss": 1.484586238861084, + "step": 9674 + }, + { + "epoch": 2.9781471221914435, + "grad_norm": 11.125, + "learning_rate": 6.003434504384521e-07, + "loss": 1.4605937004089355, + "step": 9676 + }, + { + "epoch": 2.9787626962142197, + "grad_norm": 41.5, + "learning_rate": 6.003246357150016e-07, + "loss": 1.7345576286315918, + "step": 9678 + }, + { + "epoch": 2.979378270236996, + "grad_norm": 17.5, + "learning_rate": 6.00306350889533e-07, + "loss": 1.9778789281845093, + "step": 9680 + }, + { + "epoch": 2.979993844259772, + "grad_norm": 35.5, + "learning_rate": 6.002885959701225e-07, + "loss": 1.897291660308838, + "step": 9682 + }, + { + "epoch": 2.9806094182825484, + "grad_norm": 25.875, + "learning_rate": 6.002713709646125e-07, + "loss": 1.3435924053192139, + "step": 9684 + }, + { + "epoch": 2.9812249923053247, + "grad_norm": 23.5, + "learning_rate": 6.002546758806114e-07, + "loss": 1.4819591045379639, + "step": 9686 + }, + { + "epoch": 2.981840566328101, + "grad_norm": 12.375, + "learning_rate": 6.002385107254931e-07, + "loss": 1.3283945322036743, + "step": 9688 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 13.5, + "learning_rate": 6.002228755063982e-07, + "loss": 0.7335997819900513, + "step": 9690 + }, + { + "epoch": 2.9830717143736534, + "grad_norm": 21.0, + "learning_rate": 6.002077702302326e-07, + "loss": 1.5746386051177979, + "step": 9692 + }, + { + "epoch": 2.9836872883964296, + "grad_norm": 34.75, + "learning_rate": 6.001931949036683e-07, + "loss": 1.1658272743225098, + "step": 9694 + }, + { + "epoch": 2.984302862419206, + "grad_norm": 7.90625, + "learning_rate": 6.001791495331432e-07, + "loss": 1.0803534984588623, + "step": 9696 + }, + { + "epoch": 2.984918436441982, + "grad_norm": 12.625, + "learning_rate": 6.001656341248613e-07, + "loss": 1.301381230354309, + "step": 9698 + }, + { + "epoch": 2.9855340104647583, + "grad_norm": 21.25, + "learning_rate": 6.001526486847923e-07, + "loss": 1.188427209854126, + "step": 9700 + }, + { + "epoch": 2.9861495844875345, + "grad_norm": 7.90625, + "learning_rate": 6.00140193218672e-07, + "loss": 0.9816202521324158, + "step": 9702 + }, + { + "epoch": 2.9867651585103108, + "grad_norm": 19.25, + "learning_rate": 6.001282677320017e-07, + "loss": 1.194831132888794, + "step": 9704 + }, + { + "epoch": 2.987380732533087, + "grad_norm": 14.25, + "learning_rate": 6.001168722300492e-07, + "loss": 1.100862979888916, + "step": 9706 + }, + { + "epoch": 2.9879963065558632, + "grad_norm": 9.9375, + "learning_rate": 6.001060067178477e-07, + "loss": 1.4596848487854004, + "step": 9708 + }, + { + "epoch": 2.9886118805786395, + "grad_norm": 11.5625, + "learning_rate": 6.000956712001966e-07, + "loss": 1.4575676918029785, + "step": 9710 + }, + { + "epoch": 2.9892274546014157, + "grad_norm": 3.921875, + "learning_rate": 6.000858656816612e-07, + "loss": 1.2540020942687988, + "step": 9712 + }, + { + "epoch": 2.989843028624192, + "grad_norm": 16.0, + "learning_rate": 6.000765901665723e-07, + "loss": 1.4371435642242432, + "step": 9714 + }, + { + "epoch": 2.990458602646968, + "grad_norm": 9.375, + "learning_rate": 6.000678446590272e-07, + "loss": 1.2159063816070557, + "step": 9716 + }, + { + "epoch": 2.9910741766697444, + "grad_norm": 12.25, + "learning_rate": 6.000596291628887e-07, + "loss": 1.1678316593170166, + "step": 9718 + }, + { + "epoch": 2.9916897506925206, + "grad_norm": 23.25, + "learning_rate": 6.000519436817857e-07, + "loss": 1.6382416486740112, + "step": 9720 + }, + { + "epoch": 2.992305324715297, + "grad_norm": 24.625, + "learning_rate": 6.000447882191127e-07, + "loss": 1.2942144870758057, + "step": 9722 + }, + { + "epoch": 2.992920898738073, + "grad_norm": 11.125, + "learning_rate": 6.000381627780304e-07, + "loss": 1.5667628049850464, + "step": 9724 + }, + { + "epoch": 2.9935364727608493, + "grad_norm": 15.5625, + "learning_rate": 6.000320673614653e-07, + "loss": 1.5609714984893799, + "step": 9726 + }, + { + "epoch": 2.9941520467836256, + "grad_norm": 18.625, + "learning_rate": 6.000265019721097e-07, + "loss": 1.583195686340332, + "step": 9728 + }, + { + "epoch": 2.9947676208064022, + "grad_norm": 17.75, + "learning_rate": 6.000214666124217e-07, + "loss": 1.3528969287872314, + "step": 9730 + }, + { + "epoch": 2.995383194829178, + "grad_norm": 27.5, + "learning_rate": 6.000169612846257e-07, + "loss": 0.8428146839141846, + "step": 9732 + }, + { + "epoch": 2.9959987688519547, + "grad_norm": 8.5, + "learning_rate": 6.000129859907115e-07, + "loss": 0.9794246554374695, + "step": 9734 + }, + { + "epoch": 2.9966143428747305, + "grad_norm": 12.125, + "learning_rate": 6.00009540732435e-07, + "loss": 0.9260591864585876, + "step": 9736 + }, + { + "epoch": 2.997229916897507, + "grad_norm": 7.28125, + "learning_rate": 6.000066255113181e-07, + "loss": 0.8063700795173645, + "step": 9738 + }, + { + "epoch": 2.997845490920283, + "grad_norm": 25.875, + "learning_rate": 6.000042403286483e-07, + "loss": 1.2744035720825195, + "step": 9740 + }, + { + "epoch": 2.9984610649430596, + "grad_norm": 14.9375, + "learning_rate": 6.000023851854793e-07, + "loss": 1.2116272449493408, + "step": 9742 + }, + { + "epoch": 2.9990766389658354, + "grad_norm": 12.125, + "learning_rate": 6.000010600826304e-07, + "loss": 1.378058671951294, + "step": 9744 + }, + { + "epoch": 2.999692212988612, + "grad_norm": 14.6875, + "learning_rate": 6.00000265020687e-07, + "loss": 1.3891949653625488, + "step": 9746 + }, + { + "epoch": 3.0, + "step": 9747, + "total_flos": 3.8653261653884273e+18, + "train_loss": 1.3544263215239651, + "train_runtime": 26899.9771, + "train_samples_per_second": 1.449, + "train_steps_per_second": 0.362 + } + ], + "logging_steps": 2, + "max_steps": 9747, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.8653261653884273e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}