{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 3276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002442002442002442, "grad_norm": 2.5476107597351074, "learning_rate": 3.0487804878048784e-08, "loss": 1.9267934560775757, "step": 2 }, { "epoch": 0.004884004884004884, "grad_norm": 2.350306749343872, "learning_rate": 9.146341463414634e-08, "loss": 2.0976288318634033, "step": 4 }, { "epoch": 0.007326007326007326, "grad_norm": 22.22303009033203, "learning_rate": 1.5243902439024392e-07, "loss": 2.4150097370147705, "step": 6 }, { "epoch": 0.009768009768009768, "grad_norm": 4.77632999420166, "learning_rate": 2.134146341463415e-07, "loss": 1.9595110416412354, "step": 8 }, { "epoch": 0.01221001221001221, "grad_norm": 6.901440620422363, "learning_rate": 2.7439024390243906e-07, "loss": 2.162900447845459, "step": 10 }, { "epoch": 0.014652014652014652, "grad_norm": 11.045926094055176, "learning_rate": 3.3536585365853663e-07, "loss": 1.9404582977294922, "step": 12 }, { "epoch": 0.017094017094017096, "grad_norm": 2.2156028747558594, "learning_rate": 3.963414634146342e-07, "loss": 1.6843563318252563, "step": 14 }, { "epoch": 0.019536019536019536, "grad_norm": 26.805221557617188, "learning_rate": 4.573170731707317e-07, "loss": 2.0623722076416016, "step": 16 }, { "epoch": 0.02197802197802198, "grad_norm": 17.565683364868164, "learning_rate": 5.182926829268293e-07, "loss": 2.7407174110412598, "step": 18 }, { "epoch": 0.02442002442002442, "grad_norm": 4.266391277313232, "learning_rate": 5.79268292682927e-07, "loss": 1.88368821144104, "step": 20 }, { "epoch": 0.026862026862026864, "grad_norm": 8.910398483276367, "learning_rate": 6.402439024390244e-07, "loss": 2.084914445877075, "step": 22 }, { "epoch": 0.029304029304029304, "grad_norm": 2.130563497543335, "learning_rate": 7.012195121951221e-07, "loss": 2.019660234451294, "step": 24 }, { "epoch": 0.031746031746031744, "grad_norm": 6.1367716789245605, "learning_rate": 7.621951219512196e-07, "loss": 1.9338700771331787, "step": 26 }, { "epoch": 0.03418803418803419, "grad_norm": 6.56151008605957, "learning_rate": 8.231707317073172e-07, "loss": 2.0060365200042725, "step": 28 }, { "epoch": 0.03663003663003663, "grad_norm": 9.909710884094238, "learning_rate": 8.841463414634147e-07, "loss": 1.8657618761062622, "step": 30 }, { "epoch": 0.03907203907203907, "grad_norm": 2.332340955734253, "learning_rate": 9.451219512195123e-07, "loss": 1.385891318321228, "step": 32 }, { "epoch": 0.04151404151404151, "grad_norm": 2.9691996574401855, "learning_rate": 1.0060975609756098e-06, "loss": 1.872510552406311, "step": 34 }, { "epoch": 0.04395604395604396, "grad_norm": 3.3227553367614746, "learning_rate": 1.0670731707317073e-06, "loss": 1.4552903175354004, "step": 36 }, { "epoch": 0.0463980463980464, "grad_norm": 2.721830368041992, "learning_rate": 1.128048780487805e-06, "loss": 1.8594551086425781, "step": 38 }, { "epoch": 0.04884004884004884, "grad_norm": 5.821812152862549, "learning_rate": 1.1890243902439024e-06, "loss": 1.697621464729309, "step": 40 }, { "epoch": 0.05128205128205128, "grad_norm": 1.486113429069519, "learning_rate": 1.25e-06, "loss": 1.7849284410476685, "step": 42 }, { "epoch": 0.05372405372405373, "grad_norm": 4.890027046203613, "learning_rate": 1.3109756097560978e-06, "loss": 1.7496923208236694, "step": 44 }, { "epoch": 0.05616605616605617, "grad_norm": 1.7662242650985718, "learning_rate": 1.3719512195121952e-06, "loss": 1.6558294296264648, "step": 46 }, { "epoch": 0.05860805860805861, "grad_norm": 19.32802963256836, "learning_rate": 1.4329268292682927e-06, "loss": 1.4527249336242676, "step": 48 }, { "epoch": 0.06105006105006105, "grad_norm": 9.77262020111084, "learning_rate": 1.4939024390243904e-06, "loss": 1.3586843013763428, "step": 50 }, { "epoch": 0.06349206349206349, "grad_norm": 6.361555576324463, "learning_rate": 1.5548780487804878e-06, "loss": 1.5780984163284302, "step": 52 }, { "epoch": 0.06593406593406594, "grad_norm": 15.675647735595703, "learning_rate": 1.6158536585365855e-06, "loss": 1.285346508026123, "step": 54 }, { "epoch": 0.06837606837606838, "grad_norm": 4.175439834594727, "learning_rate": 1.6768292682926832e-06, "loss": 1.5857115983963013, "step": 56 }, { "epoch": 0.07081807081807082, "grad_norm": 1.736680030822754, "learning_rate": 1.7378048780487804e-06, "loss": 1.5757516622543335, "step": 58 }, { "epoch": 0.07326007326007326, "grad_norm": 1.47886061668396, "learning_rate": 1.7987804878048781e-06, "loss": 1.6183691024780273, "step": 60 }, { "epoch": 0.0757020757020757, "grad_norm": 2.919388771057129, "learning_rate": 1.8597560975609758e-06, "loss": 1.164100170135498, "step": 62 }, { "epoch": 0.07814407814407814, "grad_norm": 1.240254282951355, "learning_rate": 1.9207317073170733e-06, "loss": 1.767830491065979, "step": 64 }, { "epoch": 0.08058608058608059, "grad_norm": 8.434248924255371, "learning_rate": 1.981707317073171e-06, "loss": 1.3761873245239258, "step": 66 }, { "epoch": 0.08302808302808302, "grad_norm": 1.5368638038635254, "learning_rate": 2.042682926829268e-06, "loss": 1.2026317119598389, "step": 68 }, { "epoch": 0.08547008547008547, "grad_norm": 0.9749501347541809, "learning_rate": 2.103658536585366e-06, "loss": 1.2645400762557983, "step": 70 }, { "epoch": 0.08791208791208792, "grad_norm": 1.7136712074279785, "learning_rate": 2.1646341463414635e-06, "loss": 1.5449546575546265, "step": 72 }, { "epoch": 0.09035409035409035, "grad_norm": 3.341733455657959, "learning_rate": 2.225609756097561e-06, "loss": 1.5047639608383179, "step": 74 }, { "epoch": 0.0927960927960928, "grad_norm": 5.009698390960693, "learning_rate": 2.286585365853659e-06, "loss": 1.4909131526947021, "step": 76 }, { "epoch": 0.09523809523809523, "grad_norm": 1.631039023399353, "learning_rate": 2.3475609756097563e-06, "loss": 1.3529361486434937, "step": 78 }, { "epoch": 0.09768009768009768, "grad_norm": 6.177618026733398, "learning_rate": 2.408536585365854e-06, "loss": 1.254205346107483, "step": 80 }, { "epoch": 0.10012210012210013, "grad_norm": 5.524102210998535, "learning_rate": 2.4695121951219513e-06, "loss": 1.165070652961731, "step": 82 }, { "epoch": 0.10256410256410256, "grad_norm": 2.264727830886841, "learning_rate": 2.530487804878049e-06, "loss": 1.1751306056976318, "step": 84 }, { "epoch": 0.10500610500610501, "grad_norm": 1.5993300676345825, "learning_rate": 2.5914634146341466e-06, "loss": 1.352165699005127, "step": 86 }, { "epoch": 0.10744810744810745, "grad_norm": 1.8832273483276367, "learning_rate": 2.652439024390244e-06, "loss": 1.5243136882781982, "step": 88 }, { "epoch": 0.10989010989010989, "grad_norm": 0.7285981178283691, "learning_rate": 2.713414634146342e-06, "loss": 1.2205549478530884, "step": 90 }, { "epoch": 0.11233211233211234, "grad_norm": 2.318856716156006, "learning_rate": 2.7743902439024394e-06, "loss": 1.6029253005981445, "step": 92 }, { "epoch": 0.11477411477411477, "grad_norm": 7.182040691375732, "learning_rate": 2.8353658536585365e-06, "loss": 1.1754858493804932, "step": 94 }, { "epoch": 0.11721611721611722, "grad_norm": 1.7051122188568115, "learning_rate": 2.8963414634146343e-06, "loss": 1.4834587574005127, "step": 96 }, { "epoch": 0.11965811965811966, "grad_norm": 3.0200533866882324, "learning_rate": 2.957317073170732e-06, "loss": 1.4276564121246338, "step": 98 }, { "epoch": 0.1221001221001221, "grad_norm": 1.8382214307785034, "learning_rate": 3.0182926829268293e-06, "loss": 1.1662065982818604, "step": 100 }, { "epoch": 0.12454212454212454, "grad_norm": 2.296553611755371, "learning_rate": 3.079268292682927e-06, "loss": 1.3122981786727905, "step": 102 }, { "epoch": 0.12698412698412698, "grad_norm": 3.1782186031341553, "learning_rate": 3.1402439024390246e-06, "loss": 1.0392099618911743, "step": 104 }, { "epoch": 0.12942612942612944, "grad_norm": 1.1442056894302368, "learning_rate": 3.201219512195122e-06, "loss": 0.9646719694137573, "step": 106 }, { "epoch": 0.13186813186813187, "grad_norm": 4.929725170135498, "learning_rate": 3.26219512195122e-06, "loss": 1.420979619026184, "step": 108 }, { "epoch": 0.1343101343101343, "grad_norm": 2.712373971939087, "learning_rate": 3.3231707317073174e-06, "loss": 1.6603320837020874, "step": 110 }, { "epoch": 0.13675213675213677, "grad_norm": 2.1611270904541016, "learning_rate": 3.3841463414634153e-06, "loss": 1.452590823173523, "step": 112 }, { "epoch": 0.1391941391941392, "grad_norm": 1.7481805086135864, "learning_rate": 3.4451219512195124e-06, "loss": 1.2166002988815308, "step": 114 }, { "epoch": 0.14163614163614163, "grad_norm": 1.9498414993286133, "learning_rate": 3.50609756097561e-06, "loss": 1.3791627883911133, "step": 116 }, { "epoch": 0.14407814407814407, "grad_norm": 5.3667497634887695, "learning_rate": 3.5670731707317073e-06, "loss": 1.2551401853561401, "step": 118 }, { "epoch": 0.14652014652014653, "grad_norm": 2.414433717727661, "learning_rate": 3.628048780487805e-06, "loss": 1.3578366041183472, "step": 120 }, { "epoch": 0.14896214896214896, "grad_norm": 1.8076027631759644, "learning_rate": 3.6890243902439026e-06, "loss": 1.3795714378356934, "step": 122 }, { "epoch": 0.1514041514041514, "grad_norm": 2.020355701446533, "learning_rate": 3.7500000000000005e-06, "loss": 1.3397819995880127, "step": 124 }, { "epoch": 0.15384615384615385, "grad_norm": 1.3748884201049805, "learning_rate": 3.810975609756098e-06, "loss": 1.1987930536270142, "step": 126 }, { "epoch": 0.1562881562881563, "grad_norm": 1.4875504970550537, "learning_rate": 3.8719512195121954e-06, "loss": 1.4347355365753174, "step": 128 }, { "epoch": 0.15873015873015872, "grad_norm": 1.2580325603485107, "learning_rate": 3.932926829268293e-06, "loss": 1.2884924411773682, "step": 130 }, { "epoch": 0.16117216117216118, "grad_norm": 4.422817707061768, "learning_rate": 3.99390243902439e-06, "loss": 1.0759848356246948, "step": 132 }, { "epoch": 0.16361416361416362, "grad_norm": 2.910273790359497, "learning_rate": 4.054878048780488e-06, "loss": 1.1693415641784668, "step": 134 }, { "epoch": 0.16605616605616605, "grad_norm": 2.8875091075897217, "learning_rate": 4.115853658536585e-06, "loss": 1.1773910522460938, "step": 136 }, { "epoch": 0.1684981684981685, "grad_norm": 3.0497472286224365, "learning_rate": 4.176829268292683e-06, "loss": 1.1259866952896118, "step": 138 }, { "epoch": 0.17094017094017094, "grad_norm": 2.7244997024536133, "learning_rate": 4.237804878048781e-06, "loss": 1.1096811294555664, "step": 140 }, { "epoch": 0.17338217338217338, "grad_norm": 1.9807188510894775, "learning_rate": 4.298780487804878e-06, "loss": 1.374996304512024, "step": 142 }, { "epoch": 0.17582417582417584, "grad_norm": 1.5548163652420044, "learning_rate": 4.359756097560976e-06, "loss": 1.116044521331787, "step": 144 }, { "epoch": 0.17826617826617827, "grad_norm": 2.0115115642547607, "learning_rate": 4.420731707317074e-06, "loss": 0.9985978603363037, "step": 146 }, { "epoch": 0.1807081807081807, "grad_norm": 1.5195460319519043, "learning_rate": 4.481707317073171e-06, "loss": 0.9752452373504639, "step": 148 }, { "epoch": 0.18315018315018314, "grad_norm": 1.3534411191940308, "learning_rate": 4.542682926829269e-06, "loss": 1.3346309661865234, "step": 150 }, { "epoch": 0.1855921855921856, "grad_norm": 2.0687193870544434, "learning_rate": 4.603658536585367e-06, "loss": 1.4687234163284302, "step": 152 }, { "epoch": 0.18803418803418803, "grad_norm": 1.2396481037139893, "learning_rate": 4.664634146341464e-06, "loss": 1.3415579795837402, "step": 154 }, { "epoch": 0.19047619047619047, "grad_norm": 2.4731335639953613, "learning_rate": 4.725609756097561e-06, "loss": 1.6359931230545044, "step": 156 }, { "epoch": 0.19291819291819293, "grad_norm": 3.7982375621795654, "learning_rate": 4.786585365853659e-06, "loss": 1.0939006805419922, "step": 158 }, { "epoch": 0.19536019536019536, "grad_norm": 1.8634134531021118, "learning_rate": 4.8475609756097565e-06, "loss": 0.9020692110061646, "step": 160 }, { "epoch": 0.1978021978021978, "grad_norm": 1.9401910305023193, "learning_rate": 4.908536585365854e-06, "loss": 1.3406171798706055, "step": 162 }, { "epoch": 0.20024420024420025, "grad_norm": 2.7686517238616943, "learning_rate": 4.9695121951219515e-06, "loss": 1.2336323261260986, "step": 164 }, { "epoch": 0.2026862026862027, "grad_norm": 2.1715619564056396, "learning_rate": 4.999998853502653e-06, "loss": 1.2935197353363037, "step": 166 }, { "epoch": 0.20512820512820512, "grad_norm": 1.8246636390686035, "learning_rate": 4.999989681530883e-06, "loss": 1.1559749841690063, "step": 168 }, { "epoch": 0.20757020757020758, "grad_norm": 4.966519832611084, "learning_rate": 4.999971337624732e-06, "loss": 0.929039478302002, "step": 170 }, { "epoch": 0.21001221001221002, "grad_norm": 1.679980993270874, "learning_rate": 4.999943821858978e-06, "loss": 1.0169018507003784, "step": 172 }, { "epoch": 0.21245421245421245, "grad_norm": 2.0559370517730713, "learning_rate": 4.999907134345786e-06, "loss": 1.3057047128677368, "step": 174 }, { "epoch": 0.2148962148962149, "grad_norm": 2.602260112762451, "learning_rate": 4.9998612752347116e-06, "loss": 1.2571014165878296, "step": 176 }, { "epoch": 0.21733821733821734, "grad_norm": 3.518317222595215, "learning_rate": 4.999806244712696e-06, "loss": 1.3580776453018188, "step": 178 }, { "epoch": 0.21978021978021978, "grad_norm": 1.0767415761947632, "learning_rate": 4.9997420430040665e-06, "loss": 0.9726645946502686, "step": 180 }, { "epoch": 0.2222222222222222, "grad_norm": 4.324513912200928, "learning_rate": 4.9996686703705395e-06, "loss": 0.8844138383865356, "step": 182 }, { "epoch": 0.22466422466422467, "grad_norm": 1.6926108598709106, "learning_rate": 4.999586127111211e-06, "loss": 1.2904834747314453, "step": 184 }, { "epoch": 0.2271062271062271, "grad_norm": 3.4072794914245605, "learning_rate": 4.9994944135625655e-06, "loss": 1.288368582725525, "step": 186 }, { "epoch": 0.22954822954822954, "grad_norm": 2.327322483062744, "learning_rate": 4.999393530098465e-06, "loss": 1.3512585163116455, "step": 188 }, { "epoch": 0.231990231990232, "grad_norm": 1.8644685745239258, "learning_rate": 4.999283477130157e-06, "loss": 1.3694134950637817, "step": 190 }, { "epoch": 0.23443223443223443, "grad_norm": 2.46710205078125, "learning_rate": 4.999164255106262e-06, "loss": 1.3137428760528564, "step": 192 }, { "epoch": 0.23687423687423687, "grad_norm": 2.8349263668060303, "learning_rate": 4.999035864512782e-06, "loss": 1.308716058731079, "step": 194 }, { "epoch": 0.23931623931623933, "grad_norm": 4.252539157867432, "learning_rate": 4.998898305873094e-06, "loss": 1.0035754442214966, "step": 196 }, { "epoch": 0.24175824175824176, "grad_norm": 1.9181326627731323, "learning_rate": 4.9987515797479455e-06, "loss": 1.283682942390442, "step": 198 }, { "epoch": 0.2442002442002442, "grad_norm": 2.797574996948242, "learning_rate": 4.998595686735457e-06, "loss": 1.3744878768920898, "step": 200 }, { "epoch": 0.24664224664224665, "grad_norm": 2.4476912021636963, "learning_rate": 4.998430627471114e-06, "loss": 1.3049349784851074, "step": 202 }, { "epoch": 0.2490842490842491, "grad_norm": 1.6749374866485596, "learning_rate": 4.998256402627771e-06, "loss": 0.9939874410629272, "step": 204 }, { "epoch": 0.2515262515262515, "grad_norm": 1.9039818048477173, "learning_rate": 4.998073012915644e-06, "loss": 1.26462721824646, "step": 206 }, { "epoch": 0.25396825396825395, "grad_norm": 1.5555559396743774, "learning_rate": 4.99788045908231e-06, "loss": 1.118224024772644, "step": 208 }, { "epoch": 0.2564102564102564, "grad_norm": 12.163622856140137, "learning_rate": 4.9976787419126995e-06, "loss": 0.9382672905921936, "step": 210 }, { "epoch": 0.2588522588522589, "grad_norm": 2.534181594848633, "learning_rate": 4.997467862229102e-06, "loss": 0.6328732967376709, "step": 212 }, { "epoch": 0.2612942612942613, "grad_norm": 1.489608645439148, "learning_rate": 4.997247820891152e-06, "loss": 1.0992366075515747, "step": 214 }, { "epoch": 0.26373626373626374, "grad_norm": 2.1970038414001465, "learning_rate": 4.997018618795836e-06, "loss": 1.2712618112564087, "step": 216 }, { "epoch": 0.2661782661782662, "grad_norm": 1.4587446451187134, "learning_rate": 4.996780256877479e-06, "loss": 1.1741327047348022, "step": 218 }, { "epoch": 0.2686202686202686, "grad_norm": 2.0022170543670654, "learning_rate": 4.996532736107749e-06, "loss": 1.3054232597351074, "step": 220 }, { "epoch": 0.27106227106227104, "grad_norm": 1.731757402420044, "learning_rate": 4.996276057495648e-06, "loss": 0.934091329574585, "step": 222 }, { "epoch": 0.27350427350427353, "grad_norm": 1.4423786401748657, "learning_rate": 4.996010222087509e-06, "loss": 0.9163894653320312, "step": 224 }, { "epoch": 0.27594627594627597, "grad_norm": 1.7184131145477295, "learning_rate": 4.9957352309669935e-06, "loss": 1.3263689279556274, "step": 226 }, { "epoch": 0.2783882783882784, "grad_norm": 2.437328338623047, "learning_rate": 4.9954510852550825e-06, "loss": 1.3698230981826782, "step": 228 }, { "epoch": 0.28083028083028083, "grad_norm": 2.120469093322754, "learning_rate": 4.995157786110078e-06, "loss": 1.343611717224121, "step": 230 }, { "epoch": 0.28327228327228327, "grad_norm": 6.02695369720459, "learning_rate": 4.9948553347275964e-06, "loss": 0.7583301663398743, "step": 232 }, { "epoch": 0.2857142857142857, "grad_norm": 1.9317870140075684, "learning_rate": 4.994543732340559e-06, "loss": 1.0170681476593018, "step": 234 }, { "epoch": 0.28815628815628813, "grad_norm": 1.3222551345825195, "learning_rate": 4.994222980219193e-06, "loss": 1.272110939025879, "step": 236 }, { "epoch": 0.2905982905982906, "grad_norm": 1.7373944520950317, "learning_rate": 4.993893079671023e-06, "loss": 1.2445218563079834, "step": 238 }, { "epoch": 0.29304029304029305, "grad_norm": 2.4315457344055176, "learning_rate": 4.993554032040867e-06, "loss": 1.1302506923675537, "step": 240 }, { "epoch": 0.2954822954822955, "grad_norm": 3.029109477996826, "learning_rate": 4.993205838710829e-06, "loss": 0.9910866022109985, "step": 242 }, { "epoch": 0.2979242979242979, "grad_norm": 1.9078646898269653, "learning_rate": 4.992848501100299e-06, "loss": 1.3285576105117798, "step": 244 }, { "epoch": 0.30036630036630035, "grad_norm": 1.1271051168441772, "learning_rate": 4.992482020665938e-06, "loss": 0.7790983319282532, "step": 246 }, { "epoch": 0.3028083028083028, "grad_norm": 2.9028432369232178, "learning_rate": 4.992106398901679e-06, "loss": 1.1949691772460938, "step": 248 }, { "epoch": 0.3052503052503053, "grad_norm": 3.402926445007324, "learning_rate": 4.9917216373387205e-06, "loss": 0.9305516481399536, "step": 250 }, { "epoch": 0.3076923076923077, "grad_norm": 4.722480773925781, "learning_rate": 4.991327737545517e-06, "loss": 1.0460638999938965, "step": 252 }, { "epoch": 0.31013431013431014, "grad_norm": 2.7775771617889404, "learning_rate": 4.990924701127776e-06, "loss": 1.2800921201705933, "step": 254 }, { "epoch": 0.3125763125763126, "grad_norm": 1.9031347036361694, "learning_rate": 4.990512529728448e-06, "loss": 1.2638157606124878, "step": 256 }, { "epoch": 0.315018315018315, "grad_norm": 1.927398443222046, "learning_rate": 4.990091225027721e-06, "loss": 1.3112692832946777, "step": 258 }, { "epoch": 0.31746031746031744, "grad_norm": 2.3837084770202637, "learning_rate": 4.9896607887430185e-06, "loss": 1.2674881219863892, "step": 260 }, { "epoch": 0.3199023199023199, "grad_norm": 4.82175874710083, "learning_rate": 4.989221222628985e-06, "loss": 1.4771348237991333, "step": 262 }, { "epoch": 0.32234432234432236, "grad_norm": 4.768642425537109, "learning_rate": 4.988772528477482e-06, "loss": 0.7117833495140076, "step": 264 }, { "epoch": 0.3247863247863248, "grad_norm": 3.3639814853668213, "learning_rate": 4.988314708117581e-06, "loss": 1.0419560670852661, "step": 266 }, { "epoch": 0.32722832722832723, "grad_norm": 4.912712574005127, "learning_rate": 4.987847763415557e-06, "loss": 1.3187146186828613, "step": 268 }, { "epoch": 0.32967032967032966, "grad_norm": 2.553563117980957, "learning_rate": 4.9873716962748805e-06, "loss": 0.9921520352363586, "step": 270 }, { "epoch": 0.3321123321123321, "grad_norm": 2.590106964111328, "learning_rate": 4.986886508636206e-06, "loss": 1.2800440788269043, "step": 272 }, { "epoch": 0.33455433455433453, "grad_norm": 5.722552299499512, "learning_rate": 4.986392202477369e-06, "loss": 0.9619787335395813, "step": 274 }, { "epoch": 0.336996336996337, "grad_norm": 2.612945556640625, "learning_rate": 4.985888779813377e-06, "loss": 1.0021531581878662, "step": 276 }, { "epoch": 0.33943833943833945, "grad_norm": 3.078714370727539, "learning_rate": 4.985376242696399e-06, "loss": 1.3929091691970825, "step": 278 }, { "epoch": 0.3418803418803419, "grad_norm": 1.928337812423706, "learning_rate": 4.984854593215759e-06, "loss": 1.2902088165283203, "step": 280 }, { "epoch": 0.3443223443223443, "grad_norm": 1.6284582614898682, "learning_rate": 4.984323833497925e-06, "loss": 1.2728163003921509, "step": 282 }, { "epoch": 0.34676434676434675, "grad_norm": 2.321744680404663, "learning_rate": 4.983783965706507e-06, "loss": 1.311239242553711, "step": 284 }, { "epoch": 0.3492063492063492, "grad_norm": 1.7774631977081299, "learning_rate": 4.983234992042237e-06, "loss": 1.1027390956878662, "step": 286 }, { "epoch": 0.3516483516483517, "grad_norm": 3.514158010482788, "learning_rate": 4.982676914742971e-06, "loss": 1.6526391506195068, "step": 288 }, { "epoch": 0.3540903540903541, "grad_norm": 5.824040412902832, "learning_rate": 4.982109736083676e-06, "loss": 0.9344091415405273, "step": 290 }, { "epoch": 0.35653235653235654, "grad_norm": 1.5543690919876099, "learning_rate": 4.981533458376416e-06, "loss": 1.292595386505127, "step": 292 }, { "epoch": 0.358974358974359, "grad_norm": 2.081808090209961, "learning_rate": 4.980948083970351e-06, "loss": 1.0262247323989868, "step": 294 }, { "epoch": 0.3614163614163614, "grad_norm": 2.1623454093933105, "learning_rate": 4.980353615251719e-06, "loss": 1.280896782875061, "step": 296 }, { "epoch": 0.36385836385836384, "grad_norm": 9.417366027832031, "learning_rate": 4.9797500546438344e-06, "loss": 1.4011857509613037, "step": 298 }, { "epoch": 0.3663003663003663, "grad_norm": 2.1483418941497803, "learning_rate": 4.979137404607072e-06, "loss": 1.243982195854187, "step": 300 }, { "epoch": 0.36874236874236876, "grad_norm": 2.855179786682129, "learning_rate": 4.978515667638858e-06, "loss": 0.8995228409767151, "step": 302 }, { "epoch": 0.3711843711843712, "grad_norm": 1.9090166091918945, "learning_rate": 4.9778848462736625e-06, "loss": 1.1892352104187012, "step": 304 }, { "epoch": 0.37362637362637363, "grad_norm": 1.4595932960510254, "learning_rate": 4.977244943082987e-06, "loss": 1.3153109550476074, "step": 306 }, { "epoch": 0.37606837606837606, "grad_norm": 2.5620715618133545, "learning_rate": 4.976595960675356e-06, "loss": 1.3017933368682861, "step": 308 }, { "epoch": 0.3785103785103785, "grad_norm": 2.5225541591644287, "learning_rate": 4.975937901696302e-06, "loss": 1.3250616788864136, "step": 310 }, { "epoch": 0.38095238095238093, "grad_norm": 4.8774895668029785, "learning_rate": 4.975270768828359e-06, "loss": 0.984774649143219, "step": 312 }, { "epoch": 0.3833943833943834, "grad_norm": 1.8592923879623413, "learning_rate": 4.974594564791051e-06, "loss": 1.3683158159255981, "step": 314 }, { "epoch": 0.38583638583638585, "grad_norm": 4.383054733276367, "learning_rate": 4.9739092923408784e-06, "loss": 0.6529649496078491, "step": 316 }, { "epoch": 0.3882783882783883, "grad_norm": 3.972773790359497, "learning_rate": 4.97321495427131e-06, "loss": 0.9518109560012817, "step": 318 }, { "epoch": 0.3907203907203907, "grad_norm": 5.475085735321045, "learning_rate": 4.972511553412768e-06, "loss": 1.334009051322937, "step": 320 }, { "epoch": 0.39316239316239315, "grad_norm": 2.0150842666625977, "learning_rate": 4.971799092632619e-06, "loss": 1.344587802886963, "step": 322 }, { "epoch": 0.3956043956043956, "grad_norm": 1.9884312152862549, "learning_rate": 4.971077574835165e-06, "loss": 1.3174562454223633, "step": 324 }, { "epoch": 0.398046398046398, "grad_norm": 2.862060308456421, "learning_rate": 4.970347002961623e-06, "loss": 1.244167447090149, "step": 326 }, { "epoch": 0.4004884004884005, "grad_norm": 1.4828734397888184, "learning_rate": 4.969607379990123e-06, "loss": 1.2446471452713013, "step": 328 }, { "epoch": 0.40293040293040294, "grad_norm": 5.2736592292785645, "learning_rate": 4.968858708935686e-06, "loss": 0.8940474987030029, "step": 330 }, { "epoch": 0.4053724053724054, "grad_norm": 3.302468776702881, "learning_rate": 4.968100992850223e-06, "loss": 0.6339259147644043, "step": 332 }, { "epoch": 0.4078144078144078, "grad_norm": 2.196411371231079, "learning_rate": 4.967334234822514e-06, "loss": 1.0478650331497192, "step": 334 }, { "epoch": 0.41025641025641024, "grad_norm": 1.72081458568573, "learning_rate": 4.966558437978196e-06, "loss": 1.349544882774353, "step": 336 }, { "epoch": 0.4126984126984127, "grad_norm": 1.245092511177063, "learning_rate": 4.965773605479754e-06, "loss": 0.9362432956695557, "step": 338 }, { "epoch": 0.41514041514041516, "grad_norm": 10.09897518157959, "learning_rate": 4.964979740526505e-06, "loss": 1.0755311250686646, "step": 340 }, { "epoch": 0.4175824175824176, "grad_norm": 2.285883903503418, "learning_rate": 4.964176846354588e-06, "loss": 1.6347922086715698, "step": 342 }, { "epoch": 0.42002442002442003, "grad_norm": 1.59197199344635, "learning_rate": 4.963364926236949e-06, "loss": 0.9156535863876343, "step": 344 }, { "epoch": 0.42246642246642246, "grad_norm": 1.6245992183685303, "learning_rate": 4.962543983483325e-06, "loss": 1.11324143409729, "step": 346 }, { "epoch": 0.4249084249084249, "grad_norm": 2.302391767501831, "learning_rate": 4.961714021440236e-06, "loss": 1.3008726835250854, "step": 348 }, { "epoch": 0.42735042735042733, "grad_norm": 2.053579092025757, "learning_rate": 4.960875043490967e-06, "loss": 0.8544071316719055, "step": 350 }, { "epoch": 0.4297924297924298, "grad_norm": 1.6109215021133423, "learning_rate": 4.960027053055557e-06, "loss": 1.0643997192382812, "step": 352 }, { "epoch": 0.43223443223443225, "grad_norm": 1.7497365474700928, "learning_rate": 4.959170053590781e-06, "loss": 1.2529405355453491, "step": 354 }, { "epoch": 0.4346764346764347, "grad_norm": 1.5827484130859375, "learning_rate": 4.958304048590143e-06, "loss": 0.8821004033088684, "step": 356 }, { "epoch": 0.4371184371184371, "grad_norm": 3.8849446773529053, "learning_rate": 4.957429041583855e-06, "loss": 1.127004623413086, "step": 358 }, { "epoch": 0.43956043956043955, "grad_norm": 1.9818916320800781, "learning_rate": 4.956545036138824e-06, "loss": 1.207819938659668, "step": 360 }, { "epoch": 0.442002442002442, "grad_norm": 3.0806636810302734, "learning_rate": 4.9556520358586394e-06, "loss": 1.0458451509475708, "step": 362 }, { "epoch": 0.4444444444444444, "grad_norm": 6.247749328613281, "learning_rate": 4.95475004438356e-06, "loss": 1.1090641021728516, "step": 364 }, { "epoch": 0.4468864468864469, "grad_norm": 4.639119625091553, "learning_rate": 4.953839065390494e-06, "loss": 1.37210214138031, "step": 366 }, { "epoch": 0.44932844932844934, "grad_norm": 3.0761399269104004, "learning_rate": 4.952919102592985e-06, "loss": 1.020755410194397, "step": 368 }, { "epoch": 0.4517704517704518, "grad_norm": 0.9568601846694946, "learning_rate": 4.9519901597412036e-06, "loss": 1.0233187675476074, "step": 370 }, { "epoch": 0.4542124542124542, "grad_norm": 1.6627336740493774, "learning_rate": 4.9510522406219215e-06, "loss": 1.2981936931610107, "step": 372 }, { "epoch": 0.45665445665445664, "grad_norm": 1.6876623630523682, "learning_rate": 4.9501053490585055e-06, "loss": 0.8830539584159851, "step": 374 }, { "epoch": 0.4590964590964591, "grad_norm": 2.635246515274048, "learning_rate": 4.9491494889108956e-06, "loss": 1.219455599784851, "step": 376 }, { "epoch": 0.46153846153846156, "grad_norm": 2.5608506202697754, "learning_rate": 4.948184664075594e-06, "loss": 0.9302881956100464, "step": 378 }, { "epoch": 0.463980463980464, "grad_norm": 2.0959465503692627, "learning_rate": 4.947210878485644e-06, "loss": 0.8517276048660278, "step": 380 }, { "epoch": 0.46642246642246643, "grad_norm": 1.482036828994751, "learning_rate": 4.94622813611062e-06, "loss": 1.2661558389663696, "step": 382 }, { "epoch": 0.46886446886446886, "grad_norm": 2.3324732780456543, "learning_rate": 4.945236440956604e-06, "loss": 1.0352469682693481, "step": 384 }, { "epoch": 0.4713064713064713, "grad_norm": 5.657218933105469, "learning_rate": 4.944235797066177e-06, "loss": 1.2012758255004883, "step": 386 }, { "epoch": 0.47374847374847373, "grad_norm": 3.256732225418091, "learning_rate": 4.943226208518398e-06, "loss": 1.5897492170333862, "step": 388 }, { "epoch": 0.47619047619047616, "grad_norm": 1.6683677434921265, "learning_rate": 4.942207679428788e-06, "loss": 0.7892211079597473, "step": 390 }, { "epoch": 0.47863247863247865, "grad_norm": 2.3011298179626465, "learning_rate": 4.941180213949314e-06, "loss": 0.8288873434066772, "step": 392 }, { "epoch": 0.4810744810744811, "grad_norm": 1.1541416645050049, "learning_rate": 4.94014381626837e-06, "loss": 0.9236152172088623, "step": 394 }, { "epoch": 0.4835164835164835, "grad_norm": 2.699540853500366, "learning_rate": 4.939098490610763e-06, "loss": 1.2205630540847778, "step": 396 }, { "epoch": 0.48595848595848595, "grad_norm": 3.6751928329467773, "learning_rate": 4.938044241237695e-06, "loss": 1.2720117568969727, "step": 398 }, { "epoch": 0.4884004884004884, "grad_norm": 2.8597030639648438, "learning_rate": 4.936981072446743e-06, "loss": 0.5283371210098267, "step": 400 }, { "epoch": 0.4908424908424908, "grad_norm": 2.1727144718170166, "learning_rate": 4.935908988571845e-06, "loss": 1.2206032276153564, "step": 402 }, { "epoch": 0.4932844932844933, "grad_norm": 1.26828932762146, "learning_rate": 4.934827993983279e-06, "loss": 1.3251525163650513, "step": 404 }, { "epoch": 0.49572649572649574, "grad_norm": 2.455037832260132, "learning_rate": 4.933738093087651e-06, "loss": 0.6017684936523438, "step": 406 }, { "epoch": 0.4981684981684982, "grad_norm": 2.95031476020813, "learning_rate": 4.932639290327866e-06, "loss": 0.8958187103271484, "step": 408 }, { "epoch": 0.5006105006105006, "grad_norm": 1.4214322566986084, "learning_rate": 4.931531590183123e-06, "loss": 1.255342721939087, "step": 410 }, { "epoch": 0.503052503052503, "grad_norm": 10.521769523620605, "learning_rate": 4.930414997168889e-06, "loss": 0.5480175614356995, "step": 412 }, { "epoch": 0.5054945054945055, "grad_norm": 1.6570682525634766, "learning_rate": 4.929289515836882e-06, "loss": 1.3151097297668457, "step": 414 }, { "epoch": 0.5079365079365079, "grad_norm": 2.6421968936920166, "learning_rate": 4.928155150775049e-06, "loss": 1.2698694467544556, "step": 416 }, { "epoch": 0.5103785103785103, "grad_norm": 2.2661855220794678, "learning_rate": 4.927011906607559e-06, "loss": 1.1845803260803223, "step": 418 }, { "epoch": 0.5128205128205128, "grad_norm": 0.9527170658111572, "learning_rate": 4.925859787994767e-06, "loss": 1.2397900819778442, "step": 420 }, { "epoch": 0.5152625152625152, "grad_norm": 1.2722523212432861, "learning_rate": 4.924698799633212e-06, "loss": 1.2302662134170532, "step": 422 }, { "epoch": 0.5177045177045178, "grad_norm": 1.8450767993927002, "learning_rate": 4.923528946255584e-06, "loss": 1.257878303527832, "step": 424 }, { "epoch": 0.5201465201465202, "grad_norm": 2.1251304149627686, "learning_rate": 4.922350232630715e-06, "loss": 1.0593935251235962, "step": 426 }, { "epoch": 0.5225885225885226, "grad_norm": 1.2513749599456787, "learning_rate": 4.9211626635635515e-06, "loss": 1.2507191896438599, "step": 428 }, { "epoch": 0.525030525030525, "grad_norm": 8.465970039367676, "learning_rate": 4.919966243895142e-06, "loss": 0.8818293809890747, "step": 430 }, { "epoch": 0.5274725274725275, "grad_norm": 6.935226917266846, "learning_rate": 4.918760978502611e-06, "loss": 0.5760735273361206, "step": 432 }, { "epoch": 0.5299145299145299, "grad_norm": 11.597949028015137, "learning_rate": 4.917546872299143e-06, "loss": 1.2672209739685059, "step": 434 }, { "epoch": 0.5323565323565324, "grad_norm": 1.8173397779464722, "learning_rate": 4.916323930233962e-06, "loss": 1.2190382480621338, "step": 436 }, { "epoch": 0.5347985347985348, "grad_norm": 3.132521867752075, "learning_rate": 4.915092157292313e-06, "loss": 1.2443459033966064, "step": 438 }, { "epoch": 0.5372405372405372, "grad_norm": 1.43805730342865, "learning_rate": 4.913851558495433e-06, "loss": 1.2091344594955444, "step": 440 }, { "epoch": 0.5396825396825397, "grad_norm": 11.365583419799805, "learning_rate": 4.912602138900545e-06, "loss": 1.0195097923278809, "step": 442 }, { "epoch": 0.5421245421245421, "grad_norm": 2.1645138263702393, "learning_rate": 4.911343903600823e-06, "loss": 0.8177242279052734, "step": 444 }, { "epoch": 0.5445665445665445, "grad_norm": 1.9511176347732544, "learning_rate": 4.91007685772538e-06, "loss": 0.9824368357658386, "step": 446 }, { "epoch": 0.5470085470085471, "grad_norm": 1.3720399141311646, "learning_rate": 4.908801006439247e-06, "loss": 1.08683443069458, "step": 448 }, { "epoch": 0.5494505494505495, "grad_norm": 2.627140760421753, "learning_rate": 4.9075163549433455e-06, "loss": 0.979245126247406, "step": 450 }, { "epoch": 0.5518925518925519, "grad_norm": 2.8625056743621826, "learning_rate": 4.906222908474474e-06, "loss": 0.7317221760749817, "step": 452 }, { "epoch": 0.5543345543345544, "grad_norm": 3.2224857807159424, "learning_rate": 4.90492067230528e-06, "loss": 1.258061170578003, "step": 454 }, { "epoch": 0.5567765567765568, "grad_norm": 2.8512461185455322, "learning_rate": 4.903609651744244e-06, "loss": 1.2263869047164917, "step": 456 }, { "epoch": 0.5592185592185592, "grad_norm": 1.08021080493927, "learning_rate": 4.902289852135655e-06, "loss": 0.6804142594337463, "step": 458 }, { "epoch": 0.5616605616605617, "grad_norm": 46.32835006713867, "learning_rate": 4.90096127885959e-06, "loss": 1.1406168937683105, "step": 460 }, { "epoch": 0.5641025641025641, "grad_norm": 3.643751621246338, "learning_rate": 4.899623937331887e-06, "loss": 1.1659770011901855, "step": 462 }, { "epoch": 0.5665445665445665, "grad_norm": 6.275250434875488, "learning_rate": 4.898277833004135e-06, "loss": 0.5430421829223633, "step": 464 }, { "epoch": 0.568986568986569, "grad_norm": 1.6964603662490845, "learning_rate": 4.896922971363635e-06, "loss": 1.487717628479004, "step": 466 }, { "epoch": 0.5714285714285714, "grad_norm": 1.5128223896026611, "learning_rate": 4.895559357933394e-06, "loss": 1.2990221977233887, "step": 468 }, { "epoch": 0.5738705738705738, "grad_norm": 2.9639620780944824, "learning_rate": 4.89418699827209e-06, "loss": 1.001917839050293, "step": 470 }, { "epoch": 0.5763125763125763, "grad_norm": 3.8676769733428955, "learning_rate": 4.892805897974059e-06, "loss": 1.2513344287872314, "step": 472 }, { "epoch": 0.5787545787545788, "grad_norm": 3.1442391872406006, "learning_rate": 4.891416062669262e-06, "loss": 0.8551501631736755, "step": 474 }, { "epoch": 0.5811965811965812, "grad_norm": 4.836908340454102, "learning_rate": 4.890017498023274e-06, "loss": 0.9901700615882874, "step": 476 }, { "epoch": 0.5836385836385837, "grad_norm": 3.409428358078003, "learning_rate": 4.888610209737249e-06, "loss": 1.1505521535873413, "step": 478 }, { "epoch": 0.5860805860805861, "grad_norm": 2.039818525314331, "learning_rate": 4.887194203547907e-06, "loss": 1.2868854999542236, "step": 480 }, { "epoch": 0.5885225885225885, "grad_norm": 3.3642868995666504, "learning_rate": 4.885769485227503e-06, "loss": 0.5171108245849609, "step": 482 }, { "epoch": 0.590964590964591, "grad_norm": 1.0177332162857056, "learning_rate": 4.8843360605838055e-06, "loss": 0.9433972239494324, "step": 484 }, { "epoch": 0.5934065934065934, "grad_norm": 0.7848219275474548, "learning_rate": 4.882893935460078e-06, "loss": 1.0055443048477173, "step": 486 }, { "epoch": 0.5958485958485958, "grad_norm": 1.8216912746429443, "learning_rate": 4.881443115735045e-06, "loss": 0.9295751452445984, "step": 488 }, { "epoch": 0.5982905982905983, "grad_norm": 2.449176073074341, "learning_rate": 4.879983607322881e-06, "loss": 0.9871832132339478, "step": 490 }, { "epoch": 0.6007326007326007, "grad_norm": 2.1226084232330322, "learning_rate": 4.878515416173174e-06, "loss": 0.7565707564353943, "step": 492 }, { "epoch": 0.6031746031746031, "grad_norm": 1.5631353855133057, "learning_rate": 4.877038548270907e-06, "loss": 0.9493947625160217, "step": 494 }, { "epoch": 0.6056166056166056, "grad_norm": 2.126840829849243, "learning_rate": 4.875553009636437e-06, "loss": 1.216259479522705, "step": 496 }, { "epoch": 0.608058608058608, "grad_norm": 6.31650447845459, "learning_rate": 4.874058806325463e-06, "loss": 0.5695387125015259, "step": 498 }, { "epoch": 0.6105006105006106, "grad_norm": 1.5655598640441895, "learning_rate": 4.872555944429006e-06, "loss": 0.8497368097305298, "step": 500 }, { "epoch": 0.612942612942613, "grad_norm": 2.7936275005340576, "learning_rate": 4.871044430073383e-06, "loss": 1.2087408304214478, "step": 502 }, { "epoch": 0.6153846153846154, "grad_norm": 5.054646015167236, "learning_rate": 4.869524269420183e-06, "loss": 1.2262006998062134, "step": 504 }, { "epoch": 0.6178266178266179, "grad_norm": 2.619194507598877, "learning_rate": 4.8679954686662404e-06, "loss": 1.2392239570617676, "step": 506 }, { "epoch": 0.6202686202686203, "grad_norm": 2.1149373054504395, "learning_rate": 4.866458034043611e-06, "loss": 1.2161999940872192, "step": 508 }, { "epoch": 0.6227106227106227, "grad_norm": 3.2427122592926025, "learning_rate": 4.864911971819545e-06, "loss": 1.2096397876739502, "step": 510 }, { "epoch": 0.6251526251526252, "grad_norm": 1.8958977460861206, "learning_rate": 4.863357288296463e-06, "loss": 1.0511081218719482, "step": 512 }, { "epoch": 0.6275946275946276, "grad_norm": 2.3320584297180176, "learning_rate": 4.861793989811929e-06, "loss": 1.039177417755127, "step": 514 }, { "epoch": 0.63003663003663, "grad_norm": 2.3830647468566895, "learning_rate": 4.860222082738628e-06, "loss": 0.9379343390464783, "step": 516 }, { "epoch": 0.6324786324786325, "grad_norm": 1.4862414598464966, "learning_rate": 4.858641573484334e-06, "loss": 1.2305572032928467, "step": 518 }, { "epoch": 0.6349206349206349, "grad_norm": 1.5390454530715942, "learning_rate": 4.8570524684918885e-06, "loss": 0.7816034555435181, "step": 520 }, { "epoch": 0.6373626373626373, "grad_norm": 3.6071054935455322, "learning_rate": 4.855454774239174e-06, "loss": 0.9562470316886902, "step": 522 }, { "epoch": 0.6398046398046398, "grad_norm": 19.057411193847656, "learning_rate": 4.8538484972390844e-06, "loss": 0.9935526847839355, "step": 524 }, { "epoch": 0.6422466422466423, "grad_norm": 5.404201030731201, "learning_rate": 4.852233644039503e-06, "loss": 1.2573553323745728, "step": 526 }, { "epoch": 0.6446886446886447, "grad_norm": 5.5111002922058105, "learning_rate": 4.8506102212232714e-06, "loss": 1.309897780418396, "step": 528 }, { "epoch": 0.6471306471306472, "grad_norm": 12.94403076171875, "learning_rate": 4.848978235408165e-06, "loss": 1.0515775680541992, "step": 530 }, { "epoch": 0.6495726495726496, "grad_norm": 2.2977471351623535, "learning_rate": 4.847337693246869e-06, "loss": 1.0648335218429565, "step": 532 }, { "epoch": 0.652014652014652, "grad_norm": 4.799200057983398, "learning_rate": 4.845688601426942e-06, "loss": 1.5188199281692505, "step": 534 }, { "epoch": 0.6544566544566545, "grad_norm": 1.9204214811325073, "learning_rate": 4.8440309666708006e-06, "loss": 0.8509761691093445, "step": 536 }, { "epoch": 0.6568986568986569, "grad_norm": 3.197026014328003, "learning_rate": 4.842364795735681e-06, "loss": 1.237154483795166, "step": 538 }, { "epoch": 0.6593406593406593, "grad_norm": 2.667442798614502, "learning_rate": 4.840690095413621e-06, "loss": 1.256026268005371, "step": 540 }, { "epoch": 0.6617826617826618, "grad_norm": 1.8451228141784668, "learning_rate": 4.8390068725314235e-06, "loss": 0.9330289959907532, "step": 542 }, { "epoch": 0.6642246642246642, "grad_norm": 2.3354480266571045, "learning_rate": 4.837315133950639e-06, "loss": 1.2343664169311523, "step": 544 }, { "epoch": 0.6666666666666666, "grad_norm": 2.377455472946167, "learning_rate": 4.835614886567523e-06, "loss": 1.1341302394866943, "step": 546 }, { "epoch": 0.6691086691086691, "grad_norm": 2.2880988121032715, "learning_rate": 4.833906137313027e-06, "loss": 1.2215226888656616, "step": 548 }, { "epoch": 0.6715506715506715, "grad_norm": 1.8074506521224976, "learning_rate": 4.8321888931527526e-06, "loss": 1.1459529399871826, "step": 550 }, { "epoch": 0.673992673992674, "grad_norm": 2.449573516845703, "learning_rate": 4.83046316108693e-06, "loss": 0.9938110709190369, "step": 552 }, { "epoch": 0.6764346764346765, "grad_norm": 1.4324339628219604, "learning_rate": 4.828728948150395e-06, "loss": 1.018953800201416, "step": 554 }, { "epoch": 0.6788766788766789, "grad_norm": 1.9855033159255981, "learning_rate": 4.826986261412551e-06, "loss": 0.7265840768814087, "step": 556 }, { "epoch": 0.6813186813186813, "grad_norm": 5.827937126159668, "learning_rate": 4.825235107977347e-06, "loss": 1.159310221672058, "step": 558 }, { "epoch": 0.6837606837606838, "grad_norm": 1.3800535202026367, "learning_rate": 4.82347549498324e-06, "loss": 1.1552388668060303, "step": 560 }, { "epoch": 0.6862026862026862, "grad_norm": 6.40132999420166, "learning_rate": 4.821707429603181e-06, "loss": 0.9877975583076477, "step": 562 }, { "epoch": 0.6886446886446886, "grad_norm": 5.481659889221191, "learning_rate": 4.8199309190445694e-06, "loss": 1.294710636138916, "step": 564 }, { "epoch": 0.6910866910866911, "grad_norm": 1.7129062414169312, "learning_rate": 4.818145970549233e-06, "loss": 1.0880193710327148, "step": 566 }, { "epoch": 0.6935286935286935, "grad_norm": 2.8719332218170166, "learning_rate": 4.816352591393398e-06, "loss": 1.286997675895691, "step": 568 }, { "epoch": 0.6959706959706959, "grad_norm": 3.2201790809631348, "learning_rate": 4.814550788887655e-06, "loss": 0.9447314143180847, "step": 570 }, { "epoch": 0.6984126984126984, "grad_norm": 9.533697128295898, "learning_rate": 4.812740570376933e-06, "loss": 0.9670330286026001, "step": 572 }, { "epoch": 0.7008547008547008, "grad_norm": 4.622603416442871, "learning_rate": 4.810921943240469e-06, "loss": 1.4118473529815674, "step": 574 }, { "epoch": 0.7032967032967034, "grad_norm": 1.7296289205551147, "learning_rate": 4.809094914891775e-06, "loss": 1.2039122581481934, "step": 576 }, { "epoch": 0.7057387057387058, "grad_norm": 1.5504480600357056, "learning_rate": 4.807259492778613e-06, "loss": 1.1822270154953003, "step": 578 }, { "epoch": 0.7081807081807082, "grad_norm": 1.588944435119629, "learning_rate": 4.805415684382959e-06, "loss": 1.142565131187439, "step": 580 }, { "epoch": 0.7106227106227107, "grad_norm": 1.771332859992981, "learning_rate": 4.803563497220976e-06, "loss": 1.1912704706192017, "step": 582 }, { "epoch": 0.7130647130647131, "grad_norm": 5.767767429351807, "learning_rate": 4.8017029388429845e-06, "loss": 1.0446151494979858, "step": 584 }, { "epoch": 0.7155067155067155, "grad_norm": 1.6248974800109863, "learning_rate": 4.799834016833425e-06, "loss": 1.287752628326416, "step": 586 }, { "epoch": 0.717948717948718, "grad_norm": 2.37131404876709, "learning_rate": 4.7979567388108376e-06, "loss": 1.061058759689331, "step": 588 }, { "epoch": 0.7203907203907204, "grad_norm": 1.6095200777053833, "learning_rate": 4.796071112427821e-06, "loss": 0.9337313771247864, "step": 590 }, { "epoch": 0.7228327228327228, "grad_norm": 1.546781063079834, "learning_rate": 4.794177145371006e-06, "loss": 0.8499547243118286, "step": 592 }, { "epoch": 0.7252747252747253, "grad_norm": 1.973464846611023, "learning_rate": 4.792274845361025e-06, "loss": 1.199100375175476, "step": 594 }, { "epoch": 0.7277167277167277, "grad_norm": 2.8694047927856445, "learning_rate": 4.790364220152477e-06, "loss": 0.9500537514686584, "step": 596 }, { "epoch": 0.7301587301587301, "grad_norm": 3.2333526611328125, "learning_rate": 4.788445277533902e-06, "loss": 0.9067592024803162, "step": 598 }, { "epoch": 0.7326007326007326, "grad_norm": 1.7088539600372314, "learning_rate": 4.786518025327742e-06, "loss": 1.1730542182922363, "step": 600 }, { "epoch": 0.7350427350427351, "grad_norm": 3.451296091079712, "learning_rate": 4.7845824713903115e-06, "loss": 1.319393515586853, "step": 602 }, { "epoch": 0.7374847374847375, "grad_norm": 2.3576159477233887, "learning_rate": 4.782638623611771e-06, "loss": 1.1339298486709595, "step": 604 }, { "epoch": 0.73992673992674, "grad_norm": 4.726226329803467, "learning_rate": 4.780686489916086e-06, "loss": 1.4150636196136475, "step": 606 }, { "epoch": 0.7423687423687424, "grad_norm": 1.4049737453460693, "learning_rate": 4.778726078261001e-06, "loss": 1.265529751777649, "step": 608 }, { "epoch": 0.7448107448107448, "grad_norm": 2.383326768875122, "learning_rate": 4.776757396638005e-06, "loss": 0.8798419237136841, "step": 610 }, { "epoch": 0.7472527472527473, "grad_norm": 1.5629072189331055, "learning_rate": 4.774780453072298e-06, "loss": 1.2364379167556763, "step": 612 }, { "epoch": 0.7496947496947497, "grad_norm": 1.7620823383331299, "learning_rate": 4.772795255622761e-06, "loss": 1.2224982976913452, "step": 614 }, { "epoch": 0.7521367521367521, "grad_norm": 1.2436916828155518, "learning_rate": 4.770801812381919e-06, "loss": 0.8438993096351624, "step": 616 }, { "epoch": 0.7545787545787546, "grad_norm": 1.9763593673706055, "learning_rate": 4.768800131475913e-06, "loss": 1.5313855409622192, "step": 618 }, { "epoch": 0.757020757020757, "grad_norm": 3.1992554664611816, "learning_rate": 4.7667902210644616e-06, "loss": 1.008560061454773, "step": 620 }, { "epoch": 0.7594627594627594, "grad_norm": 7.259356498718262, "learning_rate": 4.764772089340833e-06, "loss": 0.9306063652038574, "step": 622 }, { "epoch": 0.7619047619047619, "grad_norm": 1.7845251560211182, "learning_rate": 4.762745744531808e-06, "loss": 1.2115577459335327, "step": 624 }, { "epoch": 0.7643467643467643, "grad_norm": 1.6257309913635254, "learning_rate": 4.760711194897646e-06, "loss": 1.2677242755889893, "step": 626 }, { "epoch": 0.7667887667887668, "grad_norm": 3.1209113597869873, "learning_rate": 4.758668448732057e-06, "loss": 1.0252844095230103, "step": 628 }, { "epoch": 0.7692307692307693, "grad_norm": 1.4179201126098633, "learning_rate": 4.7566175143621575e-06, "loss": 1.2540860176086426, "step": 630 }, { "epoch": 0.7716727716727717, "grad_norm": 1.1921404600143433, "learning_rate": 4.754558400148449e-06, "loss": 0.8723937273025513, "step": 632 }, { "epoch": 0.7741147741147741, "grad_norm": 2.1802115440368652, "learning_rate": 4.752491114484773e-06, "loss": 0.8961063623428345, "step": 634 }, { "epoch": 0.7765567765567766, "grad_norm": 1.9183303117752075, "learning_rate": 4.7504156657982835e-06, "loss": 1.226144790649414, "step": 636 }, { "epoch": 0.778998778998779, "grad_norm": 1.9927711486816406, "learning_rate": 4.74833206254941e-06, "loss": 1.2360224723815918, "step": 638 }, { "epoch": 0.7814407814407814, "grad_norm": 4.131889343261719, "learning_rate": 4.746240313231823e-06, "loss": 0.9043057560920715, "step": 640 }, { "epoch": 0.7838827838827839, "grad_norm": 3.734161138534546, "learning_rate": 4.744140426372401e-06, "loss": 1.0058786869049072, "step": 642 }, { "epoch": 0.7863247863247863, "grad_norm": 1.5284754037857056, "learning_rate": 4.742032410531195e-06, "loss": 1.124707818031311, "step": 644 }, { "epoch": 0.7887667887667887, "grad_norm": 3.012943744659424, "learning_rate": 4.73991627430139e-06, "loss": 0.9144378304481506, "step": 646 }, { "epoch": 0.7912087912087912, "grad_norm": 2.2246816158294678, "learning_rate": 4.737792026309278e-06, "loss": 1.19635009765625, "step": 648 }, { "epoch": 0.7936507936507936, "grad_norm": 1.9574096202850342, "learning_rate": 4.735659675214215e-06, "loss": 0.5257167220115662, "step": 650 }, { "epoch": 0.796092796092796, "grad_norm": 2.1935298442840576, "learning_rate": 4.7335192297085895e-06, "loss": 0.7748251557350159, "step": 652 }, { "epoch": 0.7985347985347986, "grad_norm": 2.698744535446167, "learning_rate": 4.731370698517786e-06, "loss": 1.1536623239517212, "step": 654 }, { "epoch": 0.800976800976801, "grad_norm": 1.8622996807098389, "learning_rate": 4.729214090400149e-06, "loss": 1.200728178024292, "step": 656 }, { "epoch": 0.8034188034188035, "grad_norm": 9.128825187683105, "learning_rate": 4.727049414146952e-06, "loss": 0.4623393714427948, "step": 658 }, { "epoch": 0.8058608058608059, "grad_norm": 2.4344100952148438, "learning_rate": 4.724876678582352e-06, "loss": 1.0503042936325073, "step": 660 }, { "epoch": 0.8083028083028083, "grad_norm": 1.744651198387146, "learning_rate": 4.722695892563363e-06, "loss": 1.1860074996948242, "step": 662 }, { "epoch": 0.8107448107448108, "grad_norm": 1.6168715953826904, "learning_rate": 4.720507064979816e-06, "loss": 1.2846834659576416, "step": 664 }, { "epoch": 0.8131868131868132, "grad_norm": 2.017427444458008, "learning_rate": 4.7183102047543205e-06, "loss": 0.8671167492866516, "step": 666 }, { "epoch": 0.8156288156288156, "grad_norm": 2.0237629413604736, "learning_rate": 4.716105320842234e-06, "loss": 1.0235426425933838, "step": 668 }, { "epoch": 0.818070818070818, "grad_norm": 2.6489999294281006, "learning_rate": 4.713892422231619e-06, "loss": 1.3756883144378662, "step": 670 }, { "epoch": 0.8205128205128205, "grad_norm": 1.48908269405365, "learning_rate": 4.71167151794321e-06, "loss": 1.2407268285751343, "step": 672 }, { "epoch": 0.8229548229548229, "grad_norm": 1.3047491312026978, "learning_rate": 4.709442617030379e-06, "loss": 0.9855388402938843, "step": 674 }, { "epoch": 0.8253968253968254, "grad_norm": 1.8140162229537964, "learning_rate": 4.707205728579091e-06, "loss": 0.882321298122406, "step": 676 }, { "epoch": 0.8278388278388278, "grad_norm": 13.20935344696045, "learning_rate": 4.704960861707875e-06, "loss": 1.2027504444122314, "step": 678 }, { "epoch": 0.8302808302808303, "grad_norm": 1.8309438228607178, "learning_rate": 4.702708025567784e-06, "loss": 1.2264920473098755, "step": 680 }, { "epoch": 0.8327228327228328, "grad_norm": 5.225240707397461, "learning_rate": 4.700447229342353e-06, "loss": 1.1251945495605469, "step": 682 }, { "epoch": 0.8351648351648352, "grad_norm": 12.371641159057617, "learning_rate": 4.698178482247571e-06, "loss": 0.6810005307197571, "step": 684 }, { "epoch": 0.8376068376068376, "grad_norm": 1.8889416456222534, "learning_rate": 4.695901793531834e-06, "loss": 1.325577974319458, "step": 686 }, { "epoch": 0.8400488400488401, "grad_norm": 2.4011776447296143, "learning_rate": 4.693617172475914e-06, "loss": 1.2832276821136475, "step": 688 }, { "epoch": 0.8424908424908425, "grad_norm": 6.96901273727417, "learning_rate": 4.691324628392918e-06, "loss": 0.9534074664115906, "step": 690 }, { "epoch": 0.8449328449328449, "grad_norm": 3.2731733322143555, "learning_rate": 4.68902417062825e-06, "loss": 1.2452714443206787, "step": 692 }, { "epoch": 0.8473748473748474, "grad_norm": 1.0289312601089478, "learning_rate": 4.686715808559575e-06, "loss": 0.9713150858879089, "step": 694 }, { "epoch": 0.8498168498168498, "grad_norm": 4.545189380645752, "learning_rate": 4.684399551596778e-06, "loss": 1.130218744277954, "step": 696 }, { "epoch": 0.8522588522588522, "grad_norm": 87.15223693847656, "learning_rate": 4.682075409181928e-06, "loss": 0.9914512634277344, "step": 698 }, { "epoch": 0.8547008547008547, "grad_norm": 1.9779729843139648, "learning_rate": 4.6797433907892385e-06, "loss": 1.0588513612747192, "step": 700 }, { "epoch": 0.8571428571428571, "grad_norm": 1.3967251777648926, "learning_rate": 4.677403505925027e-06, "loss": 1.0360347032546997, "step": 702 }, { "epoch": 0.8595848595848596, "grad_norm": 2.3286092281341553, "learning_rate": 4.6750557641276805e-06, "loss": 0.9465680122375488, "step": 704 }, { "epoch": 0.8620268620268621, "grad_norm": 1.6770824193954468, "learning_rate": 4.672700174967613e-06, "loss": 0.4001966118812561, "step": 706 }, { "epoch": 0.8644688644688645, "grad_norm": 1.5843091011047363, "learning_rate": 4.6703367480472304e-06, "loss": 1.1794531345367432, "step": 708 }, { "epoch": 0.8669108669108669, "grad_norm": 7.7685017585754395, "learning_rate": 4.667965493000883e-06, "loss": 0.8692483901977539, "step": 710 }, { "epoch": 0.8693528693528694, "grad_norm": 7.2869977951049805, "learning_rate": 4.665586419494837e-06, "loss": 1.3455827236175537, "step": 712 }, { "epoch": 0.8717948717948718, "grad_norm": 2.4024009704589844, "learning_rate": 4.66319953722723e-06, "loss": 1.53640615940094, "step": 714 }, { "epoch": 0.8742368742368742, "grad_norm": 1.8035765886306763, "learning_rate": 4.660804855928029e-06, "loss": 1.400252103805542, "step": 716 }, { "epoch": 0.8766788766788767, "grad_norm": 2.3839004039764404, "learning_rate": 4.658402385358992e-06, "loss": 0.88499915599823, "step": 718 }, { "epoch": 0.8791208791208791, "grad_norm": 2.779142141342163, "learning_rate": 4.655992135313634e-06, "loss": 1.1850217580795288, "step": 720 }, { "epoch": 0.8815628815628815, "grad_norm": 5.73760986328125, "learning_rate": 4.6535741156171796e-06, "loss": 0.8437918424606323, "step": 722 }, { "epoch": 0.884004884004884, "grad_norm": 2.219005584716797, "learning_rate": 4.651148336126527e-06, "loss": 1.2156010866165161, "step": 724 }, { "epoch": 0.8864468864468864, "grad_norm": 2.8996713161468506, "learning_rate": 4.6487148067302065e-06, "loss": 1.1615610122680664, "step": 726 }, { "epoch": 0.8888888888888888, "grad_norm": 1.5215322971343994, "learning_rate": 4.646273537348337e-06, "loss": 1.15150785446167, "step": 728 }, { "epoch": 0.8913308913308914, "grad_norm": 1.62069833278656, "learning_rate": 4.643824537932595e-06, "loss": 1.3772497177124023, "step": 730 }, { "epoch": 0.8937728937728938, "grad_norm": 5.082972526550293, "learning_rate": 4.641367818466164e-06, "loss": 1.1609463691711426, "step": 732 }, { "epoch": 0.8962148962148963, "grad_norm": 2.289997100830078, "learning_rate": 4.6389033889637e-06, "loss": 0.9794567227363586, "step": 734 }, { "epoch": 0.8986568986568987, "grad_norm": 1.610153317451477, "learning_rate": 4.636431259471284e-06, "loss": 0.7476667165756226, "step": 736 }, { "epoch": 0.9010989010989011, "grad_norm": 3.150763511657715, "learning_rate": 4.633951440066391e-06, "loss": 0.8690844774246216, "step": 738 }, { "epoch": 0.9035409035409036, "grad_norm": 4.614825248718262, "learning_rate": 4.631463940857841e-06, "loss": 1.0103671550750732, "step": 740 }, { "epoch": 0.905982905982906, "grad_norm": 1.2555079460144043, "learning_rate": 4.6289687719857595e-06, "loss": 0.873469352722168, "step": 742 }, { "epoch": 0.9084249084249084, "grad_norm": 1.3857243061065674, "learning_rate": 4.626465943621538e-06, "loss": 0.8869038224220276, "step": 744 }, { "epoch": 0.9108669108669109, "grad_norm": 8.226287841796875, "learning_rate": 4.623955465967791e-06, "loss": 1.1199119091033936, "step": 746 }, { "epoch": 0.9133089133089133, "grad_norm": 2.548466920852661, "learning_rate": 4.621437349258316e-06, "loss": 0.8345762491226196, "step": 748 }, { "epoch": 0.9157509157509157, "grad_norm": 1.7079286575317383, "learning_rate": 4.618911603758047e-06, "loss": 1.3368088006973267, "step": 750 }, { "epoch": 0.9181929181929182, "grad_norm": 1.5761820077896118, "learning_rate": 4.616378239763021e-06, "loss": 1.2190864086151123, "step": 752 }, { "epoch": 0.9206349206349206, "grad_norm": 1.6565887928009033, "learning_rate": 4.613837267600328e-06, "loss": 1.2295691967010498, "step": 754 }, { "epoch": 0.9230769230769231, "grad_norm": 1.4082704782485962, "learning_rate": 4.611288697628074e-06, "loss": 1.2072789669036865, "step": 756 }, { "epoch": 0.9255189255189256, "grad_norm": 2.3025150299072266, "learning_rate": 4.608732540235336e-06, "loss": 1.2459933757781982, "step": 758 }, { "epoch": 0.927960927960928, "grad_norm": 2.0221993923187256, "learning_rate": 4.60616880584212e-06, "loss": 0.7390088438987732, "step": 760 }, { "epoch": 0.9304029304029304, "grad_norm": 3.3428783416748047, "learning_rate": 4.603597504899322e-06, "loss": 1.0999096632003784, "step": 762 }, { "epoch": 0.9328449328449329, "grad_norm": 1.6523158550262451, "learning_rate": 4.601018647888677e-06, "loss": 0.8729748129844666, "step": 764 }, { "epoch": 0.9352869352869353, "grad_norm": 62.29137420654297, "learning_rate": 4.598432245322729e-06, "loss": 0.5466877818107605, "step": 766 }, { "epoch": 0.9377289377289377, "grad_norm": 1.815721035003662, "learning_rate": 4.595838307744775e-06, "loss": 0.8582451939582825, "step": 768 }, { "epoch": 0.9401709401709402, "grad_norm": 1.5688364505767822, "learning_rate": 4.593236845728832e-06, "loss": 0.7072654962539673, "step": 770 }, { "epoch": 0.9426129426129426, "grad_norm": 1.4338949918746948, "learning_rate": 4.590627869879586e-06, "loss": 1.053293228149414, "step": 772 }, { "epoch": 0.945054945054945, "grad_norm": 2.2348110675811768, "learning_rate": 4.588011390832357e-06, "loss": 1.2199137210845947, "step": 774 }, { "epoch": 0.9474969474969475, "grad_norm": 2.0182247161865234, "learning_rate": 4.585387419253048e-06, "loss": 1.1575353145599365, "step": 776 }, { "epoch": 0.9499389499389499, "grad_norm": 2.0367462635040283, "learning_rate": 4.582755965838105e-06, "loss": 0.842775821685791, "step": 778 }, { "epoch": 0.9523809523809523, "grad_norm": 1.4695345163345337, "learning_rate": 4.580117041314476e-06, "loss": 0.9546113610267639, "step": 780 }, { "epoch": 0.9548229548229549, "grad_norm": 4.101283550262451, "learning_rate": 4.577470656439562e-06, "loss": 1.0789297819137573, "step": 782 }, { "epoch": 0.9572649572649573, "grad_norm": 1.6646188497543335, "learning_rate": 4.574816822001175e-06, "loss": 1.2004691362380981, "step": 784 }, { "epoch": 0.9597069597069597, "grad_norm": 1.4552080631256104, "learning_rate": 4.572155548817498e-06, "loss": 1.2365154027938843, "step": 786 }, { "epoch": 0.9621489621489622, "grad_norm": 2.4475789070129395, "learning_rate": 4.5694868477370325e-06, "loss": 1.330816388130188, "step": 788 }, { "epoch": 0.9645909645909646, "grad_norm": 2.0710349082946777, "learning_rate": 4.566810729638565e-06, "loss": 1.2555092573165894, "step": 790 }, { "epoch": 0.967032967032967, "grad_norm": 1.5041587352752686, "learning_rate": 4.564127205431112e-06, "loss": 1.0104345083236694, "step": 792 }, { "epoch": 0.9694749694749695, "grad_norm": 1.5308382511138916, "learning_rate": 4.5614362860538855e-06, "loss": 1.2593212127685547, "step": 794 }, { "epoch": 0.9719169719169719, "grad_norm": 1.3870609998703003, "learning_rate": 4.558737982476238e-06, "loss": 1.1537375450134277, "step": 796 }, { "epoch": 0.9743589743589743, "grad_norm": 1.61140775680542, "learning_rate": 4.556032305697628e-06, "loss": 1.154402732849121, "step": 798 }, { "epoch": 0.9768009768009768, "grad_norm": 2.04144024848938, "learning_rate": 4.553319266747566e-06, "loss": 1.2140703201293945, "step": 800 }, { "epoch": 0.9792429792429792, "grad_norm": 2.3697726726531982, "learning_rate": 4.550598876685578e-06, "loss": 0.895045280456543, "step": 802 }, { "epoch": 0.9816849816849816, "grad_norm": 1.659780502319336, "learning_rate": 4.547871146601154e-06, "loss": 1.1046396493911743, "step": 804 }, { "epoch": 0.9841269841269841, "grad_norm": 2.151529312133789, "learning_rate": 4.545136087613705e-06, "loss": 1.5022149085998535, "step": 806 }, { "epoch": 0.9865689865689866, "grad_norm": 2.602121353149414, "learning_rate": 4.5423937108725195e-06, "loss": 0.8793852925300598, "step": 808 }, { "epoch": 0.989010989010989, "grad_norm": 3.102078914642334, "learning_rate": 4.5396440275567135e-06, "loss": 1.256363034248352, "step": 810 }, { "epoch": 0.9914529914529915, "grad_norm": 2.6925265789031982, "learning_rate": 4.536887048875191e-06, "loss": 1.2054369449615479, "step": 812 }, { "epoch": 0.9938949938949939, "grad_norm": 2.5501744747161865, "learning_rate": 4.5341227860665935e-06, "loss": 1.278929591178894, "step": 814 }, { "epoch": 0.9963369963369964, "grad_norm": 1.176234483718872, "learning_rate": 4.531351250399254e-06, "loss": 1.1633700132369995, "step": 816 }, { "epoch": 0.9987789987789988, "grad_norm": 1.4937130212783813, "learning_rate": 4.5285724531711575e-06, "loss": 1.0776214599609375, "step": 818 }, { "epoch": 1.0012210012210012, "grad_norm": 3.934837818145752, "learning_rate": 4.525786405709885e-06, "loss": 0.9735159873962402, "step": 820 }, { "epoch": 1.0036630036630036, "grad_norm": 1.3186302185058594, "learning_rate": 4.5229931193725775e-06, "loss": 1.1400266885757446, "step": 822 }, { "epoch": 1.006105006105006, "grad_norm": 3.5487184524536133, "learning_rate": 4.520192605545879e-06, "loss": 0.522385835647583, "step": 824 }, { "epoch": 1.0085470085470085, "grad_norm": 1.5596842765808105, "learning_rate": 4.517384875645903e-06, "loss": 1.0808534622192383, "step": 826 }, { "epoch": 1.010989010989011, "grad_norm": 12.584797859191895, "learning_rate": 4.514569941118172e-06, "loss": 0.8573816418647766, "step": 828 }, { "epoch": 1.0134310134310134, "grad_norm": 2.6060709953308105, "learning_rate": 4.511747813437582e-06, "loss": 0.8253161907196045, "step": 830 }, { "epoch": 1.0158730158730158, "grad_norm": 6.2511420249938965, "learning_rate": 4.50891850410835e-06, "loss": 0.8468748331069946, "step": 832 }, { "epoch": 1.0183150183150182, "grad_norm": 1.8716773986816406, "learning_rate": 4.506082024663969e-06, "loss": 0.833984375, "step": 834 }, { "epoch": 1.0207570207570207, "grad_norm": 2.3107144832611084, "learning_rate": 4.503238386667159e-06, "loss": 1.1121944189071655, "step": 836 }, { "epoch": 1.0231990231990231, "grad_norm": 1.7429516315460205, "learning_rate": 4.500387601709824e-06, "loss": 1.1830196380615234, "step": 838 }, { "epoch": 1.0256410256410255, "grad_norm": 2.6718502044677734, "learning_rate": 4.497529681413001e-06, "loss": 0.8847705125808716, "step": 840 }, { "epoch": 1.028083028083028, "grad_norm": 3.5513010025024414, "learning_rate": 4.4946646374268105e-06, "loss": 0.9079216122627258, "step": 842 }, { "epoch": 1.0305250305250304, "grad_norm": 1.6211357116699219, "learning_rate": 4.491792481430419e-06, "loss": 1.1324057579040527, "step": 844 }, { "epoch": 1.032967032967033, "grad_norm": 4.688745975494385, "learning_rate": 4.488913225131977e-06, "loss": 0.8587746620178223, "step": 846 }, { "epoch": 1.0354090354090355, "grad_norm": 1.1522576808929443, "learning_rate": 4.4860268802685865e-06, "loss": 1.1451654434204102, "step": 848 }, { "epoch": 1.037851037851038, "grad_norm": 8.89565658569336, "learning_rate": 4.483133458606239e-06, "loss": 0.93172687292099, "step": 850 }, { "epoch": 1.0402930402930404, "grad_norm": 2.5761866569519043, "learning_rate": 4.480232971939777e-06, "loss": 0.6277377605438232, "step": 852 }, { "epoch": 1.0427350427350428, "grad_norm": 1.6280884742736816, "learning_rate": 4.477325432092845e-06, "loss": 1.103888750076294, "step": 854 }, { "epoch": 1.0451770451770452, "grad_norm": 6.281750202178955, "learning_rate": 4.474410850917835e-06, "loss": 0.8592535257339478, "step": 856 }, { "epoch": 1.0476190476190477, "grad_norm": 2.389967203140259, "learning_rate": 4.471489240295845e-06, "loss": 1.1699893474578857, "step": 858 }, { "epoch": 1.05006105006105, "grad_norm": 1.8374966382980347, "learning_rate": 4.4685606121366295e-06, "loss": 0.7583469748497009, "step": 860 }, { "epoch": 1.0525030525030525, "grad_norm": 1.7573800086975098, "learning_rate": 4.4656249783785465e-06, "loss": 1.051936149597168, "step": 862 }, { "epoch": 1.054945054945055, "grad_norm": 3.5590884685516357, "learning_rate": 4.462682350988513e-06, "loss": 1.1263583898544312, "step": 864 }, { "epoch": 1.0573870573870574, "grad_norm": 2.979887008666992, "learning_rate": 4.459732741961957e-06, "loss": 1.4335747957229614, "step": 866 }, { "epoch": 1.0598290598290598, "grad_norm": 2.712446689605713, "learning_rate": 4.456776163322761e-06, "loss": 0.39710327982902527, "step": 868 }, { "epoch": 1.0622710622710623, "grad_norm": 1.8534517288208008, "learning_rate": 4.453812627123227e-06, "loss": 0.9377206563949585, "step": 870 }, { "epoch": 1.0647130647130647, "grad_norm": 1.9186785221099854, "learning_rate": 4.450842145444012e-06, "loss": 1.0142245292663574, "step": 872 }, { "epoch": 1.0671550671550671, "grad_norm": 1.7083848714828491, "learning_rate": 4.4478647303940905e-06, "loss": 0.7915772199630737, "step": 874 }, { "epoch": 1.0695970695970696, "grad_norm": 2.7380499839782715, "learning_rate": 4.4448803941106964e-06, "loss": 1.10654878616333, "step": 876 }, { "epoch": 1.072039072039072, "grad_norm": 2.324699878692627, "learning_rate": 4.44188914875928e-06, "loss": 1.0548545122146606, "step": 878 }, { "epoch": 1.0744810744810744, "grad_norm": 1.84871506690979, "learning_rate": 4.438891006533456e-06, "loss": 0.747241735458374, "step": 880 }, { "epoch": 1.0769230769230769, "grad_norm": 1.8665426969528198, "learning_rate": 4.435885979654953e-06, "loss": 1.0984582901000977, "step": 882 }, { "epoch": 1.0793650793650793, "grad_norm": 3.5970113277435303, "learning_rate": 4.432874080373565e-06, "loss": 0.7559424638748169, "step": 884 }, { "epoch": 1.0818070818070817, "grad_norm": 8.55659294128418, "learning_rate": 4.4298553209671e-06, "loss": 0.6807610988616943, "step": 886 }, { "epoch": 1.0842490842490842, "grad_norm": 2.020759344100952, "learning_rate": 4.426829713741332e-06, "loss": 1.144335389137268, "step": 888 }, { "epoch": 1.0866910866910866, "grad_norm": 1.3028897047042847, "learning_rate": 4.4237972710299475e-06, "loss": 0.8821287751197815, "step": 890 }, { "epoch": 1.089133089133089, "grad_norm": 4.502945899963379, "learning_rate": 4.420758005194502e-06, "loss": 1.0961552858352661, "step": 892 }, { "epoch": 1.0915750915750915, "grad_norm": 2.57550048828125, "learning_rate": 4.417711928624358e-06, "loss": 1.079803705215454, "step": 894 }, { "epoch": 1.0940170940170941, "grad_norm": 1.6765140295028687, "learning_rate": 4.41465905373665e-06, "loss": 1.0386958122253418, "step": 896 }, { "epoch": 1.0964590964590966, "grad_norm": 1.710028052330017, "learning_rate": 4.411599392976217e-06, "loss": 0.8400865793228149, "step": 898 }, { "epoch": 1.098901098901099, "grad_norm": 1.6493473052978516, "learning_rate": 4.408532958815566e-06, "loss": 0.7645131945610046, "step": 900 }, { "epoch": 1.1013431013431014, "grad_norm": 5.944808006286621, "learning_rate": 4.405459763754814e-06, "loss": 0.5732899904251099, "step": 902 }, { "epoch": 1.1037851037851039, "grad_norm": 2.6218488216400146, "learning_rate": 4.402379820321636e-06, "loss": 0.6524146199226379, "step": 904 }, { "epoch": 1.1062271062271063, "grad_norm": 6.440639972686768, "learning_rate": 4.399293141071219e-06, "loss": 1.0054997205734253, "step": 906 }, { "epoch": 1.1086691086691087, "grad_norm": 2.3345463275909424, "learning_rate": 4.396199738586208e-06, "loss": 1.0879311561584473, "step": 908 }, { "epoch": 1.1111111111111112, "grad_norm": 1.7419912815093994, "learning_rate": 4.393099625476652e-06, "loss": 0.7915565371513367, "step": 910 }, { "epoch": 1.1135531135531136, "grad_norm": 7.168088912963867, "learning_rate": 4.389992814379959e-06, "loss": 1.140365719795227, "step": 912 }, { "epoch": 1.115995115995116, "grad_norm": 4.943615913391113, "learning_rate": 4.386879317960839e-06, "loss": 0.7865337133407593, "step": 914 }, { "epoch": 1.1184371184371185, "grad_norm": 2.3716390132904053, "learning_rate": 4.383759148911254e-06, "loss": 0.8161624670028687, "step": 916 }, { "epoch": 1.120879120879121, "grad_norm": 1.9857978820800781, "learning_rate": 4.380632319950368e-06, "loss": 1.118779182434082, "step": 918 }, { "epoch": 1.1233211233211233, "grad_norm": 1.886316180229187, "learning_rate": 4.377498843824491e-06, "loss": 1.064637541770935, "step": 920 }, { "epoch": 1.1257631257631258, "grad_norm": 1.9790380001068115, "learning_rate": 4.374358733307035e-06, "loss": 0.8831157684326172, "step": 922 }, { "epoch": 1.1282051282051282, "grad_norm": 2.9242825508117676, "learning_rate": 4.37121200119845e-06, "loss": 0.7275701761245728, "step": 924 }, { "epoch": 1.1306471306471306, "grad_norm": 3.6830453872680664, "learning_rate": 4.368058660326182e-06, "loss": 0.6424413323402405, "step": 926 }, { "epoch": 1.133089133089133, "grad_norm": 23.97179412841797, "learning_rate": 4.364898723544618e-06, "loss": 0.5658762454986572, "step": 928 }, { "epoch": 1.1355311355311355, "grad_norm": 1.438925862312317, "learning_rate": 4.361732203735032e-06, "loss": 1.0492331981658936, "step": 930 }, { "epoch": 1.137973137973138, "grad_norm": 2.813020706176758, "learning_rate": 4.358559113805531e-06, "loss": 1.0911149978637695, "step": 932 }, { "epoch": 1.1404151404151404, "grad_norm": 3.2957863807678223, "learning_rate": 4.355379466691008e-06, "loss": 0.9345967769622803, "step": 934 }, { "epoch": 1.1428571428571428, "grad_norm": 1.7585479021072388, "learning_rate": 4.3521932753530856e-06, "loss": 0.9201436042785645, "step": 936 }, { "epoch": 1.1452991452991452, "grad_norm": 3.857144594192505, "learning_rate": 4.34900055278006e-06, "loss": 1.0130703449249268, "step": 938 }, { "epoch": 1.1477411477411477, "grad_norm": 1.5104448795318604, "learning_rate": 4.345801311986855e-06, "loss": 1.0904299020767212, "step": 940 }, { "epoch": 1.15018315018315, "grad_norm": 1.7477518320083618, "learning_rate": 4.342595566014965e-06, "loss": 1.0437507629394531, "step": 942 }, { "epoch": 1.1526251526251525, "grad_norm": 2.9423069953918457, "learning_rate": 4.339383327932402e-06, "loss": 0.820652425289154, "step": 944 }, { "epoch": 1.155067155067155, "grad_norm": 1.767242670059204, "learning_rate": 4.33616461083364e-06, "loss": 0.788296103477478, "step": 946 }, { "epoch": 1.1575091575091574, "grad_norm": 10.629752159118652, "learning_rate": 4.33293942783957e-06, "loss": 0.6116584539413452, "step": 948 }, { "epoch": 1.1599511599511598, "grad_norm": 3.033034086227417, "learning_rate": 4.329707792097436e-06, "loss": 1.0888707637786865, "step": 950 }, { "epoch": 1.1623931623931625, "grad_norm": 1.710669755935669, "learning_rate": 4.326469716780787e-06, "loss": 1.091694712638855, "step": 952 }, { "epoch": 1.164835164835165, "grad_norm": 3.053687572479248, "learning_rate": 4.323225215089425e-06, "loss": 0.9434468746185303, "step": 954 }, { "epoch": 1.1672771672771673, "grad_norm": 1.6665173768997192, "learning_rate": 4.319974300249346e-06, "loss": 0.8895185589790344, "step": 956 }, { "epoch": 1.1697191697191698, "grad_norm": 5.074321269989014, "learning_rate": 4.3167169855126885e-06, "loss": 0.7732067108154297, "step": 958 }, { "epoch": 1.1721611721611722, "grad_norm": 3.9608309268951416, "learning_rate": 4.313453284157683e-06, "loss": 1.162553071975708, "step": 960 }, { "epoch": 1.1746031746031746, "grad_norm": 2.7939271926879883, "learning_rate": 4.310183209488592e-06, "loss": 0.9017969369888306, "step": 962 }, { "epoch": 1.177045177045177, "grad_norm": 2.613326072692871, "learning_rate": 4.306906774835658e-06, "loss": 0.7510517835617065, "step": 964 }, { "epoch": 1.1794871794871795, "grad_norm": 0.82303386926651, "learning_rate": 4.303623993555051e-06, "loss": 1.073706030845642, "step": 966 }, { "epoch": 1.181929181929182, "grad_norm": 4.074831962585449, "learning_rate": 4.300334879028813e-06, "loss": 1.135977029800415, "step": 968 }, { "epoch": 1.1843711843711844, "grad_norm": 3.945430040359497, "learning_rate": 4.2970394446648015e-06, "loss": 0.7781526446342468, "step": 970 }, { "epoch": 1.1868131868131868, "grad_norm": 1.8157840967178345, "learning_rate": 4.293737703896636e-06, "loss": 1.197265625, "step": 972 }, { "epoch": 1.1892551892551892, "grad_norm": 2.2199547290802, "learning_rate": 4.290429670183648e-06, "loss": 0.8672367334365845, "step": 974 }, { "epoch": 1.1916971916971917, "grad_norm": 11.338154792785645, "learning_rate": 4.287115357010816e-06, "loss": 0.7362724542617798, "step": 976 }, { "epoch": 1.1941391941391941, "grad_norm": 1.7665647268295288, "learning_rate": 4.283794777888718e-06, "loss": 0.8837488293647766, "step": 978 }, { "epoch": 1.1965811965811965, "grad_norm": 1.333461880683899, "learning_rate": 4.280467946353478e-06, "loss": 1.094375491142273, "step": 980 }, { "epoch": 1.199023199023199, "grad_norm": 1.3403749465942383, "learning_rate": 4.277134875966703e-06, "loss": 1.0798702239990234, "step": 982 }, { "epoch": 1.2014652014652014, "grad_norm": 1.700862169265747, "learning_rate": 4.273795580315437e-06, "loss": 1.195528507232666, "step": 984 }, { "epoch": 1.2039072039072038, "grad_norm": 8.78385066986084, "learning_rate": 4.270450073012095e-06, "loss": 0.7649343013763428, "step": 986 }, { "epoch": 1.2063492063492063, "grad_norm": 67.73321533203125, "learning_rate": 4.267098367694419e-06, "loss": 0.7331146001815796, "step": 988 }, { "epoch": 1.2087912087912087, "grad_norm": 3.189149856567383, "learning_rate": 4.263740478025412e-06, "loss": 0.8756888508796692, "step": 990 }, { "epoch": 1.2112332112332111, "grad_norm": 4.3046770095825195, "learning_rate": 4.2603764176932925e-06, "loss": 1.1108595132827759, "step": 992 }, { "epoch": 1.2136752136752136, "grad_norm": 2.0171821117401123, "learning_rate": 4.257006200411429e-06, "loss": 1.0103721618652344, "step": 994 }, { "epoch": 1.2161172161172162, "grad_norm": 2.713459014892578, "learning_rate": 4.25362983991829e-06, "loss": 0.9784596562385559, "step": 996 }, { "epoch": 1.2185592185592187, "grad_norm": 1.9199753999710083, "learning_rate": 4.250247349977385e-06, "loss": 1.062201738357544, "step": 998 }, { "epoch": 1.221001221001221, "grad_norm": 11.171542167663574, "learning_rate": 4.246858744377212e-06, "loss": 0.744211733341217, "step": 1000 }, { "epoch": 1.2234432234432235, "grad_norm": 2.0410537719726562, "learning_rate": 4.243464036931198e-06, "loss": 1.0498521327972412, "step": 1002 }, { "epoch": 1.225885225885226, "grad_norm": 1.5847947597503662, "learning_rate": 4.240063241477643e-06, "loss": 1.1089041233062744, "step": 1004 }, { "epoch": 1.2283272283272284, "grad_norm": 3.658682346343994, "learning_rate": 4.2366563718796664e-06, "loss": 0.8046331405639648, "step": 1006 }, { "epoch": 1.2307692307692308, "grad_norm": 1.940625548362732, "learning_rate": 4.233243442025145e-06, "loss": 0.7440409064292908, "step": 1008 }, { "epoch": 1.2332112332112333, "grad_norm": 1.9868489503860474, "learning_rate": 4.229824465826665e-06, "loss": 1.144100308418274, "step": 1010 }, { "epoch": 1.2356532356532357, "grad_norm": 1.2745945453643799, "learning_rate": 4.226399457221454e-06, "loss": 0.6603936553001404, "step": 1012 }, { "epoch": 1.2380952380952381, "grad_norm": 1.5920745134353638, "learning_rate": 4.222968430171336e-06, "loss": 1.1303434371948242, "step": 1014 }, { "epoch": 1.2405372405372406, "grad_norm": 4.021664619445801, "learning_rate": 4.219531398662665e-06, "loss": 1.0450407266616821, "step": 1016 }, { "epoch": 1.242979242979243, "grad_norm": 1.6237807273864746, "learning_rate": 4.216088376706274e-06, "loss": 1.0899841785430908, "step": 1018 }, { "epoch": 1.2454212454212454, "grad_norm": 2.2023823261260986, "learning_rate": 4.212639378337413e-06, "loss": 0.7024634480476379, "step": 1020 }, { "epoch": 1.2478632478632479, "grad_norm": 8.069097518920898, "learning_rate": 4.209184417615697e-06, "loss": 0.9512033462524414, "step": 1022 }, { "epoch": 1.2503052503052503, "grad_norm": 1.6683331727981567, "learning_rate": 4.2057235086250455e-06, "loss": 1.052414059638977, "step": 1024 }, { "epoch": 1.2527472527472527, "grad_norm": 3.130899667739868, "learning_rate": 4.2022566654736255e-06, "loss": 1.0695925951004028, "step": 1026 }, { "epoch": 1.2551892551892552, "grad_norm": 1.8631014823913574, "learning_rate": 4.198783902293794e-06, "loss": 0.9780709147453308, "step": 1028 }, { "epoch": 1.2576312576312576, "grad_norm": 2.728553295135498, "learning_rate": 4.1953052332420415e-06, "loss": 0.9186390042304993, "step": 1030 }, { "epoch": 1.26007326007326, "grad_norm": 1.7069987058639526, "learning_rate": 4.191820672498931e-06, "loss": 1.138177514076233, "step": 1032 }, { "epoch": 1.2625152625152625, "grad_norm": 3.96309494972229, "learning_rate": 4.188330234269046e-06, "loss": 1.230303406715393, "step": 1034 }, { "epoch": 1.264957264957265, "grad_norm": 0.6388441920280457, "learning_rate": 4.184833932780927e-06, "loss": 0.7601897716522217, "step": 1036 }, { "epoch": 1.2673992673992673, "grad_norm": 2.074471950531006, "learning_rate": 4.181331782287015e-06, "loss": 0.6320565938949585, "step": 1038 }, { "epoch": 1.2698412698412698, "grad_norm": 1.3992935419082642, "learning_rate": 4.177823797063597e-06, "loss": 0.7402109503746033, "step": 1040 }, { "epoch": 1.2722832722832722, "grad_norm": 1.8529661893844604, "learning_rate": 4.174309991410742e-06, "loss": 1.1013227701187134, "step": 1042 }, { "epoch": 1.2747252747252746, "grad_norm": 1.740545392036438, "learning_rate": 4.1707903796522474e-06, "loss": 0.9940573573112488, "step": 1044 }, { "epoch": 1.277167277167277, "grad_norm": 3.5190329551696777, "learning_rate": 4.1672649761355785e-06, "loss": 1.0399502515792847, "step": 1046 }, { "epoch": 1.2796092796092795, "grad_norm": 3.40808367729187, "learning_rate": 4.163733795231808e-06, "loss": 0.8423551321029663, "step": 1048 }, { "epoch": 1.282051282051282, "grad_norm": 8.643896102905273, "learning_rate": 4.160196851335564e-06, "loss": 0.3857470452785492, "step": 1050 }, { "epoch": 1.2844932844932844, "grad_norm": 2.840670347213745, "learning_rate": 4.156654158864964e-06, "loss": 1.0681036710739136, "step": 1052 }, { "epoch": 1.2869352869352868, "grad_norm": 3.3994009494781494, "learning_rate": 4.15310573226156e-06, "loss": 0.86181640625, "step": 1054 }, { "epoch": 1.2893772893772895, "grad_norm": 5.254836559295654, "learning_rate": 4.149551585990277e-06, "loss": 0.7644107937812805, "step": 1056 }, { "epoch": 1.291819291819292, "grad_norm": 2.2039105892181396, "learning_rate": 4.1459917345393614e-06, "loss": 1.2520135641098022, "step": 1058 }, { "epoch": 1.2942612942612943, "grad_norm": 1.7039287090301514, "learning_rate": 4.142426192420308e-06, "loss": 1.0944513082504272, "step": 1060 }, { "epoch": 1.2967032967032968, "grad_norm": 4.587660789489746, "learning_rate": 4.138854974167818e-06, "loss": 0.6725199222564697, "step": 1062 }, { "epoch": 1.2991452991452992, "grad_norm": 1.5989353656768799, "learning_rate": 4.135278094339725e-06, "loss": 1.1340867280960083, "step": 1064 }, { "epoch": 1.3015873015873016, "grad_norm": 2.533905029296875, "learning_rate": 4.131695567516943e-06, "loss": 0.7726882100105286, "step": 1066 }, { "epoch": 1.304029304029304, "grad_norm": 1.1167593002319336, "learning_rate": 4.1281074083034065e-06, "loss": 1.0258402824401855, "step": 1068 }, { "epoch": 1.3064713064713065, "grad_norm": 8.628117561340332, "learning_rate": 4.12451363132601e-06, "loss": 0.8221207857131958, "step": 1070 }, { "epoch": 1.308913308913309, "grad_norm": 3.239126443862915, "learning_rate": 4.120914251234548e-06, "loss": 1.0316239595413208, "step": 1072 }, { "epoch": 1.3113553113553114, "grad_norm": 3.97194504737854, "learning_rate": 4.117309282701655e-06, "loss": 0.7956058382987976, "step": 1074 }, { "epoch": 1.3137973137973138, "grad_norm": 2.8797948360443115, "learning_rate": 4.1136987404227476e-06, "loss": 0.7710628509521484, "step": 1076 }, { "epoch": 1.3162393162393162, "grad_norm": 6.195582389831543, "learning_rate": 4.110082639115963e-06, "loss": 1.073829174041748, "step": 1078 }, { "epoch": 1.3186813186813187, "grad_norm": 1.3067351579666138, "learning_rate": 4.106460993522101e-06, "loss": 0.9566723108291626, "step": 1080 }, { "epoch": 1.321123321123321, "grad_norm": 2.2232918739318848, "learning_rate": 4.102833818404557e-06, "loss": 0.9678391218185425, "step": 1082 }, { "epoch": 1.3235653235653235, "grad_norm": 2.109621047973633, "learning_rate": 4.099201128549275e-06, "loss": 1.1640703678131104, "step": 1084 }, { "epoch": 1.326007326007326, "grad_norm": 4.480690956115723, "learning_rate": 4.095562938764672e-06, "loss": 1.0956099033355713, "step": 1086 }, { "epoch": 1.3284493284493284, "grad_norm": 2.0747313499450684, "learning_rate": 4.091919263881592e-06, "loss": 1.097609281539917, "step": 1088 }, { "epoch": 1.3308913308913308, "grad_norm": 2.344632387161255, "learning_rate": 4.088270118753232e-06, "loss": 0.7443391680717468, "step": 1090 }, { "epoch": 1.3333333333333333, "grad_norm": 32.47975540161133, "learning_rate": 4.084615518255092e-06, "loss": 1.0534281730651855, "step": 1092 }, { "epoch": 1.3357753357753357, "grad_norm": 1.4418542385101318, "learning_rate": 4.08095547728491e-06, "loss": 1.1028659343719482, "step": 1094 }, { "epoch": 1.3382173382173383, "grad_norm": 6.136029243469238, "learning_rate": 4.077290010762602e-06, "loss": 0.47979384660720825, "step": 1096 }, { "epoch": 1.3406593406593408, "grad_norm": 2.139401435852051, "learning_rate": 4.0736191336301986e-06, "loss": 1.1901733875274658, "step": 1098 }, { "epoch": 1.3431013431013432, "grad_norm": 1.571408987045288, "learning_rate": 4.06994286085179e-06, "loss": 1.075485348701477, "step": 1100 }, { "epoch": 1.3455433455433456, "grad_norm": 1.0710482597351074, "learning_rate": 4.066261207413458e-06, "loss": 1.0476422309875488, "step": 1102 }, { "epoch": 1.347985347985348, "grad_norm": 2.6131324768066406, "learning_rate": 4.06257418832322e-06, "loss": 0.8847273588180542, "step": 1104 }, { "epoch": 1.3504273504273505, "grad_norm": 1.8128620386123657, "learning_rate": 4.058881818610966e-06, "loss": 1.1783521175384521, "step": 1106 }, { "epoch": 1.352869352869353, "grad_norm": 34.26594924926758, "learning_rate": 4.055184113328397e-06, "loss": 0.9166494011878967, "step": 1108 }, { "epoch": 1.3553113553113554, "grad_norm": 1.9319859743118286, "learning_rate": 4.051481087548966e-06, "loss": 1.1042914390563965, "step": 1110 }, { "epoch": 1.3577533577533578, "grad_norm": 2.550018072128296, "learning_rate": 4.047772756367811e-06, "loss": 1.0983607769012451, "step": 1112 }, { "epoch": 1.3601953601953602, "grad_norm": 3.659637212753296, "learning_rate": 4.044059134901701e-06, "loss": 1.0594271421432495, "step": 1114 }, { "epoch": 1.3626373626373627, "grad_norm": 4.164947986602783, "learning_rate": 4.0403402382889676e-06, "loss": 0.4707038700580597, "step": 1116 }, { "epoch": 1.3650793650793651, "grad_norm": 1.7244220972061157, "learning_rate": 4.036616081689447e-06, "loss": 1.137607216835022, "step": 1118 }, { "epoch": 1.3675213675213675, "grad_norm": 1.9371610879898071, "learning_rate": 4.032886680284419e-06, "loss": 1.1212375164031982, "step": 1120 }, { "epoch": 1.36996336996337, "grad_norm": 2.010833263397217, "learning_rate": 4.029152049276541e-06, "loss": 1.0424951314926147, "step": 1122 }, { "epoch": 1.3724053724053724, "grad_norm": 1.6150962114334106, "learning_rate": 4.025412203889791e-06, "loss": 0.9809345602989197, "step": 1124 }, { "epoch": 1.3748473748473748, "grad_norm": 2.5580382347106934, "learning_rate": 4.0216671593694e-06, "loss": 1.2934308052062988, "step": 1126 }, { "epoch": 1.3772893772893773, "grad_norm": 2.092132806777954, "learning_rate": 4.017916930981797e-06, "loss": 1.0607208013534546, "step": 1128 }, { "epoch": 1.3797313797313797, "grad_norm": 2.038407802581787, "learning_rate": 4.014161534014538e-06, "loss": 0.8067485094070435, "step": 1130 }, { "epoch": 1.3821733821733821, "grad_norm": 1.479718804359436, "learning_rate": 4.010400983776253e-06, "loss": 0.7700361609458923, "step": 1132 }, { "epoch": 1.3846153846153846, "grad_norm": 3.232928514480591, "learning_rate": 4.006635295596575e-06, "loss": 0.4854944348335266, "step": 1134 }, { "epoch": 1.387057387057387, "grad_norm": 2.037388563156128, "learning_rate": 4.002864484826083e-06, "loss": 0.9804095029830933, "step": 1136 }, { "epoch": 1.3894993894993894, "grad_norm": 1.7072653770446777, "learning_rate": 3.99908856683624e-06, "loss": 1.1063387393951416, "step": 1138 }, { "epoch": 1.3919413919413919, "grad_norm": 4.661365509033203, "learning_rate": 3.995307557019326e-06, "loss": 0.8346843719482422, "step": 1140 }, { "epoch": 1.3943833943833943, "grad_norm": 2.608985662460327, "learning_rate": 3.991521470788377e-06, "loss": 0.9450017213821411, "step": 1142 }, { "epoch": 1.3968253968253967, "grad_norm": 2.2186226844787598, "learning_rate": 3.987730323577123e-06, "loss": 0.6135491728782654, "step": 1144 }, { "epoch": 1.3992673992673992, "grad_norm": 1.9363148212432861, "learning_rate": 3.983934130839927e-06, "loss": 1.068377137184143, "step": 1146 }, { "epoch": 1.4017094017094016, "grad_norm": 6.124155521392822, "learning_rate": 3.980132908051717e-06, "loss": 0.8843311667442322, "step": 1148 }, { "epoch": 1.404151404151404, "grad_norm": 1.894343376159668, "learning_rate": 3.976326670707927e-06, "loss": 0.7890317440032959, "step": 1150 }, { "epoch": 1.4065934065934065, "grad_norm": 1.4660074710845947, "learning_rate": 3.972515434324432e-06, "loss": 0.8038425445556641, "step": 1152 }, { "epoch": 1.409035409035409, "grad_norm": 1.7170904874801636, "learning_rate": 3.9686992144374854e-06, "loss": 0.9780741930007935, "step": 1154 }, { "epoch": 1.4114774114774113, "grad_norm": 6.812156677246094, "learning_rate": 3.964878026603656e-06, "loss": 0.7489140629768372, "step": 1156 }, { "epoch": 1.4139194139194138, "grad_norm": 3.0899953842163086, "learning_rate": 3.961051886399763e-06, "loss": 1.009106159210205, "step": 1158 }, { "epoch": 1.4163614163614164, "grad_norm": 1.569420576095581, "learning_rate": 3.9572208094228155e-06, "loss": 1.0201953649520874, "step": 1160 }, { "epoch": 1.4188034188034189, "grad_norm": 2.1486785411834717, "learning_rate": 3.9533848112899455e-06, "loss": 0.7411532402038574, "step": 1162 }, { "epoch": 1.4212454212454213, "grad_norm": 13.017099380493164, "learning_rate": 3.949543907638345e-06, "loss": 0.7296299934387207, "step": 1164 }, { "epoch": 1.4236874236874237, "grad_norm": 1.9764689207077026, "learning_rate": 3.945698114125207e-06, "loss": 1.1636407375335693, "step": 1166 }, { "epoch": 1.4261294261294262, "grad_norm": 0.6818609833717346, "learning_rate": 3.941847446427651e-06, "loss": 0.9746972322463989, "step": 1168 }, { "epoch": 1.4285714285714286, "grad_norm": 2.446106433868408, "learning_rate": 3.937991920242671e-06, "loss": 0.8085231184959412, "step": 1170 }, { "epoch": 1.431013431013431, "grad_norm": 2.190028190612793, "learning_rate": 3.934131551287067e-06, "loss": 1.1608608961105347, "step": 1172 }, { "epoch": 1.4334554334554335, "grad_norm": 1.8594470024108887, "learning_rate": 3.930266355297375e-06, "loss": 1.1073782444000244, "step": 1174 }, { "epoch": 1.435897435897436, "grad_norm": 3.316195487976074, "learning_rate": 3.926396348029814e-06, "loss": 1.1658706665039062, "step": 1176 }, { "epoch": 1.4383394383394383, "grad_norm": 2.6010489463806152, "learning_rate": 3.922521545260211e-06, "loss": 0.9183681011199951, "step": 1178 }, { "epoch": 1.4407814407814408, "grad_norm": 5.369879245758057, "learning_rate": 3.918641962783945e-06, "loss": 1.037269949913025, "step": 1180 }, { "epoch": 1.4432234432234432, "grad_norm": 3.0808987617492676, "learning_rate": 3.914757616415877e-06, "loss": 0.8047484755516052, "step": 1182 }, { "epoch": 1.4456654456654456, "grad_norm": 1.6899147033691406, "learning_rate": 3.910868521990289e-06, "loss": 1.117107629776001, "step": 1184 }, { "epoch": 1.448107448107448, "grad_norm": 1.6038181781768799, "learning_rate": 3.906974695360818e-06, "loss": 1.0371313095092773, "step": 1186 }, { "epoch": 1.4505494505494505, "grad_norm": 2.300448179244995, "learning_rate": 3.90307615240039e-06, "loss": 0.8785613179206848, "step": 1188 }, { "epoch": 1.452991452991453, "grad_norm": 1.9171602725982666, "learning_rate": 3.8991729090011585e-06, "loss": 1.0834622383117676, "step": 1190 }, { "epoch": 1.4554334554334554, "grad_norm": 2.6901988983154297, "learning_rate": 3.895264981074438e-06, "loss": 0.8501840829849243, "step": 1192 }, { "epoch": 1.4578754578754578, "grad_norm": 1.8914860486984253, "learning_rate": 3.891352384550639e-06, "loss": 0.8218003511428833, "step": 1194 }, { "epoch": 1.4603174603174602, "grad_norm": 2.6401541233062744, "learning_rate": 3.887435135379202e-06, "loss": 0.7749768495559692, "step": 1196 }, { "epoch": 1.462759462759463, "grad_norm": 3.5819826126098633, "learning_rate": 3.8835132495285344e-06, "loss": 0.9986313581466675, "step": 1198 }, { "epoch": 1.4652014652014653, "grad_norm": 2.515784502029419, "learning_rate": 3.879586742985945e-06, "loss": 1.154970645904541, "step": 1200 }, { "epoch": 1.4676434676434678, "grad_norm": 2.7575578689575195, "learning_rate": 3.875655631757579e-06, "loss": 1.0889326333999634, "step": 1202 }, { "epoch": 1.4700854700854702, "grad_norm": 1.673169493675232, "learning_rate": 3.871719931868352e-06, "loss": 1.109386920928955, "step": 1204 }, { "epoch": 1.4725274725274726, "grad_norm": 3.21140193939209, "learning_rate": 3.867779659361885e-06, "loss": 0.9718731641769409, "step": 1206 }, { "epoch": 1.474969474969475, "grad_norm": 2.298818588256836, "learning_rate": 3.863834830300437e-06, "loss": 0.8030334115028381, "step": 1208 }, { "epoch": 1.4774114774114775, "grad_norm": 3.9100306034088135, "learning_rate": 3.859885460764845e-06, "loss": 0.9156997203826904, "step": 1210 }, { "epoch": 1.47985347985348, "grad_norm": 1.3137868642807007, "learning_rate": 3.855931566854451e-06, "loss": 1.0059466361999512, "step": 1212 }, { "epoch": 1.4822954822954824, "grad_norm": 1.9000264406204224, "learning_rate": 3.851973164687046e-06, "loss": 1.1118829250335693, "step": 1214 }, { "epoch": 1.4847374847374848, "grad_norm": 1.584736943244934, "learning_rate": 3.848010270398792e-06, "loss": 1.0681581497192383, "step": 1216 }, { "epoch": 1.4871794871794872, "grad_norm": 1.8261507749557495, "learning_rate": 3.844042900144167e-06, "loss": 0.2508808970451355, "step": 1218 }, { "epoch": 1.4896214896214897, "grad_norm": 1.896042823791504, "learning_rate": 3.8400710700958945e-06, "loss": 0.6199178695678711, "step": 1220 }, { "epoch": 1.492063492063492, "grad_norm": 2.0678446292877197, "learning_rate": 3.836094796444875e-06, "loss": 1.0399789810180664, "step": 1222 }, { "epoch": 1.4945054945054945, "grad_norm": 6.400730133056641, "learning_rate": 3.832114095400129e-06, "loss": 0.8569754362106323, "step": 1224 }, { "epoch": 1.496947496947497, "grad_norm": 2.1547770500183105, "learning_rate": 3.8281289831887185e-06, "loss": 1.1074395179748535, "step": 1226 }, { "epoch": 1.4993894993894994, "grad_norm": 1.7979967594146729, "learning_rate": 3.824139476055692e-06, "loss": 0.36593061685562134, "step": 1228 }, { "epoch": 1.5018315018315018, "grad_norm": 35.071903228759766, "learning_rate": 3.820145590264012e-06, "loss": 0.8221673965454102, "step": 1230 }, { "epoch": 1.5042735042735043, "grad_norm": 0.9250247478485107, "learning_rate": 3.81614734209449e-06, "loss": 0.6617715954780579, "step": 1232 }, { "epoch": 1.5067155067155067, "grad_norm": 4.005329132080078, "learning_rate": 3.812144747845719e-06, "loss": 1.1474699974060059, "step": 1234 }, { "epoch": 1.5091575091575091, "grad_norm": 2.440639019012451, "learning_rate": 3.808137823834012e-06, "loss": 0.8988032937049866, "step": 1236 }, { "epoch": 1.5115995115995116, "grad_norm": 1.8108290433883667, "learning_rate": 3.80412658639333e-06, "loss": 0.8774833679199219, "step": 1238 }, { "epoch": 1.514041514041514, "grad_norm": 1.4303427934646606, "learning_rate": 3.800111051875217e-06, "loss": 1.0372514724731445, "step": 1240 }, { "epoch": 1.5164835164835164, "grad_norm": 1.5728963613510132, "learning_rate": 3.7960912366487353e-06, "loss": 1.0711747407913208, "step": 1242 }, { "epoch": 1.5189255189255189, "grad_norm": 9.220934867858887, "learning_rate": 3.7920671571003953e-06, "loss": 0.686614453792572, "step": 1244 }, { "epoch": 1.5213675213675213, "grad_norm": 1.5577303171157837, "learning_rate": 3.7880388296340924e-06, "loss": 0.7836710810661316, "step": 1246 }, { "epoch": 1.5238095238095237, "grad_norm": 1.9703376293182373, "learning_rate": 3.7840062706710362e-06, "loss": 0.8961681127548218, "step": 1248 }, { "epoch": 1.5262515262515262, "grad_norm": 2.641063690185547, "learning_rate": 3.7799694966496888e-06, "loss": 1.1727888584136963, "step": 1250 }, { "epoch": 1.5286935286935286, "grad_norm": 5.275555610656738, "learning_rate": 3.775928524025691e-06, "loss": 0.875237226486206, "step": 1252 }, { "epoch": 1.531135531135531, "grad_norm": 1.5248931646347046, "learning_rate": 3.771883369271803e-06, "loss": 1.040828824043274, "step": 1254 }, { "epoch": 1.5335775335775335, "grad_norm": 2.20690655708313, "learning_rate": 3.7678340488778302e-06, "loss": 1.1615933179855347, "step": 1256 }, { "epoch": 1.536019536019536, "grad_norm": 1.435325026512146, "learning_rate": 3.763780579350559e-06, "loss": 0.40704652667045593, "step": 1258 }, { "epoch": 1.5384615384615383, "grad_norm": 14.3430814743042, "learning_rate": 3.759722977213691e-06, "loss": 0.8075951337814331, "step": 1260 }, { "epoch": 1.5409035409035408, "grad_norm": 16.239559173583984, "learning_rate": 3.755661259007774e-06, "loss": 0.6135749816894531, "step": 1262 }, { "epoch": 1.5433455433455432, "grad_norm": 2.538618803024292, "learning_rate": 3.751595441290133e-06, "loss": 0.8490422964096069, "step": 1264 }, { "epoch": 1.5457875457875456, "grad_norm": 2.3163981437683105, "learning_rate": 3.7475255406348067e-06, "loss": 0.8143582940101624, "step": 1266 }, { "epoch": 1.5482295482295483, "grad_norm": 1.8422861099243164, "learning_rate": 3.7434515736324746e-06, "loss": 1.0519959926605225, "step": 1268 }, { "epoch": 1.5506715506715507, "grad_norm": 9.199726104736328, "learning_rate": 3.7393735568903955e-06, "loss": 0.4911290109157562, "step": 1270 }, { "epoch": 1.5531135531135531, "grad_norm": 2.1301679611206055, "learning_rate": 3.7352915070323366e-06, "loss": 1.189732313156128, "step": 1272 }, { "epoch": 1.5555555555555556, "grad_norm": 1.937249779701233, "learning_rate": 3.731205440698501e-06, "loss": 0.9045177102088928, "step": 1274 }, { "epoch": 1.557997557997558, "grad_norm": 2.8137459754943848, "learning_rate": 3.7271153745454726e-06, "loss": 1.390211582183838, "step": 1276 }, { "epoch": 1.5604395604395604, "grad_norm": 2.1598775386810303, "learning_rate": 3.723021325246132e-06, "loss": 0.737874448299408, "step": 1278 }, { "epoch": 1.5628815628815629, "grad_norm": 2.4186580181121826, "learning_rate": 3.7189233094896044e-06, "loss": 1.0836533308029175, "step": 1280 }, { "epoch": 1.5653235653235653, "grad_norm": 2.439676284790039, "learning_rate": 3.714821343981179e-06, "loss": 0.7069857120513916, "step": 1282 }, { "epoch": 1.5677655677655677, "grad_norm": 1.5403668880462646, "learning_rate": 3.7107154454422456e-06, "loss": 1.0703009366989136, "step": 1284 }, { "epoch": 1.5702075702075702, "grad_norm": 3.893155097961426, "learning_rate": 3.706605630610231e-06, "loss": 1.1834505796432495, "step": 1286 }, { "epoch": 1.5726495726495726, "grad_norm": 5.153315544128418, "learning_rate": 3.7024919162385232e-06, "loss": 0.5492372512817383, "step": 1288 }, { "epoch": 1.575091575091575, "grad_norm": 1.3920317888259888, "learning_rate": 3.6983743190964077e-06, "loss": 0.8411808013916016, "step": 1290 }, { "epoch": 1.5775335775335775, "grad_norm": 9.354891777038574, "learning_rate": 3.6942528559689965e-06, "loss": 0.36394214630126953, "step": 1292 }, { "epoch": 1.5799755799755801, "grad_norm": 2.3740155696868896, "learning_rate": 3.690127543657162e-06, "loss": 0.7142713069915771, "step": 1294 }, { "epoch": 1.5824175824175826, "grad_norm": 37.80674362182617, "learning_rate": 3.685998398977468e-06, "loss": 1.0909113883972168, "step": 1296 }, { "epoch": 1.584859584859585, "grad_norm": 1.855957269668579, "learning_rate": 3.6818654387620993e-06, "loss": 1.1598751544952393, "step": 1298 }, { "epoch": 1.5873015873015874, "grad_norm": 2.314946174621582, "learning_rate": 3.677728679858797e-06, "loss": 0.9421340823173523, "step": 1300 }, { "epoch": 1.5897435897435899, "grad_norm": 5.468100070953369, "learning_rate": 3.673588139130784e-06, "loss": 1.2048614025115967, "step": 1302 }, { "epoch": 1.5921855921855923, "grad_norm": 3.331906795501709, "learning_rate": 3.6694438334567024e-06, "loss": 1.1039568185806274, "step": 1304 }, { "epoch": 1.5946275946275947, "grad_norm": 1.5079933404922485, "learning_rate": 3.6652957797305387e-06, "loss": 0.6897386908531189, "step": 1306 }, { "epoch": 1.5970695970695972, "grad_norm": 2.3638577461242676, "learning_rate": 3.661143994861563e-06, "loss": 1.1327297687530518, "step": 1308 }, { "epoch": 1.5995115995115996, "grad_norm": 2.4536283016204834, "learning_rate": 3.6569884957742497e-06, "loss": 1.0871834754943848, "step": 1310 }, { "epoch": 1.601953601953602, "grad_norm": 1.548901915550232, "learning_rate": 3.652829299408217e-06, "loss": 1.0074199438095093, "step": 1312 }, { "epoch": 1.6043956043956045, "grad_norm": 1.3679847717285156, "learning_rate": 3.648666422718155e-06, "loss": 1.1029393672943115, "step": 1314 }, { "epoch": 1.606837606837607, "grad_norm": 2.071131706237793, "learning_rate": 3.644499882673756e-06, "loss": 1.1430408954620361, "step": 1316 }, { "epoch": 1.6092796092796093, "grad_norm": 2.4289538860321045, "learning_rate": 3.6403296962596442e-06, "loss": 1.0161014795303345, "step": 1318 }, { "epoch": 1.6117216117216118, "grad_norm": 1.8402098417282104, "learning_rate": 3.6361558804753088e-06, "loss": 1.2254347801208496, "step": 1320 }, { "epoch": 1.6141636141636142, "grad_norm": 1.484537124633789, "learning_rate": 3.631978452335036e-06, "loss": 1.116368293762207, "step": 1322 }, { "epoch": 1.6166056166056166, "grad_norm": 1.7078075408935547, "learning_rate": 3.6277974288678354e-06, "loss": 1.0890535116195679, "step": 1324 }, { "epoch": 1.619047619047619, "grad_norm": 4.279214382171631, "learning_rate": 3.6236128271173716e-06, "loss": 0.8000863790512085, "step": 1326 }, { "epoch": 1.6214896214896215, "grad_norm": 1.6943376064300537, "learning_rate": 3.6194246641418993e-06, "loss": 1.1035950183868408, "step": 1328 }, { "epoch": 1.623931623931624, "grad_norm": 3.024909257888794, "learning_rate": 3.6152329570141863e-06, "loss": 1.078392744064331, "step": 1330 }, { "epoch": 1.6263736263736264, "grad_norm": 4.725790977478027, "learning_rate": 3.611037722821452e-06, "loss": 0.8447167277336121, "step": 1332 }, { "epoch": 1.6288156288156288, "grad_norm": 1.9349464178085327, "learning_rate": 3.6068389786652915e-06, "loss": 1.1011463403701782, "step": 1334 }, { "epoch": 1.6312576312576312, "grad_norm": 1.9638590812683105, "learning_rate": 3.6026367416616054e-06, "loss": 0.7226951718330383, "step": 1336 }, { "epoch": 1.6336996336996337, "grad_norm": 3.807051420211792, "learning_rate": 3.598431028940539e-06, "loss": 1.0683143138885498, "step": 1338 }, { "epoch": 1.636141636141636, "grad_norm": 2.799273729324341, "learning_rate": 3.594221857646399e-06, "loss": 0.5557500720024109, "step": 1340 }, { "epoch": 1.6385836385836385, "grad_norm": 1.5128666162490845, "learning_rate": 3.5900092449375977e-06, "loss": 0.391013503074646, "step": 1342 }, { "epoch": 1.641025641025641, "grad_norm": 2.4419357776641846, "learning_rate": 3.5857932079865703e-06, "loss": 1.2627594470977783, "step": 1344 }, { "epoch": 1.6434676434676434, "grad_norm": 1.5012274980545044, "learning_rate": 3.5815737639797143e-06, "loss": 1.1198487281799316, "step": 1346 }, { "epoch": 1.6459096459096458, "grad_norm": 1.7359366416931152, "learning_rate": 3.5773509301173136e-06, "loss": 0.7089607119560242, "step": 1348 }, { "epoch": 1.6483516483516483, "grad_norm": 1.7854307889938354, "learning_rate": 3.573124723613473e-06, "loss": 0.7905706763267517, "step": 1350 }, { "epoch": 1.6507936507936507, "grad_norm": 2.4434316158294678, "learning_rate": 3.568895161696042e-06, "loss": 1.0632576942443848, "step": 1352 }, { "epoch": 1.6532356532356531, "grad_norm": 1.7432414293289185, "learning_rate": 3.5646622616065537e-06, "loss": 1.170975685119629, "step": 1354 }, { "epoch": 1.6556776556776556, "grad_norm": 1.8956907987594604, "learning_rate": 3.560426040600143e-06, "loss": 1.0797570943832397, "step": 1356 }, { "epoch": 1.658119658119658, "grad_norm": 1.6335842609405518, "learning_rate": 3.556186515945486e-06, "loss": 0.5945901870727539, "step": 1358 }, { "epoch": 1.6605616605616604, "grad_norm": 2.311692714691162, "learning_rate": 3.5519437049247257e-06, "loss": 0.8245255947113037, "step": 1360 }, { "epoch": 1.6630036630036629, "grad_norm": 2.2353930473327637, "learning_rate": 3.547697624833401e-06, "loss": 1.1110084056854248, "step": 1362 }, { "epoch": 1.6654456654456653, "grad_norm": 1.7413452863693237, "learning_rate": 3.543448292980376e-06, "loss": 1.1027268171310425, "step": 1364 }, { "epoch": 1.6678876678876677, "grad_norm": 1.9247740507125854, "learning_rate": 3.5391957266877724e-06, "loss": 1.0763671398162842, "step": 1366 }, { "epoch": 1.6703296703296702, "grad_norm": 1.415798544883728, "learning_rate": 3.534939943290896e-06, "loss": 1.0487414598464966, "step": 1368 }, { "epoch": 1.6727716727716728, "grad_norm": 2.411515235900879, "learning_rate": 3.530680960138166e-06, "loss": 1.142496109008789, "step": 1370 }, { "epoch": 1.6752136752136753, "grad_norm": 1.571021556854248, "learning_rate": 3.5264187945910465e-06, "loss": 0.6615177392959595, "step": 1372 }, { "epoch": 1.6776556776556777, "grad_norm": 1.4412907361984253, "learning_rate": 3.5221534640239745e-06, "loss": 0.29376649856567383, "step": 1374 }, { "epoch": 1.6800976800976801, "grad_norm": 6.718142509460449, "learning_rate": 3.5178849858242874e-06, "loss": 1.1929081678390503, "step": 1376 }, { "epoch": 1.6825396825396826, "grad_norm": 4.863142013549805, "learning_rate": 3.5136133773921553e-06, "loss": 1.202161192893982, "step": 1378 }, { "epoch": 1.684981684981685, "grad_norm": 0.7358537912368774, "learning_rate": 3.509338656140508e-06, "loss": 0.9144766330718994, "step": 1380 }, { "epoch": 1.6874236874236874, "grad_norm": 4.494753837585449, "learning_rate": 3.505060839494964e-06, "loss": 0.978439211845398, "step": 1382 }, { "epoch": 1.6898656898656899, "grad_norm": 1.7089729309082031, "learning_rate": 3.5007799448937617e-06, "loss": 1.1718627214431763, "step": 1384 }, { "epoch": 1.6923076923076923, "grad_norm": 1.796030879020691, "learning_rate": 3.496495989787683e-06, "loss": 1.0744086503982544, "step": 1386 }, { "epoch": 1.6947496947496947, "grad_norm": 1.5995069742202759, "learning_rate": 3.49220899163999e-06, "loss": 1.1244831085205078, "step": 1388 }, { "epoch": 1.6971916971916972, "grad_norm": 3.2209115028381348, "learning_rate": 3.4879189679263474e-06, "loss": 0.3722049295902252, "step": 1390 }, { "epoch": 1.6996336996336996, "grad_norm": 1.2462571859359741, "learning_rate": 3.4836259361347524e-06, "loss": 1.0250697135925293, "step": 1392 }, { "epoch": 1.702075702075702, "grad_norm": 2.99985408782959, "learning_rate": 3.479329913765467e-06, "loss": 0.946092426776886, "step": 1394 }, { "epoch": 1.7045177045177047, "grad_norm": 3.67580246925354, "learning_rate": 3.475030918330942e-06, "loss": 1.217712163925171, "step": 1396 }, { "epoch": 1.7069597069597071, "grad_norm": 3.188765525817871, "learning_rate": 3.4707289673557486e-06, "loss": 0.9007408022880554, "step": 1398 }, { "epoch": 1.7094017094017095, "grad_norm": 5.768331050872803, "learning_rate": 3.4664240783765064e-06, "loss": 0.4004557728767395, "step": 1400 }, { "epoch": 1.711843711843712, "grad_norm": 5.148880958557129, "learning_rate": 3.4621162689418104e-06, "loss": 0.9390780329704285, "step": 1402 }, { "epoch": 1.7142857142857144, "grad_norm": 1.9988371133804321, "learning_rate": 3.4578055566121617e-06, "loss": 1.065889596939087, "step": 1404 }, { "epoch": 1.7167277167277168, "grad_norm": 4.718473434448242, "learning_rate": 3.453491958959894e-06, "loss": 0.5322512984275818, "step": 1406 }, { "epoch": 1.7191697191697193, "grad_norm": 3.3976686000823975, "learning_rate": 3.449175493569103e-06, "loss": 1.1359853744506836, "step": 1408 }, { "epoch": 1.7216117216117217, "grad_norm": 6.322020530700684, "learning_rate": 3.4448561780355766e-06, "loss": 0.7464244961738586, "step": 1410 }, { "epoch": 1.7240537240537241, "grad_norm": 11.572935104370117, "learning_rate": 3.4405340299667183e-06, "loss": 0.8479959964752197, "step": 1412 }, { "epoch": 1.7264957264957266, "grad_norm": 1.7882882356643677, "learning_rate": 3.436209066981479e-06, "loss": 1.0817737579345703, "step": 1414 }, { "epoch": 1.728937728937729, "grad_norm": 5.552520275115967, "learning_rate": 3.4318813067102853e-06, "loss": 0.9852099418640137, "step": 1416 }, { "epoch": 1.7313797313797314, "grad_norm": 1.9042245149612427, "learning_rate": 3.4275507667949658e-06, "loss": 1.1091506481170654, "step": 1418 }, { "epoch": 1.7338217338217339, "grad_norm": 2.391268491744995, "learning_rate": 3.423217464888681e-06, "loss": 0.8407750725746155, "step": 1420 }, { "epoch": 1.7362637362637363, "grad_norm": 2.475590944290161, "learning_rate": 3.41888141865585e-06, "loss": 0.9131081104278564, "step": 1422 }, { "epoch": 1.7387057387057387, "grad_norm": 5.156746864318848, "learning_rate": 3.4145426457720787e-06, "loss": 0.7782116532325745, "step": 1424 }, { "epoch": 1.7411477411477412, "grad_norm": 7.184075355529785, "learning_rate": 3.4102011639240884e-06, "loss": 0.7344411611557007, "step": 1426 }, { "epoch": 1.7435897435897436, "grad_norm": 2.159703016281128, "learning_rate": 3.4058569908096436e-06, "loss": 1.132224202156067, "step": 1428 }, { "epoch": 1.746031746031746, "grad_norm": 1.8462954759597778, "learning_rate": 3.4015101441374776e-06, "loss": 1.173128366470337, "step": 1430 }, { "epoch": 1.7484737484737485, "grad_norm": 2.538024425506592, "learning_rate": 3.397160641627226e-06, "loss": 0.7561154961585999, "step": 1432 }, { "epoch": 1.750915750915751, "grad_norm": 1.7686879634857178, "learning_rate": 3.392808501009347e-06, "loss": 0.6580084562301636, "step": 1434 }, { "epoch": 1.7533577533577533, "grad_norm": 10.234268188476562, "learning_rate": 3.3884537400250554e-06, "loss": 0.6667467951774597, "step": 1436 }, { "epoch": 1.7557997557997558, "grad_norm": 1.436072826385498, "learning_rate": 3.384096376426247e-06, "loss": 0.5105250477790833, "step": 1438 }, { "epoch": 1.7582417582417582, "grad_norm": 1.6276432275772095, "learning_rate": 3.379736427975425e-06, "loss": 1.0976946353912354, "step": 1440 }, { "epoch": 1.7606837606837606, "grad_norm": 3.592867136001587, "learning_rate": 3.3753739124456343e-06, "loss": 0.8957812786102295, "step": 1442 }, { "epoch": 1.763125763125763, "grad_norm": 4.000123023986816, "learning_rate": 3.371008847620379e-06, "loss": 0.7372997403144836, "step": 1444 }, { "epoch": 1.7655677655677655, "grad_norm": 3.1201529502868652, "learning_rate": 3.366641251293559e-06, "loss": 1.102899193763733, "step": 1446 }, { "epoch": 1.768009768009768, "grad_norm": 1.768283486366272, "learning_rate": 3.3622711412693914e-06, "loss": 1.124794602394104, "step": 1448 }, { "epoch": 1.7704517704517704, "grad_norm": 2.403294801712036, "learning_rate": 3.3578985353623416e-06, "loss": 0.9902628660202026, "step": 1450 }, { "epoch": 1.7728937728937728, "grad_norm": 3.0186891555786133, "learning_rate": 3.3535234513970494e-06, "loss": 0.399064302444458, "step": 1452 }, { "epoch": 1.7753357753357752, "grad_norm": 1.5962026119232178, "learning_rate": 3.349145907208255e-06, "loss": 0.7983530163764954, "step": 1454 }, { "epoch": 1.7777777777777777, "grad_norm": 3.296353816986084, "learning_rate": 3.3447659206407285e-06, "loss": 0.5403007864952087, "step": 1456 }, { "epoch": 1.7802197802197801, "grad_norm": 1.5648705959320068, "learning_rate": 3.3403835095491967e-06, "loss": 1.0592517852783203, "step": 1458 }, { "epoch": 1.7826617826617825, "grad_norm": 3.352639675140381, "learning_rate": 3.3359986917982675e-06, "loss": 1.0402568578720093, "step": 1460 }, { "epoch": 1.785103785103785, "grad_norm": 3.2459142208099365, "learning_rate": 3.3316114852623617e-06, "loss": 0.9993575811386108, "step": 1462 }, { "epoch": 1.7875457875457874, "grad_norm": 2.1725311279296875, "learning_rate": 3.327221907825638e-06, "loss": 0.8232885599136353, "step": 1464 }, { "epoch": 1.7899877899877898, "grad_norm": 2.444363594055176, "learning_rate": 3.3228299773819165e-06, "loss": 0.8555684685707092, "step": 1466 }, { "epoch": 1.7924297924297923, "grad_norm": 4.547183990478516, "learning_rate": 3.318435711834615e-06, "loss": 0.8133440017700195, "step": 1468 }, { "epoch": 1.7948717948717947, "grad_norm": 3.024049758911133, "learning_rate": 3.3140391290966646e-06, "loss": 1.0311592817306519, "step": 1470 }, { "epoch": 1.7973137973137974, "grad_norm": 4.397846221923828, "learning_rate": 3.309640247090445e-06, "loss": 1.0561209917068481, "step": 1472 }, { "epoch": 1.7997557997557998, "grad_norm": 2.594501256942749, "learning_rate": 3.3052390837477087e-06, "loss": 0.6757609248161316, "step": 1474 }, { "epoch": 1.8021978021978022, "grad_norm": 2.992253541946411, "learning_rate": 3.300835657009507e-06, "loss": 0.7614642977714539, "step": 1476 }, { "epoch": 1.8046398046398047, "grad_norm": 5.074526786804199, "learning_rate": 3.2964299848261187e-06, "loss": 0.8146823048591614, "step": 1478 }, { "epoch": 1.807081807081807, "grad_norm": 3.6561779975891113, "learning_rate": 3.2920220851569746e-06, "loss": 0.4933128356933594, "step": 1480 }, { "epoch": 1.8095238095238095, "grad_norm": 5.129440784454346, "learning_rate": 3.2876119759705884e-06, "loss": 0.8365576267242432, "step": 1482 }, { "epoch": 1.811965811965812, "grad_norm": 1.3081094026565552, "learning_rate": 3.2831996752444774e-06, "loss": 1.174236536026001, "step": 1484 }, { "epoch": 1.8144078144078144, "grad_norm": 1.9769134521484375, "learning_rate": 3.2787852009650945e-06, "loss": 1.0928758382797241, "step": 1486 }, { "epoch": 1.8168498168498168, "grad_norm": 3.200984001159668, "learning_rate": 3.2743685711277533e-06, "loss": 0.7248603701591492, "step": 1488 }, { "epoch": 1.8192918192918193, "grad_norm": 1.9529130458831787, "learning_rate": 3.269949803736554e-06, "loss": 0.8898839950561523, "step": 1490 }, { "epoch": 1.8217338217338217, "grad_norm": 2.331352949142456, "learning_rate": 3.265528916804308e-06, "loss": 1.097998857498169, "step": 1492 }, { "epoch": 1.8241758241758241, "grad_norm": 7.352150917053223, "learning_rate": 3.261105928352472e-06, "loss": 0.7203211784362793, "step": 1494 }, { "epoch": 1.8266178266178266, "grad_norm": 8.535738945007324, "learning_rate": 3.2566808564110635e-06, "loss": 0.8137180209159851, "step": 1496 }, { "epoch": 1.8290598290598292, "grad_norm": 1.5943210124969482, "learning_rate": 3.252253719018599e-06, "loss": 1.4954842329025269, "step": 1498 }, { "epoch": 1.8315018315018317, "grad_norm": 2.4209067821502686, "learning_rate": 3.2478245342220094e-06, "loss": 1.2031804323196411, "step": 1500 }, { "epoch": 1.833943833943834, "grad_norm": 3.7259180545806885, "learning_rate": 3.243393320076575e-06, "loss": 0.8611478805541992, "step": 1502 }, { "epoch": 1.8363858363858365, "grad_norm": 0.5359264612197876, "learning_rate": 3.238960094645848e-06, "loss": 0.9046647548675537, "step": 1504 }, { "epoch": 1.838827838827839, "grad_norm": 2.4440624713897705, "learning_rate": 3.2345248760015777e-06, "loss": 0.7731856107711792, "step": 1506 }, { "epoch": 1.8412698412698414, "grad_norm": 1.7057727575302124, "learning_rate": 3.2300876822236427e-06, "loss": 0.8238407373428345, "step": 1508 }, { "epoch": 1.8437118437118438, "grad_norm": 2.0124754905700684, "learning_rate": 3.225648531399968e-06, "loss": 1.0737024545669556, "step": 1510 }, { "epoch": 1.8461538461538463, "grad_norm": 1.904160499572754, "learning_rate": 3.22120744162646e-06, "loss": 1.0689663887023926, "step": 1512 }, { "epoch": 1.8485958485958487, "grad_norm": 1.249457836151123, "learning_rate": 3.2167644310069276e-06, "loss": 1.0780993700027466, "step": 1514 }, { "epoch": 1.8510378510378511, "grad_norm": 3.9271388053894043, "learning_rate": 3.2123195176530104e-06, "loss": 0.833716094493866, "step": 1516 }, { "epoch": 1.8534798534798536, "grad_norm": 1.8167206048965454, "learning_rate": 3.207872719684104e-06, "loss": 1.1510157585144043, "step": 1518 }, { "epoch": 1.855921855921856, "grad_norm": 3.824442148208618, "learning_rate": 3.203424055227287e-06, "loss": 0.9223167896270752, "step": 1520 }, { "epoch": 1.8583638583638584, "grad_norm": 4.120997905731201, "learning_rate": 3.1989735424172456e-06, "loss": 0.9817994832992554, "step": 1522 }, { "epoch": 1.8608058608058609, "grad_norm": 2.414776563644409, "learning_rate": 3.1945211993962035e-06, "loss": 0.9063418507575989, "step": 1524 }, { "epoch": 1.8632478632478633, "grad_norm": 4.998463153839111, "learning_rate": 3.190067044313841e-06, "loss": 0.9489470720291138, "step": 1526 }, { "epoch": 1.8656898656898657, "grad_norm": 1.9804654121398926, "learning_rate": 3.185611095327227e-06, "loss": 0.7647035121917725, "step": 1528 }, { "epoch": 1.8681318681318682, "grad_norm": 1.3335086107254028, "learning_rate": 3.181153370600745e-06, "loss": 0.9383209943771362, "step": 1530 }, { "epoch": 1.8705738705738706, "grad_norm": 4.721079349517822, "learning_rate": 3.176693888306014e-06, "loss": 0.77753746509552, "step": 1532 }, { "epoch": 1.873015873015873, "grad_norm": 2.030644655227661, "learning_rate": 3.1722326666218213e-06, "loss": 0.8778524994850159, "step": 1534 }, { "epoch": 1.8754578754578755, "grad_norm": 1.5334826707839966, "learning_rate": 3.16776972373404e-06, "loss": 1.1086459159851074, "step": 1536 }, { "epoch": 1.877899877899878, "grad_norm": 1.6864469051361084, "learning_rate": 3.1633050778355624e-06, "loss": 1.0293059349060059, "step": 1538 }, { "epoch": 1.8803418803418803, "grad_norm": 2.2873408794403076, "learning_rate": 3.158838747126224e-06, "loss": 1.0864299535751343, "step": 1540 }, { "epoch": 1.8827838827838828, "grad_norm": 1.5731513500213623, "learning_rate": 3.1543707498127267e-06, "loss": 1.0680838823318481, "step": 1542 }, { "epoch": 1.8852258852258852, "grad_norm": 2.0635628700256348, "learning_rate": 3.1499011041085662e-06, "loss": 0.9070185422897339, "step": 1544 }, { "epoch": 1.8876678876678876, "grad_norm": 2.2307991981506348, "learning_rate": 3.145429828233959e-06, "loss": 1.060643196105957, "step": 1546 }, { "epoch": 1.89010989010989, "grad_norm": 3.084059476852417, "learning_rate": 3.1409569404157646e-06, "loss": 1.0800150632858276, "step": 1548 }, { "epoch": 1.8925518925518925, "grad_norm": 2.034463882446289, "learning_rate": 3.136482458887416e-06, "loss": 0.6771202087402344, "step": 1550 }, { "epoch": 1.894993894993895, "grad_norm": 2.416832447052002, "learning_rate": 3.132006401888841e-06, "loss": 1.1564983129501343, "step": 1552 }, { "epoch": 1.8974358974358974, "grad_norm": 2.9857418537139893, "learning_rate": 3.1275287876663905e-06, "loss": 0.8453341126441956, "step": 1554 }, { "epoch": 1.8998778998778998, "grad_norm": 1.9065909385681152, "learning_rate": 3.123049634472764e-06, "loss": 1.206203818321228, "step": 1556 }, { "epoch": 1.9023199023199022, "grad_norm": 1.7331615686416626, "learning_rate": 3.118568960566933e-06, "loss": 0.9110676050186157, "step": 1558 }, { "epoch": 1.9047619047619047, "grad_norm": 2.49706768989563, "learning_rate": 3.114086784214069e-06, "loss": 0.6509535908699036, "step": 1560 }, { "epoch": 1.907203907203907, "grad_norm": 1.9002443552017212, "learning_rate": 3.109603123685468e-06, "loss": 1.080418586730957, "step": 1562 }, { "epoch": 1.9096459096459095, "grad_norm": 3.7310116291046143, "learning_rate": 3.1051179972584756e-06, "loss": 0.7549952268600464, "step": 1564 }, { "epoch": 1.912087912087912, "grad_norm": 1.4353991746902466, "learning_rate": 3.1006314232164146e-06, "loss": 1.083061695098877, "step": 1566 }, { "epoch": 1.9145299145299144, "grad_norm": 2.5150792598724365, "learning_rate": 3.0961434198485067e-06, "loss": 0.9303537607192993, "step": 1568 }, { "epoch": 1.9169719169719168, "grad_norm": 1.2595463991165161, "learning_rate": 3.0916540054498028e-06, "loss": 0.7716434001922607, "step": 1570 }, { "epoch": 1.9194139194139193, "grad_norm": 1.386602759361267, "learning_rate": 3.087163198321103e-06, "loss": 1.1206477880477905, "step": 1572 }, { "epoch": 1.9218559218559217, "grad_norm": 2.0977489948272705, "learning_rate": 3.0826710167688866e-06, "loss": 0.7714415788650513, "step": 1574 }, { "epoch": 1.9242979242979243, "grad_norm": 3.282386302947998, "learning_rate": 3.0781774791052347e-06, "loss": 1.0669711828231812, "step": 1576 }, { "epoch": 1.9267399267399268, "grad_norm": 2.187236785888672, "learning_rate": 3.073682603647758e-06, "loss": 0.7885124683380127, "step": 1578 }, { "epoch": 1.9291819291819292, "grad_norm": 2.4865806102752686, "learning_rate": 3.0691864087195172e-06, "loss": 1.084753394126892, "step": 1580 }, { "epoch": 1.9316239316239316, "grad_norm": 3.804330348968506, "learning_rate": 3.064688912648957e-06, "loss": 0.3611922860145569, "step": 1582 }, { "epoch": 1.934065934065934, "grad_norm": 18.454357147216797, "learning_rate": 3.0601901337698213e-06, "loss": 0.5478751063346863, "step": 1584 }, { "epoch": 1.9365079365079365, "grad_norm": 9.308585166931152, "learning_rate": 3.055690090421085e-06, "loss": 0.6894688606262207, "step": 1586 }, { "epoch": 1.938949938949939, "grad_norm": 4.380536079406738, "learning_rate": 3.0511888009468792e-06, "loss": 1.172979474067688, "step": 1588 }, { "epoch": 1.9413919413919414, "grad_norm": 1.1702888011932373, "learning_rate": 3.0466862836964117e-06, "loss": 1.1025750637054443, "step": 1590 }, { "epoch": 1.9438339438339438, "grad_norm": 2.2686538696289062, "learning_rate": 3.0421825570238978e-06, "loss": 1.0041526556015015, "step": 1592 }, { "epoch": 1.9462759462759462, "grad_norm": 1.5547155141830444, "learning_rate": 3.037677639288481e-06, "loss": 0.7530244588851929, "step": 1594 }, { "epoch": 1.9487179487179487, "grad_norm": 1.6241923570632935, "learning_rate": 3.0331715488541626e-06, "loss": 0.6593371629714966, "step": 1596 }, { "epoch": 1.9511599511599511, "grad_norm": 1.3635199069976807, "learning_rate": 3.0286643040897203e-06, "loss": 0.7976773381233215, "step": 1598 }, { "epoch": 1.9536019536019538, "grad_norm": 1.5380146503448486, "learning_rate": 3.0241559233686424e-06, "loss": 0.8483846187591553, "step": 1600 }, { "epoch": 1.9560439560439562, "grad_norm": 1.5258017778396606, "learning_rate": 3.0196464250690434e-06, "loss": 1.0973600149154663, "step": 1602 }, { "epoch": 1.9584859584859586, "grad_norm": 5.223465442657471, "learning_rate": 3.0151358275735965e-06, "loss": 1.2270939350128174, "step": 1604 }, { "epoch": 1.960927960927961, "grad_norm": 4.014069080352783, "learning_rate": 3.0106241492694533e-06, "loss": 1.3512402772903442, "step": 1606 }, { "epoch": 1.9633699633699635, "grad_norm": 1.4490033388137817, "learning_rate": 3.0061114085481745e-06, "loss": 1.1516140699386597, "step": 1608 }, { "epoch": 1.965811965811966, "grad_norm": 1.19436776638031, "learning_rate": 3.0015976238056475e-06, "loss": 1.0787304639816284, "step": 1610 }, { "epoch": 1.9682539682539684, "grad_norm": 6.923144817352295, "learning_rate": 2.9970828134420198e-06, "loss": 0.9626544713973999, "step": 1612 }, { "epoch": 1.9706959706959708, "grad_norm": 1.686660885810852, "learning_rate": 2.992566995861616e-06, "loss": 1.1870635747909546, "step": 1614 }, { "epoch": 1.9731379731379732, "grad_norm": 2.969782829284668, "learning_rate": 2.988050189472869e-06, "loss": 0.9546635150909424, "step": 1616 }, { "epoch": 1.9755799755799757, "grad_norm": 2.04162335395813, "learning_rate": 2.983532412688242e-06, "loss": 1.0080379247665405, "step": 1618 }, { "epoch": 1.978021978021978, "grad_norm": 1.1154638528823853, "learning_rate": 2.979013683924154e-06, "loss": 1.1551849842071533, "step": 1620 }, { "epoch": 1.9804639804639805, "grad_norm": 1.3147307634353638, "learning_rate": 2.9744940216009037e-06, "loss": 0.8124474287033081, "step": 1622 }, { "epoch": 1.982905982905983, "grad_norm": 3.960902690887451, "learning_rate": 2.969973444142597e-06, "loss": 0.5901971459388733, "step": 1624 }, { "epoch": 1.9853479853479854, "grad_norm": 2.4836363792419434, "learning_rate": 2.965451969977069e-06, "loss": 0.8430943489074707, "step": 1626 }, { "epoch": 1.9877899877899878, "grad_norm": 5.949784278869629, "learning_rate": 2.9609296175358102e-06, "loss": 0.9984661340713501, "step": 1628 }, { "epoch": 1.9902319902319903, "grad_norm": 2.4892053604125977, "learning_rate": 2.9564064052538926e-06, "loss": 1.1695860624313354, "step": 1630 }, { "epoch": 1.9926739926739927, "grad_norm": 1.6151142120361328, "learning_rate": 2.951882351569892e-06, "loss": 1.063124179840088, "step": 1632 }, { "epoch": 1.9951159951159951, "grad_norm": 2.0610530376434326, "learning_rate": 2.9473574749258143e-06, "loss": 0.8075814247131348, "step": 1634 }, { "epoch": 1.9975579975579976, "grad_norm": 2.036194086074829, "learning_rate": 2.94283179376702e-06, "loss": 1.161010980606079, "step": 1636 }, { "epoch": 2.0, "grad_norm": 1.6809015274047852, "learning_rate": 2.9383053265421514e-06, "loss": 1.0740622282028198, "step": 1638 }, { "epoch": 2.0024420024420024, "grad_norm": 7.186413288116455, "learning_rate": 2.9337780917030513e-06, "loss": 0.9597793221473694, "step": 1640 }, { "epoch": 2.004884004884005, "grad_norm": 2.5799577236175537, "learning_rate": 2.929250107704694e-06, "loss": 0.7062101364135742, "step": 1642 }, { "epoch": 2.0073260073260073, "grad_norm": 0.9430143237113953, "learning_rate": 2.924721393005109e-06, "loss": 0.9560756087303162, "step": 1644 }, { "epoch": 2.0097680097680097, "grad_norm": 2.4356815814971924, "learning_rate": 2.9201919660653e-06, "loss": 0.7125204801559448, "step": 1646 }, { "epoch": 2.012210012210012, "grad_norm": 2.3169310092926025, "learning_rate": 2.9156618453491786e-06, "loss": 0.8216168880462646, "step": 1648 }, { "epoch": 2.0146520146520146, "grad_norm": 11.127049446105957, "learning_rate": 2.911131049323483e-06, "loss": 0.8026351928710938, "step": 1650 }, { "epoch": 2.017094017094017, "grad_norm": 2.923428535461426, "learning_rate": 2.9065995964577028e-06, "loss": 0.7188471555709839, "step": 1652 }, { "epoch": 2.0195360195360195, "grad_norm": 4.269984722137451, "learning_rate": 2.902067505224008e-06, "loss": 1.2672061920166016, "step": 1654 }, { "epoch": 2.021978021978022, "grad_norm": 1.2916280031204224, "learning_rate": 2.897534794097167e-06, "loss": 0.5318281054496765, "step": 1656 }, { "epoch": 2.0244200244200243, "grad_norm": 2.5028984546661377, "learning_rate": 2.89300148155448e-06, "loss": 0.9953727126121521, "step": 1658 }, { "epoch": 2.0268620268620268, "grad_norm": 2.887450695037842, "learning_rate": 2.8884675860756946e-06, "loss": 0.9623196125030518, "step": 1660 }, { "epoch": 2.029304029304029, "grad_norm": 2.6880152225494385, "learning_rate": 2.883933126142937e-06, "loss": 1.0482466220855713, "step": 1662 }, { "epoch": 2.0317460317460316, "grad_norm": 1.8128950595855713, "learning_rate": 2.8793981202406335e-06, "loss": 0.4340633749961853, "step": 1664 }, { "epoch": 2.034188034188034, "grad_norm": 3.808696985244751, "learning_rate": 2.874862586855437e-06, "loss": 0.7226059436798096, "step": 1666 }, { "epoch": 2.0366300366300365, "grad_norm": 1.5693755149841309, "learning_rate": 2.870326544476148e-06, "loss": 1.0041981935501099, "step": 1668 }, { "epoch": 2.039072039072039, "grad_norm": 3.0417141914367676, "learning_rate": 2.8657900115936465e-06, "loss": 0.7336680889129639, "step": 1670 }, { "epoch": 2.0415140415140414, "grad_norm": 3.467229127883911, "learning_rate": 2.8612530067008067e-06, "loss": 0.9192556142807007, "step": 1672 }, { "epoch": 2.043956043956044, "grad_norm": 3.149291515350342, "learning_rate": 2.8567155482924315e-06, "loss": 0.9109829068183899, "step": 1674 }, { "epoch": 2.0463980463980462, "grad_norm": 1.5668519735336304, "learning_rate": 2.8521776548651692e-06, "loss": 0.6515228748321533, "step": 1676 }, { "epoch": 2.0488400488400487, "grad_norm": 3.5928568840026855, "learning_rate": 2.8476393449174426e-06, "loss": 1.0088976621627808, "step": 1678 }, { "epoch": 2.051282051282051, "grad_norm": 2.0251355171203613, "learning_rate": 2.843100636949374e-06, "loss": 1.004931092262268, "step": 1680 }, { "epoch": 2.0537240537240535, "grad_norm": 3.476871967315674, "learning_rate": 2.838561549462705e-06, "loss": 0.7845253348350525, "step": 1682 }, { "epoch": 2.056166056166056, "grad_norm": 8.491005897521973, "learning_rate": 2.8340221009607272e-06, "loss": 0.7041101455688477, "step": 1684 }, { "epoch": 2.0586080586080584, "grad_norm": 7.643034934997559, "learning_rate": 2.829482309948203e-06, "loss": 0.8947182297706604, "step": 1686 }, { "epoch": 2.061050061050061, "grad_norm": 2.488511323928833, "learning_rate": 2.824942194931289e-06, "loss": 0.9186074137687683, "step": 1688 }, { "epoch": 2.0634920634920633, "grad_norm": 10.357978820800781, "learning_rate": 2.820401774417466e-06, "loss": 0.7940126061439514, "step": 1690 }, { "epoch": 2.065934065934066, "grad_norm": 1.5219630002975464, "learning_rate": 2.815861066915458e-06, "loss": 0.7649714350700378, "step": 1692 }, { "epoch": 2.0683760683760686, "grad_norm": 1.8372576236724854, "learning_rate": 2.811320090935159e-06, "loss": 0.7867807149887085, "step": 1694 }, { "epoch": 2.070818070818071, "grad_norm": 2.9736199378967285, "learning_rate": 2.806778864987558e-06, "loss": 1.0023208856582642, "step": 1696 }, { "epoch": 2.0732600732600734, "grad_norm": 4.48581075668335, "learning_rate": 2.802237407584663e-06, "loss": 0.9700354337692261, "step": 1698 }, { "epoch": 2.075702075702076, "grad_norm": 2.3087658882141113, "learning_rate": 2.797695737239425e-06, "loss": 0.9603742361068726, "step": 1700 }, { "epoch": 2.0781440781440783, "grad_norm": 3.8156135082244873, "learning_rate": 2.7931538724656625e-06, "loss": 0.4553748667240143, "step": 1702 }, { "epoch": 2.0805860805860807, "grad_norm": 1.0407174825668335, "learning_rate": 2.788611831777989e-06, "loss": 0.5665370225906372, "step": 1704 }, { "epoch": 2.083028083028083, "grad_norm": 5.208148956298828, "learning_rate": 2.784069633691732e-06, "loss": 0.41125673055648804, "step": 1706 }, { "epoch": 2.0854700854700856, "grad_norm": 4.539152145385742, "learning_rate": 2.779527296722863e-06, "loss": 0.9381171464920044, "step": 1708 }, { "epoch": 2.087912087912088, "grad_norm": 2.998134136199951, "learning_rate": 2.774984839387918e-06, "loss": 0.7079961895942688, "step": 1710 }, { "epoch": 2.0903540903540905, "grad_norm": 3.755718231201172, "learning_rate": 2.7704422802039255e-06, "loss": 0.7328172922134399, "step": 1712 }, { "epoch": 2.092796092796093, "grad_norm": 1.5156927108764648, "learning_rate": 2.765899637688327e-06, "loss": 0.6182104349136353, "step": 1714 }, { "epoch": 2.0952380952380953, "grad_norm": 2.6270971298217773, "learning_rate": 2.7613569303589054e-06, "loss": 0.7295227646827698, "step": 1716 }, { "epoch": 2.0976800976800978, "grad_norm": 2.0563015937805176, "learning_rate": 2.756814176733707e-06, "loss": 0.9640318155288696, "step": 1718 }, { "epoch": 2.1001221001221, "grad_norm": 2.9778478145599365, "learning_rate": 2.752271395330967e-06, "loss": 0.9460858106613159, "step": 1720 }, { "epoch": 2.1025641025641026, "grad_norm": 2.579092025756836, "learning_rate": 2.7477286046690336e-06, "loss": 0.9912809133529663, "step": 1722 }, { "epoch": 2.105006105006105, "grad_norm": 2.132593870162964, "learning_rate": 2.743185823266294e-06, "loss": 0.657219648361206, "step": 1724 }, { "epoch": 2.1074481074481075, "grad_norm": 3.7171902656555176, "learning_rate": 2.7386430696410953e-06, "loss": 0.6395490765571594, "step": 1726 }, { "epoch": 2.10989010989011, "grad_norm": 1.617601752281189, "learning_rate": 2.7341003623116743e-06, "loss": 0.5296671986579895, "step": 1728 }, { "epoch": 2.1123321123321124, "grad_norm": 2.1062819957733154, "learning_rate": 2.729557719796076e-06, "loss": 0.8426005840301514, "step": 1730 }, { "epoch": 2.114774114774115, "grad_norm": 3.001302480697632, "learning_rate": 2.7250151606120826e-06, "loss": 0.565944254398346, "step": 1732 }, { "epoch": 2.1172161172161172, "grad_norm": 0.658115565776825, "learning_rate": 2.7204727032771376e-06, "loss": 0.3656719923019409, "step": 1734 }, { "epoch": 2.1196581196581197, "grad_norm": 2.848242998123169, "learning_rate": 2.7159303663082687e-06, "loss": 0.9933385252952576, "step": 1736 }, { "epoch": 2.122100122100122, "grad_norm": 4.3910417556762695, "learning_rate": 2.7113881682220123e-06, "loss": 0.9253290891647339, "step": 1738 }, { "epoch": 2.1245421245421245, "grad_norm": 1.5515904426574707, "learning_rate": 2.7068461275343382e-06, "loss": 0.8804880976676941, "step": 1740 }, { "epoch": 2.126984126984127, "grad_norm": 5.038269996643066, "learning_rate": 2.7023042627605754e-06, "loss": 1.0033385753631592, "step": 1742 }, { "epoch": 2.1294261294261294, "grad_norm": 2.507053852081299, "learning_rate": 2.6977625924153376e-06, "loss": 0.671730637550354, "step": 1744 }, { "epoch": 2.131868131868132, "grad_norm": 2.777392625808716, "learning_rate": 2.6932211350124425e-06, "loss": 1.001034140586853, "step": 1746 }, { "epoch": 2.1343101343101343, "grad_norm": 5.460464954376221, "learning_rate": 2.6886799090648417e-06, "loss": 0.38881126046180725, "step": 1748 }, { "epoch": 2.1367521367521367, "grad_norm": 6.266025543212891, "learning_rate": 2.684138933084543e-06, "loss": 1.1089563369750977, "step": 1750 }, { "epoch": 2.139194139194139, "grad_norm": 2.3073322772979736, "learning_rate": 2.6795982255825354e-06, "loss": 0.9409431219100952, "step": 1752 }, { "epoch": 2.1416361416361416, "grad_norm": 3.6696202754974365, "learning_rate": 2.6750578050687115e-06, "loss": 0.8869442939758301, "step": 1754 }, { "epoch": 2.144078144078144, "grad_norm": 4.61408805847168, "learning_rate": 2.6705176900517983e-06, "loss": 1.01822030544281, "step": 1756 }, { "epoch": 2.1465201465201464, "grad_norm": 2.669914484024048, "learning_rate": 2.665977899039274e-06, "loss": 0.48161619901657104, "step": 1758 }, { "epoch": 2.148962148962149, "grad_norm": 2.6554932594299316, "learning_rate": 2.661438450537296e-06, "loss": 0.8899593353271484, "step": 1760 }, { "epoch": 2.1514041514041513, "grad_norm": 1.00688636302948, "learning_rate": 2.656899363050628e-06, "loss": 0.6889787912368774, "step": 1762 }, { "epoch": 2.1538461538461537, "grad_norm": 5.138449192047119, "learning_rate": 2.6523606550825577e-06, "loss": 0.6849108934402466, "step": 1764 }, { "epoch": 2.156288156288156, "grad_norm": 1.4361852407455444, "learning_rate": 2.647822345134832e-06, "loss": 0.5109698176383972, "step": 1766 }, { "epoch": 2.1587301587301586, "grad_norm": 4.641076564788818, "learning_rate": 2.6432844517075696e-06, "loss": 0.7529181838035583, "step": 1768 }, { "epoch": 2.161172161172161, "grad_norm": 2.5101401805877686, "learning_rate": 2.638746993299194e-06, "loss": 0.6117711067199707, "step": 1770 }, { "epoch": 2.1636141636141635, "grad_norm": 1.5911965370178223, "learning_rate": 2.6342099884063542e-06, "loss": 0.9727715849876404, "step": 1772 }, { "epoch": 2.166056166056166, "grad_norm": 4.569766044616699, "learning_rate": 2.6296734555238517e-06, "loss": 0.8418964147567749, "step": 1774 }, { "epoch": 2.1684981684981683, "grad_norm": 2.1551764011383057, "learning_rate": 2.625137413144564e-06, "loss": 1.0541213750839233, "step": 1776 }, { "epoch": 2.1709401709401708, "grad_norm": 6.51698112487793, "learning_rate": 2.6206018797593672e-06, "loss": 0.6803760528564453, "step": 1778 }, { "epoch": 2.173382173382173, "grad_norm": 2.092607021331787, "learning_rate": 2.6160668738570638e-06, "loss": 0.9858105182647705, "step": 1780 }, { "epoch": 2.1758241758241756, "grad_norm": 8.542030334472656, "learning_rate": 2.6115324139243065e-06, "loss": 0.7755582332611084, "step": 1782 }, { "epoch": 2.178266178266178, "grad_norm": 2.45867919921875, "learning_rate": 2.606998518445521e-06, "loss": 0.9509971141815186, "step": 1784 }, { "epoch": 2.1807081807081805, "grad_norm": 5.667660236358643, "learning_rate": 2.6024652059028337e-06, "loss": 0.8328191041946411, "step": 1786 }, { "epoch": 2.183150183150183, "grad_norm": 15.772777557373047, "learning_rate": 2.5979324947759936e-06, "loss": 0.9569545388221741, "step": 1788 }, { "epoch": 2.185592185592186, "grad_norm": 4.427873134613037, "learning_rate": 2.5934004035422983e-06, "loss": 0.897070050239563, "step": 1790 }, { "epoch": 2.1880341880341883, "grad_norm": 4.582241535186768, "learning_rate": 2.5888689506765186e-06, "loss": 0.9291706681251526, "step": 1792 }, { "epoch": 2.1904761904761907, "grad_norm": 2.202183485031128, "learning_rate": 2.5843381546508217e-06, "loss": 0.545952320098877, "step": 1794 }, { "epoch": 2.192918192918193, "grad_norm": 1.6277117729187012, "learning_rate": 2.579808033934701e-06, "loss": 0.6887462735176086, "step": 1796 }, { "epoch": 2.1953601953601956, "grad_norm": 4.229698657989502, "learning_rate": 2.5752786069948925e-06, "loss": 0.8135143518447876, "step": 1798 }, { "epoch": 2.197802197802198, "grad_norm": 2.0216007232666016, "learning_rate": 2.5707498922953065e-06, "loss": 0.9676254391670227, "step": 1800 }, { "epoch": 2.2002442002442004, "grad_norm": 3.828848361968994, "learning_rate": 2.5662219082969502e-06, "loss": 0.9208850264549255, "step": 1802 }, { "epoch": 2.202686202686203, "grad_norm": 2.4354822635650635, "learning_rate": 2.561694673457849e-06, "loss": 0.6844379305839539, "step": 1804 }, { "epoch": 2.2051282051282053, "grad_norm": 1.510022521018982, "learning_rate": 2.55716820623298e-06, "loss": 0.938340961933136, "step": 1806 }, { "epoch": 2.2075702075702077, "grad_norm": 1.947124719619751, "learning_rate": 2.5526425250741864e-06, "loss": 0.9929482936859131, "step": 1808 }, { "epoch": 2.21001221001221, "grad_norm": 2.3316943645477295, "learning_rate": 2.548117648430109e-06, "loss": 0.7233268618583679, "step": 1810 }, { "epoch": 2.2124542124542126, "grad_norm": 2.0162341594696045, "learning_rate": 2.543593594746108e-06, "loss": 0.6767272353172302, "step": 1812 }, { "epoch": 2.214896214896215, "grad_norm": 0.9409213662147522, "learning_rate": 2.539070382464191e-06, "loss": 0.435127854347229, "step": 1814 }, { "epoch": 2.2173382173382175, "grad_norm": 1.5501841306686401, "learning_rate": 2.5345480300229313e-06, "loss": 0.9680942893028259, "step": 1816 }, { "epoch": 2.21978021978022, "grad_norm": 2.132582426071167, "learning_rate": 2.5300265558574034e-06, "loss": 0.890035092830658, "step": 1818 }, { "epoch": 2.2222222222222223, "grad_norm": 1.4618556499481201, "learning_rate": 2.525505978399097e-06, "loss": 0.8848022818565369, "step": 1820 }, { "epoch": 2.2246642246642248, "grad_norm": 1.61579167842865, "learning_rate": 2.5209863160758467e-06, "loss": 0.5495251417160034, "step": 1822 }, { "epoch": 2.227106227106227, "grad_norm": 2.571030378341675, "learning_rate": 2.5164675873117588e-06, "loss": 0.79774409532547, "step": 1824 }, { "epoch": 2.2295482295482296, "grad_norm": 3.0071425437927246, "learning_rate": 2.511949810527131e-06, "loss": 0.7262362241744995, "step": 1826 }, { "epoch": 2.231990231990232, "grad_norm": 5.134605884552002, "learning_rate": 2.507433004138385e-06, "loss": 0.6448302865028381, "step": 1828 }, { "epoch": 2.2344322344322345, "grad_norm": 7.841651916503906, "learning_rate": 2.5029171865579813e-06, "loss": 0.8010722398757935, "step": 1830 }, { "epoch": 2.236874236874237, "grad_norm": 2.221493721008301, "learning_rate": 2.4984023761943532e-06, "loss": 0.9125744104385376, "step": 1832 }, { "epoch": 2.2393162393162394, "grad_norm": 2.1063764095306396, "learning_rate": 2.493888591451826e-06, "loss": 0.9146173000335693, "step": 1834 }, { "epoch": 2.241758241758242, "grad_norm": 2.964050054550171, "learning_rate": 2.4893758507305465e-06, "loss": 1.0444574356079102, "step": 1836 }, { "epoch": 2.244200244200244, "grad_norm": 3.5007026195526123, "learning_rate": 2.4848641724264046e-06, "loss": 1.0267515182495117, "step": 1838 }, { "epoch": 2.2466422466422467, "grad_norm": 3.2701382637023926, "learning_rate": 2.4803535749309578e-06, "loss": 0.44911229610443115, "step": 1840 }, { "epoch": 2.249084249084249, "grad_norm": 3.629748821258545, "learning_rate": 2.4758440766313583e-06, "loss": 1.014188528060913, "step": 1842 }, { "epoch": 2.2515262515262515, "grad_norm": 2.0354208946228027, "learning_rate": 2.4713356959102804e-06, "loss": 1.1367512941360474, "step": 1844 }, { "epoch": 2.253968253968254, "grad_norm": 7.688552379608154, "learning_rate": 2.4668284511458385e-06, "loss": 0.641595721244812, "step": 1846 }, { "epoch": 2.2564102564102564, "grad_norm": 0.6762326955795288, "learning_rate": 2.4623223607115195e-06, "loss": 0.6372429132461548, "step": 1848 }, { "epoch": 2.258852258852259, "grad_norm": 8.27773666381836, "learning_rate": 2.457817442976103e-06, "loss": 0.8019550442695618, "step": 1850 }, { "epoch": 2.2612942612942613, "grad_norm": 4.783394813537598, "learning_rate": 2.453313716303589e-06, "loss": 0.896358072757721, "step": 1852 }, { "epoch": 2.2637362637362637, "grad_norm": 1.9131306409835815, "learning_rate": 2.4488111990531223e-06, "loss": 0.9868752360343933, "step": 1854 }, { "epoch": 2.266178266178266, "grad_norm": 1.9729207754135132, "learning_rate": 2.4443099095789147e-06, "loss": 1.0031776428222656, "step": 1856 }, { "epoch": 2.2686202686202686, "grad_norm": 1.891891360282898, "learning_rate": 2.4398098662301794e-06, "loss": 0.8341459631919861, "step": 1858 }, { "epoch": 2.271062271062271, "grad_norm": 4.215878486633301, "learning_rate": 2.435311087351044e-06, "loss": 1.061068058013916, "step": 1860 }, { "epoch": 2.2735042735042734, "grad_norm": 3.9982106685638428, "learning_rate": 2.430813591280483e-06, "loss": 0.6541799902915955, "step": 1862 }, { "epoch": 2.275946275946276, "grad_norm": 1.4903755187988281, "learning_rate": 2.426317396352243e-06, "loss": 0.6281458139419556, "step": 1864 }, { "epoch": 2.2783882783882783, "grad_norm": 2.3184974193573, "learning_rate": 2.421822520894766e-06, "loss": 0.9676836729049683, "step": 1866 }, { "epoch": 2.2808302808302807, "grad_norm": 6.018508434295654, "learning_rate": 2.4173289832311137e-06, "loss": 0.9747646450996399, "step": 1868 }, { "epoch": 2.283272283272283, "grad_norm": 3.8310694694519043, "learning_rate": 2.4128368016788973e-06, "loss": 0.9547437429428101, "step": 1870 }, { "epoch": 2.2857142857142856, "grad_norm": 2.1508607864379883, "learning_rate": 2.408345994550198e-06, "loss": 0.9991099834442139, "step": 1872 }, { "epoch": 2.288156288156288, "grad_norm": 11.147876739501953, "learning_rate": 2.403856580151494e-06, "loss": 0.2010164111852646, "step": 1874 }, { "epoch": 2.2905982905982905, "grad_norm": 1.8129445314407349, "learning_rate": 2.3993685767835866e-06, "loss": 0.9613729119300842, "step": 1876 }, { "epoch": 2.293040293040293, "grad_norm": 6.837078094482422, "learning_rate": 2.3948820027415247e-06, "loss": 0.6076623201370239, "step": 1878 }, { "epoch": 2.2954822954822953, "grad_norm": 10.136993408203125, "learning_rate": 2.390396876314533e-06, "loss": 0.819178581237793, "step": 1880 }, { "epoch": 2.2979242979242978, "grad_norm": 4.178940773010254, "learning_rate": 2.3859132157859323e-06, "loss": 0.963537335395813, "step": 1882 }, { "epoch": 2.3003663003663, "grad_norm": 3.2244484424591064, "learning_rate": 2.3814310394330683e-06, "loss": 0.6918718218803406, "step": 1884 }, { "epoch": 2.3028083028083026, "grad_norm": 2.9510533809661865, "learning_rate": 2.3769503655272375e-06, "loss": 1.1837718486785889, "step": 1886 }, { "epoch": 2.305250305250305, "grad_norm": 2.189448833465576, "learning_rate": 2.3724712123336098e-06, "loss": 0.953423798084259, "step": 1888 }, { "epoch": 2.3076923076923075, "grad_norm": 1.599765419960022, "learning_rate": 2.3679935981111594e-06, "loss": 0.9839805960655212, "step": 1890 }, { "epoch": 2.31013431013431, "grad_norm": 3.1475296020507812, "learning_rate": 2.363517541112585e-06, "loss": 0.9580415487289429, "step": 1892 }, { "epoch": 2.3125763125763124, "grad_norm": 4.1370744705200195, "learning_rate": 2.359043059584236e-06, "loss": 1.0927242040634155, "step": 1894 }, { "epoch": 2.315018315018315, "grad_norm": 2.406658411026001, "learning_rate": 2.354570171766042e-06, "loss": 0.9985021948814392, "step": 1896 }, { "epoch": 2.317460317460317, "grad_norm": 2.142878770828247, "learning_rate": 2.350098895891434e-06, "loss": 1.1066349744796753, "step": 1898 }, { "epoch": 2.3199023199023197, "grad_norm": 1.68159818649292, "learning_rate": 2.345629250187274e-06, "loss": 0.9075395464897156, "step": 1900 }, { "epoch": 2.3223443223443225, "grad_norm": 2.389622688293457, "learning_rate": 2.3411612528737765e-06, "loss": 1.001306414604187, "step": 1902 }, { "epoch": 2.324786324786325, "grad_norm": 17.086624145507812, "learning_rate": 2.3366949221644387e-06, "loss": 0.5735141038894653, "step": 1904 }, { "epoch": 2.3272283272283274, "grad_norm": 2.313629388809204, "learning_rate": 2.3322302762659616e-06, "loss": 0.45153266191482544, "step": 1906 }, { "epoch": 2.32967032967033, "grad_norm": 2.6861674785614014, "learning_rate": 2.3277673333781803e-06, "loss": 0.6503361463546753, "step": 1908 }, { "epoch": 2.3321123321123323, "grad_norm": 2.286579132080078, "learning_rate": 2.323306111693986e-06, "loss": 0.4222344160079956, "step": 1910 }, { "epoch": 2.3345543345543347, "grad_norm": 6.489071846008301, "learning_rate": 2.3188466293992555e-06, "loss": 0.816202700138092, "step": 1912 }, { "epoch": 2.336996336996337, "grad_norm": 1.8680096864700317, "learning_rate": 2.3143889046727735e-06, "loss": 0.865801990032196, "step": 1914 }, { "epoch": 2.3394383394383396, "grad_norm": 3.2620620727539062, "learning_rate": 2.3099329556861605e-06, "loss": 0.9299424290657043, "step": 1916 }, { "epoch": 2.341880341880342, "grad_norm": 2.1896071434020996, "learning_rate": 2.305478800603798e-06, "loss": 0.7136389017105103, "step": 1918 }, { "epoch": 2.3443223443223444, "grad_norm": 3.63539457321167, "learning_rate": 2.301026457582754e-06, "loss": 0.32393085956573486, "step": 1920 }, { "epoch": 2.346764346764347, "grad_norm": 1.8691846132278442, "learning_rate": 2.2965759447727136e-06, "loss": 0.7247822284698486, "step": 1922 }, { "epoch": 2.3492063492063493, "grad_norm": 3.239156723022461, "learning_rate": 2.2921272803158966e-06, "loss": 0.5720818638801575, "step": 1924 }, { "epoch": 2.3516483516483517, "grad_norm": 2.6364552974700928, "learning_rate": 2.2876804823469907e-06, "loss": 0.977821946144104, "step": 1926 }, { "epoch": 2.354090354090354, "grad_norm": 2.6305339336395264, "learning_rate": 2.2832355689930736e-06, "loss": 0.4369853138923645, "step": 1928 }, { "epoch": 2.3565323565323566, "grad_norm": 1.0626336336135864, "learning_rate": 2.2787925583735403e-06, "loss": 0.513285219669342, "step": 1930 }, { "epoch": 2.358974358974359, "grad_norm": 2.138998031616211, "learning_rate": 2.274351468600033e-06, "loss": 0.7080082297325134, "step": 1932 }, { "epoch": 2.3614163614163615, "grad_norm": 1.5141674280166626, "learning_rate": 2.2699123177763584e-06, "loss": 0.9225776195526123, "step": 1934 }, { "epoch": 2.363858363858364, "grad_norm": 1.8718771934509277, "learning_rate": 2.265475123998423e-06, "loss": 0.5893734693527222, "step": 1936 }, { "epoch": 2.3663003663003663, "grad_norm": 1.9005098342895508, "learning_rate": 2.2610399053541536e-06, "loss": 0.9091716408729553, "step": 1938 }, { "epoch": 2.3687423687423688, "grad_norm": 2.963907241821289, "learning_rate": 2.2566066799234255e-06, "loss": 0.7350085377693176, "step": 1940 }, { "epoch": 2.371184371184371, "grad_norm": 0.5085861682891846, "learning_rate": 2.252175465777991e-06, "loss": 0.7246252298355103, "step": 1942 }, { "epoch": 2.3736263736263736, "grad_norm": 3.5920464992523193, "learning_rate": 2.2477462809814023e-06, "loss": 0.8181778788566589, "step": 1944 }, { "epoch": 2.376068376068376, "grad_norm": 4.16288948059082, "learning_rate": 2.2433191435889368e-06, "loss": 0.2666274309158325, "step": 1946 }, { "epoch": 2.3785103785103785, "grad_norm": 1.2004927396774292, "learning_rate": 2.2388940716475292e-06, "loss": 0.6288062334060669, "step": 1948 }, { "epoch": 2.380952380952381, "grad_norm": 5.698200225830078, "learning_rate": 2.234471083195692e-06, "loss": 0.5255064964294434, "step": 1950 }, { "epoch": 2.3833943833943834, "grad_norm": 2.2315514087677, "learning_rate": 2.2300501962634474e-06, "loss": 0.5431297421455383, "step": 1952 }, { "epoch": 2.385836385836386, "grad_norm": 6.294608116149902, "learning_rate": 2.2256314288722474e-06, "loss": 0.7784007787704468, "step": 1954 }, { "epoch": 2.3882783882783882, "grad_norm": 2.0798985958099365, "learning_rate": 2.2212147990349062e-06, "loss": 1.0333225727081299, "step": 1956 }, { "epoch": 2.3907203907203907, "grad_norm": 1.6888413429260254, "learning_rate": 2.2168003247555238e-06, "loss": 0.7074629068374634, "step": 1958 }, { "epoch": 2.393162393162393, "grad_norm": 1.4450297355651855, "learning_rate": 2.2123880240294127e-06, "loss": 1.10811448097229, "step": 1960 }, { "epoch": 2.3956043956043955, "grad_norm": 2.086488723754883, "learning_rate": 2.2079779148430265e-06, "loss": 0.6509331464767456, "step": 1962 }, { "epoch": 2.398046398046398, "grad_norm": 1.7849246263504028, "learning_rate": 2.203570015173882e-06, "loss": 0.966160774230957, "step": 1964 }, { "epoch": 2.4004884004884004, "grad_norm": 3.1780035495758057, "learning_rate": 2.199164342990494e-06, "loss": 0.5994513034820557, "step": 1966 }, { "epoch": 2.402930402930403, "grad_norm": 2.7655558586120605, "learning_rate": 2.1947609162522924e-06, "loss": 0.6144997477531433, "step": 1968 }, { "epoch": 2.4053724053724053, "grad_norm": 9.948949813842773, "learning_rate": 2.190359752909556e-06, "loss": 0.4493882656097412, "step": 1970 }, { "epoch": 2.4078144078144077, "grad_norm": 3.8871443271636963, "learning_rate": 2.1859608709033357e-06, "loss": 0.22239239513874054, "step": 1972 }, { "epoch": 2.41025641025641, "grad_norm": 1.7906450033187866, "learning_rate": 2.1815642881653858e-06, "loss": 0.23173484206199646, "step": 1974 }, { "epoch": 2.4126984126984126, "grad_norm": 7.670548915863037, "learning_rate": 2.177170022618084e-06, "loss": 0.38976311683654785, "step": 1976 }, { "epoch": 2.415140415140415, "grad_norm": 2.966620922088623, "learning_rate": 2.1727780921743633e-06, "loss": 0.9863390922546387, "step": 1978 }, { "epoch": 2.4175824175824174, "grad_norm": 1.4998663663864136, "learning_rate": 2.1683885147376394e-06, "loss": 0.47463205456733704, "step": 1980 }, { "epoch": 2.42002442002442, "grad_norm": 2.3728787899017334, "learning_rate": 2.1640013082017332e-06, "loss": 1.125450849533081, "step": 1982 }, { "epoch": 2.4224664224664223, "grad_norm": 1.6976672410964966, "learning_rate": 2.1596164904508044e-06, "loss": 0.5219910740852356, "step": 1984 }, { "epoch": 2.4249084249084247, "grad_norm": 2.192134380340576, "learning_rate": 2.1552340793592718e-06, "loss": 1.040833830833435, "step": 1986 }, { "epoch": 2.427350427350427, "grad_norm": 2.2941408157348633, "learning_rate": 2.1508540927917458e-06, "loss": 0.9751767516136169, "step": 1988 }, { "epoch": 2.42979242979243, "grad_norm": 2.0631191730499268, "learning_rate": 2.1464765486029517e-06, "loss": 1.166698932647705, "step": 1990 }, { "epoch": 2.4322344322344325, "grad_norm": 9.183150291442871, "learning_rate": 2.1421014646376583e-06, "loss": 1.0005483627319336, "step": 1992 }, { "epoch": 2.434676434676435, "grad_norm": 2.409327268600464, "learning_rate": 2.137728858730609e-06, "loss": 0.9616595506668091, "step": 1994 }, { "epoch": 2.4371184371184373, "grad_norm": 1.7600177526474, "learning_rate": 2.133358748706442e-06, "loss": 0.7983999848365784, "step": 1996 }, { "epoch": 2.4395604395604398, "grad_norm": 1.8082541227340698, "learning_rate": 2.128991152379622e-06, "loss": 0.9734374284744263, "step": 1998 }, { "epoch": 2.442002442002442, "grad_norm": 1.7054754495620728, "learning_rate": 2.1246260875543672e-06, "loss": 0.6818905472755432, "step": 2000 }, { "epoch": 2.4444444444444446, "grad_norm": 3.318437099456787, "learning_rate": 2.1202635720245744e-06, "loss": 1.0553401708602905, "step": 2002 }, { "epoch": 2.446886446886447, "grad_norm": 1.892685055732727, "learning_rate": 2.115903623573754e-06, "loss": 0.637603759765625, "step": 2004 }, { "epoch": 2.4493284493284495, "grad_norm": 3.667452573776245, "learning_rate": 2.1115462599749453e-06, "loss": 0.6911687254905701, "step": 2006 }, { "epoch": 2.451770451770452, "grad_norm": 1.999451994895935, "learning_rate": 2.107191498990654e-06, "loss": 1.1354289054870605, "step": 2008 }, { "epoch": 2.4542124542124544, "grad_norm": 2.8429207801818848, "learning_rate": 2.1028393583727752e-06, "loss": 0.6011534929275513, "step": 2010 }, { "epoch": 2.456654456654457, "grad_norm": 3.9235146045684814, "learning_rate": 2.0984898558625227e-06, "loss": 0.6388018131256104, "step": 2012 }, { "epoch": 2.4590964590964592, "grad_norm": 2.5842745304107666, "learning_rate": 2.0941430091903576e-06, "loss": 1.0912564992904663, "step": 2014 }, { "epoch": 2.4615384615384617, "grad_norm": 2.1695728302001953, "learning_rate": 2.0897988360759127e-06, "loss": 0.90839684009552, "step": 2016 }, { "epoch": 2.463980463980464, "grad_norm": 1.3284540176391602, "learning_rate": 2.0854573542279216e-06, "loss": 1.0240721702575684, "step": 2018 }, { "epoch": 2.4664224664224665, "grad_norm": 5.811964511871338, "learning_rate": 2.081118581344151e-06, "loss": 0.7707440257072449, "step": 2020 }, { "epoch": 2.468864468864469, "grad_norm": 1.8133083581924438, "learning_rate": 2.0767825351113192e-06, "loss": 0.6514004468917847, "step": 2022 }, { "epoch": 2.4713064713064714, "grad_norm": 4.037316799163818, "learning_rate": 2.072449233205035e-06, "loss": 0.7341061234474182, "step": 2024 }, { "epoch": 2.473748473748474, "grad_norm": 1.9994157552719116, "learning_rate": 2.068118693289715e-06, "loss": 0.9125716090202332, "step": 2026 }, { "epoch": 2.4761904761904763, "grad_norm": 0.7720515727996826, "learning_rate": 2.0637909330185217e-06, "loss": 0.6419773101806641, "step": 2028 }, { "epoch": 2.4786324786324787, "grad_norm": 1.6481854915618896, "learning_rate": 2.0594659700332833e-06, "loss": 0.9903475046157837, "step": 2030 }, { "epoch": 2.481074481074481, "grad_norm": 2.603499174118042, "learning_rate": 2.055143821964424e-06, "loss": 1.1345065832138062, "step": 2032 }, { "epoch": 2.4835164835164836, "grad_norm": 2.5555107593536377, "learning_rate": 2.0508245064308968e-06, "loss": 0.5736313462257385, "step": 2034 }, { "epoch": 2.485958485958486, "grad_norm": 2.2995779514312744, "learning_rate": 2.046508041040107e-06, "loss": 1.004111409187317, "step": 2036 }, { "epoch": 2.4884004884004884, "grad_norm": 1.261184573173523, "learning_rate": 2.04219444338784e-06, "loss": 0.6451095342636108, "step": 2038 }, { "epoch": 2.490842490842491, "grad_norm": 9.273902893066406, "learning_rate": 2.0378837310581907e-06, "loss": 0.769629955291748, "step": 2040 }, { "epoch": 2.4932844932844933, "grad_norm": 5.710522174835205, "learning_rate": 2.0335759216234947e-06, "loss": 0.9529898166656494, "step": 2042 }, { "epoch": 2.4957264957264957, "grad_norm": 1.7338002920150757, "learning_rate": 2.0292710326442517e-06, "loss": 0.7281374931335449, "step": 2044 }, { "epoch": 2.498168498168498, "grad_norm": 7.91054630279541, "learning_rate": 2.0249690816690583e-06, "loss": 0.5946838855743408, "step": 2046 }, { "epoch": 2.5006105006105006, "grad_norm": 2.516921281814575, "learning_rate": 2.0206700862345334e-06, "loss": 0.719270646572113, "step": 2048 }, { "epoch": 2.503052503052503, "grad_norm": 9.922062873840332, "learning_rate": 2.016374063865248e-06, "loss": 0.8115828037261963, "step": 2050 }, { "epoch": 2.5054945054945055, "grad_norm": 2.6232059001922607, "learning_rate": 2.0120810320736537e-06, "loss": 1.1120948791503906, "step": 2052 }, { "epoch": 2.507936507936508, "grad_norm": 1.4087735414505005, "learning_rate": 2.00779100836001e-06, "loss": 0.7034242153167725, "step": 2054 }, { "epoch": 2.5103785103785103, "grad_norm": 1.9080172777175903, "learning_rate": 2.003504010212317e-06, "loss": 1.0267211198806763, "step": 2056 }, { "epoch": 2.5128205128205128, "grad_norm": 8.740047454833984, "learning_rate": 1.99922005510624e-06, "loss": 0.31465768814086914, "step": 2058 }, { "epoch": 2.515262515262515, "grad_norm": 3.2588388919830322, "learning_rate": 1.9949391605050365e-06, "loss": 0.2918320596218109, "step": 2060 }, { "epoch": 2.5177045177045176, "grad_norm": 6.833197116851807, "learning_rate": 1.990661343859493e-06, "loss": 0.7108557224273682, "step": 2062 }, { "epoch": 2.52014652014652, "grad_norm": 6.0173821449279785, "learning_rate": 1.986386622607845e-06, "loss": 0.8981122374534607, "step": 2064 }, { "epoch": 2.5225885225885225, "grad_norm": 1.5162910223007202, "learning_rate": 1.9821150141757133e-06, "loss": 0.6950556039810181, "step": 2066 }, { "epoch": 2.525030525030525, "grad_norm": 3.4228689670562744, "learning_rate": 1.977846535976026e-06, "loss": 0.7832509875297546, "step": 2068 }, { "epoch": 2.5274725274725274, "grad_norm": 2.2627415657043457, "learning_rate": 1.9735812054089542e-06, "loss": 1.0561403036117554, "step": 2070 }, { "epoch": 2.52991452991453, "grad_norm": 14.761664390563965, "learning_rate": 1.969319039861835e-06, "loss": 0.6642997860908508, "step": 2072 }, { "epoch": 2.5323565323565322, "grad_norm": 1.6336398124694824, "learning_rate": 1.965060056709105e-06, "loss": 1.0829975605010986, "step": 2074 }, { "epoch": 2.5347985347985347, "grad_norm": 2.2617239952087402, "learning_rate": 1.960804273312228e-06, "loss": 0.8906936645507812, "step": 2076 }, { "epoch": 2.537240537240537, "grad_norm": 7.154871463775635, "learning_rate": 1.9565517070196248e-06, "loss": 1.0117489099502563, "step": 2078 }, { "epoch": 2.5396825396825395, "grad_norm": 7.616284370422363, "learning_rate": 1.9523023751665997e-06, "loss": 0.6691079139709473, "step": 2080 }, { "epoch": 2.542124542124542, "grad_norm": 2.257676839828491, "learning_rate": 1.9480562950752745e-06, "loss": 0.9914268255233765, "step": 2082 }, { "epoch": 2.5445665445665444, "grad_norm": 1.5886762142181396, "learning_rate": 1.9438134840545147e-06, "loss": 1.0071735382080078, "step": 2084 }, { "epoch": 2.547008547008547, "grad_norm": 1.6075587272644043, "learning_rate": 1.939573959399858e-06, "loss": 0.9144080281257629, "step": 2086 }, { "epoch": 2.5494505494505493, "grad_norm": 1.8543524742126465, "learning_rate": 1.9353377383934475e-06, "loss": 0.912468433380127, "step": 2088 }, { "epoch": 2.5518925518925517, "grad_norm": 2.2457010746002197, "learning_rate": 1.931104838303958e-06, "loss": 0.7604387998580933, "step": 2090 }, { "epoch": 2.554334554334554, "grad_norm": 1.9843976497650146, "learning_rate": 1.9268752763865285e-06, "loss": 0.691798210144043, "step": 2092 }, { "epoch": 2.5567765567765566, "grad_norm": 1.6185815334320068, "learning_rate": 1.9226490698826876e-06, "loss": 0.7290869951248169, "step": 2094 }, { "epoch": 2.559218559218559, "grad_norm": 1.5648012161254883, "learning_rate": 1.918426236020286e-06, "loss": 0.8694143295288086, "step": 2096 }, { "epoch": 2.5616605616605614, "grad_norm": 0.736847460269928, "learning_rate": 1.91420679201343e-06, "loss": 0.023200487717986107, "step": 2098 }, { "epoch": 2.564102564102564, "grad_norm": 2.0540359020233154, "learning_rate": 1.9099907550624034e-06, "loss": 0.6316545009613037, "step": 2100 }, { "epoch": 2.5665445665445663, "grad_norm": 4.105884075164795, "learning_rate": 1.9057781423536015e-06, "loss": 0.9644788503646851, "step": 2102 }, { "epoch": 2.5689865689865687, "grad_norm": 6.087828159332275, "learning_rate": 1.9015689710594627e-06, "loss": 0.6429115533828735, "step": 2104 }, { "epoch": 2.571428571428571, "grad_norm": 7.242172718048096, "learning_rate": 1.897363258338395e-06, "loss": 0.8835878968238831, "step": 2106 }, { "epoch": 2.5738705738705736, "grad_norm": 3.802177906036377, "learning_rate": 1.8931610213347096e-06, "loss": 0.6208938360214233, "step": 2108 }, { "epoch": 2.576312576312576, "grad_norm": 4.764630317687988, "learning_rate": 1.888962277178548e-06, "loss": 0.5702378749847412, "step": 2110 }, { "epoch": 2.578754578754579, "grad_norm": 1.3646184206008911, "learning_rate": 1.884767042985814e-06, "loss": 1.0015933513641357, "step": 2112 }, { "epoch": 2.5811965811965814, "grad_norm": 7.507686614990234, "learning_rate": 1.880575335858102e-06, "loss": 0.297787070274353, "step": 2114 }, { "epoch": 2.583638583638584, "grad_norm": 1.5217941999435425, "learning_rate": 1.8763871728826282e-06, "loss": 0.7149800658226013, "step": 2116 }, { "epoch": 2.586080586080586, "grad_norm": 1.9740521907806396, "learning_rate": 1.8722025711321657e-06, "loss": 0.998376190662384, "step": 2118 }, { "epoch": 2.5885225885225887, "grad_norm": 1.2570465803146362, "learning_rate": 1.8680215476649643e-06, "loss": 0.665241539478302, "step": 2120 }, { "epoch": 2.590964590964591, "grad_norm": 4.761254787445068, "learning_rate": 1.8638441195246915e-06, "loss": 0.9342296719551086, "step": 2122 }, { "epoch": 2.5934065934065935, "grad_norm": 2.0453293323516846, "learning_rate": 1.8596703037403573e-06, "loss": 0.8592435121536255, "step": 2124 }, { "epoch": 2.595848595848596, "grad_norm": 1.4386849403381348, "learning_rate": 1.8555001173262449e-06, "loss": 0.4735715985298157, "step": 2126 }, { "epoch": 2.5982905982905984, "grad_norm": 1.8832699060440063, "learning_rate": 1.8513335772818452e-06, "loss": 0.9801812171936035, "step": 2128 }, { "epoch": 2.600732600732601, "grad_norm": 2.282724380493164, "learning_rate": 1.8471707005917833e-06, "loss": 0.6964608430862427, "step": 2130 }, { "epoch": 2.6031746031746033, "grad_norm": 1.7532885074615479, "learning_rate": 1.8430115042257518e-06, "loss": 0.5790331959724426, "step": 2132 }, { "epoch": 2.6056166056166057, "grad_norm": 2.6640255451202393, "learning_rate": 1.838856005138438e-06, "loss": 0.9573779106140137, "step": 2134 }, { "epoch": 2.608058608058608, "grad_norm": 3.7161571979522705, "learning_rate": 1.8347042202694616e-06, "loss": 0.6422839760780334, "step": 2136 }, { "epoch": 2.6105006105006106, "grad_norm": 3.1850647926330566, "learning_rate": 1.8305561665432987e-06, "loss": 0.7944685816764832, "step": 2138 }, { "epoch": 2.612942612942613, "grad_norm": 1.816838264465332, "learning_rate": 1.8264118608692166e-06, "loss": 0.6552348136901855, "step": 2140 }, { "epoch": 2.6153846153846154, "grad_norm": 2.1410024166107178, "learning_rate": 1.8222713201412034e-06, "loss": 0.9763152599334717, "step": 2142 }, { "epoch": 2.617826617826618, "grad_norm": 1.5893546342849731, "learning_rate": 1.818134561237901e-06, "loss": 0.9420812726020813, "step": 2144 }, { "epoch": 2.6202686202686203, "grad_norm": 8.81820297241211, "learning_rate": 1.814001601022533e-06, "loss": 0.9948893785476685, "step": 2146 }, { "epoch": 2.6227106227106227, "grad_norm": 1.6226983070373535, "learning_rate": 1.8098724563428383e-06, "loss": 0.6544241309165955, "step": 2148 }, { "epoch": 2.625152625152625, "grad_norm": 1.8730573654174805, "learning_rate": 1.8057471440310048e-06, "loss": 1.034470796585083, "step": 2150 }, { "epoch": 2.6275946275946276, "grad_norm": 1.7522205114364624, "learning_rate": 1.8016256809035932e-06, "loss": 1.0132882595062256, "step": 2152 }, { "epoch": 2.63003663003663, "grad_norm": 2.8230953216552734, "learning_rate": 1.7975080837614777e-06, "loss": 0.989703357219696, "step": 2154 }, { "epoch": 2.6324786324786325, "grad_norm": 2.739607334136963, "learning_rate": 1.79339436938977e-06, "loss": 0.9275919198989868, "step": 2156 }, { "epoch": 2.634920634920635, "grad_norm": 2.475738048553467, "learning_rate": 1.7892845545577547e-06, "loss": 0.7446354627609253, "step": 2158 }, { "epoch": 2.6373626373626373, "grad_norm": 3.7959513664245605, "learning_rate": 1.7851786560188223e-06, "loss": 0.5423752069473267, "step": 2160 }, { "epoch": 2.6398046398046398, "grad_norm": 2.074728488922119, "learning_rate": 1.7810766905103972e-06, "loss": 0.7950323820114136, "step": 2162 }, { "epoch": 2.642246642246642, "grad_norm": 1.902423620223999, "learning_rate": 1.776978674753868e-06, "loss": 0.48773831129074097, "step": 2164 }, { "epoch": 2.6446886446886446, "grad_norm": 1.4304314851760864, "learning_rate": 1.7728846254545285e-06, "loss": 0.9862061738967896, "step": 2166 }, { "epoch": 2.647130647130647, "grad_norm": 1.1472234725952148, "learning_rate": 1.7687945593014988e-06, "loss": 0.735059916973114, "step": 2168 }, { "epoch": 2.6495726495726495, "grad_norm": 1.9486029148101807, "learning_rate": 1.764708492967665e-06, "loss": 1.0259606838226318, "step": 2170 }, { "epoch": 2.652014652014652, "grad_norm": 1.9149264097213745, "learning_rate": 1.7606264431096048e-06, "loss": 1.0802158117294312, "step": 2172 }, { "epoch": 2.6544566544566544, "grad_norm": 6.957102298736572, "learning_rate": 1.7565484263675258e-06, "loss": 0.9875915050506592, "step": 2174 }, { "epoch": 2.656898656898657, "grad_norm": 2.418081283569336, "learning_rate": 1.7524744593651948e-06, "loss": 0.7961604595184326, "step": 2176 }, { "epoch": 2.659340659340659, "grad_norm": 2.5019962787628174, "learning_rate": 1.7484045587098681e-06, "loss": 1.029079556465149, "step": 2178 }, { "epoch": 2.6617826617826617, "grad_norm": 39.45793151855469, "learning_rate": 1.7443387409922266e-06, "loss": 1.0245277881622314, "step": 2180 }, { "epoch": 2.664224664224664, "grad_norm": 1.2770639657974243, "learning_rate": 1.740277022786309e-06, "loss": 1.0204907655715942, "step": 2182 }, { "epoch": 2.6666666666666665, "grad_norm": 2.364436149597168, "learning_rate": 1.7362194206494421e-06, "loss": 0.6930133700370789, "step": 2184 }, { "epoch": 2.669108669108669, "grad_norm": 2.3246958255767822, "learning_rate": 1.732165951122171e-06, "loss": 1.0231374502182007, "step": 2186 }, { "epoch": 2.6715506715506714, "grad_norm": 1.607748031616211, "learning_rate": 1.7281166307281972e-06, "loss": 1.094809651374817, "step": 2188 }, { "epoch": 2.6739926739926743, "grad_norm": 2.158128261566162, "learning_rate": 1.7240714759743084e-06, "loss": 1.021047830581665, "step": 2190 }, { "epoch": 2.6764346764346767, "grad_norm": 8.213944435119629, "learning_rate": 1.7200305033503123e-06, "loss": 0.9594013094902039, "step": 2192 }, { "epoch": 2.678876678876679, "grad_norm": 4.63560676574707, "learning_rate": 1.7159937293289639e-06, "loss": 0.3299452066421509, "step": 2194 }, { "epoch": 2.6813186813186816, "grad_norm": 2.0286736488342285, "learning_rate": 1.711961170365909e-06, "loss": 1.214423418045044, "step": 2196 }, { "epoch": 2.683760683760684, "grad_norm": 1.665736198425293, "learning_rate": 1.707932842899605e-06, "loss": 0.9360992908477783, "step": 2198 }, { "epoch": 2.6862026862026864, "grad_norm": 1.7861151695251465, "learning_rate": 1.7039087633512652e-06, "loss": 0.9141231179237366, "step": 2200 }, { "epoch": 2.688644688644689, "grad_norm": 4.079377174377441, "learning_rate": 1.6998889481247827e-06, "loss": 0.6146577596664429, "step": 2202 }, { "epoch": 2.6910866910866913, "grad_norm": 3.120830535888672, "learning_rate": 1.6958734136066708e-06, "loss": 0.7842304110527039, "step": 2204 }, { "epoch": 2.6935286935286937, "grad_norm": 9.338458061218262, "learning_rate": 1.6918621761659885e-06, "loss": 0.4128279983997345, "step": 2206 }, { "epoch": 2.695970695970696, "grad_norm": 5.846481800079346, "learning_rate": 1.6878552521542825e-06, "loss": 0.909477710723877, "step": 2208 }, { "epoch": 2.6984126984126986, "grad_norm": 2.9776010513305664, "learning_rate": 1.6838526579055108e-06, "loss": 0.6446021795272827, "step": 2210 }, { "epoch": 2.700854700854701, "grad_norm": 2.117492914199829, "learning_rate": 1.679854409735989e-06, "loss": 0.9117352962493896, "step": 2212 }, { "epoch": 2.7032967032967035, "grad_norm": 2.0278072357177734, "learning_rate": 1.6758605239443083e-06, "loss": 0.6256328225135803, "step": 2214 }, { "epoch": 2.705738705738706, "grad_norm": 5.5183563232421875, "learning_rate": 1.6718710168112824e-06, "loss": 0.5338436365127563, "step": 2216 }, { "epoch": 2.7081807081807083, "grad_norm": 3.22253155708313, "learning_rate": 1.6678859045998724e-06, "loss": 0.6465069651603699, "step": 2218 }, { "epoch": 2.7106227106227108, "grad_norm": 8.712440490722656, "learning_rate": 1.663905203555125e-06, "loss": 0.3656350374221802, "step": 2220 }, { "epoch": 2.713064713064713, "grad_norm": 2.39136004447937, "learning_rate": 1.6599289299041067e-06, "loss": 0.5852014422416687, "step": 2222 }, { "epoch": 2.7155067155067156, "grad_norm": 3.29854416847229, "learning_rate": 1.6559570998558339e-06, "loss": 0.7199364900588989, "step": 2224 }, { "epoch": 2.717948717948718, "grad_norm": 1.553189754486084, "learning_rate": 1.6519897296012089e-06, "loss": 0.7559410333633423, "step": 2226 }, { "epoch": 2.7203907203907205, "grad_norm": 5.676231384277344, "learning_rate": 1.648026835312954e-06, "loss": 0.7857324481010437, "step": 2228 }, { "epoch": 2.722832722832723, "grad_norm": 2.2479665279388428, "learning_rate": 1.644068433145548e-06, "loss": 0.9991781711578369, "step": 2230 }, { "epoch": 2.7252747252747254, "grad_norm": 19.45795249938965, "learning_rate": 1.640114539235156e-06, "loss": 0.6020703911781311, "step": 2232 }, { "epoch": 2.727716727716728, "grad_norm": 1.4817429780960083, "learning_rate": 1.6361651696995633e-06, "loss": 1.0305383205413818, "step": 2234 }, { "epoch": 2.7301587301587302, "grad_norm": 3.4105312824249268, "learning_rate": 1.6322203406381158e-06, "loss": 1.0053908824920654, "step": 2236 }, { "epoch": 2.7326007326007327, "grad_norm": 1.9684903621673584, "learning_rate": 1.6282800681316485e-06, "loss": 0.9223586320877075, "step": 2238 }, { "epoch": 2.735042735042735, "grad_norm": 3.927523374557495, "learning_rate": 1.6243443682424211e-06, "loss": 0.6888905167579651, "step": 2240 }, { "epoch": 2.7374847374847375, "grad_norm": 9.635194778442383, "learning_rate": 1.6204132570140551e-06, "loss": 0.9834311008453369, "step": 2242 }, { "epoch": 2.73992673992674, "grad_norm": 2.742316722869873, "learning_rate": 1.616486750471466e-06, "loss": 0.5603131055831909, "step": 2244 }, { "epoch": 2.7423687423687424, "grad_norm": 2.2433788776397705, "learning_rate": 1.6125648646207992e-06, "loss": 0.7219388484954834, "step": 2246 }, { "epoch": 2.744810744810745, "grad_norm": 3.132955312728882, "learning_rate": 1.608647615449362e-06, "loss": 0.8298469185829163, "step": 2248 }, { "epoch": 2.7472527472527473, "grad_norm": 2.522810697555542, "learning_rate": 1.604735018925563e-06, "loss": 0.9102773070335388, "step": 2250 }, { "epoch": 2.7496947496947497, "grad_norm": 2.429370164871216, "learning_rate": 1.6008270909988414e-06, "loss": 0.9825899600982666, "step": 2252 }, { "epoch": 2.752136752136752, "grad_norm": 1.3979560136795044, "learning_rate": 1.596923847599611e-06, "loss": 0.694176197052002, "step": 2254 }, { "epoch": 2.7545787545787546, "grad_norm": 3.7129030227661133, "learning_rate": 1.593025304639183e-06, "loss": 0.7678108811378479, "step": 2256 }, { "epoch": 2.757020757020757, "grad_norm": 58.61724090576172, "learning_rate": 1.5891314780097123e-06, "loss": 0.9679561853408813, "step": 2258 }, { "epoch": 2.7594627594627594, "grad_norm": 3.2823469638824463, "learning_rate": 1.585242383584124e-06, "loss": 1.0787243843078613, "step": 2260 }, { "epoch": 2.761904761904762, "grad_norm": 4.105648517608643, "learning_rate": 1.5813580372160558e-06, "loss": 1.0055099725723267, "step": 2262 }, { "epoch": 2.7643467643467643, "grad_norm": 1.8101457357406616, "learning_rate": 1.5774784547397898e-06, "loss": 0.9336439967155457, "step": 2264 }, { "epoch": 2.7667887667887667, "grad_norm": 3.130258798599243, "learning_rate": 1.5736036519701876e-06, "loss": 0.912263035774231, "step": 2266 }, { "epoch": 2.769230769230769, "grad_norm": 0.9224950075149536, "learning_rate": 1.5697336447026257e-06, "loss": 0.7292864918708801, "step": 2268 }, { "epoch": 2.7716727716727716, "grad_norm": 1.59903085231781, "learning_rate": 1.565868448712935e-06, "loss": 0.593657374382019, "step": 2270 }, { "epoch": 2.774114774114774, "grad_norm": 1.2354272603988647, "learning_rate": 1.562008079757329e-06, "loss": 0.2578456699848175, "step": 2272 }, { "epoch": 2.7765567765567765, "grad_norm": 1.8744750022888184, "learning_rate": 1.5581525535723502e-06, "loss": 1.0456628799438477, "step": 2274 }, { "epoch": 2.778998778998779, "grad_norm": 1.6361182928085327, "learning_rate": 1.5543018858747943e-06, "loss": 0.9015727043151855, "step": 2276 }, { "epoch": 2.7814407814407813, "grad_norm": 2.5508196353912354, "learning_rate": 1.550456092361655e-06, "loss": 0.5647008419036865, "step": 2278 }, { "epoch": 2.7838827838827838, "grad_norm": 5.802591323852539, "learning_rate": 1.546615188710055e-06, "loss": 0.341159850358963, "step": 2280 }, { "epoch": 2.786324786324786, "grad_norm": 2.65018630027771, "learning_rate": 1.5427791905771843e-06, "loss": 1.0216097831726074, "step": 2282 }, { "epoch": 2.7887667887667886, "grad_norm": 4.260854721069336, "learning_rate": 1.538948113600237e-06, "loss": 0.8784246444702148, "step": 2284 }, { "epoch": 2.791208791208791, "grad_norm": 3.1193864345550537, "learning_rate": 1.5351219733963453e-06, "loss": 0.9552139043807983, "step": 2286 }, { "epoch": 2.7936507936507935, "grad_norm": 1.618998646736145, "learning_rate": 1.5313007855625153e-06, "loss": 0.9732692241668701, "step": 2288 }, { "epoch": 2.796092796092796, "grad_norm": 6.491037368774414, "learning_rate": 1.5274845656755687e-06, "loss": 0.3624776303768158, "step": 2290 }, { "epoch": 2.7985347985347984, "grad_norm": 2.8097808361053467, "learning_rate": 1.5236733292920735e-06, "loss": 0.8098872303962708, "step": 2292 }, { "epoch": 2.800976800976801, "grad_norm": 2.353226900100708, "learning_rate": 1.5198670919482839e-06, "loss": 0.7608856558799744, "step": 2294 }, { "epoch": 2.8034188034188032, "grad_norm": 1.7207751274108887, "learning_rate": 1.5160658691600737e-06, "loss": 0.8960850834846497, "step": 2296 }, { "epoch": 2.8058608058608057, "grad_norm": 2.5294456481933594, "learning_rate": 1.5122696764228772e-06, "loss": 0.40981659293174744, "step": 2298 }, { "epoch": 2.808302808302808, "grad_norm": 2.1142632961273193, "learning_rate": 1.5084785292116244e-06, "loss": 0.6546359658241272, "step": 2300 }, { "epoch": 2.8107448107448105, "grad_norm": 2.852811574935913, "learning_rate": 1.5046924429806747e-06, "loss": 1.049178123474121, "step": 2302 }, { "epoch": 2.813186813186813, "grad_norm": 0.45758649706840515, "learning_rate": 1.50091143316376e-06, "loss": 0.5754284262657166, "step": 2304 }, { "epoch": 2.8156288156288154, "grad_norm": 1.5217318534851074, "learning_rate": 1.497135515173917e-06, "loss": 0.7435483336448669, "step": 2306 }, { "epoch": 2.818070818070818, "grad_norm": 2.425044298171997, "learning_rate": 1.4933647044034264e-06, "loss": 0.6329599618911743, "step": 2308 }, { "epoch": 2.8205128205128203, "grad_norm": 2.1778621673583984, "learning_rate": 1.489599016223748e-06, "loss": 1.040429949760437, "step": 2310 }, { "epoch": 2.8229548229548227, "grad_norm": 1.7928675413131714, "learning_rate": 1.485838465985463e-06, "loss": 0.6599953770637512, "step": 2312 }, { "epoch": 2.825396825396825, "grad_norm": 6.531580924987793, "learning_rate": 1.482083069018203e-06, "loss": 0.7039975523948669, "step": 2314 }, { "epoch": 2.8278388278388276, "grad_norm": 2.4885547161102295, "learning_rate": 1.4783328406306002e-06, "loss": 0.7224160432815552, "step": 2316 }, { "epoch": 2.8302808302808304, "grad_norm": 2.634704351425171, "learning_rate": 1.4745877961102096e-06, "loss": 1.0425044298171997, "step": 2318 }, { "epoch": 2.832722832722833, "grad_norm": 1.6846414804458618, "learning_rate": 1.4708479507234596e-06, "loss": 0.6160850524902344, "step": 2320 }, { "epoch": 2.8351648351648353, "grad_norm": 6.159206390380859, "learning_rate": 1.4671133197155817e-06, "loss": 0.6913861036300659, "step": 2322 }, { "epoch": 2.8376068376068377, "grad_norm": 0.6180989146232605, "learning_rate": 1.4633839183105531e-06, "loss": 0.19272488355636597, "step": 2324 }, { "epoch": 2.84004884004884, "grad_norm": 1.4327154159545898, "learning_rate": 1.4596597617110327e-06, "loss": 0.8577545285224915, "step": 2326 }, { "epoch": 2.8424908424908426, "grad_norm": 2.3003664016723633, "learning_rate": 1.4559408650982999e-06, "loss": 0.8021556735038757, "step": 2328 }, { "epoch": 2.844932844932845, "grad_norm": 1.9625604152679443, "learning_rate": 1.4522272436321893e-06, "loss": 0.8357652425765991, "step": 2330 }, { "epoch": 2.8473748473748475, "grad_norm": 1.8401294946670532, "learning_rate": 1.4485189124510355e-06, "loss": 1.0011165142059326, "step": 2332 }, { "epoch": 2.84981684981685, "grad_norm": 2.622108221054077, "learning_rate": 1.4448158866716028e-06, "loss": 0.15081661939620972, "step": 2334 }, { "epoch": 2.8522588522588523, "grad_norm": 2.198842763900757, "learning_rate": 1.441118181389035e-06, "loss": 1.0237456560134888, "step": 2336 }, { "epoch": 2.8547008547008548, "grad_norm": 1.9740854501724243, "learning_rate": 1.437425811676781e-06, "loss": 0.6290860176086426, "step": 2338 }, { "epoch": 2.857142857142857, "grad_norm": 1.7903653383255005, "learning_rate": 1.4337387925865435e-06, "loss": 1.0167012214660645, "step": 2340 }, { "epoch": 2.8595848595848596, "grad_norm": 1.402300477027893, "learning_rate": 1.430057139148211e-06, "loss": 1.0619688034057617, "step": 2342 }, { "epoch": 2.862026862026862, "grad_norm": 5.845098972320557, "learning_rate": 1.4263808663698015e-06, "loss": 0.3327184319496155, "step": 2344 }, { "epoch": 2.8644688644688645, "grad_norm": 3.894296169281006, "learning_rate": 1.4227099892373986e-06, "loss": 0.9415085911750793, "step": 2346 }, { "epoch": 2.866910866910867, "grad_norm": 0.5116417407989502, "learning_rate": 1.4190445227150907e-06, "loss": 0.5154658555984497, "step": 2348 }, { "epoch": 2.8693528693528694, "grad_norm": 1.3756489753723145, "learning_rate": 1.4153844817449087e-06, "loss": 0.6424716114997864, "step": 2350 }, { "epoch": 2.871794871794872, "grad_norm": 1.572013020515442, "learning_rate": 1.4117298812467687e-06, "loss": 0.7699521780014038, "step": 2352 }, { "epoch": 2.8742368742368742, "grad_norm": 51.821266174316406, "learning_rate": 1.4080807361184088e-06, "loss": 0.5482099652290344, "step": 2354 }, { "epoch": 2.8766788766788767, "grad_norm": 1.6499661207199097, "learning_rate": 1.4044370612353281e-06, "loss": 0.906887412071228, "step": 2356 }, { "epoch": 2.879120879120879, "grad_norm": 4.545080661773682, "learning_rate": 1.400798871450726e-06, "loss": 0.784338653087616, "step": 2358 }, { "epoch": 2.8815628815628815, "grad_norm": 2.03543758392334, "learning_rate": 1.397166181595443e-06, "loss": 0.5577901005744934, "step": 2360 }, { "epoch": 2.884004884004884, "grad_norm": 5.208932399749756, "learning_rate": 1.3935390064779008e-06, "loss": 0.7476451992988586, "step": 2362 }, { "epoch": 2.8864468864468864, "grad_norm": 2.228670835494995, "learning_rate": 1.3899173608840378e-06, "loss": 1.051893949508667, "step": 2364 }, { "epoch": 2.888888888888889, "grad_norm": 1.407289743423462, "learning_rate": 1.3863012595772531e-06, "loss": 1.076530933380127, "step": 2366 }, { "epoch": 2.8913308913308913, "grad_norm": 22.543790817260742, "learning_rate": 1.3826907172983456e-06, "loss": 0.846904993057251, "step": 2368 }, { "epoch": 2.8937728937728937, "grad_norm": 1.6555728912353516, "learning_rate": 1.3790857487654535e-06, "loss": 1.1604909896850586, "step": 2370 }, { "epoch": 2.896214896214896, "grad_norm": 2.013773202896118, "learning_rate": 1.3754863686739906e-06, "loss": 0.915320634841919, "step": 2372 }, { "epoch": 2.8986568986568986, "grad_norm": 2.000242233276367, "learning_rate": 1.3718925916965945e-06, "loss": 0.7186045050621033, "step": 2374 }, { "epoch": 2.901098901098901, "grad_norm": 1.6389049291610718, "learning_rate": 1.3683044324830573e-06, "loss": 0.9410088658332825, "step": 2376 }, { "epoch": 2.9035409035409034, "grad_norm": 2.346830129623413, "learning_rate": 1.3647219056602757e-06, "loss": 1.0101977586746216, "step": 2378 }, { "epoch": 2.905982905982906, "grad_norm": 1.5173600912094116, "learning_rate": 1.361145025832182e-06, "loss": 0.8229511976242065, "step": 2380 }, { "epoch": 2.9084249084249083, "grad_norm": 2.14473557472229, "learning_rate": 1.3575738075796923e-06, "loss": 0.9482402801513672, "step": 2382 }, { "epoch": 2.9108669108669107, "grad_norm": 1.9663830995559692, "learning_rate": 1.35400826546064e-06, "loss": 0.9494956135749817, "step": 2384 }, { "epoch": 2.913308913308913, "grad_norm": 5.7204909324646, "learning_rate": 1.350448414009723e-06, "loss": 1.0107911825180054, "step": 2386 }, { "epoch": 2.9157509157509156, "grad_norm": 2.0800321102142334, "learning_rate": 1.3468942677384408e-06, "loss": 0.8393886089324951, "step": 2388 }, { "epoch": 2.918192918192918, "grad_norm": 6.8317461013793945, "learning_rate": 1.343345841135037e-06, "loss": 0.46943965554237366, "step": 2390 }, { "epoch": 2.9206349206349205, "grad_norm": 3.705573081970215, "learning_rate": 1.3398031486644366e-06, "loss": 0.5753905177116394, "step": 2392 }, { "epoch": 2.9230769230769234, "grad_norm": 1.4765284061431885, "learning_rate": 1.3362662047681928e-06, "loss": 0.8073123097419739, "step": 2394 }, { "epoch": 2.925518925518926, "grad_norm": 8.290820121765137, "learning_rate": 1.3327350238644224e-06, "loss": 0.6432682871818542, "step": 2396 }, { "epoch": 2.927960927960928, "grad_norm": 33.93027877807617, "learning_rate": 1.3292096203477533e-06, "loss": 0.6455587148666382, "step": 2398 }, { "epoch": 2.9304029304029307, "grad_norm": 15.2852201461792, "learning_rate": 1.3256900085892584e-06, "loss": 0.5954673290252686, "step": 2400 }, { "epoch": 2.932844932844933, "grad_norm": 2.53251051902771, "learning_rate": 1.3221762029364043e-06, "loss": 0.656650960445404, "step": 2402 }, { "epoch": 2.9352869352869355, "grad_norm": 3.878647565841675, "learning_rate": 1.3186682177129862e-06, "loss": 0.3129318654537201, "step": 2404 }, { "epoch": 2.937728937728938, "grad_norm": 3.8784711360931396, "learning_rate": 1.3151660672190744e-06, "loss": 1.0069366693496704, "step": 2406 }, { "epoch": 2.9401709401709404, "grad_norm": 1.668820858001709, "learning_rate": 1.3116697657309547e-06, "loss": 0.7313091158866882, "step": 2408 }, { "epoch": 2.942612942612943, "grad_norm": 2.024500846862793, "learning_rate": 1.3081793275010699e-06, "loss": 0.6760754585266113, "step": 2410 }, { "epoch": 2.9450549450549453, "grad_norm": 1.469117522239685, "learning_rate": 1.3046947667579596e-06, "loss": 0.9695707559585571, "step": 2412 }, { "epoch": 2.9474969474969477, "grad_norm": 2.2661683559417725, "learning_rate": 1.301216097706206e-06, "loss": 1.0303492546081543, "step": 2414 }, { "epoch": 2.94993894993895, "grad_norm": 1.312300682067871, "learning_rate": 1.2977433345263752e-06, "loss": 0.9242293238639832, "step": 2416 }, { "epoch": 2.9523809523809526, "grad_norm": 2.9520680904388428, "learning_rate": 1.2942764913749544e-06, "loss": 0.6899678707122803, "step": 2418 }, { "epoch": 2.954822954822955, "grad_norm": 1.741718053817749, "learning_rate": 1.2908155823843033e-06, "loss": 0.9872897267341614, "step": 2420 }, { "epoch": 2.9572649572649574, "grad_norm": 1.809373378753662, "learning_rate": 1.2873606216625879e-06, "loss": 0.8448399305343628, "step": 2422 }, { "epoch": 2.95970695970696, "grad_norm": 2.35263991355896, "learning_rate": 1.2839116232937271e-06, "loss": 0.5212328433990479, "step": 2424 }, { "epoch": 2.9621489621489623, "grad_norm": 2.598365068435669, "learning_rate": 1.280468601337335e-06, "loss": 1.1081678867340088, "step": 2426 }, { "epoch": 2.9645909645909647, "grad_norm": 11.081291198730469, "learning_rate": 1.2770315698286643e-06, "loss": 0.5913952589035034, "step": 2428 }, { "epoch": 2.967032967032967, "grad_norm": 2.6343994140625, "learning_rate": 1.273600542778546e-06, "loss": 0.9255035519599915, "step": 2430 }, { "epoch": 2.9694749694749696, "grad_norm": 2.8472864627838135, "learning_rate": 1.2701755341733363e-06, "loss": 0.8645012378692627, "step": 2432 }, { "epoch": 2.971916971916972, "grad_norm": 1.3869469165802002, "learning_rate": 1.2667565579748552e-06, "loss": 0.9598724246025085, "step": 2434 }, { "epoch": 2.9743589743589745, "grad_norm": 1.7193245887756348, "learning_rate": 1.2633436281203353e-06, "loss": 0.5284073948860168, "step": 2436 }, { "epoch": 2.976800976800977, "grad_norm": 4.513336181640625, "learning_rate": 1.2599367585223573e-06, "loss": 0.5111241340637207, "step": 2438 }, { "epoch": 2.9792429792429793, "grad_norm": 3.2821226119995117, "learning_rate": 1.2565359630688029e-06, "loss": 0.9840971231460571, "step": 2440 }, { "epoch": 2.9816849816849818, "grad_norm": 1.6997895240783691, "learning_rate": 1.2531412556227883e-06, "loss": 1.0207282304763794, "step": 2442 }, { "epoch": 2.984126984126984, "grad_norm": 2.6019296646118164, "learning_rate": 1.2497526500226163e-06, "loss": 0.940024197101593, "step": 2444 }, { "epoch": 2.9865689865689866, "grad_norm": 2.273214101791382, "learning_rate": 1.246370160081711e-06, "loss": 0.9067605137825012, "step": 2446 }, { "epoch": 2.989010989010989, "grad_norm": 1.8762654066085815, "learning_rate": 1.2429937995885713e-06, "loss": 0.93479323387146, "step": 2448 }, { "epoch": 2.9914529914529915, "grad_norm": 5.84881067276001, "learning_rate": 1.2396235823067076e-06, "loss": 0.6413801312446594, "step": 2450 }, { "epoch": 2.993894993894994, "grad_norm": 1.7108796834945679, "learning_rate": 1.2362595219745882e-06, "loss": 1.0565381050109863, "step": 2452 }, { "epoch": 2.9963369963369964, "grad_norm": 1.5569299459457397, "learning_rate": 1.2329016323055822e-06, "loss": 0.9824570417404175, "step": 2454 }, { "epoch": 2.998778998778999, "grad_norm": 2.560699939727783, "learning_rate": 1.2295499269879063e-06, "loss": 0.5337162613868713, "step": 2456 }, { "epoch": 3.001221001221001, "grad_norm": 3.085305690765381, "learning_rate": 1.2262044196845638e-06, "loss": 0.6332882046699524, "step": 2458 }, { "epoch": 3.0036630036630036, "grad_norm": 3.3725106716156006, "learning_rate": 1.2228651240332972e-06, "loss": 0.62852543592453, "step": 2460 }, { "epoch": 3.006105006105006, "grad_norm": 1.6869263648986816, "learning_rate": 1.2195320536465225e-06, "loss": 1.0432286262512207, "step": 2462 }, { "epoch": 3.0085470085470085, "grad_norm": 1.3357821702957153, "learning_rate": 1.2162052221112828e-06, "loss": 0.962488055229187, "step": 2464 }, { "epoch": 3.010989010989011, "grad_norm": 4.632596492767334, "learning_rate": 1.2128846429891852e-06, "loss": 0.5416973233222961, "step": 2466 }, { "epoch": 3.0134310134310134, "grad_norm": 1.600440502166748, "learning_rate": 1.2095703298163526e-06, "loss": 0.8857253789901733, "step": 2468 }, { "epoch": 3.015873015873016, "grad_norm": 0.576468288898468, "learning_rate": 1.2062622961033632e-06, "loss": 0.2631528675556183, "step": 2470 }, { "epoch": 3.0183150183150182, "grad_norm": 5.257506370544434, "learning_rate": 1.2029605553351988e-06, "loss": 0.3512267470359802, "step": 2472 }, { "epoch": 3.0207570207570207, "grad_norm": 2.005457639694214, "learning_rate": 1.199665120971188e-06, "loss": 0.9261833429336548, "step": 2474 }, { "epoch": 3.023199023199023, "grad_norm": 5.405751705169678, "learning_rate": 1.1963760064449495e-06, "loss": 0.5271846652030945, "step": 2476 }, { "epoch": 3.0256410256410255, "grad_norm": 1.659690499305725, "learning_rate": 1.1930932251643438e-06, "loss": 0.6160858869552612, "step": 2478 }, { "epoch": 3.028083028083028, "grad_norm": 1.8383840322494507, "learning_rate": 1.189816790511409e-06, "loss": 0.8536359667778015, "step": 2480 }, { "epoch": 3.0305250305250304, "grad_norm": 2.1919424533843994, "learning_rate": 1.1865467158423179e-06, "loss": 0.9045109152793884, "step": 2482 }, { "epoch": 3.032967032967033, "grad_norm": 1.5028966665267944, "learning_rate": 1.1832830144873122e-06, "loss": 0.6014432907104492, "step": 2484 }, { "epoch": 3.0354090354090353, "grad_norm": 2.3299906253814697, "learning_rate": 1.1800256997506557e-06, "loss": 0.8661763072013855, "step": 2486 }, { "epoch": 3.0378510378510377, "grad_norm": 9.991959571838379, "learning_rate": 1.176774784910576e-06, "loss": 0.6161713600158691, "step": 2488 }, { "epoch": 3.04029304029304, "grad_norm": 2.847564697265625, "learning_rate": 1.1735302832192135e-06, "loss": 0.8722133636474609, "step": 2490 }, { "epoch": 3.0427350427350426, "grad_norm": 2.7239389419555664, "learning_rate": 1.1702922079025647e-06, "loss": 0.3192221522331238, "step": 2492 }, { "epoch": 3.045177045177045, "grad_norm": 0.7756720185279846, "learning_rate": 1.1670605721604307e-06, "loss": 0.2883589565753937, "step": 2494 }, { "epoch": 3.0476190476190474, "grad_norm": 4.5272135734558105, "learning_rate": 1.1638353891663602e-06, "loss": 0.6891329288482666, "step": 2496 }, { "epoch": 3.05006105006105, "grad_norm": 4.521149635314941, "learning_rate": 1.1606166720675999e-06, "loss": 0.45780226588249207, "step": 2498 }, { "epoch": 3.0525030525030523, "grad_norm": 0.8906940221786499, "learning_rate": 1.157404433985035e-06, "loss": 0.5027573704719543, "step": 2500 }, { "epoch": 3.0549450549450547, "grad_norm": 9.020967483520508, "learning_rate": 1.1541986880131455e-06, "loss": 0.4361349642276764, "step": 2502 }, { "epoch": 3.057387057387057, "grad_norm": 2.3300914764404297, "learning_rate": 1.1509994472199407e-06, "loss": 0.8963256478309631, "step": 2504 }, { "epoch": 3.0598290598290596, "grad_norm": 2.031867027282715, "learning_rate": 1.1478067246469158e-06, "loss": 0.4999798536300659, "step": 2506 }, { "epoch": 3.062271062271062, "grad_norm": 1.6989192962646484, "learning_rate": 1.1446205333089922e-06, "loss": 0.7561573386192322, "step": 2508 }, { "epoch": 3.064713064713065, "grad_norm": 4.123907089233398, "learning_rate": 1.1414408861944695e-06, "loss": 0.8584511876106262, "step": 2510 }, { "epoch": 3.0671550671550674, "grad_norm": 0.06403572857379913, "learning_rate": 1.1382677962649687e-06, "loss": 0.3911321461200714, "step": 2512 }, { "epoch": 3.06959706959707, "grad_norm": 4.289177894592285, "learning_rate": 1.1351012764553828e-06, "loss": 0.8152522444725037, "step": 2514 }, { "epoch": 3.0720390720390722, "grad_norm": 2.2127068042755127, "learning_rate": 1.1319413396738188e-06, "loss": 0.5816116333007812, "step": 2516 }, { "epoch": 3.0744810744810747, "grad_norm": 0.09444202482700348, "learning_rate": 1.128787998801552e-06, "loss": 0.20017878711223602, "step": 2518 }, { "epoch": 3.076923076923077, "grad_norm": 2.8026583194732666, "learning_rate": 1.1256412666929655e-06, "loss": 0.514468789100647, "step": 2520 }, { "epoch": 3.0793650793650795, "grad_norm": 2.7216711044311523, "learning_rate": 1.1225011561755093e-06, "loss": 0.6835171580314636, "step": 2522 }, { "epoch": 3.081807081807082, "grad_norm": 2.2049448490142822, "learning_rate": 1.1193676800496326e-06, "loss": 0.8667712211608887, "step": 2524 }, { "epoch": 3.0842490842490844, "grad_norm": 11.64513111114502, "learning_rate": 1.1162408510887469e-06, "loss": 0.5643727779388428, "step": 2526 }, { "epoch": 3.086691086691087, "grad_norm": 2.371492862701416, "learning_rate": 1.1131206820391618e-06, "loss": 0.5264307856559753, "step": 2528 }, { "epoch": 3.0891330891330893, "grad_norm": 8.419890403747559, "learning_rate": 1.1100071856200413e-06, "loss": 0.11923594772815704, "step": 2530 }, { "epoch": 3.0915750915750917, "grad_norm": 4.913514137268066, "learning_rate": 1.106900374523348e-06, "loss": 0.32799002528190613, "step": 2532 }, { "epoch": 3.094017094017094, "grad_norm": 2.237429141998291, "learning_rate": 1.1038002614137922e-06, "loss": 0.8726149797439575, "step": 2534 }, { "epoch": 3.0964590964590966, "grad_norm": 5.674133777618408, "learning_rate": 1.1007068589287814e-06, "loss": 0.635856568813324, "step": 2536 }, { "epoch": 3.098901098901099, "grad_norm": 24.948986053466797, "learning_rate": 1.0976201796783642e-06, "loss": 0.6740862131118774, "step": 2538 }, { "epoch": 3.1013431013431014, "grad_norm": 2.047201633453369, "learning_rate": 1.0945402362451871e-06, "loss": 0.9215976595878601, "step": 2540 }, { "epoch": 3.103785103785104, "grad_norm": 6.655368328094482, "learning_rate": 1.0914670411844338e-06, "loss": 0.559134304523468, "step": 2542 }, { "epoch": 3.1062271062271063, "grad_norm": 1.7727597951889038, "learning_rate": 1.0884006070237834e-06, "loss": 0.5720962285995483, "step": 2544 }, { "epoch": 3.1086691086691087, "grad_norm": 3.6166863441467285, "learning_rate": 1.0853409462633507e-06, "loss": 0.16919654607772827, "step": 2546 }, { "epoch": 3.111111111111111, "grad_norm": 2.884989023208618, "learning_rate": 1.0822880713756422e-06, "loss": 0.639471173286438, "step": 2548 }, { "epoch": 3.1135531135531136, "grad_norm": 1.7115203142166138, "learning_rate": 1.0792419948054994e-06, "loss": 0.6552147269248962, "step": 2550 }, { "epoch": 3.115995115995116, "grad_norm": 3.130906343460083, "learning_rate": 1.0762027289700527e-06, "loss": 0.2590104043483734, "step": 2552 }, { "epoch": 3.1184371184371185, "grad_norm": 2.566354751586914, "learning_rate": 1.0731702862586686e-06, "loss": 0.8442977666854858, "step": 2554 }, { "epoch": 3.120879120879121, "grad_norm": 2.08247709274292, "learning_rate": 1.070144679032901e-06, "loss": 0.37470126152038574, "step": 2556 }, { "epoch": 3.1233211233211233, "grad_norm": 0.45577648282051086, "learning_rate": 1.0671259196264355e-06, "loss": 0.4773566722869873, "step": 2558 }, { "epoch": 3.1257631257631258, "grad_norm": 2.147977590560913, "learning_rate": 1.064114020345048e-06, "loss": 0.847014844417572, "step": 2560 }, { "epoch": 3.128205128205128, "grad_norm": 6.595324516296387, "learning_rate": 1.0611089934665438e-06, "loss": 1.0399620532989502, "step": 2562 }, { "epoch": 3.1306471306471306, "grad_norm": 3.0821518898010254, "learning_rate": 1.0581108512407206e-06, "loss": 0.8594496250152588, "step": 2564 }, { "epoch": 3.133089133089133, "grad_norm": 6.90889310836792, "learning_rate": 1.055119605889304e-06, "loss": 0.6531029939651489, "step": 2566 }, { "epoch": 3.1355311355311355, "grad_norm": 5.536701679229736, "learning_rate": 1.0521352696059106e-06, "loss": 0.8756755590438843, "step": 2568 }, { "epoch": 3.137973137973138, "grad_norm": 7.192801475524902, "learning_rate": 1.0491578545559882e-06, "loss": 0.5930169820785522, "step": 2570 }, { "epoch": 3.1404151404151404, "grad_norm": 5.717547416687012, "learning_rate": 1.0461873728767735e-06, "loss": 0.6029551029205322, "step": 2572 }, { "epoch": 3.142857142857143, "grad_norm": 4.090261936187744, "learning_rate": 1.043223836677239e-06, "loss": 0.8777113556861877, "step": 2574 }, { "epoch": 3.1452991452991452, "grad_norm": 14.0100736618042, "learning_rate": 1.040267258038045e-06, "loss": 0.9692713022232056, "step": 2576 }, { "epoch": 3.1477411477411477, "grad_norm": 3.899435520172119, "learning_rate": 1.0373176490114874e-06, "loss": 0.8949326276779175, "step": 2578 }, { "epoch": 3.15018315018315, "grad_norm": 5.538814067840576, "learning_rate": 1.0343750216214546e-06, "loss": 0.8762179017066956, "step": 2580 }, { "epoch": 3.1526251526251525, "grad_norm": 7.619011402130127, "learning_rate": 1.0314393878633705e-06, "loss": 0.7504989504814148, "step": 2582 }, { "epoch": 3.155067155067155, "grad_norm": 3.597076416015625, "learning_rate": 1.0285107597041552e-06, "loss": 0.31154295802116394, "step": 2584 }, { "epoch": 3.1575091575091574, "grad_norm": 3.5088987350463867, "learning_rate": 1.0255891490821657e-06, "loss": 0.6339558362960815, "step": 2586 }, { "epoch": 3.15995115995116, "grad_norm": 5.022501468658447, "learning_rate": 1.0226745679071555e-06, "loss": 0.328271746635437, "step": 2588 }, { "epoch": 3.1623931623931623, "grad_norm": 4.233664035797119, "learning_rate": 1.0197670280602234e-06, "loss": 0.35303497314453125, "step": 2590 }, { "epoch": 3.1648351648351647, "grad_norm": 2.7248518466949463, "learning_rate": 1.016866541393762e-06, "loss": 0.8729944825172424, "step": 2592 }, { "epoch": 3.167277167277167, "grad_norm": 2.3809876441955566, "learning_rate": 1.0139731197314144e-06, "loss": 0.7970367074012756, "step": 2594 }, { "epoch": 3.1697191697191696, "grad_norm": 5.26347017288208, "learning_rate": 1.0110867748680229e-06, "loss": 0.6249693632125854, "step": 2596 }, { "epoch": 3.172161172161172, "grad_norm": 2.0786538124084473, "learning_rate": 1.0082075185695821e-06, "loss": 0.8957004547119141, "step": 2598 }, { "epoch": 3.1746031746031744, "grad_norm": 2.350102424621582, "learning_rate": 1.0053353625731898e-06, "loss": 0.773188591003418, "step": 2600 }, { "epoch": 3.177045177045177, "grad_norm": 2.2141921520233154, "learning_rate": 1.0024703185870009e-06, "loss": 0.8564462065696716, "step": 2602 }, { "epoch": 3.1794871794871793, "grad_norm": 1.9684959650039673, "learning_rate": 9.99612398290176e-07, "loss": 0.8819740414619446, "step": 2604 }, { "epoch": 3.1819291819291817, "grad_norm": 9.041963577270508, "learning_rate": 9.967616133328415e-07, "loss": 0.6753929257392883, "step": 2606 }, { "epoch": 3.1843711843711846, "grad_norm": 3.164386510848999, "learning_rate": 9.939179753360317e-07, "loss": 0.9383725523948669, "step": 2608 }, { "epoch": 3.186813186813187, "grad_norm": 1.6950500011444092, "learning_rate": 9.910814958916509e-07, "loss": 0.8148356676101685, "step": 2610 }, { "epoch": 3.1892551892551895, "grad_norm": 1.7868210077285767, "learning_rate": 9.882521865624188e-07, "loss": 0.8345255255699158, "step": 2612 }, { "epoch": 3.191697191697192, "grad_norm": 2.3038735389709473, "learning_rate": 9.854300588818285e-07, "loss": 0.5892983078956604, "step": 2614 }, { "epoch": 3.1941391941391943, "grad_norm": 3.3323488235473633, "learning_rate": 9.826151243540976e-07, "loss": 0.5326892137527466, "step": 2616 }, { "epoch": 3.1965811965811968, "grad_norm": 5.719020366668701, "learning_rate": 9.798073944541209e-07, "loss": 0.5761935114860535, "step": 2618 }, { "epoch": 3.199023199023199, "grad_norm": 4.963831424713135, "learning_rate": 9.77006880627423e-07, "loss": 0.35491544008255005, "step": 2620 }, { "epoch": 3.2014652014652016, "grad_norm": 1.7607579231262207, "learning_rate": 9.742135942901152e-07, "loss": 0.5363562703132629, "step": 2622 }, { "epoch": 3.203907203907204, "grad_norm": 15.481447219848633, "learning_rate": 9.714275468288426e-07, "loss": 0.43480369448661804, "step": 2624 }, { "epoch": 3.2063492063492065, "grad_norm": 2.512791633605957, "learning_rate": 9.68648749600746e-07, "loss": 0.9965137839317322, "step": 2626 }, { "epoch": 3.208791208791209, "grad_norm": 38.066707611083984, "learning_rate": 9.658772139334074e-07, "loss": 0.227127343416214, "step": 2628 }, { "epoch": 3.2112332112332114, "grad_norm": 2.2914047241210938, "learning_rate": 9.631129511248099e-07, "loss": 0.9076048135757446, "step": 2630 }, { "epoch": 3.213675213675214, "grad_norm": 3.0350992679595947, "learning_rate": 9.603559724432874e-07, "loss": 0.5686833262443542, "step": 2632 }, { "epoch": 3.2161172161172162, "grad_norm": 2.278398036956787, "learning_rate": 9.576062891274816e-07, "loss": 0.6908602714538574, "step": 2634 }, { "epoch": 3.2185592185592187, "grad_norm": 7.601328372955322, "learning_rate": 9.548639123862952e-07, "loss": 0.81014084815979, "step": 2636 }, { "epoch": 3.221001221001221, "grad_norm": 4.974308490753174, "learning_rate": 9.52128853398847e-07, "loss": 0.6480343341827393, "step": 2638 }, { "epoch": 3.2234432234432235, "grad_norm": 5.386457443237305, "learning_rate": 9.494011233144227e-07, "loss": 0.6495685577392578, "step": 2640 }, { "epoch": 3.225885225885226, "grad_norm": 3.7221124172210693, "learning_rate": 9.466807332524343e-07, "loss": 0.885014533996582, "step": 2642 }, { "epoch": 3.2283272283272284, "grad_norm": 2.109729051589966, "learning_rate": 9.439676943023732e-07, "loss": 0.8729287385940552, "step": 2644 }, { "epoch": 3.230769230769231, "grad_norm": 1.1997343301773071, "learning_rate": 9.412620175237621e-07, "loss": 0.913487434387207, "step": 2646 }, { "epoch": 3.2332112332112333, "grad_norm": 5.304439544677734, "learning_rate": 9.385637139461151e-07, "loss": 0.9510135650634766, "step": 2648 }, { "epoch": 3.2356532356532357, "grad_norm": 8.256377220153809, "learning_rate": 9.358727945688877e-07, "loss": 0.2964293956756592, "step": 2650 }, { "epoch": 3.238095238095238, "grad_norm": 2.2458744049072266, "learning_rate": 9.331892703614359e-07, "loss": 0.8582343459129333, "step": 2652 }, { "epoch": 3.2405372405372406, "grad_norm": 2.5474815368652344, "learning_rate": 9.305131522629679e-07, "loss": 1.0781978368759155, "step": 2654 }, { "epoch": 3.242979242979243, "grad_norm": 1.53843092918396, "learning_rate": 9.27844451182503e-07, "loss": 0.4822746217250824, "step": 2656 }, { "epoch": 3.2454212454212454, "grad_norm": 5.753013610839844, "learning_rate": 9.251831779988252e-07, "loss": 0.3543876111507416, "step": 2658 }, { "epoch": 3.247863247863248, "grad_norm": 6.03514289855957, "learning_rate": 9.22529343560439e-07, "loss": 0.5339376330375671, "step": 2660 }, { "epoch": 3.2503052503052503, "grad_norm": 3.018615245819092, "learning_rate": 9.19882958685524e-07, "loss": 1.2899425029754639, "step": 2662 }, { "epoch": 3.2527472527472527, "grad_norm": 4.987201690673828, "learning_rate": 9.172440341618951e-07, "loss": 0.6661590337753296, "step": 2664 }, { "epoch": 3.255189255189255, "grad_norm": 3.2230803966522217, "learning_rate": 9.146125807469525e-07, "loss": 0.6229037642478943, "step": 2666 }, { "epoch": 3.2576312576312576, "grad_norm": 5.4705071449279785, "learning_rate": 9.119886091676436e-07, "loss": 0.9204983711242676, "step": 2668 }, { "epoch": 3.26007326007326, "grad_norm": 1.7358949184417725, "learning_rate": 9.093721301204143e-07, "loss": 0.8217456340789795, "step": 2670 }, { "epoch": 3.2625152625152625, "grad_norm": 2.2710583209991455, "learning_rate": 9.067631542711692e-07, "loss": 0.5310102701187134, "step": 2672 }, { "epoch": 3.264957264957265, "grad_norm": 23.32669448852539, "learning_rate": 9.041616922552254e-07, "loss": 0.1262706220149994, "step": 2674 }, { "epoch": 3.2673992673992673, "grad_norm": 2.316551923751831, "learning_rate": 9.015677546772717e-07, "loss": 0.9631689190864563, "step": 2676 }, { "epoch": 3.2698412698412698, "grad_norm": 5.279893398284912, "learning_rate": 8.989813521113232e-07, "loss": 0.7836791276931763, "step": 2678 }, { "epoch": 3.272283272283272, "grad_norm": 0.7904373407363892, "learning_rate": 8.964024951006798e-07, "loss": 0.49453315138816833, "step": 2680 }, { "epoch": 3.2747252747252746, "grad_norm": 4.579488277435303, "learning_rate": 8.938311941578806e-07, "loss": 0.48266905546188354, "step": 2682 }, { "epoch": 3.277167277167277, "grad_norm": 2.3565316200256348, "learning_rate": 8.912674597646653e-07, "loss": 0.6459278464317322, "step": 2684 }, { "epoch": 3.2796092796092795, "grad_norm": 3.3402786254882812, "learning_rate": 8.887113023719262e-07, "loss": 1.0020655393600464, "step": 2686 }, { "epoch": 3.282051282051282, "grad_norm": 0.46372556686401367, "learning_rate": 8.861627323996724e-07, "loss": 0.08561723679304123, "step": 2688 }, { "epoch": 3.2844932844932844, "grad_norm": 2.682774782180786, "learning_rate": 8.836217602369799e-07, "loss": 1.0556048154830933, "step": 2690 }, { "epoch": 3.286935286935287, "grad_norm": 2.9882302284240723, "learning_rate": 8.810883962419542e-07, "loss": 0.9429636001586914, "step": 2692 }, { "epoch": 3.2893772893772892, "grad_norm": 0.21104033291339874, "learning_rate": 8.785626507416855e-07, "loss": 0.11109757423400879, "step": 2694 }, { "epoch": 3.2918192918192917, "grad_norm": 0.5581515431404114, "learning_rate": 8.760445340322096e-07, "loss": 0.17564286291599274, "step": 2696 }, { "epoch": 3.294261294261294, "grad_norm": 2.4714837074279785, "learning_rate": 8.735340563784625e-07, "loss": 0.6768051385879517, "step": 2698 }, { "epoch": 3.2967032967032965, "grad_norm": 2.1521193981170654, "learning_rate": 8.710312280142416e-07, "loss": 0.9193722605705261, "step": 2700 }, { "epoch": 3.299145299145299, "grad_norm": 13.212026596069336, "learning_rate": 8.685360591421598e-07, "loss": 0.9638568758964539, "step": 2702 }, { "epoch": 3.3015873015873014, "grad_norm": 2.658658266067505, "learning_rate": 8.660485599336094e-07, "loss": 0.8721650838851929, "step": 2704 }, { "epoch": 3.304029304029304, "grad_norm": 2.8447883129119873, "learning_rate": 8.635687405287171e-07, "loss": 0.7735913991928101, "step": 2706 }, { "epoch": 3.3064713064713063, "grad_norm": 1.3805103302001953, "learning_rate": 8.610966110363014e-07, "loss": 0.5056965351104736, "step": 2708 }, { "epoch": 3.3089133089133087, "grad_norm": 1.813744068145752, "learning_rate": 8.586321815338361e-07, "loss": 0.57419753074646, "step": 2710 }, { "epoch": 3.311355311355311, "grad_norm": 2.599118232727051, "learning_rate": 8.56175462067405e-07, "loss": 0.8707802295684814, "step": 2712 }, { "epoch": 3.3137973137973136, "grad_norm": 2.0446720123291016, "learning_rate": 8.537264626516634e-07, "loss": 0.5774456262588501, "step": 2714 }, { "epoch": 3.316239316239316, "grad_norm": 3.4925484657287598, "learning_rate": 8.512851932697947e-07, "loss": 0.9497953653335571, "step": 2716 }, { "epoch": 3.3186813186813184, "grad_norm": 3.462381601333618, "learning_rate": 8.488516638734731e-07, "loss": 0.8057655692100525, "step": 2718 }, { "epoch": 3.3211233211233213, "grad_norm": 1.8965480327606201, "learning_rate": 8.464258843828202e-07, "loss": 0.8699415326118469, "step": 2720 }, { "epoch": 3.3235653235653237, "grad_norm": 2.465651750564575, "learning_rate": 8.440078646863664e-07, "loss": 0.641089141368866, "step": 2722 }, { "epoch": 3.326007326007326, "grad_norm": 0.7584552764892578, "learning_rate": 8.415976146410084e-07, "loss": 0.09330576658248901, "step": 2724 }, { "epoch": 3.3284493284493286, "grad_norm": 2.273700714111328, "learning_rate": 8.391951440719725e-07, "loss": 0.5427566766738892, "step": 2726 }, { "epoch": 3.330891330891331, "grad_norm": 2.9195804595947266, "learning_rate": 8.368004627727699e-07, "loss": 0.8910986185073853, "step": 2728 }, { "epoch": 3.3333333333333335, "grad_norm": 4.608819007873535, "learning_rate": 8.344135805051629e-07, "loss": 0.7685779929161072, "step": 2730 }, { "epoch": 3.335775335775336, "grad_norm": 0.7064034938812256, "learning_rate": 8.320345069991175e-07, "loss": 0.5918761491775513, "step": 2732 }, { "epoch": 3.3382173382173383, "grad_norm": 3.630662202835083, "learning_rate": 8.296632519527711e-07, "loss": 0.6658087372779846, "step": 2734 }, { "epoch": 3.340659340659341, "grad_norm": 2.0108442306518555, "learning_rate": 8.272998250323872e-07, "loss": 0.7752918004989624, "step": 2736 }, { "epoch": 3.343101343101343, "grad_norm": 1.0330065488815308, "learning_rate": 8.249442358723204e-07, "loss": 0.5759359002113342, "step": 2738 }, { "epoch": 3.3455433455433456, "grad_norm": 3.048520803451538, "learning_rate": 8.225964940749737e-07, "loss": 0.5758652687072754, "step": 2740 }, { "epoch": 3.347985347985348, "grad_norm": 4.6908392906188965, "learning_rate": 8.202566092107628e-07, "loss": 0.8692240118980408, "step": 2742 }, { "epoch": 3.3504273504273505, "grad_norm": 11.141556739807129, "learning_rate": 8.179245908180724e-07, "loss": 0.5387795567512512, "step": 2744 }, { "epoch": 3.352869352869353, "grad_norm": 3.472233533859253, "learning_rate": 8.156004484032226e-07, "loss": 0.7473067045211792, "step": 2746 }, { "epoch": 3.3553113553113554, "grad_norm": 7.765378475189209, "learning_rate": 8.132841914404253e-07, "loss": 0.4999602437019348, "step": 2748 }, { "epoch": 3.357753357753358, "grad_norm": 9.492226600646973, "learning_rate": 8.109758293717505e-07, "loss": 0.36286643147468567, "step": 2750 }, { "epoch": 3.3601953601953602, "grad_norm": 0.9999972581863403, "learning_rate": 8.086753716070828e-07, "loss": 0.402780145406723, "step": 2752 }, { "epoch": 3.3626373626373627, "grad_norm": 2.837510108947754, "learning_rate": 8.063828275240873e-07, "loss": 0.4516952335834503, "step": 2754 }, { "epoch": 3.365079365079365, "grad_norm": 4.009491443634033, "learning_rate": 8.040982064681671e-07, "loss": 0.8290095925331116, "step": 2756 }, { "epoch": 3.3675213675213675, "grad_norm": 2.420555830001831, "learning_rate": 8.018215177524302e-07, "loss": 0.8783026337623596, "step": 2758 }, { "epoch": 3.36996336996337, "grad_norm": 4.7987284660339355, "learning_rate": 7.995527706576474e-07, "loss": 1.161372423171997, "step": 2760 }, { "epoch": 3.3724053724053724, "grad_norm": 2.0077738761901855, "learning_rate": 7.972919744322172e-07, "loss": 0.5153079032897949, "step": 2762 }, { "epoch": 3.374847374847375, "grad_norm": 2.043386459350586, "learning_rate": 7.950391382921253e-07, "loss": 0.8760576248168945, "step": 2764 }, { "epoch": 3.3772893772893773, "grad_norm": 2.278296709060669, "learning_rate": 7.927942714209094e-07, "loss": 0.47707459330558777, "step": 2766 }, { "epoch": 3.3797313797313797, "grad_norm": 3.5936970710754395, "learning_rate": 7.905573829696222e-07, "loss": 0.3957478404045105, "step": 2768 }, { "epoch": 3.382173382173382, "grad_norm": 7.540212154388428, "learning_rate": 7.883284820567905e-07, "loss": 0.5244758725166321, "step": 2770 }, { "epoch": 3.3846153846153846, "grad_norm": 18.458484649658203, "learning_rate": 7.861075777683822e-07, "loss": 0.8487293720245361, "step": 2772 }, { "epoch": 3.387057387057387, "grad_norm": 9.975359916687012, "learning_rate": 7.838946791577669e-07, "loss": 0.42381957173347473, "step": 2774 }, { "epoch": 3.3894993894993894, "grad_norm": 1.6827895641326904, "learning_rate": 7.816897952456802e-07, "loss": 0.8452630043029785, "step": 2776 }, { "epoch": 3.391941391941392, "grad_norm": 2.2170772552490234, "learning_rate": 7.794929350201849e-07, "loss": 0.7993656396865845, "step": 2778 }, { "epoch": 3.3943833943833943, "grad_norm": 0.4966033399105072, "learning_rate": 7.773041074366375e-07, "loss": 0.38123244047164917, "step": 2780 }, { "epoch": 3.3968253968253967, "grad_norm": 1.8956717252731323, "learning_rate": 7.751233214176485e-07, "loss": 0.4719703495502472, "step": 2782 }, { "epoch": 3.399267399267399, "grad_norm": 3.7957403659820557, "learning_rate": 7.729505858530489e-07, "loss": 0.21603846549987793, "step": 2784 }, { "epoch": 3.4017094017094016, "grad_norm": 3.2089285850524902, "learning_rate": 7.70785909599851e-07, "loss": 0.4859767258167267, "step": 2786 }, { "epoch": 3.404151404151404, "grad_norm": 2.2830421924591064, "learning_rate": 7.686293014822149e-07, "loss": 0.8922374248504639, "step": 2788 }, { "epoch": 3.4065934065934065, "grad_norm": 3.5256621837615967, "learning_rate": 7.664807702914107e-07, "loss": 0.8285965919494629, "step": 2790 }, { "epoch": 3.409035409035409, "grad_norm": 2.4469144344329834, "learning_rate": 7.643403247857853e-07, "loss": 0.4633885622024536, "step": 2792 }, { "epoch": 3.4114774114774113, "grad_norm": 3.2874977588653564, "learning_rate": 7.622079736907219e-07, "loss": 0.730563223361969, "step": 2794 }, { "epoch": 3.413919413919414, "grad_norm": 1.843967318534851, "learning_rate": 7.600837256986104e-07, "loss": 0.9653308391571045, "step": 2796 }, { "epoch": 3.416361416361416, "grad_norm": 6.217641830444336, "learning_rate": 7.57967589468806e-07, "loss": 0.47932058572769165, "step": 2798 }, { "epoch": 3.4188034188034186, "grad_norm": 0.45403721928596497, "learning_rate": 7.558595736275995e-07, "loss": 0.05683291330933571, "step": 2800 }, { "epoch": 3.421245421245421, "grad_norm": 2.4444427490234375, "learning_rate": 7.537596867681773e-07, "loss": 1.0308482646942139, "step": 2802 }, { "epoch": 3.4236874236874235, "grad_norm": 0.5573722124099731, "learning_rate": 7.516679374505911e-07, "loss": 0.6440561413764954, "step": 2804 }, { "epoch": 3.426129426129426, "grad_norm": 3.718284845352173, "learning_rate": 7.495843342017173e-07, "loss": 0.6178560853004456, "step": 2806 }, { "epoch": 3.4285714285714284, "grad_norm": 3.11525297164917, "learning_rate": 7.475088855152279e-07, "loss": 0.923469066619873, "step": 2808 }, { "epoch": 3.4310134310134313, "grad_norm": 2.810075521469116, "learning_rate": 7.454415998515516e-07, "loss": 0.7372915744781494, "step": 2810 }, { "epoch": 3.4334554334554337, "grad_norm": 2.502122640609741, "learning_rate": 7.433824856378425e-07, "loss": 0.14429078996181488, "step": 2812 }, { "epoch": 3.435897435897436, "grad_norm": 10.276256561279297, "learning_rate": 7.413315512679436e-07, "loss": 0.5484145283699036, "step": 2814 }, { "epoch": 3.4383394383394386, "grad_norm": 2.402463912963867, "learning_rate": 7.392888051023542e-07, "loss": 0.8286385536193848, "step": 2816 }, { "epoch": 3.440781440781441, "grad_norm": 1.590881586074829, "learning_rate": 7.37254255468193e-07, "loss": 0.9624377489089966, "step": 2818 }, { "epoch": 3.4432234432234434, "grad_norm": 2.1349987983703613, "learning_rate": 7.352279106591676e-07, "loss": 0.8825662732124329, "step": 2820 }, { "epoch": 3.445665445665446, "grad_norm": 3.0658047199249268, "learning_rate": 7.332097789355388e-07, "loss": 0.9127561450004578, "step": 2822 }, { "epoch": 3.4481074481074483, "grad_norm": 1.7639163732528687, "learning_rate": 7.31199868524088e-07, "loss": 0.8078799247741699, "step": 2824 }, { "epoch": 3.4505494505494507, "grad_norm": 1.9734654426574707, "learning_rate": 7.291981876180815e-07, "loss": 0.6381809115409851, "step": 2826 }, { "epoch": 3.452991452991453, "grad_norm": 2.2318012714385986, "learning_rate": 7.272047443772395e-07, "loss": 0.760457456111908, "step": 2828 }, { "epoch": 3.4554334554334556, "grad_norm": 9.063981056213379, "learning_rate": 7.252195469277024e-07, "loss": 0.6253539323806763, "step": 2830 }, { "epoch": 3.457875457875458, "grad_norm": 3.0912418365478516, "learning_rate": 7.232426033619955e-07, "loss": 0.4733204245567322, "step": 2832 }, { "epoch": 3.4603174603174605, "grad_norm": 1.568339228630066, "learning_rate": 7.212739217389991e-07, "loss": 0.9539817571640015, "step": 2834 }, { "epoch": 3.462759462759463, "grad_norm": 9.57923412322998, "learning_rate": 7.193135100839142e-07, "loss": 0.5720884799957275, "step": 2836 }, { "epoch": 3.4652014652014653, "grad_norm": 14.26650333404541, "learning_rate": 7.173613763882297e-07, "loss": 0.5722582936286926, "step": 2838 }, { "epoch": 3.4676434676434678, "grad_norm": 3.157581329345703, "learning_rate": 7.154175286096886e-07, "loss": 0.954519510269165, "step": 2840 }, { "epoch": 3.47008547008547, "grad_norm": 2.162440061569214, "learning_rate": 7.134819746722588e-07, "loss": 0.8875312805175781, "step": 2842 }, { "epoch": 3.4725274725274726, "grad_norm": 1.576352834701538, "learning_rate": 7.115547224660981e-07, "loss": 0.8703738451004028, "step": 2844 }, { "epoch": 3.474969474969475, "grad_norm": 2.352095127105713, "learning_rate": 7.096357798475231e-07, "loss": 0.8873903155326843, "step": 2846 }, { "epoch": 3.4774114774114775, "grad_norm": 2.0396454334259033, "learning_rate": 7.077251546389761e-07, "loss": 0.8595806360244751, "step": 2848 }, { "epoch": 3.47985347985348, "grad_norm": 2.4909889698028564, "learning_rate": 7.058228546289952e-07, "loss": 0.6372047662734985, "step": 2850 }, { "epoch": 3.4822954822954824, "grad_norm": 2.2574751377105713, "learning_rate": 7.039288875721798e-07, "loss": 0.8206950426101685, "step": 2852 }, { "epoch": 3.484737484737485, "grad_norm": 0.5610913634300232, "learning_rate": 7.020432611891629e-07, "loss": 0.1707066297531128, "step": 2854 }, { "epoch": 3.4871794871794872, "grad_norm": 8.053951263427734, "learning_rate": 7.001659831665748e-07, "loss": 0.6180318593978882, "step": 2856 }, { "epoch": 3.4896214896214897, "grad_norm": 8.793201446533203, "learning_rate": 6.982970611570168e-07, "loss": 0.29429712891578674, "step": 2858 }, { "epoch": 3.492063492063492, "grad_norm": 1.830889344215393, "learning_rate": 6.964365027790243e-07, "loss": 0.8592406511306763, "step": 2860 }, { "epoch": 3.4945054945054945, "grad_norm": 2.1449406147003174, "learning_rate": 6.945843156170423e-07, "loss": 0.9528040885925293, "step": 2862 }, { "epoch": 3.496947496947497, "grad_norm": 2.4805285930633545, "learning_rate": 6.927405072213878e-07, "loss": 0.467544287443161, "step": 2864 }, { "epoch": 3.4993894993894994, "grad_norm": 4.722518444061279, "learning_rate": 6.909050851082258e-07, "loss": 0.38818594813346863, "step": 2866 }, { "epoch": 3.501831501831502, "grad_norm": 2.0547142028808594, "learning_rate": 6.89078056759532e-07, "loss": 0.8755742311477661, "step": 2868 }, { "epoch": 3.5042735042735043, "grad_norm": 7.294073581695557, "learning_rate": 6.872594296230677e-07, "loss": 0.5849094986915588, "step": 2870 }, { "epoch": 3.5067155067155067, "grad_norm": 4.594062328338623, "learning_rate": 6.854492111123455e-07, "loss": 0.5189932584762573, "step": 2872 }, { "epoch": 3.509157509157509, "grad_norm": 3.3439576625823975, "learning_rate": 6.836474086066024e-07, "loss": 0.9283484220504761, "step": 2874 }, { "epoch": 3.5115995115995116, "grad_norm": 6.525171279907227, "learning_rate": 6.81854029450767e-07, "loss": 0.32967475056648254, "step": 2876 }, { "epoch": 3.514041514041514, "grad_norm": 1.570821762084961, "learning_rate": 6.800690809554313e-07, "loss": 0.9133099913597107, "step": 2878 }, { "epoch": 3.5164835164835164, "grad_norm": 2.403273582458496, "learning_rate": 6.782925703968195e-07, "loss": 0.5854375958442688, "step": 2880 }, { "epoch": 3.518925518925519, "grad_norm": 3.7819712162017822, "learning_rate": 6.765245050167599e-07, "loss": 0.6390686631202698, "step": 2882 }, { "epoch": 3.5213675213675213, "grad_norm": 2.2759008407592773, "learning_rate": 6.74764892022654e-07, "loss": 0.9842717051506042, "step": 2884 }, { "epoch": 3.5238095238095237, "grad_norm": 1.5493816137313843, "learning_rate": 6.730137385874491e-07, "loss": 0.9478884339332581, "step": 2886 }, { "epoch": 3.526251526251526, "grad_norm": 2.1049609184265137, "learning_rate": 6.712710518496049e-07, "loss": 0.777178168296814, "step": 2888 }, { "epoch": 3.5286935286935286, "grad_norm": 2.9918575286865234, "learning_rate": 6.695368389130699e-07, "loss": 0.8717899918556213, "step": 2890 }, { "epoch": 3.531135531135531, "grad_norm": 3.209395170211792, "learning_rate": 6.678111068472487e-07, "loss": 0.7953534722328186, "step": 2892 }, { "epoch": 3.5335775335775335, "grad_norm": 14.544081687927246, "learning_rate": 6.660938626869734e-07, "loss": 0.4765959680080414, "step": 2894 }, { "epoch": 3.536019536019536, "grad_norm": 51.49199295043945, "learning_rate": 6.643851134324767e-07, "loss": 0.7235844731330872, "step": 2896 }, { "epoch": 3.5384615384615383, "grad_norm": 4.060218811035156, "learning_rate": 6.626848660493623e-07, "loss": 0.804652750492096, "step": 2898 }, { "epoch": 3.5409035409035408, "grad_norm": 12.073833465576172, "learning_rate": 6.60993127468577e-07, "loss": 0.867784321308136, "step": 2900 }, { "epoch": 3.543345543345543, "grad_norm": 3.4921793937683105, "learning_rate": 6.593099045863802e-07, "loss": 0.13817808032035828, "step": 2902 }, { "epoch": 3.5457875457875456, "grad_norm": 1.4257546663284302, "learning_rate": 6.576352042643192e-07, "loss": 0.8409507274627686, "step": 2904 }, { "epoch": 3.548229548229548, "grad_norm": 4.283762454986572, "learning_rate": 6.559690333292e-07, "loss": 0.8512478470802307, "step": 2906 }, { "epoch": 3.5506715506715505, "grad_norm": 2.5699775218963623, "learning_rate": 6.543113985730579e-07, "loss": 1.0054024457931519, "step": 2908 }, { "epoch": 3.553113553113553, "grad_norm": 5.507492542266846, "learning_rate": 6.526623067531313e-07, "loss": 0.6415849328041077, "step": 2910 }, { "epoch": 3.5555555555555554, "grad_norm": 4.384498119354248, "learning_rate": 6.510217645918349e-07, "loss": 0.46229088306427, "step": 2912 }, { "epoch": 3.557997557997558, "grad_norm": 2.0857934951782227, "learning_rate": 6.493897787767291e-07, "loss": 0.5283727645874023, "step": 2914 }, { "epoch": 3.5604395604395602, "grad_norm": 1.9115166664123535, "learning_rate": 6.477663559604979e-07, "loss": 0.6623761653900146, "step": 2916 }, { "epoch": 3.5628815628815627, "grad_norm": 6.1141533851623535, "learning_rate": 6.461515027609163e-07, "loss": 0.6332585215568542, "step": 2918 }, { "epoch": 3.565323565323565, "grad_norm": 8.153079986572266, "learning_rate": 6.44545225760827e-07, "loss": 0.5882151126861572, "step": 2920 }, { "epoch": 3.5677655677655675, "grad_norm": 2.2321126461029053, "learning_rate": 6.429475315081122e-07, "loss": 0.8858240246772766, "step": 2922 }, { "epoch": 3.57020757020757, "grad_norm": 19.70038414001465, "learning_rate": 6.413584265156671e-07, "loss": 0.6081412434577942, "step": 2924 }, { "epoch": 3.5726495726495724, "grad_norm": 3.0893778800964355, "learning_rate": 6.397779172613722e-07, "loss": 0.454592227935791, "step": 2926 }, { "epoch": 3.575091575091575, "grad_norm": 6.8976240158081055, "learning_rate": 6.382060101880711e-07, "loss": 0.8145590424537659, "step": 2928 }, { "epoch": 3.5775335775335773, "grad_norm": 1.8353841304779053, "learning_rate": 6.366427117035377e-07, "loss": 0.8217576146125793, "step": 2930 }, { "epoch": 3.57997557997558, "grad_norm": 4.694766044616699, "learning_rate": 6.350880281804557e-07, "loss": 0.7602511644363403, "step": 2932 }, { "epoch": 3.5824175824175826, "grad_norm": 2.5171759128570557, "learning_rate": 6.335419659563896e-07, "loss": 0.7700616717338562, "step": 2934 }, { "epoch": 3.584859584859585, "grad_norm": 5.43289041519165, "learning_rate": 6.320045313337597e-07, "loss": 0.518511950969696, "step": 2936 }, { "epoch": 3.5873015873015874, "grad_norm": 0.7759566903114319, "learning_rate": 6.304757305798172e-07, "loss": 0.432235449552536, "step": 2938 }, { "epoch": 3.58974358974359, "grad_norm": 2.7056305408477783, "learning_rate": 6.289555699266174e-07, "loss": 0.5823948383331299, "step": 2940 }, { "epoch": 3.5921855921855923, "grad_norm": 10.587597846984863, "learning_rate": 6.274440555709947e-07, "loss": 0.9206511378288269, "step": 2942 }, { "epoch": 3.5946275946275947, "grad_norm": 1.4514787197113037, "learning_rate": 6.259411936745376e-07, "loss": 0.9449152946472168, "step": 2944 }, { "epoch": 3.597069597069597, "grad_norm": 2.0257363319396973, "learning_rate": 6.244469903635632e-07, "loss": 0.9899218678474426, "step": 2946 }, { "epoch": 3.5995115995115996, "grad_norm": 3.9623706340789795, "learning_rate": 6.229614517290932e-07, "loss": 0.48770591616630554, "step": 2948 }, { "epoch": 3.601953601953602, "grad_norm": 2.4973347187042236, "learning_rate": 6.21484583826827e-07, "loss": 0.5998440980911255, "step": 2950 }, { "epoch": 3.6043956043956045, "grad_norm": 4.926875114440918, "learning_rate": 6.200163926771196e-07, "loss": 0.28131791949272156, "step": 2952 }, { "epoch": 3.606837606837607, "grad_norm": 4.383153915405273, "learning_rate": 6.185568842649552e-07, "loss": 0.5602369904518127, "step": 2954 }, { "epoch": 3.6092796092796093, "grad_norm": 2.369140625, "learning_rate": 6.171060645399233e-07, "loss": 0.7010159492492676, "step": 2956 }, { "epoch": 3.6117216117216118, "grad_norm": 3.2974140644073486, "learning_rate": 6.15663939416195e-07, "loss": 0.7715173363685608, "step": 2958 }, { "epoch": 3.614163614163614, "grad_norm": 1.597782850265503, "learning_rate": 6.142305147724979e-07, "loss": 0.8990174531936646, "step": 2960 }, { "epoch": 3.6166056166056166, "grad_norm": 3.7081425189971924, "learning_rate": 6.128057964520934e-07, "loss": 0.5858969688415527, "step": 2962 }, { "epoch": 3.619047619047619, "grad_norm": 1.764650821685791, "learning_rate": 6.113897902627508e-07, "loss": 0.8998643159866333, "step": 2964 }, { "epoch": 3.6214896214896215, "grad_norm": 2.027956247329712, "learning_rate": 6.099825019767264e-07, "loss": 0.8704400658607483, "step": 2966 }, { "epoch": 3.623931623931624, "grad_norm": 2.2779312133789062, "learning_rate": 6.085839373307382e-07, "loss": 0.9620934724807739, "step": 2968 }, { "epoch": 3.6263736263736264, "grad_norm": 2.847346544265747, "learning_rate": 6.071941020259423e-07, "loss": 0.4650316834449768, "step": 2970 }, { "epoch": 3.628815628815629, "grad_norm": 4.082263469696045, "learning_rate": 6.058130017279103e-07, "loss": 0.4654577672481537, "step": 2972 }, { "epoch": 3.6312576312576312, "grad_norm": 4.675213813781738, "learning_rate": 6.044406420666072e-07, "loss": 0.5305784940719604, "step": 2974 }, { "epoch": 3.6336996336996337, "grad_norm": 4.327298164367676, "learning_rate": 6.030770286363656e-07, "loss": 0.8460584282875061, "step": 2976 }, { "epoch": 3.636141636141636, "grad_norm": 6.675053596496582, "learning_rate": 6.017221669958662e-07, "loss": 0.4189061224460602, "step": 2978 }, { "epoch": 3.6385836385836385, "grad_norm": 1.6335099935531616, "learning_rate": 6.003760626681127e-07, "loss": 0.956732988357544, "step": 2980 }, { "epoch": 3.641025641025641, "grad_norm": 6.5811381340026855, "learning_rate": 5.99038721140411e-07, "loss": 1.057121992111206, "step": 2982 }, { "epoch": 3.6434676434676434, "grad_norm": 1.5813173055648804, "learning_rate": 5.97710147864345e-07, "loss": 0.9400102496147156, "step": 2984 }, { "epoch": 3.645909645909646, "grad_norm": 3.3870911598205566, "learning_rate": 5.963903482557566e-07, "loss": 0.9326266050338745, "step": 2986 }, { "epoch": 3.6483516483516483, "grad_norm": 2.8349955081939697, "learning_rate": 5.950793276947205e-07, "loss": 0.9676442742347717, "step": 2988 }, { "epoch": 3.6507936507936507, "grad_norm": 31.81429100036621, "learning_rate": 5.937770915255269e-07, "loss": 0.9522081017494202, "step": 2990 }, { "epoch": 3.653235653235653, "grad_norm": 3.3921871185302734, "learning_rate": 5.924836450566549e-07, "loss": 0.5230456590652466, "step": 2992 }, { "epoch": 3.6556776556776556, "grad_norm": 2.2261812686920166, "learning_rate": 5.911989935607538e-07, "loss": 0.419090211391449, "step": 2994 }, { "epoch": 3.658119658119658, "grad_norm": 2.2666232585906982, "learning_rate": 5.899231422746202e-07, "loss": 0.9825529456138611, "step": 2996 }, { "epoch": 3.6605616605616604, "grad_norm": 1.18002188205719, "learning_rate": 5.886560963991778e-07, "loss": 0.45276400446891785, "step": 2998 }, { "epoch": 3.663003663003663, "grad_norm": 4.351987361907959, "learning_rate": 5.873978610994557e-07, "loss": 0.38837531208992004, "step": 3000 }, { "epoch": 3.6654456654456653, "grad_norm": 3.792799234390259, "learning_rate": 5.861484415045672e-07, "loss": 0.4969119429588318, "step": 3002 }, { "epoch": 3.6678876678876677, "grad_norm": 4.516859531402588, "learning_rate": 5.849078427076883e-07, "loss": 0.2892443835735321, "step": 3004 }, { "epoch": 3.67032967032967, "grad_norm": 1.7598987817764282, "learning_rate": 5.836760697660382e-07, "loss": 0.9143301844596863, "step": 3006 }, { "epoch": 3.672771672771673, "grad_norm": 7.990298748016357, "learning_rate": 5.82453127700858e-07, "loss": 0.6147029399871826, "step": 3008 }, { "epoch": 3.6752136752136755, "grad_norm": 0.5319908857345581, "learning_rate": 5.812390214973905e-07, "loss": 0.5243109464645386, "step": 3010 }, { "epoch": 3.677655677655678, "grad_norm": 2.6800284385681152, "learning_rate": 5.800337561048592e-07, "loss": 0.9062631726264954, "step": 3012 }, { "epoch": 3.6800976800976803, "grad_norm": 1.4025696516036987, "learning_rate": 5.788373364364487e-07, "loss": 0.9003893733024597, "step": 3014 }, { "epoch": 3.682539682539683, "grad_norm": 9.346170425415039, "learning_rate": 5.776497673692857e-07, "loss": 0.7075907588005066, "step": 3016 }, { "epoch": 3.684981684981685, "grad_norm": 2.3770735263824463, "learning_rate": 5.764710537444159e-07, "loss": 0.5896199941635132, "step": 3018 }, { "epoch": 3.6874236874236876, "grad_norm": 3.0938150882720947, "learning_rate": 5.753012003667885e-07, "loss": 0.6084612011909485, "step": 3020 }, { "epoch": 3.68986568986569, "grad_norm": 1.9582159519195557, "learning_rate": 5.741402120052328e-07, "loss": 0.5125177502632141, "step": 3022 }, { "epoch": 3.6923076923076925, "grad_norm": 2.1140964031219482, "learning_rate": 5.729880933924421e-07, "loss": 1.003217101097107, "step": 3024 }, { "epoch": 3.694749694749695, "grad_norm": 0.36588796973228455, "learning_rate": 5.718448492249509e-07, "loss": 0.5080230236053467, "step": 3026 }, { "epoch": 3.6971916971916974, "grad_norm": 3.183983087539673, "learning_rate": 5.707104841631195e-07, "loss": 0.7072214484214783, "step": 3028 }, { "epoch": 3.6996336996337, "grad_norm": 1.5387071371078491, "learning_rate": 5.695850028311112e-07, "loss": 0.9744673371315002, "step": 3030 }, { "epoch": 3.7020757020757022, "grad_norm": 2.1208925247192383, "learning_rate": 5.68468409816877e-07, "loss": 0.8400413990020752, "step": 3032 }, { "epoch": 3.7045177045177047, "grad_norm": 4.847201824188232, "learning_rate": 5.673607096721346e-07, "loss": 0.500311017036438, "step": 3034 }, { "epoch": 3.706959706959707, "grad_norm": 2.9996325969696045, "learning_rate": 5.662619069123503e-07, "loss": 0.5278769135475159, "step": 3036 }, { "epoch": 3.7094017094017095, "grad_norm": 2.098602771759033, "learning_rate": 5.651720060167208e-07, "loss": 0.5000000596046448, "step": 3038 }, { "epoch": 3.711843711843712, "grad_norm": 1.7179620265960693, "learning_rate": 5.640910114281555e-07, "loss": 0.9520195722579956, "step": 3040 }, { "epoch": 3.7142857142857144, "grad_norm": 2.3502564430236816, "learning_rate": 5.630189275532574e-07, "loss": 0.8327752947807312, "step": 3042 }, { "epoch": 3.716727716727717, "grad_norm": 2.5049052238464355, "learning_rate": 5.619557587623057e-07, "loss": 0.6436217427253723, "step": 3044 }, { "epoch": 3.7191697191697193, "grad_norm": 2.9425840377807617, "learning_rate": 5.609015093892374e-07, "loss": 0.9164323806762695, "step": 3046 }, { "epoch": 3.7216117216117217, "grad_norm": 3.1850688457489014, "learning_rate": 5.59856183731631e-07, "loss": 0.5315079689025879, "step": 3048 }, { "epoch": 3.724053724053724, "grad_norm": 2.6305289268493652, "learning_rate": 5.588197860506867e-07, "loss": 0.7617026567459106, "step": 3050 }, { "epoch": 3.7264957264957266, "grad_norm": 3.4540348052978516, "learning_rate": 5.577923205712124e-07, "loss": 1.017609715461731, "step": 3052 }, { "epoch": 3.728937728937729, "grad_norm": 7.902237415313721, "learning_rate": 5.567737914816022e-07, "loss": 0.5209454298019409, "step": 3054 }, { "epoch": 3.7313797313797314, "grad_norm": 1.829217791557312, "learning_rate": 5.557642029338236e-07, "loss": 0.9426127672195435, "step": 3056 }, { "epoch": 3.733821733821734, "grad_norm": 3.1745777130126953, "learning_rate": 5.547635590433968e-07, "loss": 0.6483992338180542, "step": 3058 }, { "epoch": 3.7362637362637363, "grad_norm": 10.875771522521973, "learning_rate": 5.53771863889381e-07, "loss": 0.46888014674186707, "step": 3060 }, { "epoch": 3.7387057387057387, "grad_norm": 2.8701348304748535, "learning_rate": 5.527891215143559e-07, "loss": 0.5719221830368042, "step": 3062 }, { "epoch": 3.741147741147741, "grad_norm": 1.0279072523117065, "learning_rate": 5.518153359244063e-07, "loss": 0.3847256898880005, "step": 3064 }, { "epoch": 3.7435897435897436, "grad_norm": 2.5575125217437744, "learning_rate": 5.508505110891045e-07, "loss": 0.5125806331634521, "step": 3066 }, { "epoch": 3.746031746031746, "grad_norm": 1.723737120628357, "learning_rate": 5.498946509414949e-07, "loss": 0.8170480132102966, "step": 3068 }, { "epoch": 3.7484737484737485, "grad_norm": 1.8103982210159302, "learning_rate": 5.489477593780787e-07, "loss": 1.0591984987258911, "step": 3070 }, { "epoch": 3.750915750915751, "grad_norm": 6.911821365356445, "learning_rate": 5.480098402587973e-07, "loss": 0.645149290561676, "step": 3072 }, { "epoch": 3.7533577533577533, "grad_norm": 0.26767414808273315, "learning_rate": 5.470808974070152e-07, "loss": 0.4036714732646942, "step": 3074 }, { "epoch": 3.755799755799756, "grad_norm": 4.02056884765625, "learning_rate": 5.461609346095067e-07, "loss": 0.8655245304107666, "step": 3076 }, { "epoch": 3.758241758241758, "grad_norm": 4.357627868652344, "learning_rate": 5.452499556164402e-07, "loss": 0.8845657110214233, "step": 3078 }, { "epoch": 3.7606837606837606, "grad_norm": 10.457714080810547, "learning_rate": 5.443479641413607e-07, "loss": 0.6024913191795349, "step": 3080 }, { "epoch": 3.763125763125763, "grad_norm": 2.315418243408203, "learning_rate": 5.434549638611768e-07, "loss": 0.9414732456207275, "step": 3082 }, { "epoch": 3.7655677655677655, "grad_norm": 1.8591489791870117, "learning_rate": 5.425709584161457e-07, "loss": 0.9516326785087585, "step": 3084 }, { "epoch": 3.768009768009768, "grad_norm": 1.980412244796753, "learning_rate": 5.416959514098571e-07, "loss": 0.9030287265777588, "step": 3086 }, { "epoch": 3.7704517704517704, "grad_norm": 0.8802301287651062, "learning_rate": 5.40829946409219e-07, "loss": 0.2058449685573578, "step": 3088 }, { "epoch": 3.772893772893773, "grad_norm": 3.767972230911255, "learning_rate": 5.399729469444438e-07, "loss": 0.8536104559898376, "step": 3090 }, { "epoch": 3.7753357753357752, "grad_norm": 2.7339487075805664, "learning_rate": 5.39124956509033e-07, "loss": 0.8664818406105042, "step": 3092 }, { "epoch": 3.7777777777777777, "grad_norm": 1.868648648262024, "learning_rate": 5.382859785597643e-07, "loss": 0.9490870237350464, "step": 3094 }, { "epoch": 3.78021978021978, "grad_norm": 3.2051358222961426, "learning_rate": 5.374560165166752e-07, "loss": 0.8471544981002808, "step": 3096 }, { "epoch": 3.7826617826617825, "grad_norm": 3.188377857208252, "learning_rate": 5.366350737630515e-07, "loss": 0.6783183217048645, "step": 3098 }, { "epoch": 3.785103785103785, "grad_norm": 12.093615531921387, "learning_rate": 5.358231536454119e-07, "loss": 0.8494789004325867, "step": 3100 }, { "epoch": 3.7875457875457874, "grad_norm": 2.5433156490325928, "learning_rate": 5.350202594734954e-07, "loss": 0.8256645202636719, "step": 3102 }, { "epoch": 3.78998778998779, "grad_norm": 6.241081237792969, "learning_rate": 5.34226394520247e-07, "loss": 0.8711805939674377, "step": 3104 }, { "epoch": 3.7924297924297923, "grad_norm": 2.3150527477264404, "learning_rate": 5.33441562021805e-07, "loss": 1.0078837871551514, "step": 3106 }, { "epoch": 3.7948717948717947, "grad_norm": 2.26035737991333, "learning_rate": 5.326657651774867e-07, "loss": 0.5672973394393921, "step": 3108 }, { "epoch": 3.797313797313797, "grad_norm": 3.3058907985687256, "learning_rate": 5.318990071497772e-07, "loss": 0.6369197368621826, "step": 3110 }, { "epoch": 3.7997557997557996, "grad_norm": 2.8003060817718506, "learning_rate": 5.311412910643145e-07, "loss": 0.5773022174835205, "step": 3112 }, { "epoch": 3.802197802197802, "grad_norm": 3.470675468444824, "learning_rate": 5.303926200098789e-07, "loss": 0.5989543199539185, "step": 3114 }, { "epoch": 3.8046398046398044, "grad_norm": 3.9955947399139404, "learning_rate": 5.296529970383777e-07, "loss": 0.44651395082473755, "step": 3116 }, { "epoch": 3.807081807081807, "grad_norm": 4.266364097595215, "learning_rate": 5.289224251648359e-07, "loss": 0.6023522019386292, "step": 3118 }, { "epoch": 3.8095238095238093, "grad_norm": 2.1567165851593018, "learning_rate": 5.282009073673812e-07, "loss": 0.9219540953636169, "step": 3120 }, { "epoch": 3.8119658119658117, "grad_norm": 4.827529430389404, "learning_rate": 5.27488446587233e-07, "loss": 0.5145304203033447, "step": 3122 }, { "epoch": 3.814407814407814, "grad_norm": 3.446068048477173, "learning_rate": 5.267850457286907e-07, "loss": 0.6707845330238342, "step": 3124 }, { "epoch": 3.8168498168498166, "grad_norm": 6.150956630706787, "learning_rate": 5.26090707659122e-07, "loss": 0.881208062171936, "step": 3126 }, { "epoch": 3.819291819291819, "grad_norm": 7.88019323348999, "learning_rate": 5.254054352089493e-07, "loss": 0.48564082384109497, "step": 3128 }, { "epoch": 3.8217338217338215, "grad_norm": 2.4069323539733887, "learning_rate": 5.247292311716413e-07, "loss": 0.8890138864517212, "step": 3130 }, { "epoch": 3.824175824175824, "grad_norm": 1.5671998262405396, "learning_rate": 5.240620983036986e-07, "loss": 0.5058675408363342, "step": 3132 }, { "epoch": 3.8266178266178263, "grad_norm": 6.1965227127075195, "learning_rate": 5.234040393246448e-07, "loss": 1.1437023878097534, "step": 3134 }, { "epoch": 3.8290598290598292, "grad_norm": 3.5453076362609863, "learning_rate": 5.227550569170133e-07, "loss": 1.039106845855713, "step": 3136 }, { "epoch": 3.8315018315018317, "grad_norm": 4.471746444702148, "learning_rate": 5.221151537263382e-07, "loss": 0.6547291278839111, "step": 3138 }, { "epoch": 3.833943833943834, "grad_norm": 3.5945651531219482, "learning_rate": 5.214843323611432e-07, "loss": 0.3847421407699585, "step": 3140 }, { "epoch": 3.8363858363858365, "grad_norm": 3.4551937580108643, "learning_rate": 5.208625953929289e-07, "loss": 0.7860216498374939, "step": 3142 }, { "epoch": 3.838827838827839, "grad_norm": 5.9122633934021, "learning_rate": 5.202499453561658e-07, "loss": 0.26646631956100464, "step": 3144 }, { "epoch": 3.8412698412698414, "grad_norm": 4.5092620849609375, "learning_rate": 5.196463847482812e-07, "loss": 0.5625693202018738, "step": 3146 }, { "epoch": 3.843711843711844, "grad_norm": 0.4482984244823456, "learning_rate": 5.1905191602965e-07, "loss": 0.12481559067964554, "step": 3148 }, { "epoch": 3.8461538461538463, "grad_norm": 5.856686115264893, "learning_rate": 5.184665416235841e-07, "loss": 0.5362542271614075, "step": 3150 }, { "epoch": 3.8485958485958487, "grad_norm": 4.156497001647949, "learning_rate": 5.178902639163247e-07, "loss": 0.7409583330154419, "step": 3152 }, { "epoch": 3.851037851037851, "grad_norm": 1.6845171451568604, "learning_rate": 5.17323085257029e-07, "loss": 0.5385940074920654, "step": 3154 }, { "epoch": 3.8534798534798536, "grad_norm": 1.5355862379074097, "learning_rate": 5.167650079577636e-07, "loss": 0.8247669339179993, "step": 3156 }, { "epoch": 3.855921855921856, "grad_norm": 4.407171249389648, "learning_rate": 5.162160342934939e-07, "loss": 0.8968489170074463, "step": 3158 }, { "epoch": 3.8583638583638584, "grad_norm": 8.075994491577148, "learning_rate": 5.15676166502075e-07, "loss": 0.09241821616888046, "step": 3160 }, { "epoch": 3.860805860805861, "grad_norm": 2.5929574966430664, "learning_rate": 5.151454067842417e-07, "loss": 0.4451131224632263, "step": 3162 }, { "epoch": 3.8632478632478633, "grad_norm": 1.8862788677215576, "learning_rate": 5.146237573036012e-07, "loss": 0.9212697148323059, "step": 3164 }, { "epoch": 3.8656898656898657, "grad_norm": 2.396461248397827, "learning_rate": 5.141112201866231e-07, "loss": 0.9008550047874451, "step": 3166 }, { "epoch": 3.868131868131868, "grad_norm": 2.7560782432556152, "learning_rate": 5.136077975226314e-07, "loss": 0.7847106456756592, "step": 3168 }, { "epoch": 3.8705738705738706, "grad_norm": 5.181787014007568, "learning_rate": 5.131134913637951e-07, "loss": 0.5696348547935486, "step": 3170 }, { "epoch": 3.873015873015873, "grad_norm": 8.310593605041504, "learning_rate": 5.126283037251208e-07, "loss": 0.5494756102561951, "step": 3172 }, { "epoch": 3.8754578754578755, "grad_norm": 2.406679391860962, "learning_rate": 5.121522365844436e-07, "loss": 0.5918058156967163, "step": 3174 }, { "epoch": 3.877899877899878, "grad_norm": 2.115579128265381, "learning_rate": 5.116852918824199e-07, "loss": 0.9309298396110535, "step": 3176 }, { "epoch": 3.8803418803418803, "grad_norm": 1.9531852006912231, "learning_rate": 5.112274715225194e-07, "loss": 0.858812153339386, "step": 3178 }, { "epoch": 3.8827838827838828, "grad_norm": 3.3092024326324463, "learning_rate": 5.107787773710157e-07, "loss": 0.8395816087722778, "step": 3180 }, { "epoch": 3.885225885225885, "grad_norm": 4.282203197479248, "learning_rate": 5.103392112569815e-07, "loss": 0.8726351261138916, "step": 3182 }, { "epoch": 3.8876678876678876, "grad_norm": 5.603507995605469, "learning_rate": 5.099087749722788e-07, "loss": 0.3810088336467743, "step": 3184 }, { "epoch": 3.89010989010989, "grad_norm": 3.650843858718872, "learning_rate": 5.094874702715529e-07, "loss": 0.9510683417320251, "step": 3186 }, { "epoch": 3.8925518925518925, "grad_norm": 2.743922472000122, "learning_rate": 5.090752988722245e-07, "loss": 0.40368887782096863, "step": 3188 }, { "epoch": 3.894993894993895, "grad_norm": 0.3727673292160034, "learning_rate": 5.086722624544829e-07, "loss": 0.420103520154953, "step": 3190 }, { "epoch": 3.8974358974358974, "grad_norm": 2.02138090133667, "learning_rate": 5.082783626612797e-07, "loss": 0.8819708824157715, "step": 3192 }, { "epoch": 3.8998778998779, "grad_norm": 3.9244892597198486, "learning_rate": 5.078936010983213e-07, "loss": 1.0119850635528564, "step": 3194 }, { "epoch": 3.9023199023199022, "grad_norm": 6.452670574188232, "learning_rate": 5.075179793340628e-07, "loss": 0.5983652472496033, "step": 3196 }, { "epoch": 3.9047619047619047, "grad_norm": 5.412775993347168, "learning_rate": 5.071514988997016e-07, "loss": 0.1550082117319107, "step": 3198 }, { "epoch": 3.907203907203907, "grad_norm": 3.0789589881896973, "learning_rate": 5.067941612891708e-07, "loss": 0.9240917563438416, "step": 3200 }, { "epoch": 3.9096459096459095, "grad_norm": 1.7385785579681396, "learning_rate": 5.06445967959134e-07, "loss": 0.6053808331489563, "step": 3202 }, { "epoch": 3.912087912087912, "grad_norm": 2.076815605163574, "learning_rate": 5.061069203289777e-07, "loss": 0.9977898001670837, "step": 3204 }, { "epoch": 3.9145299145299144, "grad_norm": 1.4593520164489746, "learning_rate": 5.057770197808077e-07, "loss": 0.9548913240432739, "step": 3206 }, { "epoch": 3.916971916971917, "grad_norm": 2.623448371887207, "learning_rate": 5.054562676594414e-07, "loss": 1.132678508758545, "step": 3208 }, { "epoch": 3.9194139194139193, "grad_norm": 1.8026434183120728, "learning_rate": 5.051446652724042e-07, "loss": 0.6159650087356567, "step": 3210 }, { "epoch": 3.9218559218559217, "grad_norm": 2.9582080841064453, "learning_rate": 5.048422138899222e-07, "loss": 0.23612847924232483, "step": 3212 }, { "epoch": 3.9242979242979246, "grad_norm": 1.8346482515335083, "learning_rate": 5.045489147449187e-07, "loss": 0.9001370668411255, "step": 3214 }, { "epoch": 3.926739926739927, "grad_norm": 4.2038726806640625, "learning_rate": 5.042647690330078e-07, "loss": 0.921493411064148, "step": 3216 }, { "epoch": 3.9291819291819294, "grad_norm": 6.33651065826416, "learning_rate": 5.039897779124914e-07, "loss": 0.6150534749031067, "step": 3218 }, { "epoch": 3.931623931623932, "grad_norm": 2.513700246810913, "learning_rate": 5.037239425043525e-07, "loss": 0.6679733991622925, "step": 3220 }, { "epoch": 3.9340659340659343, "grad_norm": 11.929234504699707, "learning_rate": 5.034672638922512e-07, "loss": 0.530619740486145, "step": 3222 }, { "epoch": 3.9365079365079367, "grad_norm": 3.113684892654419, "learning_rate": 5.032197431225214e-07, "loss": 0.8231785297393799, "step": 3224 }, { "epoch": 3.938949938949939, "grad_norm": 4.810062885284424, "learning_rate": 5.029813812041649e-07, "loss": 0.5280576944351196, "step": 3226 }, { "epoch": 3.9413919413919416, "grad_norm": 2.087477922439575, "learning_rate": 5.027521791088482e-07, "loss": 0.9266934394836426, "step": 3228 }, { "epoch": 3.943833943833944, "grad_norm": 4.400597095489502, "learning_rate": 5.025321377708989e-07, "loss": 0.5227733850479126, "step": 3230 }, { "epoch": 3.9462759462759465, "grad_norm": 3.1473488807678223, "learning_rate": 5.023212580873009e-07, "loss": 0.952559769153595, "step": 3232 }, { "epoch": 3.948717948717949, "grad_norm": 14.350162506103516, "learning_rate": 5.02119540917691e-07, "loss": 0.5347244143486023, "step": 3234 }, { "epoch": 3.9511599511599513, "grad_norm": 2.0704898834228516, "learning_rate": 5.01926987084356e-07, "loss": 0.9426727294921875, "step": 3236 }, { "epoch": 3.9536019536019538, "grad_norm": 3.468090057373047, "learning_rate": 5.017435973722293e-07, "loss": 0.7870326042175293, "step": 3238 }, { "epoch": 3.956043956043956, "grad_norm": 2.9406344890594482, "learning_rate": 5.015693725288866e-07, "loss": 0.4789937436580658, "step": 3240 }, { "epoch": 3.9584859584859586, "grad_norm": 15.776670455932617, "learning_rate": 5.014043132645438e-07, "loss": 0.6635629534721375, "step": 3242 }, { "epoch": 3.960927960927961, "grad_norm": 4.5655083656311035, "learning_rate": 5.012484202520545e-07, "loss": 0.9738138914108276, "step": 3244 }, { "epoch": 3.9633699633699635, "grad_norm": 2.4571170806884766, "learning_rate": 5.01101694126906e-07, "loss": 0.5079742670059204, "step": 3246 }, { "epoch": 3.965811965811966, "grad_norm": 1.870768666267395, "learning_rate": 5.009641354872178e-07, "loss": 0.9230693578720093, "step": 3248 }, { "epoch": 3.9682539682539684, "grad_norm": 2.229893445968628, "learning_rate": 5.008357448937387e-07, "loss": 0.6680663228034973, "step": 3250 }, { "epoch": 3.970695970695971, "grad_norm": 25.037006378173828, "learning_rate": 5.007165228698442e-07, "loss": 0.4087255597114563, "step": 3252 }, { "epoch": 3.9731379731379732, "grad_norm": 2.2511398792266846, "learning_rate": 5.006064699015351e-07, "loss": 0.8908025622367859, "step": 3254 }, { "epoch": 3.9755799755799757, "grad_norm": 4.969597339630127, "learning_rate": 5.005055864374352e-07, "loss": 0.8304935693740845, "step": 3256 }, { "epoch": 3.978021978021978, "grad_norm": 2.5601906776428223, "learning_rate": 5.004138728887892e-07, "loss": 0.40299245715141296, "step": 3258 }, { "epoch": 3.9804639804639805, "grad_norm": 2.45926570892334, "learning_rate": 5.003313296294612e-07, "loss": 0.5143805146217346, "step": 3260 }, { "epoch": 3.982905982905983, "grad_norm": 1.9476388692855835, "learning_rate": 5.002579569959336e-07, "loss": 0.5361751914024353, "step": 3262 }, { "epoch": 3.9853479853479854, "grad_norm": 4.383269786834717, "learning_rate": 5.001937552873049e-07, "loss": 0.4276546835899353, "step": 3264 }, { "epoch": 3.987789987789988, "grad_norm": 4.4012627601623535, "learning_rate": 5.001387247652891e-07, "loss": 0.8529163002967834, "step": 3266 }, { "epoch": 3.9902319902319903, "grad_norm": 1.4377570152282715, "learning_rate": 5.000928656542145e-07, "loss": 0.9019818902015686, "step": 3268 }, { "epoch": 3.9926739926739927, "grad_norm": 2.091071605682373, "learning_rate": 5.000561781410232e-07, "loss": 0.6819381713867188, "step": 3270 }, { "epoch": 3.995115995115995, "grad_norm": 1.7002183198928833, "learning_rate": 5.000286623752688e-07, "loss": 0.9077348113059998, "step": 3272 }, { "epoch": 3.9975579975579976, "grad_norm": 1.8634474277496338, "learning_rate": 5.000103184691177e-07, "loss": 0.8196188807487488, "step": 3274 }, { "epoch": 4.0, "grad_norm": 8.092415809631348, "learning_rate": 5.000011464973476e-07, "loss": 0.480937659740448, "step": 3276 }, { "epoch": 4.0, "step": 3276, "total_flos": 3.438047841308639e+18, "train_loss": 0.8972434957087567, "train_runtime": 10632.4216, "train_samples_per_second": 4.93, "train_steps_per_second": 0.308 } ], "logging_steps": 2, "max_steps": 3276, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.438047841308639e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }