9b-118 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
997f748 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 3276,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002442002442002442,
"grad_norm": 2.5476107597351074,
"learning_rate": 3.0487804878048784e-08,
"loss": 1.9267934560775757,
"step": 2
},
{
"epoch": 0.004884004884004884,
"grad_norm": 2.350306749343872,
"learning_rate": 9.146341463414634e-08,
"loss": 2.0976288318634033,
"step": 4
},
{
"epoch": 0.007326007326007326,
"grad_norm": 22.22303009033203,
"learning_rate": 1.5243902439024392e-07,
"loss": 2.4150097370147705,
"step": 6
},
{
"epoch": 0.009768009768009768,
"grad_norm": 4.77632999420166,
"learning_rate": 2.134146341463415e-07,
"loss": 1.9595110416412354,
"step": 8
},
{
"epoch": 0.01221001221001221,
"grad_norm": 6.901440620422363,
"learning_rate": 2.7439024390243906e-07,
"loss": 2.162900447845459,
"step": 10
},
{
"epoch": 0.014652014652014652,
"grad_norm": 11.045926094055176,
"learning_rate": 3.3536585365853663e-07,
"loss": 1.9404582977294922,
"step": 12
},
{
"epoch": 0.017094017094017096,
"grad_norm": 2.2156028747558594,
"learning_rate": 3.963414634146342e-07,
"loss": 1.6843563318252563,
"step": 14
},
{
"epoch": 0.019536019536019536,
"grad_norm": 26.805221557617188,
"learning_rate": 4.573170731707317e-07,
"loss": 2.0623722076416016,
"step": 16
},
{
"epoch": 0.02197802197802198,
"grad_norm": 17.565683364868164,
"learning_rate": 5.182926829268293e-07,
"loss": 2.7407174110412598,
"step": 18
},
{
"epoch": 0.02442002442002442,
"grad_norm": 4.266391277313232,
"learning_rate": 5.79268292682927e-07,
"loss": 1.88368821144104,
"step": 20
},
{
"epoch": 0.026862026862026864,
"grad_norm": 8.910398483276367,
"learning_rate": 6.402439024390244e-07,
"loss": 2.084914445877075,
"step": 22
},
{
"epoch": 0.029304029304029304,
"grad_norm": 2.130563497543335,
"learning_rate": 7.012195121951221e-07,
"loss": 2.019660234451294,
"step": 24
},
{
"epoch": 0.031746031746031744,
"grad_norm": 6.1367716789245605,
"learning_rate": 7.621951219512196e-07,
"loss": 1.9338700771331787,
"step": 26
},
{
"epoch": 0.03418803418803419,
"grad_norm": 6.56151008605957,
"learning_rate": 8.231707317073172e-07,
"loss": 2.0060365200042725,
"step": 28
},
{
"epoch": 0.03663003663003663,
"grad_norm": 9.909710884094238,
"learning_rate": 8.841463414634147e-07,
"loss": 1.8657618761062622,
"step": 30
},
{
"epoch": 0.03907203907203907,
"grad_norm": 2.332340955734253,
"learning_rate": 9.451219512195123e-07,
"loss": 1.385891318321228,
"step": 32
},
{
"epoch": 0.04151404151404151,
"grad_norm": 2.9691996574401855,
"learning_rate": 1.0060975609756098e-06,
"loss": 1.872510552406311,
"step": 34
},
{
"epoch": 0.04395604395604396,
"grad_norm": 3.3227553367614746,
"learning_rate": 1.0670731707317073e-06,
"loss": 1.4552903175354004,
"step": 36
},
{
"epoch": 0.0463980463980464,
"grad_norm": 2.721830368041992,
"learning_rate": 1.128048780487805e-06,
"loss": 1.8594551086425781,
"step": 38
},
{
"epoch": 0.04884004884004884,
"grad_norm": 5.821812152862549,
"learning_rate": 1.1890243902439024e-06,
"loss": 1.697621464729309,
"step": 40
},
{
"epoch": 0.05128205128205128,
"grad_norm": 1.486113429069519,
"learning_rate": 1.25e-06,
"loss": 1.7849284410476685,
"step": 42
},
{
"epoch": 0.05372405372405373,
"grad_norm": 4.890027046203613,
"learning_rate": 1.3109756097560978e-06,
"loss": 1.7496923208236694,
"step": 44
},
{
"epoch": 0.05616605616605617,
"grad_norm": 1.7662242650985718,
"learning_rate": 1.3719512195121952e-06,
"loss": 1.6558294296264648,
"step": 46
},
{
"epoch": 0.05860805860805861,
"grad_norm": 19.32802963256836,
"learning_rate": 1.4329268292682927e-06,
"loss": 1.4527249336242676,
"step": 48
},
{
"epoch": 0.06105006105006105,
"grad_norm": 9.77262020111084,
"learning_rate": 1.4939024390243904e-06,
"loss": 1.3586843013763428,
"step": 50
},
{
"epoch": 0.06349206349206349,
"grad_norm": 6.361555576324463,
"learning_rate": 1.5548780487804878e-06,
"loss": 1.5780984163284302,
"step": 52
},
{
"epoch": 0.06593406593406594,
"grad_norm": 15.675647735595703,
"learning_rate": 1.6158536585365855e-06,
"loss": 1.285346508026123,
"step": 54
},
{
"epoch": 0.06837606837606838,
"grad_norm": 4.175439834594727,
"learning_rate": 1.6768292682926832e-06,
"loss": 1.5857115983963013,
"step": 56
},
{
"epoch": 0.07081807081807082,
"grad_norm": 1.736680030822754,
"learning_rate": 1.7378048780487804e-06,
"loss": 1.5757516622543335,
"step": 58
},
{
"epoch": 0.07326007326007326,
"grad_norm": 1.47886061668396,
"learning_rate": 1.7987804878048781e-06,
"loss": 1.6183691024780273,
"step": 60
},
{
"epoch": 0.0757020757020757,
"grad_norm": 2.919388771057129,
"learning_rate": 1.8597560975609758e-06,
"loss": 1.164100170135498,
"step": 62
},
{
"epoch": 0.07814407814407814,
"grad_norm": 1.240254282951355,
"learning_rate": 1.9207317073170733e-06,
"loss": 1.767830491065979,
"step": 64
},
{
"epoch": 0.08058608058608059,
"grad_norm": 8.434248924255371,
"learning_rate": 1.981707317073171e-06,
"loss": 1.3761873245239258,
"step": 66
},
{
"epoch": 0.08302808302808302,
"grad_norm": 1.5368638038635254,
"learning_rate": 2.042682926829268e-06,
"loss": 1.2026317119598389,
"step": 68
},
{
"epoch": 0.08547008547008547,
"grad_norm": 0.9749501347541809,
"learning_rate": 2.103658536585366e-06,
"loss": 1.2645400762557983,
"step": 70
},
{
"epoch": 0.08791208791208792,
"grad_norm": 1.7136712074279785,
"learning_rate": 2.1646341463414635e-06,
"loss": 1.5449546575546265,
"step": 72
},
{
"epoch": 0.09035409035409035,
"grad_norm": 3.341733455657959,
"learning_rate": 2.225609756097561e-06,
"loss": 1.5047639608383179,
"step": 74
},
{
"epoch": 0.0927960927960928,
"grad_norm": 5.009698390960693,
"learning_rate": 2.286585365853659e-06,
"loss": 1.4909131526947021,
"step": 76
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1.631039023399353,
"learning_rate": 2.3475609756097563e-06,
"loss": 1.3529361486434937,
"step": 78
},
{
"epoch": 0.09768009768009768,
"grad_norm": 6.177618026733398,
"learning_rate": 2.408536585365854e-06,
"loss": 1.254205346107483,
"step": 80
},
{
"epoch": 0.10012210012210013,
"grad_norm": 5.524102210998535,
"learning_rate": 2.4695121951219513e-06,
"loss": 1.165070652961731,
"step": 82
},
{
"epoch": 0.10256410256410256,
"grad_norm": 2.264727830886841,
"learning_rate": 2.530487804878049e-06,
"loss": 1.1751306056976318,
"step": 84
},
{
"epoch": 0.10500610500610501,
"grad_norm": 1.5993300676345825,
"learning_rate": 2.5914634146341466e-06,
"loss": 1.352165699005127,
"step": 86
},
{
"epoch": 0.10744810744810745,
"grad_norm": 1.8832273483276367,
"learning_rate": 2.652439024390244e-06,
"loss": 1.5243136882781982,
"step": 88
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.7285981178283691,
"learning_rate": 2.713414634146342e-06,
"loss": 1.2205549478530884,
"step": 90
},
{
"epoch": 0.11233211233211234,
"grad_norm": 2.318856716156006,
"learning_rate": 2.7743902439024394e-06,
"loss": 1.6029253005981445,
"step": 92
},
{
"epoch": 0.11477411477411477,
"grad_norm": 7.182040691375732,
"learning_rate": 2.8353658536585365e-06,
"loss": 1.1754858493804932,
"step": 94
},
{
"epoch": 0.11721611721611722,
"grad_norm": 1.7051122188568115,
"learning_rate": 2.8963414634146343e-06,
"loss": 1.4834587574005127,
"step": 96
},
{
"epoch": 0.11965811965811966,
"grad_norm": 3.0200533866882324,
"learning_rate": 2.957317073170732e-06,
"loss": 1.4276564121246338,
"step": 98
},
{
"epoch": 0.1221001221001221,
"grad_norm": 1.8382214307785034,
"learning_rate": 3.0182926829268293e-06,
"loss": 1.1662065982818604,
"step": 100
},
{
"epoch": 0.12454212454212454,
"grad_norm": 2.296553611755371,
"learning_rate": 3.079268292682927e-06,
"loss": 1.3122981786727905,
"step": 102
},
{
"epoch": 0.12698412698412698,
"grad_norm": 3.1782186031341553,
"learning_rate": 3.1402439024390246e-06,
"loss": 1.0392099618911743,
"step": 104
},
{
"epoch": 0.12942612942612944,
"grad_norm": 1.1442056894302368,
"learning_rate": 3.201219512195122e-06,
"loss": 0.9646719694137573,
"step": 106
},
{
"epoch": 0.13186813186813187,
"grad_norm": 4.929725170135498,
"learning_rate": 3.26219512195122e-06,
"loss": 1.420979619026184,
"step": 108
},
{
"epoch": 0.1343101343101343,
"grad_norm": 2.712373971939087,
"learning_rate": 3.3231707317073174e-06,
"loss": 1.6603320837020874,
"step": 110
},
{
"epoch": 0.13675213675213677,
"grad_norm": 2.1611270904541016,
"learning_rate": 3.3841463414634153e-06,
"loss": 1.452590823173523,
"step": 112
},
{
"epoch": 0.1391941391941392,
"grad_norm": 1.7481805086135864,
"learning_rate": 3.4451219512195124e-06,
"loss": 1.2166002988815308,
"step": 114
},
{
"epoch": 0.14163614163614163,
"grad_norm": 1.9498414993286133,
"learning_rate": 3.50609756097561e-06,
"loss": 1.3791627883911133,
"step": 116
},
{
"epoch": 0.14407814407814407,
"grad_norm": 5.3667497634887695,
"learning_rate": 3.5670731707317073e-06,
"loss": 1.2551401853561401,
"step": 118
},
{
"epoch": 0.14652014652014653,
"grad_norm": 2.414433717727661,
"learning_rate": 3.628048780487805e-06,
"loss": 1.3578366041183472,
"step": 120
},
{
"epoch": 0.14896214896214896,
"grad_norm": 1.8076027631759644,
"learning_rate": 3.6890243902439026e-06,
"loss": 1.3795714378356934,
"step": 122
},
{
"epoch": 0.1514041514041514,
"grad_norm": 2.020355701446533,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3397819995880127,
"step": 124
},
{
"epoch": 0.15384615384615385,
"grad_norm": 1.3748884201049805,
"learning_rate": 3.810975609756098e-06,
"loss": 1.1987930536270142,
"step": 126
},
{
"epoch": 0.1562881562881563,
"grad_norm": 1.4875504970550537,
"learning_rate": 3.8719512195121954e-06,
"loss": 1.4347355365753174,
"step": 128
},
{
"epoch": 0.15873015873015872,
"grad_norm": 1.2580325603485107,
"learning_rate": 3.932926829268293e-06,
"loss": 1.2884924411773682,
"step": 130
},
{
"epoch": 0.16117216117216118,
"grad_norm": 4.422817707061768,
"learning_rate": 3.99390243902439e-06,
"loss": 1.0759848356246948,
"step": 132
},
{
"epoch": 0.16361416361416362,
"grad_norm": 2.910273790359497,
"learning_rate": 4.054878048780488e-06,
"loss": 1.1693415641784668,
"step": 134
},
{
"epoch": 0.16605616605616605,
"grad_norm": 2.8875091075897217,
"learning_rate": 4.115853658536585e-06,
"loss": 1.1773910522460938,
"step": 136
},
{
"epoch": 0.1684981684981685,
"grad_norm": 3.0497472286224365,
"learning_rate": 4.176829268292683e-06,
"loss": 1.1259866952896118,
"step": 138
},
{
"epoch": 0.17094017094017094,
"grad_norm": 2.7244997024536133,
"learning_rate": 4.237804878048781e-06,
"loss": 1.1096811294555664,
"step": 140
},
{
"epoch": 0.17338217338217338,
"grad_norm": 1.9807188510894775,
"learning_rate": 4.298780487804878e-06,
"loss": 1.374996304512024,
"step": 142
},
{
"epoch": 0.17582417582417584,
"grad_norm": 1.5548163652420044,
"learning_rate": 4.359756097560976e-06,
"loss": 1.116044521331787,
"step": 144
},
{
"epoch": 0.17826617826617827,
"grad_norm": 2.0115115642547607,
"learning_rate": 4.420731707317074e-06,
"loss": 0.9985978603363037,
"step": 146
},
{
"epoch": 0.1807081807081807,
"grad_norm": 1.5195460319519043,
"learning_rate": 4.481707317073171e-06,
"loss": 0.9752452373504639,
"step": 148
},
{
"epoch": 0.18315018315018314,
"grad_norm": 1.3534411191940308,
"learning_rate": 4.542682926829269e-06,
"loss": 1.3346309661865234,
"step": 150
},
{
"epoch": 0.1855921855921856,
"grad_norm": 2.0687193870544434,
"learning_rate": 4.603658536585367e-06,
"loss": 1.4687234163284302,
"step": 152
},
{
"epoch": 0.18803418803418803,
"grad_norm": 1.2396481037139893,
"learning_rate": 4.664634146341464e-06,
"loss": 1.3415579795837402,
"step": 154
},
{
"epoch": 0.19047619047619047,
"grad_norm": 2.4731335639953613,
"learning_rate": 4.725609756097561e-06,
"loss": 1.6359931230545044,
"step": 156
},
{
"epoch": 0.19291819291819293,
"grad_norm": 3.7982375621795654,
"learning_rate": 4.786585365853659e-06,
"loss": 1.0939006805419922,
"step": 158
},
{
"epoch": 0.19536019536019536,
"grad_norm": 1.8634134531021118,
"learning_rate": 4.8475609756097565e-06,
"loss": 0.9020692110061646,
"step": 160
},
{
"epoch": 0.1978021978021978,
"grad_norm": 1.9401910305023193,
"learning_rate": 4.908536585365854e-06,
"loss": 1.3406171798706055,
"step": 162
},
{
"epoch": 0.20024420024420025,
"grad_norm": 2.7686517238616943,
"learning_rate": 4.9695121951219515e-06,
"loss": 1.2336323261260986,
"step": 164
},
{
"epoch": 0.2026862026862027,
"grad_norm": 2.1715619564056396,
"learning_rate": 4.999998853502653e-06,
"loss": 1.2935197353363037,
"step": 166
},
{
"epoch": 0.20512820512820512,
"grad_norm": 1.8246636390686035,
"learning_rate": 4.999989681530883e-06,
"loss": 1.1559749841690063,
"step": 168
},
{
"epoch": 0.20757020757020758,
"grad_norm": 4.966519832611084,
"learning_rate": 4.999971337624732e-06,
"loss": 0.929039478302002,
"step": 170
},
{
"epoch": 0.21001221001221002,
"grad_norm": 1.679980993270874,
"learning_rate": 4.999943821858978e-06,
"loss": 1.0169018507003784,
"step": 172
},
{
"epoch": 0.21245421245421245,
"grad_norm": 2.0559370517730713,
"learning_rate": 4.999907134345786e-06,
"loss": 1.3057047128677368,
"step": 174
},
{
"epoch": 0.2148962148962149,
"grad_norm": 2.602260112762451,
"learning_rate": 4.9998612752347116e-06,
"loss": 1.2571014165878296,
"step": 176
},
{
"epoch": 0.21733821733821734,
"grad_norm": 3.518317222595215,
"learning_rate": 4.999806244712696e-06,
"loss": 1.3580776453018188,
"step": 178
},
{
"epoch": 0.21978021978021978,
"grad_norm": 1.0767415761947632,
"learning_rate": 4.9997420430040665e-06,
"loss": 0.9726645946502686,
"step": 180
},
{
"epoch": 0.2222222222222222,
"grad_norm": 4.324513912200928,
"learning_rate": 4.9996686703705395e-06,
"loss": 0.8844138383865356,
"step": 182
},
{
"epoch": 0.22466422466422467,
"grad_norm": 1.6926108598709106,
"learning_rate": 4.999586127111211e-06,
"loss": 1.2904834747314453,
"step": 184
},
{
"epoch": 0.2271062271062271,
"grad_norm": 3.4072794914245605,
"learning_rate": 4.9994944135625655e-06,
"loss": 1.288368582725525,
"step": 186
},
{
"epoch": 0.22954822954822954,
"grad_norm": 2.327322483062744,
"learning_rate": 4.999393530098465e-06,
"loss": 1.3512585163116455,
"step": 188
},
{
"epoch": 0.231990231990232,
"grad_norm": 1.8644685745239258,
"learning_rate": 4.999283477130157e-06,
"loss": 1.3694134950637817,
"step": 190
},
{
"epoch": 0.23443223443223443,
"grad_norm": 2.46710205078125,
"learning_rate": 4.999164255106262e-06,
"loss": 1.3137428760528564,
"step": 192
},
{
"epoch": 0.23687423687423687,
"grad_norm": 2.8349263668060303,
"learning_rate": 4.999035864512782e-06,
"loss": 1.308716058731079,
"step": 194
},
{
"epoch": 0.23931623931623933,
"grad_norm": 4.252539157867432,
"learning_rate": 4.998898305873094e-06,
"loss": 1.0035754442214966,
"step": 196
},
{
"epoch": 0.24175824175824176,
"grad_norm": 1.9181326627731323,
"learning_rate": 4.9987515797479455e-06,
"loss": 1.283682942390442,
"step": 198
},
{
"epoch": 0.2442002442002442,
"grad_norm": 2.797574996948242,
"learning_rate": 4.998595686735457e-06,
"loss": 1.3744878768920898,
"step": 200
},
{
"epoch": 0.24664224664224665,
"grad_norm": 2.4476912021636963,
"learning_rate": 4.998430627471114e-06,
"loss": 1.3049349784851074,
"step": 202
},
{
"epoch": 0.2490842490842491,
"grad_norm": 1.6749374866485596,
"learning_rate": 4.998256402627771e-06,
"loss": 0.9939874410629272,
"step": 204
},
{
"epoch": 0.2515262515262515,
"grad_norm": 1.9039818048477173,
"learning_rate": 4.998073012915644e-06,
"loss": 1.26462721824646,
"step": 206
},
{
"epoch": 0.25396825396825395,
"grad_norm": 1.5555559396743774,
"learning_rate": 4.99788045908231e-06,
"loss": 1.118224024772644,
"step": 208
},
{
"epoch": 0.2564102564102564,
"grad_norm": 12.163622856140137,
"learning_rate": 4.9976787419126995e-06,
"loss": 0.9382672905921936,
"step": 210
},
{
"epoch": 0.2588522588522589,
"grad_norm": 2.534181594848633,
"learning_rate": 4.997467862229102e-06,
"loss": 0.6328732967376709,
"step": 212
},
{
"epoch": 0.2612942612942613,
"grad_norm": 1.489608645439148,
"learning_rate": 4.997247820891152e-06,
"loss": 1.0992366075515747,
"step": 214
},
{
"epoch": 0.26373626373626374,
"grad_norm": 2.1970038414001465,
"learning_rate": 4.997018618795836e-06,
"loss": 1.2712618112564087,
"step": 216
},
{
"epoch": 0.2661782661782662,
"grad_norm": 1.4587446451187134,
"learning_rate": 4.996780256877479e-06,
"loss": 1.1741327047348022,
"step": 218
},
{
"epoch": 0.2686202686202686,
"grad_norm": 2.0022170543670654,
"learning_rate": 4.996532736107749e-06,
"loss": 1.3054232597351074,
"step": 220
},
{
"epoch": 0.27106227106227104,
"grad_norm": 1.731757402420044,
"learning_rate": 4.996276057495648e-06,
"loss": 0.934091329574585,
"step": 222
},
{
"epoch": 0.27350427350427353,
"grad_norm": 1.4423786401748657,
"learning_rate": 4.996010222087509e-06,
"loss": 0.9163894653320312,
"step": 224
},
{
"epoch": 0.27594627594627597,
"grad_norm": 1.7184131145477295,
"learning_rate": 4.9957352309669935e-06,
"loss": 1.3263689279556274,
"step": 226
},
{
"epoch": 0.2783882783882784,
"grad_norm": 2.437328338623047,
"learning_rate": 4.9954510852550825e-06,
"loss": 1.3698230981826782,
"step": 228
},
{
"epoch": 0.28083028083028083,
"grad_norm": 2.120469093322754,
"learning_rate": 4.995157786110078e-06,
"loss": 1.343611717224121,
"step": 230
},
{
"epoch": 0.28327228327228327,
"grad_norm": 6.02695369720459,
"learning_rate": 4.9948553347275964e-06,
"loss": 0.7583301663398743,
"step": 232
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.9317870140075684,
"learning_rate": 4.994543732340559e-06,
"loss": 1.0170681476593018,
"step": 234
},
{
"epoch": 0.28815628815628813,
"grad_norm": 1.3222551345825195,
"learning_rate": 4.994222980219193e-06,
"loss": 1.272110939025879,
"step": 236
},
{
"epoch": 0.2905982905982906,
"grad_norm": 1.7373944520950317,
"learning_rate": 4.993893079671023e-06,
"loss": 1.2445218563079834,
"step": 238
},
{
"epoch": 0.29304029304029305,
"grad_norm": 2.4315457344055176,
"learning_rate": 4.993554032040867e-06,
"loss": 1.1302506923675537,
"step": 240
},
{
"epoch": 0.2954822954822955,
"grad_norm": 3.029109477996826,
"learning_rate": 4.993205838710829e-06,
"loss": 0.9910866022109985,
"step": 242
},
{
"epoch": 0.2979242979242979,
"grad_norm": 1.9078646898269653,
"learning_rate": 4.992848501100299e-06,
"loss": 1.3285576105117798,
"step": 244
},
{
"epoch": 0.30036630036630035,
"grad_norm": 1.1271051168441772,
"learning_rate": 4.992482020665938e-06,
"loss": 0.7790983319282532,
"step": 246
},
{
"epoch": 0.3028083028083028,
"grad_norm": 2.9028432369232178,
"learning_rate": 4.992106398901679e-06,
"loss": 1.1949691772460938,
"step": 248
},
{
"epoch": 0.3052503052503053,
"grad_norm": 3.402926445007324,
"learning_rate": 4.9917216373387205e-06,
"loss": 0.9305516481399536,
"step": 250
},
{
"epoch": 0.3076923076923077,
"grad_norm": 4.722480773925781,
"learning_rate": 4.991327737545517e-06,
"loss": 1.0460638999938965,
"step": 252
},
{
"epoch": 0.31013431013431014,
"grad_norm": 2.7775771617889404,
"learning_rate": 4.990924701127776e-06,
"loss": 1.2800921201705933,
"step": 254
},
{
"epoch": 0.3125763125763126,
"grad_norm": 1.9031347036361694,
"learning_rate": 4.990512529728448e-06,
"loss": 1.2638157606124878,
"step": 256
},
{
"epoch": 0.315018315018315,
"grad_norm": 1.927398443222046,
"learning_rate": 4.990091225027721e-06,
"loss": 1.3112692832946777,
"step": 258
},
{
"epoch": 0.31746031746031744,
"grad_norm": 2.3837084770202637,
"learning_rate": 4.9896607887430185e-06,
"loss": 1.2674881219863892,
"step": 260
},
{
"epoch": 0.3199023199023199,
"grad_norm": 4.82175874710083,
"learning_rate": 4.989221222628985e-06,
"loss": 1.4771348237991333,
"step": 262
},
{
"epoch": 0.32234432234432236,
"grad_norm": 4.768642425537109,
"learning_rate": 4.988772528477482e-06,
"loss": 0.7117833495140076,
"step": 264
},
{
"epoch": 0.3247863247863248,
"grad_norm": 3.3639814853668213,
"learning_rate": 4.988314708117581e-06,
"loss": 1.0419560670852661,
"step": 266
},
{
"epoch": 0.32722832722832723,
"grad_norm": 4.912712574005127,
"learning_rate": 4.987847763415557e-06,
"loss": 1.3187146186828613,
"step": 268
},
{
"epoch": 0.32967032967032966,
"grad_norm": 2.553563117980957,
"learning_rate": 4.9873716962748805e-06,
"loss": 0.9921520352363586,
"step": 270
},
{
"epoch": 0.3321123321123321,
"grad_norm": 2.590106964111328,
"learning_rate": 4.986886508636206e-06,
"loss": 1.2800440788269043,
"step": 272
},
{
"epoch": 0.33455433455433453,
"grad_norm": 5.722552299499512,
"learning_rate": 4.986392202477369e-06,
"loss": 0.9619787335395813,
"step": 274
},
{
"epoch": 0.336996336996337,
"grad_norm": 2.612945556640625,
"learning_rate": 4.985888779813377e-06,
"loss": 1.0021531581878662,
"step": 276
},
{
"epoch": 0.33943833943833945,
"grad_norm": 3.078714370727539,
"learning_rate": 4.985376242696399e-06,
"loss": 1.3929091691970825,
"step": 278
},
{
"epoch": 0.3418803418803419,
"grad_norm": 1.928337812423706,
"learning_rate": 4.984854593215759e-06,
"loss": 1.2902088165283203,
"step": 280
},
{
"epoch": 0.3443223443223443,
"grad_norm": 1.6284582614898682,
"learning_rate": 4.984323833497925e-06,
"loss": 1.2728163003921509,
"step": 282
},
{
"epoch": 0.34676434676434675,
"grad_norm": 2.321744680404663,
"learning_rate": 4.983783965706507e-06,
"loss": 1.311239242553711,
"step": 284
},
{
"epoch": 0.3492063492063492,
"grad_norm": 1.7774631977081299,
"learning_rate": 4.983234992042237e-06,
"loss": 1.1027390956878662,
"step": 286
},
{
"epoch": 0.3516483516483517,
"grad_norm": 3.514158010482788,
"learning_rate": 4.982676914742971e-06,
"loss": 1.6526391506195068,
"step": 288
},
{
"epoch": 0.3540903540903541,
"grad_norm": 5.824040412902832,
"learning_rate": 4.982109736083676e-06,
"loss": 0.9344091415405273,
"step": 290
},
{
"epoch": 0.35653235653235654,
"grad_norm": 1.5543690919876099,
"learning_rate": 4.981533458376416e-06,
"loss": 1.292595386505127,
"step": 292
},
{
"epoch": 0.358974358974359,
"grad_norm": 2.081808090209961,
"learning_rate": 4.980948083970351e-06,
"loss": 1.0262247323989868,
"step": 294
},
{
"epoch": 0.3614163614163614,
"grad_norm": 2.1623454093933105,
"learning_rate": 4.980353615251719e-06,
"loss": 1.280896782875061,
"step": 296
},
{
"epoch": 0.36385836385836384,
"grad_norm": 9.417366027832031,
"learning_rate": 4.9797500546438344e-06,
"loss": 1.4011857509613037,
"step": 298
},
{
"epoch": 0.3663003663003663,
"grad_norm": 2.1483418941497803,
"learning_rate": 4.979137404607072e-06,
"loss": 1.243982195854187,
"step": 300
},
{
"epoch": 0.36874236874236876,
"grad_norm": 2.855179786682129,
"learning_rate": 4.978515667638858e-06,
"loss": 0.8995228409767151,
"step": 302
},
{
"epoch": 0.3711843711843712,
"grad_norm": 1.9090166091918945,
"learning_rate": 4.9778848462736625e-06,
"loss": 1.1892352104187012,
"step": 304
},
{
"epoch": 0.37362637362637363,
"grad_norm": 1.4595932960510254,
"learning_rate": 4.977244943082987e-06,
"loss": 1.3153109550476074,
"step": 306
},
{
"epoch": 0.37606837606837606,
"grad_norm": 2.5620715618133545,
"learning_rate": 4.976595960675356e-06,
"loss": 1.3017933368682861,
"step": 308
},
{
"epoch": 0.3785103785103785,
"grad_norm": 2.5225541591644287,
"learning_rate": 4.975937901696302e-06,
"loss": 1.3250616788864136,
"step": 310
},
{
"epoch": 0.38095238095238093,
"grad_norm": 4.8774895668029785,
"learning_rate": 4.975270768828359e-06,
"loss": 0.984774649143219,
"step": 312
},
{
"epoch": 0.3833943833943834,
"grad_norm": 1.8592923879623413,
"learning_rate": 4.974594564791051e-06,
"loss": 1.3683158159255981,
"step": 314
},
{
"epoch": 0.38583638583638585,
"grad_norm": 4.383054733276367,
"learning_rate": 4.9739092923408784e-06,
"loss": 0.6529649496078491,
"step": 316
},
{
"epoch": 0.3882783882783883,
"grad_norm": 3.972773790359497,
"learning_rate": 4.97321495427131e-06,
"loss": 0.9518109560012817,
"step": 318
},
{
"epoch": 0.3907203907203907,
"grad_norm": 5.475085735321045,
"learning_rate": 4.972511553412768e-06,
"loss": 1.334009051322937,
"step": 320
},
{
"epoch": 0.39316239316239315,
"grad_norm": 2.0150842666625977,
"learning_rate": 4.971799092632619e-06,
"loss": 1.344587802886963,
"step": 322
},
{
"epoch": 0.3956043956043956,
"grad_norm": 1.9884312152862549,
"learning_rate": 4.971077574835165e-06,
"loss": 1.3174562454223633,
"step": 324
},
{
"epoch": 0.398046398046398,
"grad_norm": 2.862060308456421,
"learning_rate": 4.970347002961623e-06,
"loss": 1.244167447090149,
"step": 326
},
{
"epoch": 0.4004884004884005,
"grad_norm": 1.4828734397888184,
"learning_rate": 4.969607379990123e-06,
"loss": 1.2446471452713013,
"step": 328
},
{
"epoch": 0.40293040293040294,
"grad_norm": 5.2736592292785645,
"learning_rate": 4.968858708935686e-06,
"loss": 0.8940474987030029,
"step": 330
},
{
"epoch": 0.4053724053724054,
"grad_norm": 3.302468776702881,
"learning_rate": 4.968100992850223e-06,
"loss": 0.6339259147644043,
"step": 332
},
{
"epoch": 0.4078144078144078,
"grad_norm": 2.196411371231079,
"learning_rate": 4.967334234822514e-06,
"loss": 1.0478650331497192,
"step": 334
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.72081458568573,
"learning_rate": 4.966558437978196e-06,
"loss": 1.349544882774353,
"step": 336
},
{
"epoch": 0.4126984126984127,
"grad_norm": 1.245092511177063,
"learning_rate": 4.965773605479754e-06,
"loss": 0.9362432956695557,
"step": 338
},
{
"epoch": 0.41514041514041516,
"grad_norm": 10.09897518157959,
"learning_rate": 4.964979740526505e-06,
"loss": 1.0755311250686646,
"step": 340
},
{
"epoch": 0.4175824175824176,
"grad_norm": 2.285883903503418,
"learning_rate": 4.964176846354588e-06,
"loss": 1.6347922086715698,
"step": 342
},
{
"epoch": 0.42002442002442003,
"grad_norm": 1.59197199344635,
"learning_rate": 4.963364926236949e-06,
"loss": 0.9156535863876343,
"step": 344
},
{
"epoch": 0.42246642246642246,
"grad_norm": 1.6245992183685303,
"learning_rate": 4.962543983483325e-06,
"loss": 1.11324143409729,
"step": 346
},
{
"epoch": 0.4249084249084249,
"grad_norm": 2.302391767501831,
"learning_rate": 4.961714021440236e-06,
"loss": 1.3008726835250854,
"step": 348
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.053579092025757,
"learning_rate": 4.960875043490967e-06,
"loss": 0.8544071316719055,
"step": 350
},
{
"epoch": 0.4297924297924298,
"grad_norm": 1.6109215021133423,
"learning_rate": 4.960027053055557e-06,
"loss": 1.0643997192382812,
"step": 352
},
{
"epoch": 0.43223443223443225,
"grad_norm": 1.7497365474700928,
"learning_rate": 4.959170053590781e-06,
"loss": 1.2529405355453491,
"step": 354
},
{
"epoch": 0.4346764346764347,
"grad_norm": 1.5827484130859375,
"learning_rate": 4.958304048590143e-06,
"loss": 0.8821004033088684,
"step": 356
},
{
"epoch": 0.4371184371184371,
"grad_norm": 3.8849446773529053,
"learning_rate": 4.957429041583855e-06,
"loss": 1.127004623413086,
"step": 358
},
{
"epoch": 0.43956043956043955,
"grad_norm": 1.9818916320800781,
"learning_rate": 4.956545036138824e-06,
"loss": 1.207819938659668,
"step": 360
},
{
"epoch": 0.442002442002442,
"grad_norm": 3.0806636810302734,
"learning_rate": 4.9556520358586394e-06,
"loss": 1.0458451509475708,
"step": 362
},
{
"epoch": 0.4444444444444444,
"grad_norm": 6.247749328613281,
"learning_rate": 4.95475004438356e-06,
"loss": 1.1090641021728516,
"step": 364
},
{
"epoch": 0.4468864468864469,
"grad_norm": 4.639119625091553,
"learning_rate": 4.953839065390494e-06,
"loss": 1.37210214138031,
"step": 366
},
{
"epoch": 0.44932844932844934,
"grad_norm": 3.0761399269104004,
"learning_rate": 4.952919102592985e-06,
"loss": 1.020755410194397,
"step": 368
},
{
"epoch": 0.4517704517704518,
"grad_norm": 0.9568601846694946,
"learning_rate": 4.9519901597412036e-06,
"loss": 1.0233187675476074,
"step": 370
},
{
"epoch": 0.4542124542124542,
"grad_norm": 1.6627336740493774,
"learning_rate": 4.9510522406219215e-06,
"loss": 1.2981936931610107,
"step": 372
},
{
"epoch": 0.45665445665445664,
"grad_norm": 1.6876623630523682,
"learning_rate": 4.9501053490585055e-06,
"loss": 0.8830539584159851,
"step": 374
},
{
"epoch": 0.4590964590964591,
"grad_norm": 2.635246515274048,
"learning_rate": 4.9491494889108956e-06,
"loss": 1.219455599784851,
"step": 376
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.5608506202697754,
"learning_rate": 4.948184664075594e-06,
"loss": 0.9302881956100464,
"step": 378
},
{
"epoch": 0.463980463980464,
"grad_norm": 2.0959465503692627,
"learning_rate": 4.947210878485644e-06,
"loss": 0.8517276048660278,
"step": 380
},
{
"epoch": 0.46642246642246643,
"grad_norm": 1.482036828994751,
"learning_rate": 4.94622813611062e-06,
"loss": 1.2661558389663696,
"step": 382
},
{
"epoch": 0.46886446886446886,
"grad_norm": 2.3324732780456543,
"learning_rate": 4.945236440956604e-06,
"loss": 1.0352469682693481,
"step": 384
},
{
"epoch": 0.4713064713064713,
"grad_norm": 5.657218933105469,
"learning_rate": 4.944235797066177e-06,
"loss": 1.2012758255004883,
"step": 386
},
{
"epoch": 0.47374847374847373,
"grad_norm": 3.256732225418091,
"learning_rate": 4.943226208518398e-06,
"loss": 1.5897492170333862,
"step": 388
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.6683677434921265,
"learning_rate": 4.942207679428788e-06,
"loss": 0.7892211079597473,
"step": 390
},
{
"epoch": 0.47863247863247865,
"grad_norm": 2.3011298179626465,
"learning_rate": 4.941180213949314e-06,
"loss": 0.8288873434066772,
"step": 392
},
{
"epoch": 0.4810744810744811,
"grad_norm": 1.1541416645050049,
"learning_rate": 4.94014381626837e-06,
"loss": 0.9236152172088623,
"step": 394
},
{
"epoch": 0.4835164835164835,
"grad_norm": 2.699540853500366,
"learning_rate": 4.939098490610763e-06,
"loss": 1.2205630540847778,
"step": 396
},
{
"epoch": 0.48595848595848595,
"grad_norm": 3.6751928329467773,
"learning_rate": 4.938044241237695e-06,
"loss": 1.2720117568969727,
"step": 398
},
{
"epoch": 0.4884004884004884,
"grad_norm": 2.8597030639648438,
"learning_rate": 4.936981072446743e-06,
"loss": 0.5283371210098267,
"step": 400
},
{
"epoch": 0.4908424908424908,
"grad_norm": 2.1727144718170166,
"learning_rate": 4.935908988571845e-06,
"loss": 1.2206032276153564,
"step": 402
},
{
"epoch": 0.4932844932844933,
"grad_norm": 1.26828932762146,
"learning_rate": 4.934827993983279e-06,
"loss": 1.3251525163650513,
"step": 404
},
{
"epoch": 0.49572649572649574,
"grad_norm": 2.455037832260132,
"learning_rate": 4.933738093087651e-06,
"loss": 0.6017684936523438,
"step": 406
},
{
"epoch": 0.4981684981684982,
"grad_norm": 2.95031476020813,
"learning_rate": 4.932639290327866e-06,
"loss": 0.8958187103271484,
"step": 408
},
{
"epoch": 0.5006105006105006,
"grad_norm": 1.4214322566986084,
"learning_rate": 4.931531590183123e-06,
"loss": 1.255342721939087,
"step": 410
},
{
"epoch": 0.503052503052503,
"grad_norm": 10.521769523620605,
"learning_rate": 4.930414997168889e-06,
"loss": 0.5480175614356995,
"step": 412
},
{
"epoch": 0.5054945054945055,
"grad_norm": 1.6570682525634766,
"learning_rate": 4.929289515836882e-06,
"loss": 1.3151097297668457,
"step": 414
},
{
"epoch": 0.5079365079365079,
"grad_norm": 2.6421968936920166,
"learning_rate": 4.928155150775049e-06,
"loss": 1.2698694467544556,
"step": 416
},
{
"epoch": 0.5103785103785103,
"grad_norm": 2.2661855220794678,
"learning_rate": 4.927011906607559e-06,
"loss": 1.1845803260803223,
"step": 418
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.9527170658111572,
"learning_rate": 4.925859787994767e-06,
"loss": 1.2397900819778442,
"step": 420
},
{
"epoch": 0.5152625152625152,
"grad_norm": 1.2722523212432861,
"learning_rate": 4.924698799633212e-06,
"loss": 1.2302662134170532,
"step": 422
},
{
"epoch": 0.5177045177045178,
"grad_norm": 1.8450767993927002,
"learning_rate": 4.923528946255584e-06,
"loss": 1.257878303527832,
"step": 424
},
{
"epoch": 0.5201465201465202,
"grad_norm": 2.1251304149627686,
"learning_rate": 4.922350232630715e-06,
"loss": 1.0593935251235962,
"step": 426
},
{
"epoch": 0.5225885225885226,
"grad_norm": 1.2513749599456787,
"learning_rate": 4.9211626635635515e-06,
"loss": 1.2507191896438599,
"step": 428
},
{
"epoch": 0.525030525030525,
"grad_norm": 8.465970039367676,
"learning_rate": 4.919966243895142e-06,
"loss": 0.8818293809890747,
"step": 430
},
{
"epoch": 0.5274725274725275,
"grad_norm": 6.935226917266846,
"learning_rate": 4.918760978502611e-06,
"loss": 0.5760735273361206,
"step": 432
},
{
"epoch": 0.5299145299145299,
"grad_norm": 11.597949028015137,
"learning_rate": 4.917546872299143e-06,
"loss": 1.2672209739685059,
"step": 434
},
{
"epoch": 0.5323565323565324,
"grad_norm": 1.8173397779464722,
"learning_rate": 4.916323930233962e-06,
"loss": 1.2190382480621338,
"step": 436
},
{
"epoch": 0.5347985347985348,
"grad_norm": 3.132521867752075,
"learning_rate": 4.915092157292313e-06,
"loss": 1.2443459033966064,
"step": 438
},
{
"epoch": 0.5372405372405372,
"grad_norm": 1.43805730342865,
"learning_rate": 4.913851558495433e-06,
"loss": 1.2091344594955444,
"step": 440
},
{
"epoch": 0.5396825396825397,
"grad_norm": 11.365583419799805,
"learning_rate": 4.912602138900545e-06,
"loss": 1.0195097923278809,
"step": 442
},
{
"epoch": 0.5421245421245421,
"grad_norm": 2.1645138263702393,
"learning_rate": 4.911343903600823e-06,
"loss": 0.8177242279052734,
"step": 444
},
{
"epoch": 0.5445665445665445,
"grad_norm": 1.9511176347732544,
"learning_rate": 4.91007685772538e-06,
"loss": 0.9824368357658386,
"step": 446
},
{
"epoch": 0.5470085470085471,
"grad_norm": 1.3720399141311646,
"learning_rate": 4.908801006439247e-06,
"loss": 1.08683443069458,
"step": 448
},
{
"epoch": 0.5494505494505495,
"grad_norm": 2.627140760421753,
"learning_rate": 4.9075163549433455e-06,
"loss": 0.979245126247406,
"step": 450
},
{
"epoch": 0.5518925518925519,
"grad_norm": 2.8625056743621826,
"learning_rate": 4.906222908474474e-06,
"loss": 0.7317221760749817,
"step": 452
},
{
"epoch": 0.5543345543345544,
"grad_norm": 3.2224857807159424,
"learning_rate": 4.90492067230528e-06,
"loss": 1.258061170578003,
"step": 454
},
{
"epoch": 0.5567765567765568,
"grad_norm": 2.8512461185455322,
"learning_rate": 4.903609651744244e-06,
"loss": 1.2263869047164917,
"step": 456
},
{
"epoch": 0.5592185592185592,
"grad_norm": 1.08021080493927,
"learning_rate": 4.902289852135655e-06,
"loss": 0.6804142594337463,
"step": 458
},
{
"epoch": 0.5616605616605617,
"grad_norm": 46.32835006713867,
"learning_rate": 4.90096127885959e-06,
"loss": 1.1406168937683105,
"step": 460
},
{
"epoch": 0.5641025641025641,
"grad_norm": 3.643751621246338,
"learning_rate": 4.899623937331887e-06,
"loss": 1.1659770011901855,
"step": 462
},
{
"epoch": 0.5665445665445665,
"grad_norm": 6.275250434875488,
"learning_rate": 4.898277833004135e-06,
"loss": 0.5430421829223633,
"step": 464
},
{
"epoch": 0.568986568986569,
"grad_norm": 1.6964603662490845,
"learning_rate": 4.896922971363635e-06,
"loss": 1.487717628479004,
"step": 466
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.5128223896026611,
"learning_rate": 4.895559357933394e-06,
"loss": 1.2990221977233887,
"step": 468
},
{
"epoch": 0.5738705738705738,
"grad_norm": 2.9639620780944824,
"learning_rate": 4.89418699827209e-06,
"loss": 1.001917839050293,
"step": 470
},
{
"epoch": 0.5763125763125763,
"grad_norm": 3.8676769733428955,
"learning_rate": 4.892805897974059e-06,
"loss": 1.2513344287872314,
"step": 472
},
{
"epoch": 0.5787545787545788,
"grad_norm": 3.1442391872406006,
"learning_rate": 4.891416062669262e-06,
"loss": 0.8551501631736755,
"step": 474
},
{
"epoch": 0.5811965811965812,
"grad_norm": 4.836908340454102,
"learning_rate": 4.890017498023274e-06,
"loss": 0.9901700615882874,
"step": 476
},
{
"epoch": 0.5836385836385837,
"grad_norm": 3.409428358078003,
"learning_rate": 4.888610209737249e-06,
"loss": 1.1505521535873413,
"step": 478
},
{
"epoch": 0.5860805860805861,
"grad_norm": 2.039818525314331,
"learning_rate": 4.887194203547907e-06,
"loss": 1.2868854999542236,
"step": 480
},
{
"epoch": 0.5885225885225885,
"grad_norm": 3.3642868995666504,
"learning_rate": 4.885769485227503e-06,
"loss": 0.5171108245849609,
"step": 482
},
{
"epoch": 0.590964590964591,
"grad_norm": 1.0177332162857056,
"learning_rate": 4.8843360605838055e-06,
"loss": 0.9433972239494324,
"step": 484
},
{
"epoch": 0.5934065934065934,
"grad_norm": 0.7848219275474548,
"learning_rate": 4.882893935460078e-06,
"loss": 1.0055443048477173,
"step": 486
},
{
"epoch": 0.5958485958485958,
"grad_norm": 1.8216912746429443,
"learning_rate": 4.881443115735045e-06,
"loss": 0.9295751452445984,
"step": 488
},
{
"epoch": 0.5982905982905983,
"grad_norm": 2.449176073074341,
"learning_rate": 4.879983607322881e-06,
"loss": 0.9871832132339478,
"step": 490
},
{
"epoch": 0.6007326007326007,
"grad_norm": 2.1226084232330322,
"learning_rate": 4.878515416173174e-06,
"loss": 0.7565707564353943,
"step": 492
},
{
"epoch": 0.6031746031746031,
"grad_norm": 1.5631353855133057,
"learning_rate": 4.877038548270907e-06,
"loss": 0.9493947625160217,
"step": 494
},
{
"epoch": 0.6056166056166056,
"grad_norm": 2.126840829849243,
"learning_rate": 4.875553009636437e-06,
"loss": 1.216259479522705,
"step": 496
},
{
"epoch": 0.608058608058608,
"grad_norm": 6.31650447845459,
"learning_rate": 4.874058806325463e-06,
"loss": 0.5695387125015259,
"step": 498
},
{
"epoch": 0.6105006105006106,
"grad_norm": 1.5655598640441895,
"learning_rate": 4.872555944429006e-06,
"loss": 0.8497368097305298,
"step": 500
},
{
"epoch": 0.612942612942613,
"grad_norm": 2.7936275005340576,
"learning_rate": 4.871044430073383e-06,
"loss": 1.2087408304214478,
"step": 502
},
{
"epoch": 0.6153846153846154,
"grad_norm": 5.054646015167236,
"learning_rate": 4.869524269420183e-06,
"loss": 1.2262006998062134,
"step": 504
},
{
"epoch": 0.6178266178266179,
"grad_norm": 2.619194507598877,
"learning_rate": 4.8679954686662404e-06,
"loss": 1.2392239570617676,
"step": 506
},
{
"epoch": 0.6202686202686203,
"grad_norm": 2.1149373054504395,
"learning_rate": 4.866458034043611e-06,
"loss": 1.2161999940872192,
"step": 508
},
{
"epoch": 0.6227106227106227,
"grad_norm": 3.2427122592926025,
"learning_rate": 4.864911971819545e-06,
"loss": 1.2096397876739502,
"step": 510
},
{
"epoch": 0.6251526251526252,
"grad_norm": 1.8958977460861206,
"learning_rate": 4.863357288296463e-06,
"loss": 1.0511081218719482,
"step": 512
},
{
"epoch": 0.6275946275946276,
"grad_norm": 2.3320584297180176,
"learning_rate": 4.861793989811929e-06,
"loss": 1.039177417755127,
"step": 514
},
{
"epoch": 0.63003663003663,
"grad_norm": 2.3830647468566895,
"learning_rate": 4.860222082738628e-06,
"loss": 0.9379343390464783,
"step": 516
},
{
"epoch": 0.6324786324786325,
"grad_norm": 1.4862414598464966,
"learning_rate": 4.858641573484334e-06,
"loss": 1.2305572032928467,
"step": 518
},
{
"epoch": 0.6349206349206349,
"grad_norm": 1.5390454530715942,
"learning_rate": 4.8570524684918885e-06,
"loss": 0.7816034555435181,
"step": 520
},
{
"epoch": 0.6373626373626373,
"grad_norm": 3.6071054935455322,
"learning_rate": 4.855454774239174e-06,
"loss": 0.9562470316886902,
"step": 522
},
{
"epoch": 0.6398046398046398,
"grad_norm": 19.057411193847656,
"learning_rate": 4.8538484972390844e-06,
"loss": 0.9935526847839355,
"step": 524
},
{
"epoch": 0.6422466422466423,
"grad_norm": 5.404201030731201,
"learning_rate": 4.852233644039503e-06,
"loss": 1.2573553323745728,
"step": 526
},
{
"epoch": 0.6446886446886447,
"grad_norm": 5.5111002922058105,
"learning_rate": 4.8506102212232714e-06,
"loss": 1.309897780418396,
"step": 528
},
{
"epoch": 0.6471306471306472,
"grad_norm": 12.94403076171875,
"learning_rate": 4.848978235408165e-06,
"loss": 1.0515775680541992,
"step": 530
},
{
"epoch": 0.6495726495726496,
"grad_norm": 2.2977471351623535,
"learning_rate": 4.847337693246869e-06,
"loss": 1.0648335218429565,
"step": 532
},
{
"epoch": 0.652014652014652,
"grad_norm": 4.799200057983398,
"learning_rate": 4.845688601426942e-06,
"loss": 1.5188199281692505,
"step": 534
},
{
"epoch": 0.6544566544566545,
"grad_norm": 1.9204214811325073,
"learning_rate": 4.8440309666708006e-06,
"loss": 0.8509761691093445,
"step": 536
},
{
"epoch": 0.6568986568986569,
"grad_norm": 3.197026014328003,
"learning_rate": 4.842364795735681e-06,
"loss": 1.237154483795166,
"step": 538
},
{
"epoch": 0.6593406593406593,
"grad_norm": 2.667442798614502,
"learning_rate": 4.840690095413621e-06,
"loss": 1.256026268005371,
"step": 540
},
{
"epoch": 0.6617826617826618,
"grad_norm": 1.8451228141784668,
"learning_rate": 4.8390068725314235e-06,
"loss": 0.9330289959907532,
"step": 542
},
{
"epoch": 0.6642246642246642,
"grad_norm": 2.3354480266571045,
"learning_rate": 4.837315133950639e-06,
"loss": 1.2343664169311523,
"step": 544
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.377455472946167,
"learning_rate": 4.835614886567523e-06,
"loss": 1.1341302394866943,
"step": 546
},
{
"epoch": 0.6691086691086691,
"grad_norm": 2.2880988121032715,
"learning_rate": 4.833906137313027e-06,
"loss": 1.2215226888656616,
"step": 548
},
{
"epoch": 0.6715506715506715,
"grad_norm": 1.8074506521224976,
"learning_rate": 4.8321888931527526e-06,
"loss": 1.1459529399871826,
"step": 550
},
{
"epoch": 0.673992673992674,
"grad_norm": 2.449573516845703,
"learning_rate": 4.83046316108693e-06,
"loss": 0.9938110709190369,
"step": 552
},
{
"epoch": 0.6764346764346765,
"grad_norm": 1.4324339628219604,
"learning_rate": 4.828728948150395e-06,
"loss": 1.018953800201416,
"step": 554
},
{
"epoch": 0.6788766788766789,
"grad_norm": 1.9855033159255981,
"learning_rate": 4.826986261412551e-06,
"loss": 0.7265840768814087,
"step": 556
},
{
"epoch": 0.6813186813186813,
"grad_norm": 5.827937126159668,
"learning_rate": 4.825235107977347e-06,
"loss": 1.159310221672058,
"step": 558
},
{
"epoch": 0.6837606837606838,
"grad_norm": 1.3800535202026367,
"learning_rate": 4.82347549498324e-06,
"loss": 1.1552388668060303,
"step": 560
},
{
"epoch": 0.6862026862026862,
"grad_norm": 6.40132999420166,
"learning_rate": 4.821707429603181e-06,
"loss": 0.9877975583076477,
"step": 562
},
{
"epoch": 0.6886446886446886,
"grad_norm": 5.481659889221191,
"learning_rate": 4.8199309190445694e-06,
"loss": 1.294710636138916,
"step": 564
},
{
"epoch": 0.6910866910866911,
"grad_norm": 1.7129062414169312,
"learning_rate": 4.818145970549233e-06,
"loss": 1.0880193710327148,
"step": 566
},
{
"epoch": 0.6935286935286935,
"grad_norm": 2.8719332218170166,
"learning_rate": 4.816352591393398e-06,
"loss": 1.286997675895691,
"step": 568
},
{
"epoch": 0.6959706959706959,
"grad_norm": 3.2201790809631348,
"learning_rate": 4.814550788887655e-06,
"loss": 0.9447314143180847,
"step": 570
},
{
"epoch": 0.6984126984126984,
"grad_norm": 9.533697128295898,
"learning_rate": 4.812740570376933e-06,
"loss": 0.9670330286026001,
"step": 572
},
{
"epoch": 0.7008547008547008,
"grad_norm": 4.622603416442871,
"learning_rate": 4.810921943240469e-06,
"loss": 1.4118473529815674,
"step": 574
},
{
"epoch": 0.7032967032967034,
"grad_norm": 1.7296289205551147,
"learning_rate": 4.809094914891775e-06,
"loss": 1.2039122581481934,
"step": 576
},
{
"epoch": 0.7057387057387058,
"grad_norm": 1.5504480600357056,
"learning_rate": 4.807259492778613e-06,
"loss": 1.1822270154953003,
"step": 578
},
{
"epoch": 0.7081807081807082,
"grad_norm": 1.588944435119629,
"learning_rate": 4.805415684382959e-06,
"loss": 1.142565131187439,
"step": 580
},
{
"epoch": 0.7106227106227107,
"grad_norm": 1.771332859992981,
"learning_rate": 4.803563497220976e-06,
"loss": 1.1912704706192017,
"step": 582
},
{
"epoch": 0.7130647130647131,
"grad_norm": 5.767767429351807,
"learning_rate": 4.8017029388429845e-06,
"loss": 1.0446151494979858,
"step": 584
},
{
"epoch": 0.7155067155067155,
"grad_norm": 1.6248974800109863,
"learning_rate": 4.799834016833425e-06,
"loss": 1.287752628326416,
"step": 586
},
{
"epoch": 0.717948717948718,
"grad_norm": 2.37131404876709,
"learning_rate": 4.7979567388108376e-06,
"loss": 1.061058759689331,
"step": 588
},
{
"epoch": 0.7203907203907204,
"grad_norm": 1.6095200777053833,
"learning_rate": 4.796071112427821e-06,
"loss": 0.9337313771247864,
"step": 590
},
{
"epoch": 0.7228327228327228,
"grad_norm": 1.546781063079834,
"learning_rate": 4.794177145371006e-06,
"loss": 0.8499547243118286,
"step": 592
},
{
"epoch": 0.7252747252747253,
"grad_norm": 1.973464846611023,
"learning_rate": 4.792274845361025e-06,
"loss": 1.199100375175476,
"step": 594
},
{
"epoch": 0.7277167277167277,
"grad_norm": 2.8694047927856445,
"learning_rate": 4.790364220152477e-06,
"loss": 0.9500537514686584,
"step": 596
},
{
"epoch": 0.7301587301587301,
"grad_norm": 3.2333526611328125,
"learning_rate": 4.788445277533902e-06,
"loss": 0.9067592024803162,
"step": 598
},
{
"epoch": 0.7326007326007326,
"grad_norm": 1.7088539600372314,
"learning_rate": 4.786518025327742e-06,
"loss": 1.1730542182922363,
"step": 600
},
{
"epoch": 0.7350427350427351,
"grad_norm": 3.451296091079712,
"learning_rate": 4.7845824713903115e-06,
"loss": 1.319393515586853,
"step": 602
},
{
"epoch": 0.7374847374847375,
"grad_norm": 2.3576159477233887,
"learning_rate": 4.782638623611771e-06,
"loss": 1.1339298486709595,
"step": 604
},
{
"epoch": 0.73992673992674,
"grad_norm": 4.726226329803467,
"learning_rate": 4.780686489916086e-06,
"loss": 1.4150636196136475,
"step": 606
},
{
"epoch": 0.7423687423687424,
"grad_norm": 1.4049737453460693,
"learning_rate": 4.778726078261001e-06,
"loss": 1.265529751777649,
"step": 608
},
{
"epoch": 0.7448107448107448,
"grad_norm": 2.383326768875122,
"learning_rate": 4.776757396638005e-06,
"loss": 0.8798419237136841,
"step": 610
},
{
"epoch": 0.7472527472527473,
"grad_norm": 1.5629072189331055,
"learning_rate": 4.774780453072298e-06,
"loss": 1.2364379167556763,
"step": 612
},
{
"epoch": 0.7496947496947497,
"grad_norm": 1.7620823383331299,
"learning_rate": 4.772795255622761e-06,
"loss": 1.2224982976913452,
"step": 614
},
{
"epoch": 0.7521367521367521,
"grad_norm": 1.2436916828155518,
"learning_rate": 4.770801812381919e-06,
"loss": 0.8438993096351624,
"step": 616
},
{
"epoch": 0.7545787545787546,
"grad_norm": 1.9763593673706055,
"learning_rate": 4.768800131475913e-06,
"loss": 1.5313855409622192,
"step": 618
},
{
"epoch": 0.757020757020757,
"grad_norm": 3.1992554664611816,
"learning_rate": 4.7667902210644616e-06,
"loss": 1.008560061454773,
"step": 620
},
{
"epoch": 0.7594627594627594,
"grad_norm": 7.259356498718262,
"learning_rate": 4.764772089340833e-06,
"loss": 0.9306063652038574,
"step": 622
},
{
"epoch": 0.7619047619047619,
"grad_norm": 1.7845251560211182,
"learning_rate": 4.762745744531808e-06,
"loss": 1.2115577459335327,
"step": 624
},
{
"epoch": 0.7643467643467643,
"grad_norm": 1.6257309913635254,
"learning_rate": 4.760711194897646e-06,
"loss": 1.2677242755889893,
"step": 626
},
{
"epoch": 0.7667887667887668,
"grad_norm": 3.1209113597869873,
"learning_rate": 4.758668448732057e-06,
"loss": 1.0252844095230103,
"step": 628
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.4179201126098633,
"learning_rate": 4.7566175143621575e-06,
"loss": 1.2540860176086426,
"step": 630
},
{
"epoch": 0.7716727716727717,
"grad_norm": 1.1921404600143433,
"learning_rate": 4.754558400148449e-06,
"loss": 0.8723937273025513,
"step": 632
},
{
"epoch": 0.7741147741147741,
"grad_norm": 2.1802115440368652,
"learning_rate": 4.752491114484773e-06,
"loss": 0.8961063623428345,
"step": 634
},
{
"epoch": 0.7765567765567766,
"grad_norm": 1.9183303117752075,
"learning_rate": 4.7504156657982835e-06,
"loss": 1.226144790649414,
"step": 636
},
{
"epoch": 0.778998778998779,
"grad_norm": 1.9927711486816406,
"learning_rate": 4.74833206254941e-06,
"loss": 1.2360224723815918,
"step": 638
},
{
"epoch": 0.7814407814407814,
"grad_norm": 4.131889343261719,
"learning_rate": 4.746240313231823e-06,
"loss": 0.9043057560920715,
"step": 640
},
{
"epoch": 0.7838827838827839,
"grad_norm": 3.734161138534546,
"learning_rate": 4.744140426372401e-06,
"loss": 1.0058786869049072,
"step": 642
},
{
"epoch": 0.7863247863247863,
"grad_norm": 1.5284754037857056,
"learning_rate": 4.742032410531195e-06,
"loss": 1.124707818031311,
"step": 644
},
{
"epoch": 0.7887667887667887,
"grad_norm": 3.012943744659424,
"learning_rate": 4.73991627430139e-06,
"loss": 0.9144378304481506,
"step": 646
},
{
"epoch": 0.7912087912087912,
"grad_norm": 2.2246816158294678,
"learning_rate": 4.737792026309278e-06,
"loss": 1.19635009765625,
"step": 648
},
{
"epoch": 0.7936507936507936,
"grad_norm": 1.9574096202850342,
"learning_rate": 4.735659675214215e-06,
"loss": 0.5257167220115662,
"step": 650
},
{
"epoch": 0.796092796092796,
"grad_norm": 2.1935298442840576,
"learning_rate": 4.7335192297085895e-06,
"loss": 0.7748251557350159,
"step": 652
},
{
"epoch": 0.7985347985347986,
"grad_norm": 2.698744535446167,
"learning_rate": 4.731370698517786e-06,
"loss": 1.1536623239517212,
"step": 654
},
{
"epoch": 0.800976800976801,
"grad_norm": 1.8622996807098389,
"learning_rate": 4.729214090400149e-06,
"loss": 1.200728178024292,
"step": 656
},
{
"epoch": 0.8034188034188035,
"grad_norm": 9.128825187683105,
"learning_rate": 4.727049414146952e-06,
"loss": 0.4623393714427948,
"step": 658
},
{
"epoch": 0.8058608058608059,
"grad_norm": 2.4344100952148438,
"learning_rate": 4.724876678582352e-06,
"loss": 1.0503042936325073,
"step": 660
},
{
"epoch": 0.8083028083028083,
"grad_norm": 1.744651198387146,
"learning_rate": 4.722695892563363e-06,
"loss": 1.1860074996948242,
"step": 662
},
{
"epoch": 0.8107448107448108,
"grad_norm": 1.6168715953826904,
"learning_rate": 4.720507064979816e-06,
"loss": 1.2846834659576416,
"step": 664
},
{
"epoch": 0.8131868131868132,
"grad_norm": 2.017427444458008,
"learning_rate": 4.7183102047543205e-06,
"loss": 0.8671167492866516,
"step": 666
},
{
"epoch": 0.8156288156288156,
"grad_norm": 2.0237629413604736,
"learning_rate": 4.716105320842234e-06,
"loss": 1.0235426425933838,
"step": 668
},
{
"epoch": 0.818070818070818,
"grad_norm": 2.6489999294281006,
"learning_rate": 4.713892422231619e-06,
"loss": 1.3756883144378662,
"step": 670
},
{
"epoch": 0.8205128205128205,
"grad_norm": 1.48908269405365,
"learning_rate": 4.71167151794321e-06,
"loss": 1.2407268285751343,
"step": 672
},
{
"epoch": 0.8229548229548229,
"grad_norm": 1.3047491312026978,
"learning_rate": 4.709442617030379e-06,
"loss": 0.9855388402938843,
"step": 674
},
{
"epoch": 0.8253968253968254,
"grad_norm": 1.8140162229537964,
"learning_rate": 4.707205728579091e-06,
"loss": 0.882321298122406,
"step": 676
},
{
"epoch": 0.8278388278388278,
"grad_norm": 13.20935344696045,
"learning_rate": 4.704960861707875e-06,
"loss": 1.2027504444122314,
"step": 678
},
{
"epoch": 0.8302808302808303,
"grad_norm": 1.8309438228607178,
"learning_rate": 4.702708025567784e-06,
"loss": 1.2264920473098755,
"step": 680
},
{
"epoch": 0.8327228327228328,
"grad_norm": 5.225240707397461,
"learning_rate": 4.700447229342353e-06,
"loss": 1.1251945495605469,
"step": 682
},
{
"epoch": 0.8351648351648352,
"grad_norm": 12.371641159057617,
"learning_rate": 4.698178482247571e-06,
"loss": 0.6810005307197571,
"step": 684
},
{
"epoch": 0.8376068376068376,
"grad_norm": 1.8889416456222534,
"learning_rate": 4.695901793531834e-06,
"loss": 1.325577974319458,
"step": 686
},
{
"epoch": 0.8400488400488401,
"grad_norm": 2.4011776447296143,
"learning_rate": 4.693617172475914e-06,
"loss": 1.2832276821136475,
"step": 688
},
{
"epoch": 0.8424908424908425,
"grad_norm": 6.96901273727417,
"learning_rate": 4.691324628392918e-06,
"loss": 0.9534074664115906,
"step": 690
},
{
"epoch": 0.8449328449328449,
"grad_norm": 3.2731733322143555,
"learning_rate": 4.68902417062825e-06,
"loss": 1.2452714443206787,
"step": 692
},
{
"epoch": 0.8473748473748474,
"grad_norm": 1.0289312601089478,
"learning_rate": 4.686715808559575e-06,
"loss": 0.9713150858879089,
"step": 694
},
{
"epoch": 0.8498168498168498,
"grad_norm": 4.545189380645752,
"learning_rate": 4.684399551596778e-06,
"loss": 1.130218744277954,
"step": 696
},
{
"epoch": 0.8522588522588522,
"grad_norm": 87.15223693847656,
"learning_rate": 4.682075409181928e-06,
"loss": 0.9914512634277344,
"step": 698
},
{
"epoch": 0.8547008547008547,
"grad_norm": 1.9779729843139648,
"learning_rate": 4.6797433907892385e-06,
"loss": 1.0588513612747192,
"step": 700
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.3967251777648926,
"learning_rate": 4.677403505925027e-06,
"loss": 1.0360347032546997,
"step": 702
},
{
"epoch": 0.8595848595848596,
"grad_norm": 2.3286092281341553,
"learning_rate": 4.6750557641276805e-06,
"loss": 0.9465680122375488,
"step": 704
},
{
"epoch": 0.8620268620268621,
"grad_norm": 1.6770824193954468,
"learning_rate": 4.672700174967613e-06,
"loss": 0.4001966118812561,
"step": 706
},
{
"epoch": 0.8644688644688645,
"grad_norm": 1.5843091011047363,
"learning_rate": 4.6703367480472304e-06,
"loss": 1.1794531345367432,
"step": 708
},
{
"epoch": 0.8669108669108669,
"grad_norm": 7.7685017585754395,
"learning_rate": 4.667965493000883e-06,
"loss": 0.8692483901977539,
"step": 710
},
{
"epoch": 0.8693528693528694,
"grad_norm": 7.2869977951049805,
"learning_rate": 4.665586419494837e-06,
"loss": 1.3455827236175537,
"step": 712
},
{
"epoch": 0.8717948717948718,
"grad_norm": 2.4024009704589844,
"learning_rate": 4.66319953722723e-06,
"loss": 1.53640615940094,
"step": 714
},
{
"epoch": 0.8742368742368742,
"grad_norm": 1.8035765886306763,
"learning_rate": 4.660804855928029e-06,
"loss": 1.400252103805542,
"step": 716
},
{
"epoch": 0.8766788766788767,
"grad_norm": 2.3839004039764404,
"learning_rate": 4.658402385358992e-06,
"loss": 0.88499915599823,
"step": 718
},
{
"epoch": 0.8791208791208791,
"grad_norm": 2.779142141342163,
"learning_rate": 4.655992135313634e-06,
"loss": 1.1850217580795288,
"step": 720
},
{
"epoch": 0.8815628815628815,
"grad_norm": 5.73760986328125,
"learning_rate": 4.6535741156171796e-06,
"loss": 0.8437918424606323,
"step": 722
},
{
"epoch": 0.884004884004884,
"grad_norm": 2.219005584716797,
"learning_rate": 4.651148336126527e-06,
"loss": 1.2156010866165161,
"step": 724
},
{
"epoch": 0.8864468864468864,
"grad_norm": 2.8996713161468506,
"learning_rate": 4.6487148067302065e-06,
"loss": 1.1615610122680664,
"step": 726
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.5215322971343994,
"learning_rate": 4.646273537348337e-06,
"loss": 1.15150785446167,
"step": 728
},
{
"epoch": 0.8913308913308914,
"grad_norm": 1.62069833278656,
"learning_rate": 4.643824537932595e-06,
"loss": 1.3772497177124023,
"step": 730
},
{
"epoch": 0.8937728937728938,
"grad_norm": 5.082972526550293,
"learning_rate": 4.641367818466164e-06,
"loss": 1.1609463691711426,
"step": 732
},
{
"epoch": 0.8962148962148963,
"grad_norm": 2.289997100830078,
"learning_rate": 4.6389033889637e-06,
"loss": 0.9794567227363586,
"step": 734
},
{
"epoch": 0.8986568986568987,
"grad_norm": 1.610153317451477,
"learning_rate": 4.636431259471284e-06,
"loss": 0.7476667165756226,
"step": 736
},
{
"epoch": 0.9010989010989011,
"grad_norm": 3.150763511657715,
"learning_rate": 4.633951440066391e-06,
"loss": 0.8690844774246216,
"step": 738
},
{
"epoch": 0.9035409035409036,
"grad_norm": 4.614825248718262,
"learning_rate": 4.631463940857841e-06,
"loss": 1.0103671550750732,
"step": 740
},
{
"epoch": 0.905982905982906,
"grad_norm": 1.2555079460144043,
"learning_rate": 4.6289687719857595e-06,
"loss": 0.873469352722168,
"step": 742
},
{
"epoch": 0.9084249084249084,
"grad_norm": 1.3857243061065674,
"learning_rate": 4.626465943621538e-06,
"loss": 0.8869038224220276,
"step": 744
},
{
"epoch": 0.9108669108669109,
"grad_norm": 8.226287841796875,
"learning_rate": 4.623955465967791e-06,
"loss": 1.1199119091033936,
"step": 746
},
{
"epoch": 0.9133089133089133,
"grad_norm": 2.548466920852661,
"learning_rate": 4.621437349258316e-06,
"loss": 0.8345762491226196,
"step": 748
},
{
"epoch": 0.9157509157509157,
"grad_norm": 1.7079286575317383,
"learning_rate": 4.618911603758047e-06,
"loss": 1.3368088006973267,
"step": 750
},
{
"epoch": 0.9181929181929182,
"grad_norm": 1.5761820077896118,
"learning_rate": 4.616378239763021e-06,
"loss": 1.2190864086151123,
"step": 752
},
{
"epoch": 0.9206349206349206,
"grad_norm": 1.6565887928009033,
"learning_rate": 4.613837267600328e-06,
"loss": 1.2295691967010498,
"step": 754
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.4082704782485962,
"learning_rate": 4.611288697628074e-06,
"loss": 1.2072789669036865,
"step": 756
},
{
"epoch": 0.9255189255189256,
"grad_norm": 2.3025150299072266,
"learning_rate": 4.608732540235336e-06,
"loss": 1.2459933757781982,
"step": 758
},
{
"epoch": 0.927960927960928,
"grad_norm": 2.0221993923187256,
"learning_rate": 4.60616880584212e-06,
"loss": 0.7390088438987732,
"step": 760
},
{
"epoch": 0.9304029304029304,
"grad_norm": 3.3428783416748047,
"learning_rate": 4.603597504899322e-06,
"loss": 1.0999096632003784,
"step": 762
},
{
"epoch": 0.9328449328449329,
"grad_norm": 1.6523158550262451,
"learning_rate": 4.601018647888677e-06,
"loss": 0.8729748129844666,
"step": 764
},
{
"epoch": 0.9352869352869353,
"grad_norm": 62.29137420654297,
"learning_rate": 4.598432245322729e-06,
"loss": 0.5466877818107605,
"step": 766
},
{
"epoch": 0.9377289377289377,
"grad_norm": 1.815721035003662,
"learning_rate": 4.595838307744775e-06,
"loss": 0.8582451939582825,
"step": 768
},
{
"epoch": 0.9401709401709402,
"grad_norm": 1.5688364505767822,
"learning_rate": 4.593236845728832e-06,
"loss": 0.7072654962539673,
"step": 770
},
{
"epoch": 0.9426129426129426,
"grad_norm": 1.4338949918746948,
"learning_rate": 4.590627869879586e-06,
"loss": 1.053293228149414,
"step": 772
},
{
"epoch": 0.945054945054945,
"grad_norm": 2.2348110675811768,
"learning_rate": 4.588011390832357e-06,
"loss": 1.2199137210845947,
"step": 774
},
{
"epoch": 0.9474969474969475,
"grad_norm": 2.0182247161865234,
"learning_rate": 4.585387419253048e-06,
"loss": 1.1575353145599365,
"step": 776
},
{
"epoch": 0.9499389499389499,
"grad_norm": 2.0367462635040283,
"learning_rate": 4.582755965838105e-06,
"loss": 0.842775821685791,
"step": 778
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.4695345163345337,
"learning_rate": 4.580117041314476e-06,
"loss": 0.9546113610267639,
"step": 780
},
{
"epoch": 0.9548229548229549,
"grad_norm": 4.101283550262451,
"learning_rate": 4.577470656439562e-06,
"loss": 1.0789297819137573,
"step": 782
},
{
"epoch": 0.9572649572649573,
"grad_norm": 1.6646188497543335,
"learning_rate": 4.574816822001175e-06,
"loss": 1.2004691362380981,
"step": 784
},
{
"epoch": 0.9597069597069597,
"grad_norm": 1.4552080631256104,
"learning_rate": 4.572155548817498e-06,
"loss": 1.2365154027938843,
"step": 786
},
{
"epoch": 0.9621489621489622,
"grad_norm": 2.4475789070129395,
"learning_rate": 4.5694868477370325e-06,
"loss": 1.330816388130188,
"step": 788
},
{
"epoch": 0.9645909645909646,
"grad_norm": 2.0710349082946777,
"learning_rate": 4.566810729638565e-06,
"loss": 1.2555092573165894,
"step": 790
},
{
"epoch": 0.967032967032967,
"grad_norm": 1.5041587352752686,
"learning_rate": 4.564127205431112e-06,
"loss": 1.0104345083236694,
"step": 792
},
{
"epoch": 0.9694749694749695,
"grad_norm": 1.5308382511138916,
"learning_rate": 4.5614362860538855e-06,
"loss": 1.2593212127685547,
"step": 794
},
{
"epoch": 0.9719169719169719,
"grad_norm": 1.3870609998703003,
"learning_rate": 4.558737982476238e-06,
"loss": 1.1537375450134277,
"step": 796
},
{
"epoch": 0.9743589743589743,
"grad_norm": 1.61140775680542,
"learning_rate": 4.556032305697628e-06,
"loss": 1.154402732849121,
"step": 798
},
{
"epoch": 0.9768009768009768,
"grad_norm": 2.04144024848938,
"learning_rate": 4.553319266747566e-06,
"loss": 1.2140703201293945,
"step": 800
},
{
"epoch": 0.9792429792429792,
"grad_norm": 2.3697726726531982,
"learning_rate": 4.550598876685578e-06,
"loss": 0.895045280456543,
"step": 802
},
{
"epoch": 0.9816849816849816,
"grad_norm": 1.659780502319336,
"learning_rate": 4.547871146601154e-06,
"loss": 1.1046396493911743,
"step": 804
},
{
"epoch": 0.9841269841269841,
"grad_norm": 2.151529312133789,
"learning_rate": 4.545136087613705e-06,
"loss": 1.5022149085998535,
"step": 806
},
{
"epoch": 0.9865689865689866,
"grad_norm": 2.602121353149414,
"learning_rate": 4.5423937108725195e-06,
"loss": 0.8793852925300598,
"step": 808
},
{
"epoch": 0.989010989010989,
"grad_norm": 3.102078914642334,
"learning_rate": 4.5396440275567135e-06,
"loss": 1.256363034248352,
"step": 810
},
{
"epoch": 0.9914529914529915,
"grad_norm": 2.6925265789031982,
"learning_rate": 4.536887048875191e-06,
"loss": 1.2054369449615479,
"step": 812
},
{
"epoch": 0.9938949938949939,
"grad_norm": 2.5501744747161865,
"learning_rate": 4.5341227860665935e-06,
"loss": 1.278929591178894,
"step": 814
},
{
"epoch": 0.9963369963369964,
"grad_norm": 1.176234483718872,
"learning_rate": 4.531351250399254e-06,
"loss": 1.1633700132369995,
"step": 816
},
{
"epoch": 0.9987789987789988,
"grad_norm": 1.4937130212783813,
"learning_rate": 4.5285724531711575e-06,
"loss": 1.0776214599609375,
"step": 818
},
{
"epoch": 1.0012210012210012,
"grad_norm": 3.934837818145752,
"learning_rate": 4.525786405709885e-06,
"loss": 0.9735159873962402,
"step": 820
},
{
"epoch": 1.0036630036630036,
"grad_norm": 1.3186302185058594,
"learning_rate": 4.5229931193725775e-06,
"loss": 1.1400266885757446,
"step": 822
},
{
"epoch": 1.006105006105006,
"grad_norm": 3.5487184524536133,
"learning_rate": 4.520192605545879e-06,
"loss": 0.522385835647583,
"step": 824
},
{
"epoch": 1.0085470085470085,
"grad_norm": 1.5596842765808105,
"learning_rate": 4.517384875645903e-06,
"loss": 1.0808534622192383,
"step": 826
},
{
"epoch": 1.010989010989011,
"grad_norm": 12.584797859191895,
"learning_rate": 4.514569941118172e-06,
"loss": 0.8573816418647766,
"step": 828
},
{
"epoch": 1.0134310134310134,
"grad_norm": 2.6060709953308105,
"learning_rate": 4.511747813437582e-06,
"loss": 0.8253161907196045,
"step": 830
},
{
"epoch": 1.0158730158730158,
"grad_norm": 6.2511420249938965,
"learning_rate": 4.50891850410835e-06,
"loss": 0.8468748331069946,
"step": 832
},
{
"epoch": 1.0183150183150182,
"grad_norm": 1.8716773986816406,
"learning_rate": 4.506082024663969e-06,
"loss": 0.833984375,
"step": 834
},
{
"epoch": 1.0207570207570207,
"grad_norm": 2.3107144832611084,
"learning_rate": 4.503238386667159e-06,
"loss": 1.1121944189071655,
"step": 836
},
{
"epoch": 1.0231990231990231,
"grad_norm": 1.7429516315460205,
"learning_rate": 4.500387601709824e-06,
"loss": 1.1830196380615234,
"step": 838
},
{
"epoch": 1.0256410256410255,
"grad_norm": 2.6718502044677734,
"learning_rate": 4.497529681413001e-06,
"loss": 0.8847705125808716,
"step": 840
},
{
"epoch": 1.028083028083028,
"grad_norm": 3.5513010025024414,
"learning_rate": 4.4946646374268105e-06,
"loss": 0.9079216122627258,
"step": 842
},
{
"epoch": 1.0305250305250304,
"grad_norm": 1.6211357116699219,
"learning_rate": 4.491792481430419e-06,
"loss": 1.1324057579040527,
"step": 844
},
{
"epoch": 1.032967032967033,
"grad_norm": 4.688745975494385,
"learning_rate": 4.488913225131977e-06,
"loss": 0.8587746620178223,
"step": 846
},
{
"epoch": 1.0354090354090355,
"grad_norm": 1.1522576808929443,
"learning_rate": 4.4860268802685865e-06,
"loss": 1.1451654434204102,
"step": 848
},
{
"epoch": 1.037851037851038,
"grad_norm": 8.89565658569336,
"learning_rate": 4.483133458606239e-06,
"loss": 0.93172687292099,
"step": 850
},
{
"epoch": 1.0402930402930404,
"grad_norm": 2.5761866569519043,
"learning_rate": 4.480232971939777e-06,
"loss": 0.6277377605438232,
"step": 852
},
{
"epoch": 1.0427350427350428,
"grad_norm": 1.6280884742736816,
"learning_rate": 4.477325432092845e-06,
"loss": 1.103888750076294,
"step": 854
},
{
"epoch": 1.0451770451770452,
"grad_norm": 6.281750202178955,
"learning_rate": 4.474410850917835e-06,
"loss": 0.8592535257339478,
"step": 856
},
{
"epoch": 1.0476190476190477,
"grad_norm": 2.389967203140259,
"learning_rate": 4.471489240295845e-06,
"loss": 1.1699893474578857,
"step": 858
},
{
"epoch": 1.05006105006105,
"grad_norm": 1.8374966382980347,
"learning_rate": 4.4685606121366295e-06,
"loss": 0.7583469748497009,
"step": 860
},
{
"epoch": 1.0525030525030525,
"grad_norm": 1.7573800086975098,
"learning_rate": 4.4656249783785465e-06,
"loss": 1.051936149597168,
"step": 862
},
{
"epoch": 1.054945054945055,
"grad_norm": 3.5590884685516357,
"learning_rate": 4.462682350988513e-06,
"loss": 1.1263583898544312,
"step": 864
},
{
"epoch": 1.0573870573870574,
"grad_norm": 2.979887008666992,
"learning_rate": 4.459732741961957e-06,
"loss": 1.4335747957229614,
"step": 866
},
{
"epoch": 1.0598290598290598,
"grad_norm": 2.712446689605713,
"learning_rate": 4.456776163322761e-06,
"loss": 0.39710327982902527,
"step": 868
},
{
"epoch": 1.0622710622710623,
"grad_norm": 1.8534517288208008,
"learning_rate": 4.453812627123227e-06,
"loss": 0.9377206563949585,
"step": 870
},
{
"epoch": 1.0647130647130647,
"grad_norm": 1.9186785221099854,
"learning_rate": 4.450842145444012e-06,
"loss": 1.0142245292663574,
"step": 872
},
{
"epoch": 1.0671550671550671,
"grad_norm": 1.7083848714828491,
"learning_rate": 4.4478647303940905e-06,
"loss": 0.7915772199630737,
"step": 874
},
{
"epoch": 1.0695970695970696,
"grad_norm": 2.7380499839782715,
"learning_rate": 4.4448803941106964e-06,
"loss": 1.10654878616333,
"step": 876
},
{
"epoch": 1.072039072039072,
"grad_norm": 2.324699878692627,
"learning_rate": 4.44188914875928e-06,
"loss": 1.0548545122146606,
"step": 878
},
{
"epoch": 1.0744810744810744,
"grad_norm": 1.84871506690979,
"learning_rate": 4.438891006533456e-06,
"loss": 0.747241735458374,
"step": 880
},
{
"epoch": 1.0769230769230769,
"grad_norm": 1.8665426969528198,
"learning_rate": 4.435885979654953e-06,
"loss": 1.0984582901000977,
"step": 882
},
{
"epoch": 1.0793650793650793,
"grad_norm": 3.5970113277435303,
"learning_rate": 4.432874080373565e-06,
"loss": 0.7559424638748169,
"step": 884
},
{
"epoch": 1.0818070818070817,
"grad_norm": 8.55659294128418,
"learning_rate": 4.4298553209671e-06,
"loss": 0.6807610988616943,
"step": 886
},
{
"epoch": 1.0842490842490842,
"grad_norm": 2.020759344100952,
"learning_rate": 4.426829713741332e-06,
"loss": 1.144335389137268,
"step": 888
},
{
"epoch": 1.0866910866910866,
"grad_norm": 1.3028897047042847,
"learning_rate": 4.4237972710299475e-06,
"loss": 0.8821287751197815,
"step": 890
},
{
"epoch": 1.089133089133089,
"grad_norm": 4.502945899963379,
"learning_rate": 4.420758005194502e-06,
"loss": 1.0961552858352661,
"step": 892
},
{
"epoch": 1.0915750915750915,
"grad_norm": 2.57550048828125,
"learning_rate": 4.417711928624358e-06,
"loss": 1.079803705215454,
"step": 894
},
{
"epoch": 1.0940170940170941,
"grad_norm": 1.6765140295028687,
"learning_rate": 4.41465905373665e-06,
"loss": 1.0386958122253418,
"step": 896
},
{
"epoch": 1.0964590964590966,
"grad_norm": 1.710028052330017,
"learning_rate": 4.411599392976217e-06,
"loss": 0.8400865793228149,
"step": 898
},
{
"epoch": 1.098901098901099,
"grad_norm": 1.6493473052978516,
"learning_rate": 4.408532958815566e-06,
"loss": 0.7645131945610046,
"step": 900
},
{
"epoch": 1.1013431013431014,
"grad_norm": 5.944808006286621,
"learning_rate": 4.405459763754814e-06,
"loss": 0.5732899904251099,
"step": 902
},
{
"epoch": 1.1037851037851039,
"grad_norm": 2.6218488216400146,
"learning_rate": 4.402379820321636e-06,
"loss": 0.6524146199226379,
"step": 904
},
{
"epoch": 1.1062271062271063,
"grad_norm": 6.440639972686768,
"learning_rate": 4.399293141071219e-06,
"loss": 1.0054997205734253,
"step": 906
},
{
"epoch": 1.1086691086691087,
"grad_norm": 2.3345463275909424,
"learning_rate": 4.396199738586208e-06,
"loss": 1.0879311561584473,
"step": 908
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.7419912815093994,
"learning_rate": 4.393099625476652e-06,
"loss": 0.7915565371513367,
"step": 910
},
{
"epoch": 1.1135531135531136,
"grad_norm": 7.168088912963867,
"learning_rate": 4.389992814379959e-06,
"loss": 1.140365719795227,
"step": 912
},
{
"epoch": 1.115995115995116,
"grad_norm": 4.943615913391113,
"learning_rate": 4.386879317960839e-06,
"loss": 0.7865337133407593,
"step": 914
},
{
"epoch": 1.1184371184371185,
"grad_norm": 2.3716390132904053,
"learning_rate": 4.383759148911254e-06,
"loss": 0.8161624670028687,
"step": 916
},
{
"epoch": 1.120879120879121,
"grad_norm": 1.9857978820800781,
"learning_rate": 4.380632319950368e-06,
"loss": 1.118779182434082,
"step": 918
},
{
"epoch": 1.1233211233211233,
"grad_norm": 1.886316180229187,
"learning_rate": 4.377498843824491e-06,
"loss": 1.064637541770935,
"step": 920
},
{
"epoch": 1.1257631257631258,
"grad_norm": 1.9790380001068115,
"learning_rate": 4.374358733307035e-06,
"loss": 0.8831157684326172,
"step": 922
},
{
"epoch": 1.1282051282051282,
"grad_norm": 2.9242825508117676,
"learning_rate": 4.37121200119845e-06,
"loss": 0.7275701761245728,
"step": 924
},
{
"epoch": 1.1306471306471306,
"grad_norm": 3.6830453872680664,
"learning_rate": 4.368058660326182e-06,
"loss": 0.6424413323402405,
"step": 926
},
{
"epoch": 1.133089133089133,
"grad_norm": 23.97179412841797,
"learning_rate": 4.364898723544618e-06,
"loss": 0.5658762454986572,
"step": 928
},
{
"epoch": 1.1355311355311355,
"grad_norm": 1.438925862312317,
"learning_rate": 4.361732203735032e-06,
"loss": 1.0492331981658936,
"step": 930
},
{
"epoch": 1.137973137973138,
"grad_norm": 2.813020706176758,
"learning_rate": 4.358559113805531e-06,
"loss": 1.0911149978637695,
"step": 932
},
{
"epoch": 1.1404151404151404,
"grad_norm": 3.2957863807678223,
"learning_rate": 4.355379466691008e-06,
"loss": 0.9345967769622803,
"step": 934
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.7585479021072388,
"learning_rate": 4.3521932753530856e-06,
"loss": 0.9201436042785645,
"step": 936
},
{
"epoch": 1.1452991452991452,
"grad_norm": 3.857144594192505,
"learning_rate": 4.34900055278006e-06,
"loss": 1.0130703449249268,
"step": 938
},
{
"epoch": 1.1477411477411477,
"grad_norm": 1.5104448795318604,
"learning_rate": 4.345801311986855e-06,
"loss": 1.0904299020767212,
"step": 940
},
{
"epoch": 1.15018315018315,
"grad_norm": 1.7477518320083618,
"learning_rate": 4.342595566014965e-06,
"loss": 1.0437507629394531,
"step": 942
},
{
"epoch": 1.1526251526251525,
"grad_norm": 2.9423069953918457,
"learning_rate": 4.339383327932402e-06,
"loss": 0.820652425289154,
"step": 944
},
{
"epoch": 1.155067155067155,
"grad_norm": 1.767242670059204,
"learning_rate": 4.33616461083364e-06,
"loss": 0.788296103477478,
"step": 946
},
{
"epoch": 1.1575091575091574,
"grad_norm": 10.629752159118652,
"learning_rate": 4.33293942783957e-06,
"loss": 0.6116584539413452,
"step": 948
},
{
"epoch": 1.1599511599511598,
"grad_norm": 3.033034086227417,
"learning_rate": 4.329707792097436e-06,
"loss": 1.0888707637786865,
"step": 950
},
{
"epoch": 1.1623931623931625,
"grad_norm": 1.710669755935669,
"learning_rate": 4.326469716780787e-06,
"loss": 1.091694712638855,
"step": 952
},
{
"epoch": 1.164835164835165,
"grad_norm": 3.053687572479248,
"learning_rate": 4.323225215089425e-06,
"loss": 0.9434468746185303,
"step": 954
},
{
"epoch": 1.1672771672771673,
"grad_norm": 1.6665173768997192,
"learning_rate": 4.319974300249346e-06,
"loss": 0.8895185589790344,
"step": 956
},
{
"epoch": 1.1697191697191698,
"grad_norm": 5.074321269989014,
"learning_rate": 4.3167169855126885e-06,
"loss": 0.7732067108154297,
"step": 958
},
{
"epoch": 1.1721611721611722,
"grad_norm": 3.9608309268951416,
"learning_rate": 4.313453284157683e-06,
"loss": 1.162553071975708,
"step": 960
},
{
"epoch": 1.1746031746031746,
"grad_norm": 2.7939271926879883,
"learning_rate": 4.310183209488592e-06,
"loss": 0.9017969369888306,
"step": 962
},
{
"epoch": 1.177045177045177,
"grad_norm": 2.613326072692871,
"learning_rate": 4.306906774835658e-06,
"loss": 0.7510517835617065,
"step": 964
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.82303386926651,
"learning_rate": 4.303623993555051e-06,
"loss": 1.073706030845642,
"step": 966
},
{
"epoch": 1.181929181929182,
"grad_norm": 4.074831962585449,
"learning_rate": 4.300334879028813e-06,
"loss": 1.135977029800415,
"step": 968
},
{
"epoch": 1.1843711843711844,
"grad_norm": 3.945430040359497,
"learning_rate": 4.2970394446648015e-06,
"loss": 0.7781526446342468,
"step": 970
},
{
"epoch": 1.1868131868131868,
"grad_norm": 1.8157840967178345,
"learning_rate": 4.293737703896636e-06,
"loss": 1.197265625,
"step": 972
},
{
"epoch": 1.1892551892551892,
"grad_norm": 2.2199547290802,
"learning_rate": 4.290429670183648e-06,
"loss": 0.8672367334365845,
"step": 974
},
{
"epoch": 1.1916971916971917,
"grad_norm": 11.338154792785645,
"learning_rate": 4.287115357010816e-06,
"loss": 0.7362724542617798,
"step": 976
},
{
"epoch": 1.1941391941391941,
"grad_norm": 1.7665647268295288,
"learning_rate": 4.283794777888718e-06,
"loss": 0.8837488293647766,
"step": 978
},
{
"epoch": 1.1965811965811965,
"grad_norm": 1.333461880683899,
"learning_rate": 4.280467946353478e-06,
"loss": 1.094375491142273,
"step": 980
},
{
"epoch": 1.199023199023199,
"grad_norm": 1.3403749465942383,
"learning_rate": 4.277134875966703e-06,
"loss": 1.0798702239990234,
"step": 982
},
{
"epoch": 1.2014652014652014,
"grad_norm": 1.700862169265747,
"learning_rate": 4.273795580315437e-06,
"loss": 1.195528507232666,
"step": 984
},
{
"epoch": 1.2039072039072038,
"grad_norm": 8.78385066986084,
"learning_rate": 4.270450073012095e-06,
"loss": 0.7649343013763428,
"step": 986
},
{
"epoch": 1.2063492063492063,
"grad_norm": 67.73321533203125,
"learning_rate": 4.267098367694419e-06,
"loss": 0.7331146001815796,
"step": 988
},
{
"epoch": 1.2087912087912087,
"grad_norm": 3.189149856567383,
"learning_rate": 4.263740478025412e-06,
"loss": 0.8756888508796692,
"step": 990
},
{
"epoch": 1.2112332112332111,
"grad_norm": 4.3046770095825195,
"learning_rate": 4.2603764176932925e-06,
"loss": 1.1108595132827759,
"step": 992
},
{
"epoch": 1.2136752136752136,
"grad_norm": 2.0171821117401123,
"learning_rate": 4.257006200411429e-06,
"loss": 1.0103721618652344,
"step": 994
},
{
"epoch": 1.2161172161172162,
"grad_norm": 2.713459014892578,
"learning_rate": 4.25362983991829e-06,
"loss": 0.9784596562385559,
"step": 996
},
{
"epoch": 1.2185592185592187,
"grad_norm": 1.9199753999710083,
"learning_rate": 4.250247349977385e-06,
"loss": 1.062201738357544,
"step": 998
},
{
"epoch": 1.221001221001221,
"grad_norm": 11.171542167663574,
"learning_rate": 4.246858744377212e-06,
"loss": 0.744211733341217,
"step": 1000
},
{
"epoch": 1.2234432234432235,
"grad_norm": 2.0410537719726562,
"learning_rate": 4.243464036931198e-06,
"loss": 1.0498521327972412,
"step": 1002
},
{
"epoch": 1.225885225885226,
"grad_norm": 1.5847947597503662,
"learning_rate": 4.240063241477643e-06,
"loss": 1.1089041233062744,
"step": 1004
},
{
"epoch": 1.2283272283272284,
"grad_norm": 3.658682346343994,
"learning_rate": 4.2366563718796664e-06,
"loss": 0.8046331405639648,
"step": 1006
},
{
"epoch": 1.2307692307692308,
"grad_norm": 1.940625548362732,
"learning_rate": 4.233243442025145e-06,
"loss": 0.7440409064292908,
"step": 1008
},
{
"epoch": 1.2332112332112333,
"grad_norm": 1.9868489503860474,
"learning_rate": 4.229824465826665e-06,
"loss": 1.144100308418274,
"step": 1010
},
{
"epoch": 1.2356532356532357,
"grad_norm": 1.2745945453643799,
"learning_rate": 4.226399457221454e-06,
"loss": 0.6603936553001404,
"step": 1012
},
{
"epoch": 1.2380952380952381,
"grad_norm": 1.5920745134353638,
"learning_rate": 4.222968430171336e-06,
"loss": 1.1303434371948242,
"step": 1014
},
{
"epoch": 1.2405372405372406,
"grad_norm": 4.021664619445801,
"learning_rate": 4.219531398662665e-06,
"loss": 1.0450407266616821,
"step": 1016
},
{
"epoch": 1.242979242979243,
"grad_norm": 1.6237807273864746,
"learning_rate": 4.216088376706274e-06,
"loss": 1.0899841785430908,
"step": 1018
},
{
"epoch": 1.2454212454212454,
"grad_norm": 2.2023823261260986,
"learning_rate": 4.212639378337413e-06,
"loss": 0.7024634480476379,
"step": 1020
},
{
"epoch": 1.2478632478632479,
"grad_norm": 8.069097518920898,
"learning_rate": 4.209184417615697e-06,
"loss": 0.9512033462524414,
"step": 1022
},
{
"epoch": 1.2503052503052503,
"grad_norm": 1.6683331727981567,
"learning_rate": 4.2057235086250455e-06,
"loss": 1.052414059638977,
"step": 1024
},
{
"epoch": 1.2527472527472527,
"grad_norm": 3.130899667739868,
"learning_rate": 4.2022566654736255e-06,
"loss": 1.0695925951004028,
"step": 1026
},
{
"epoch": 1.2551892551892552,
"grad_norm": 1.8631014823913574,
"learning_rate": 4.198783902293794e-06,
"loss": 0.9780709147453308,
"step": 1028
},
{
"epoch": 1.2576312576312576,
"grad_norm": 2.728553295135498,
"learning_rate": 4.1953052332420415e-06,
"loss": 0.9186390042304993,
"step": 1030
},
{
"epoch": 1.26007326007326,
"grad_norm": 1.7069987058639526,
"learning_rate": 4.191820672498931e-06,
"loss": 1.138177514076233,
"step": 1032
},
{
"epoch": 1.2625152625152625,
"grad_norm": 3.96309494972229,
"learning_rate": 4.188330234269046e-06,
"loss": 1.230303406715393,
"step": 1034
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.6388441920280457,
"learning_rate": 4.184833932780927e-06,
"loss": 0.7601897716522217,
"step": 1036
},
{
"epoch": 1.2673992673992673,
"grad_norm": 2.074471950531006,
"learning_rate": 4.181331782287015e-06,
"loss": 0.6320565938949585,
"step": 1038
},
{
"epoch": 1.2698412698412698,
"grad_norm": 1.3992935419082642,
"learning_rate": 4.177823797063597e-06,
"loss": 0.7402109503746033,
"step": 1040
},
{
"epoch": 1.2722832722832722,
"grad_norm": 1.8529661893844604,
"learning_rate": 4.174309991410742e-06,
"loss": 1.1013227701187134,
"step": 1042
},
{
"epoch": 1.2747252747252746,
"grad_norm": 1.740545392036438,
"learning_rate": 4.1707903796522474e-06,
"loss": 0.9940573573112488,
"step": 1044
},
{
"epoch": 1.277167277167277,
"grad_norm": 3.5190329551696777,
"learning_rate": 4.1672649761355785e-06,
"loss": 1.0399502515792847,
"step": 1046
},
{
"epoch": 1.2796092796092795,
"grad_norm": 3.40808367729187,
"learning_rate": 4.163733795231808e-06,
"loss": 0.8423551321029663,
"step": 1048
},
{
"epoch": 1.282051282051282,
"grad_norm": 8.643896102905273,
"learning_rate": 4.160196851335564e-06,
"loss": 0.3857470452785492,
"step": 1050
},
{
"epoch": 1.2844932844932844,
"grad_norm": 2.840670347213745,
"learning_rate": 4.156654158864964e-06,
"loss": 1.0681036710739136,
"step": 1052
},
{
"epoch": 1.2869352869352868,
"grad_norm": 3.3994009494781494,
"learning_rate": 4.15310573226156e-06,
"loss": 0.86181640625,
"step": 1054
},
{
"epoch": 1.2893772893772895,
"grad_norm": 5.254836559295654,
"learning_rate": 4.149551585990277e-06,
"loss": 0.7644107937812805,
"step": 1056
},
{
"epoch": 1.291819291819292,
"grad_norm": 2.2039105892181396,
"learning_rate": 4.1459917345393614e-06,
"loss": 1.2520135641098022,
"step": 1058
},
{
"epoch": 1.2942612942612943,
"grad_norm": 1.7039287090301514,
"learning_rate": 4.142426192420308e-06,
"loss": 1.0944513082504272,
"step": 1060
},
{
"epoch": 1.2967032967032968,
"grad_norm": 4.587660789489746,
"learning_rate": 4.138854974167818e-06,
"loss": 0.6725199222564697,
"step": 1062
},
{
"epoch": 1.2991452991452992,
"grad_norm": 1.5989353656768799,
"learning_rate": 4.135278094339725e-06,
"loss": 1.1340867280960083,
"step": 1064
},
{
"epoch": 1.3015873015873016,
"grad_norm": 2.533905029296875,
"learning_rate": 4.131695567516943e-06,
"loss": 0.7726882100105286,
"step": 1066
},
{
"epoch": 1.304029304029304,
"grad_norm": 1.1167593002319336,
"learning_rate": 4.1281074083034065e-06,
"loss": 1.0258402824401855,
"step": 1068
},
{
"epoch": 1.3064713064713065,
"grad_norm": 8.628117561340332,
"learning_rate": 4.12451363132601e-06,
"loss": 0.8221207857131958,
"step": 1070
},
{
"epoch": 1.308913308913309,
"grad_norm": 3.239126443862915,
"learning_rate": 4.120914251234548e-06,
"loss": 1.0316239595413208,
"step": 1072
},
{
"epoch": 1.3113553113553114,
"grad_norm": 3.97194504737854,
"learning_rate": 4.117309282701655e-06,
"loss": 0.7956058382987976,
"step": 1074
},
{
"epoch": 1.3137973137973138,
"grad_norm": 2.8797948360443115,
"learning_rate": 4.1136987404227476e-06,
"loss": 0.7710628509521484,
"step": 1076
},
{
"epoch": 1.3162393162393162,
"grad_norm": 6.195582389831543,
"learning_rate": 4.110082639115963e-06,
"loss": 1.073829174041748,
"step": 1078
},
{
"epoch": 1.3186813186813187,
"grad_norm": 1.3067351579666138,
"learning_rate": 4.106460993522101e-06,
"loss": 0.9566723108291626,
"step": 1080
},
{
"epoch": 1.321123321123321,
"grad_norm": 2.2232918739318848,
"learning_rate": 4.102833818404557e-06,
"loss": 0.9678391218185425,
"step": 1082
},
{
"epoch": 1.3235653235653235,
"grad_norm": 2.109621047973633,
"learning_rate": 4.099201128549275e-06,
"loss": 1.1640703678131104,
"step": 1084
},
{
"epoch": 1.326007326007326,
"grad_norm": 4.480690956115723,
"learning_rate": 4.095562938764672e-06,
"loss": 1.0956099033355713,
"step": 1086
},
{
"epoch": 1.3284493284493284,
"grad_norm": 2.0747313499450684,
"learning_rate": 4.091919263881592e-06,
"loss": 1.097609281539917,
"step": 1088
},
{
"epoch": 1.3308913308913308,
"grad_norm": 2.344632387161255,
"learning_rate": 4.088270118753232e-06,
"loss": 0.7443391680717468,
"step": 1090
},
{
"epoch": 1.3333333333333333,
"grad_norm": 32.47975540161133,
"learning_rate": 4.084615518255092e-06,
"loss": 1.0534281730651855,
"step": 1092
},
{
"epoch": 1.3357753357753357,
"grad_norm": 1.4418542385101318,
"learning_rate": 4.08095547728491e-06,
"loss": 1.1028659343719482,
"step": 1094
},
{
"epoch": 1.3382173382173383,
"grad_norm": 6.136029243469238,
"learning_rate": 4.077290010762602e-06,
"loss": 0.47979384660720825,
"step": 1096
},
{
"epoch": 1.3406593406593408,
"grad_norm": 2.139401435852051,
"learning_rate": 4.0736191336301986e-06,
"loss": 1.1901733875274658,
"step": 1098
},
{
"epoch": 1.3431013431013432,
"grad_norm": 1.571408987045288,
"learning_rate": 4.06994286085179e-06,
"loss": 1.075485348701477,
"step": 1100
},
{
"epoch": 1.3455433455433456,
"grad_norm": 1.0710482597351074,
"learning_rate": 4.066261207413458e-06,
"loss": 1.0476422309875488,
"step": 1102
},
{
"epoch": 1.347985347985348,
"grad_norm": 2.6131324768066406,
"learning_rate": 4.06257418832322e-06,
"loss": 0.8847273588180542,
"step": 1104
},
{
"epoch": 1.3504273504273505,
"grad_norm": 1.8128620386123657,
"learning_rate": 4.058881818610966e-06,
"loss": 1.1783521175384521,
"step": 1106
},
{
"epoch": 1.352869352869353,
"grad_norm": 34.26594924926758,
"learning_rate": 4.055184113328397e-06,
"loss": 0.9166494011878967,
"step": 1108
},
{
"epoch": 1.3553113553113554,
"grad_norm": 1.9319859743118286,
"learning_rate": 4.051481087548966e-06,
"loss": 1.1042914390563965,
"step": 1110
},
{
"epoch": 1.3577533577533578,
"grad_norm": 2.550018072128296,
"learning_rate": 4.047772756367811e-06,
"loss": 1.0983607769012451,
"step": 1112
},
{
"epoch": 1.3601953601953602,
"grad_norm": 3.659637212753296,
"learning_rate": 4.044059134901701e-06,
"loss": 1.0594271421432495,
"step": 1114
},
{
"epoch": 1.3626373626373627,
"grad_norm": 4.164947986602783,
"learning_rate": 4.0403402382889676e-06,
"loss": 0.4707038700580597,
"step": 1116
},
{
"epoch": 1.3650793650793651,
"grad_norm": 1.7244220972061157,
"learning_rate": 4.036616081689447e-06,
"loss": 1.137607216835022,
"step": 1118
},
{
"epoch": 1.3675213675213675,
"grad_norm": 1.9371610879898071,
"learning_rate": 4.032886680284419e-06,
"loss": 1.1212375164031982,
"step": 1120
},
{
"epoch": 1.36996336996337,
"grad_norm": 2.010833263397217,
"learning_rate": 4.029152049276541e-06,
"loss": 1.0424951314926147,
"step": 1122
},
{
"epoch": 1.3724053724053724,
"grad_norm": 1.6150962114334106,
"learning_rate": 4.025412203889791e-06,
"loss": 0.9809345602989197,
"step": 1124
},
{
"epoch": 1.3748473748473748,
"grad_norm": 2.5580382347106934,
"learning_rate": 4.0216671593694e-06,
"loss": 1.2934308052062988,
"step": 1126
},
{
"epoch": 1.3772893772893773,
"grad_norm": 2.092132806777954,
"learning_rate": 4.017916930981797e-06,
"loss": 1.0607208013534546,
"step": 1128
},
{
"epoch": 1.3797313797313797,
"grad_norm": 2.038407802581787,
"learning_rate": 4.014161534014538e-06,
"loss": 0.8067485094070435,
"step": 1130
},
{
"epoch": 1.3821733821733821,
"grad_norm": 1.479718804359436,
"learning_rate": 4.010400983776253e-06,
"loss": 0.7700361609458923,
"step": 1132
},
{
"epoch": 1.3846153846153846,
"grad_norm": 3.232928514480591,
"learning_rate": 4.006635295596575e-06,
"loss": 0.4854944348335266,
"step": 1134
},
{
"epoch": 1.387057387057387,
"grad_norm": 2.037388563156128,
"learning_rate": 4.002864484826083e-06,
"loss": 0.9804095029830933,
"step": 1136
},
{
"epoch": 1.3894993894993894,
"grad_norm": 1.7072653770446777,
"learning_rate": 3.99908856683624e-06,
"loss": 1.1063387393951416,
"step": 1138
},
{
"epoch": 1.3919413919413919,
"grad_norm": 4.661365509033203,
"learning_rate": 3.995307557019326e-06,
"loss": 0.8346843719482422,
"step": 1140
},
{
"epoch": 1.3943833943833943,
"grad_norm": 2.608985662460327,
"learning_rate": 3.991521470788377e-06,
"loss": 0.9450017213821411,
"step": 1142
},
{
"epoch": 1.3968253968253967,
"grad_norm": 2.2186226844787598,
"learning_rate": 3.987730323577123e-06,
"loss": 0.6135491728782654,
"step": 1144
},
{
"epoch": 1.3992673992673992,
"grad_norm": 1.9363148212432861,
"learning_rate": 3.983934130839927e-06,
"loss": 1.068377137184143,
"step": 1146
},
{
"epoch": 1.4017094017094016,
"grad_norm": 6.124155521392822,
"learning_rate": 3.980132908051717e-06,
"loss": 0.8843311667442322,
"step": 1148
},
{
"epoch": 1.404151404151404,
"grad_norm": 1.894343376159668,
"learning_rate": 3.976326670707927e-06,
"loss": 0.7890317440032959,
"step": 1150
},
{
"epoch": 1.4065934065934065,
"grad_norm": 1.4660074710845947,
"learning_rate": 3.972515434324432e-06,
"loss": 0.8038425445556641,
"step": 1152
},
{
"epoch": 1.409035409035409,
"grad_norm": 1.7170904874801636,
"learning_rate": 3.9686992144374854e-06,
"loss": 0.9780741930007935,
"step": 1154
},
{
"epoch": 1.4114774114774113,
"grad_norm": 6.812156677246094,
"learning_rate": 3.964878026603656e-06,
"loss": 0.7489140629768372,
"step": 1156
},
{
"epoch": 1.4139194139194138,
"grad_norm": 3.0899953842163086,
"learning_rate": 3.961051886399763e-06,
"loss": 1.009106159210205,
"step": 1158
},
{
"epoch": 1.4163614163614164,
"grad_norm": 1.569420576095581,
"learning_rate": 3.9572208094228155e-06,
"loss": 1.0201953649520874,
"step": 1160
},
{
"epoch": 1.4188034188034189,
"grad_norm": 2.1486785411834717,
"learning_rate": 3.9533848112899455e-06,
"loss": 0.7411532402038574,
"step": 1162
},
{
"epoch": 1.4212454212454213,
"grad_norm": 13.017099380493164,
"learning_rate": 3.949543907638345e-06,
"loss": 0.7296299934387207,
"step": 1164
},
{
"epoch": 1.4236874236874237,
"grad_norm": 1.9764689207077026,
"learning_rate": 3.945698114125207e-06,
"loss": 1.1636407375335693,
"step": 1166
},
{
"epoch": 1.4261294261294262,
"grad_norm": 0.6818609833717346,
"learning_rate": 3.941847446427651e-06,
"loss": 0.9746972322463989,
"step": 1168
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.446106433868408,
"learning_rate": 3.937991920242671e-06,
"loss": 0.8085231184959412,
"step": 1170
},
{
"epoch": 1.431013431013431,
"grad_norm": 2.190028190612793,
"learning_rate": 3.934131551287067e-06,
"loss": 1.1608608961105347,
"step": 1172
},
{
"epoch": 1.4334554334554335,
"grad_norm": 1.8594470024108887,
"learning_rate": 3.930266355297375e-06,
"loss": 1.1073782444000244,
"step": 1174
},
{
"epoch": 1.435897435897436,
"grad_norm": 3.316195487976074,
"learning_rate": 3.926396348029814e-06,
"loss": 1.1658706665039062,
"step": 1176
},
{
"epoch": 1.4383394383394383,
"grad_norm": 2.6010489463806152,
"learning_rate": 3.922521545260211e-06,
"loss": 0.9183681011199951,
"step": 1178
},
{
"epoch": 1.4407814407814408,
"grad_norm": 5.369879245758057,
"learning_rate": 3.918641962783945e-06,
"loss": 1.037269949913025,
"step": 1180
},
{
"epoch": 1.4432234432234432,
"grad_norm": 3.0808987617492676,
"learning_rate": 3.914757616415877e-06,
"loss": 0.8047484755516052,
"step": 1182
},
{
"epoch": 1.4456654456654456,
"grad_norm": 1.6899147033691406,
"learning_rate": 3.910868521990289e-06,
"loss": 1.117107629776001,
"step": 1184
},
{
"epoch": 1.448107448107448,
"grad_norm": 1.6038181781768799,
"learning_rate": 3.906974695360818e-06,
"loss": 1.0371313095092773,
"step": 1186
},
{
"epoch": 1.4505494505494505,
"grad_norm": 2.300448179244995,
"learning_rate": 3.90307615240039e-06,
"loss": 0.8785613179206848,
"step": 1188
},
{
"epoch": 1.452991452991453,
"grad_norm": 1.9171602725982666,
"learning_rate": 3.8991729090011585e-06,
"loss": 1.0834622383117676,
"step": 1190
},
{
"epoch": 1.4554334554334554,
"grad_norm": 2.6901988983154297,
"learning_rate": 3.895264981074438e-06,
"loss": 0.8501840829849243,
"step": 1192
},
{
"epoch": 1.4578754578754578,
"grad_norm": 1.8914860486984253,
"learning_rate": 3.891352384550639e-06,
"loss": 0.8218003511428833,
"step": 1194
},
{
"epoch": 1.4603174603174602,
"grad_norm": 2.6401541233062744,
"learning_rate": 3.887435135379202e-06,
"loss": 0.7749768495559692,
"step": 1196
},
{
"epoch": 1.462759462759463,
"grad_norm": 3.5819826126098633,
"learning_rate": 3.8835132495285344e-06,
"loss": 0.9986313581466675,
"step": 1198
},
{
"epoch": 1.4652014652014653,
"grad_norm": 2.515784502029419,
"learning_rate": 3.879586742985945e-06,
"loss": 1.154970645904541,
"step": 1200
},
{
"epoch": 1.4676434676434678,
"grad_norm": 2.7575578689575195,
"learning_rate": 3.875655631757579e-06,
"loss": 1.0889326333999634,
"step": 1202
},
{
"epoch": 1.4700854700854702,
"grad_norm": 1.673169493675232,
"learning_rate": 3.871719931868352e-06,
"loss": 1.109386920928955,
"step": 1204
},
{
"epoch": 1.4725274725274726,
"grad_norm": 3.21140193939209,
"learning_rate": 3.867779659361885e-06,
"loss": 0.9718731641769409,
"step": 1206
},
{
"epoch": 1.474969474969475,
"grad_norm": 2.298818588256836,
"learning_rate": 3.863834830300437e-06,
"loss": 0.8030334115028381,
"step": 1208
},
{
"epoch": 1.4774114774114775,
"grad_norm": 3.9100306034088135,
"learning_rate": 3.859885460764845e-06,
"loss": 0.9156997203826904,
"step": 1210
},
{
"epoch": 1.47985347985348,
"grad_norm": 1.3137868642807007,
"learning_rate": 3.855931566854451e-06,
"loss": 1.0059466361999512,
"step": 1212
},
{
"epoch": 1.4822954822954824,
"grad_norm": 1.9000264406204224,
"learning_rate": 3.851973164687046e-06,
"loss": 1.1118829250335693,
"step": 1214
},
{
"epoch": 1.4847374847374848,
"grad_norm": 1.584736943244934,
"learning_rate": 3.848010270398792e-06,
"loss": 1.0681581497192383,
"step": 1216
},
{
"epoch": 1.4871794871794872,
"grad_norm": 1.8261507749557495,
"learning_rate": 3.844042900144167e-06,
"loss": 0.2508808970451355,
"step": 1218
},
{
"epoch": 1.4896214896214897,
"grad_norm": 1.896042823791504,
"learning_rate": 3.8400710700958945e-06,
"loss": 0.6199178695678711,
"step": 1220
},
{
"epoch": 1.492063492063492,
"grad_norm": 2.0678446292877197,
"learning_rate": 3.836094796444875e-06,
"loss": 1.0399789810180664,
"step": 1222
},
{
"epoch": 1.4945054945054945,
"grad_norm": 6.400730133056641,
"learning_rate": 3.832114095400129e-06,
"loss": 0.8569754362106323,
"step": 1224
},
{
"epoch": 1.496947496947497,
"grad_norm": 2.1547770500183105,
"learning_rate": 3.8281289831887185e-06,
"loss": 1.1074395179748535,
"step": 1226
},
{
"epoch": 1.4993894993894994,
"grad_norm": 1.7979967594146729,
"learning_rate": 3.824139476055692e-06,
"loss": 0.36593061685562134,
"step": 1228
},
{
"epoch": 1.5018315018315018,
"grad_norm": 35.071903228759766,
"learning_rate": 3.820145590264012e-06,
"loss": 0.8221673965454102,
"step": 1230
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.9250247478485107,
"learning_rate": 3.81614734209449e-06,
"loss": 0.6617715954780579,
"step": 1232
},
{
"epoch": 1.5067155067155067,
"grad_norm": 4.005329132080078,
"learning_rate": 3.812144747845719e-06,
"loss": 1.1474699974060059,
"step": 1234
},
{
"epoch": 1.5091575091575091,
"grad_norm": 2.440639019012451,
"learning_rate": 3.808137823834012e-06,
"loss": 0.8988032937049866,
"step": 1236
},
{
"epoch": 1.5115995115995116,
"grad_norm": 1.8108290433883667,
"learning_rate": 3.80412658639333e-06,
"loss": 0.8774833679199219,
"step": 1238
},
{
"epoch": 1.514041514041514,
"grad_norm": 1.4303427934646606,
"learning_rate": 3.800111051875217e-06,
"loss": 1.0372514724731445,
"step": 1240
},
{
"epoch": 1.5164835164835164,
"grad_norm": 1.5728963613510132,
"learning_rate": 3.7960912366487353e-06,
"loss": 1.0711747407913208,
"step": 1242
},
{
"epoch": 1.5189255189255189,
"grad_norm": 9.220934867858887,
"learning_rate": 3.7920671571003953e-06,
"loss": 0.686614453792572,
"step": 1244
},
{
"epoch": 1.5213675213675213,
"grad_norm": 1.5577303171157837,
"learning_rate": 3.7880388296340924e-06,
"loss": 0.7836710810661316,
"step": 1246
},
{
"epoch": 1.5238095238095237,
"grad_norm": 1.9703376293182373,
"learning_rate": 3.7840062706710362e-06,
"loss": 0.8961681127548218,
"step": 1248
},
{
"epoch": 1.5262515262515262,
"grad_norm": 2.641063690185547,
"learning_rate": 3.7799694966496888e-06,
"loss": 1.1727888584136963,
"step": 1250
},
{
"epoch": 1.5286935286935286,
"grad_norm": 5.275555610656738,
"learning_rate": 3.775928524025691e-06,
"loss": 0.875237226486206,
"step": 1252
},
{
"epoch": 1.531135531135531,
"grad_norm": 1.5248931646347046,
"learning_rate": 3.771883369271803e-06,
"loss": 1.040828824043274,
"step": 1254
},
{
"epoch": 1.5335775335775335,
"grad_norm": 2.20690655708313,
"learning_rate": 3.7678340488778302e-06,
"loss": 1.1615933179855347,
"step": 1256
},
{
"epoch": 1.536019536019536,
"grad_norm": 1.435325026512146,
"learning_rate": 3.763780579350559e-06,
"loss": 0.40704652667045593,
"step": 1258
},
{
"epoch": 1.5384615384615383,
"grad_norm": 14.3430814743042,
"learning_rate": 3.759722977213691e-06,
"loss": 0.8075951337814331,
"step": 1260
},
{
"epoch": 1.5409035409035408,
"grad_norm": 16.239559173583984,
"learning_rate": 3.755661259007774e-06,
"loss": 0.6135749816894531,
"step": 1262
},
{
"epoch": 1.5433455433455432,
"grad_norm": 2.538618803024292,
"learning_rate": 3.751595441290133e-06,
"loss": 0.8490422964096069,
"step": 1264
},
{
"epoch": 1.5457875457875456,
"grad_norm": 2.3163981437683105,
"learning_rate": 3.7475255406348067e-06,
"loss": 0.8143582940101624,
"step": 1266
},
{
"epoch": 1.5482295482295483,
"grad_norm": 1.8422861099243164,
"learning_rate": 3.7434515736324746e-06,
"loss": 1.0519959926605225,
"step": 1268
},
{
"epoch": 1.5506715506715507,
"grad_norm": 9.199726104736328,
"learning_rate": 3.7393735568903955e-06,
"loss": 0.4911290109157562,
"step": 1270
},
{
"epoch": 1.5531135531135531,
"grad_norm": 2.1301679611206055,
"learning_rate": 3.7352915070323366e-06,
"loss": 1.189732313156128,
"step": 1272
},
{
"epoch": 1.5555555555555556,
"grad_norm": 1.937249779701233,
"learning_rate": 3.731205440698501e-06,
"loss": 0.9045177102088928,
"step": 1274
},
{
"epoch": 1.557997557997558,
"grad_norm": 2.8137459754943848,
"learning_rate": 3.7271153745454726e-06,
"loss": 1.390211582183838,
"step": 1276
},
{
"epoch": 1.5604395604395604,
"grad_norm": 2.1598775386810303,
"learning_rate": 3.723021325246132e-06,
"loss": 0.737874448299408,
"step": 1278
},
{
"epoch": 1.5628815628815629,
"grad_norm": 2.4186580181121826,
"learning_rate": 3.7189233094896044e-06,
"loss": 1.0836533308029175,
"step": 1280
},
{
"epoch": 1.5653235653235653,
"grad_norm": 2.439676284790039,
"learning_rate": 3.714821343981179e-06,
"loss": 0.7069857120513916,
"step": 1282
},
{
"epoch": 1.5677655677655677,
"grad_norm": 1.5403668880462646,
"learning_rate": 3.7107154454422456e-06,
"loss": 1.0703009366989136,
"step": 1284
},
{
"epoch": 1.5702075702075702,
"grad_norm": 3.893155097961426,
"learning_rate": 3.706605630610231e-06,
"loss": 1.1834505796432495,
"step": 1286
},
{
"epoch": 1.5726495726495726,
"grad_norm": 5.153315544128418,
"learning_rate": 3.7024919162385232e-06,
"loss": 0.5492372512817383,
"step": 1288
},
{
"epoch": 1.575091575091575,
"grad_norm": 1.3920317888259888,
"learning_rate": 3.6983743190964077e-06,
"loss": 0.8411808013916016,
"step": 1290
},
{
"epoch": 1.5775335775335775,
"grad_norm": 9.354891777038574,
"learning_rate": 3.6942528559689965e-06,
"loss": 0.36394214630126953,
"step": 1292
},
{
"epoch": 1.5799755799755801,
"grad_norm": 2.3740155696868896,
"learning_rate": 3.690127543657162e-06,
"loss": 0.7142713069915771,
"step": 1294
},
{
"epoch": 1.5824175824175826,
"grad_norm": 37.80674362182617,
"learning_rate": 3.685998398977468e-06,
"loss": 1.0909113883972168,
"step": 1296
},
{
"epoch": 1.584859584859585,
"grad_norm": 1.855957269668579,
"learning_rate": 3.6818654387620993e-06,
"loss": 1.1598751544952393,
"step": 1298
},
{
"epoch": 1.5873015873015874,
"grad_norm": 2.314946174621582,
"learning_rate": 3.677728679858797e-06,
"loss": 0.9421340823173523,
"step": 1300
},
{
"epoch": 1.5897435897435899,
"grad_norm": 5.468100070953369,
"learning_rate": 3.673588139130784e-06,
"loss": 1.2048614025115967,
"step": 1302
},
{
"epoch": 1.5921855921855923,
"grad_norm": 3.331906795501709,
"learning_rate": 3.6694438334567024e-06,
"loss": 1.1039568185806274,
"step": 1304
},
{
"epoch": 1.5946275946275947,
"grad_norm": 1.5079933404922485,
"learning_rate": 3.6652957797305387e-06,
"loss": 0.6897386908531189,
"step": 1306
},
{
"epoch": 1.5970695970695972,
"grad_norm": 2.3638577461242676,
"learning_rate": 3.661143994861563e-06,
"loss": 1.1327297687530518,
"step": 1308
},
{
"epoch": 1.5995115995115996,
"grad_norm": 2.4536283016204834,
"learning_rate": 3.6569884957742497e-06,
"loss": 1.0871834754943848,
"step": 1310
},
{
"epoch": 1.601953601953602,
"grad_norm": 1.548901915550232,
"learning_rate": 3.652829299408217e-06,
"loss": 1.0074199438095093,
"step": 1312
},
{
"epoch": 1.6043956043956045,
"grad_norm": 1.3679847717285156,
"learning_rate": 3.648666422718155e-06,
"loss": 1.1029393672943115,
"step": 1314
},
{
"epoch": 1.606837606837607,
"grad_norm": 2.071131706237793,
"learning_rate": 3.644499882673756e-06,
"loss": 1.1430408954620361,
"step": 1316
},
{
"epoch": 1.6092796092796093,
"grad_norm": 2.4289538860321045,
"learning_rate": 3.6403296962596442e-06,
"loss": 1.0161014795303345,
"step": 1318
},
{
"epoch": 1.6117216117216118,
"grad_norm": 1.8402098417282104,
"learning_rate": 3.6361558804753088e-06,
"loss": 1.2254347801208496,
"step": 1320
},
{
"epoch": 1.6141636141636142,
"grad_norm": 1.484537124633789,
"learning_rate": 3.631978452335036e-06,
"loss": 1.116368293762207,
"step": 1322
},
{
"epoch": 1.6166056166056166,
"grad_norm": 1.7078075408935547,
"learning_rate": 3.6277974288678354e-06,
"loss": 1.0890535116195679,
"step": 1324
},
{
"epoch": 1.619047619047619,
"grad_norm": 4.279214382171631,
"learning_rate": 3.6236128271173716e-06,
"loss": 0.8000863790512085,
"step": 1326
},
{
"epoch": 1.6214896214896215,
"grad_norm": 1.6943376064300537,
"learning_rate": 3.6194246641418993e-06,
"loss": 1.1035950183868408,
"step": 1328
},
{
"epoch": 1.623931623931624,
"grad_norm": 3.024909257888794,
"learning_rate": 3.6152329570141863e-06,
"loss": 1.078392744064331,
"step": 1330
},
{
"epoch": 1.6263736263736264,
"grad_norm": 4.725790977478027,
"learning_rate": 3.611037722821452e-06,
"loss": 0.8447167277336121,
"step": 1332
},
{
"epoch": 1.6288156288156288,
"grad_norm": 1.9349464178085327,
"learning_rate": 3.6068389786652915e-06,
"loss": 1.1011463403701782,
"step": 1334
},
{
"epoch": 1.6312576312576312,
"grad_norm": 1.9638590812683105,
"learning_rate": 3.6026367416616054e-06,
"loss": 0.7226951718330383,
"step": 1336
},
{
"epoch": 1.6336996336996337,
"grad_norm": 3.807051420211792,
"learning_rate": 3.598431028940539e-06,
"loss": 1.0683143138885498,
"step": 1338
},
{
"epoch": 1.636141636141636,
"grad_norm": 2.799273729324341,
"learning_rate": 3.594221857646399e-06,
"loss": 0.5557500720024109,
"step": 1340
},
{
"epoch": 1.6385836385836385,
"grad_norm": 1.5128666162490845,
"learning_rate": 3.5900092449375977e-06,
"loss": 0.391013503074646,
"step": 1342
},
{
"epoch": 1.641025641025641,
"grad_norm": 2.4419357776641846,
"learning_rate": 3.5857932079865703e-06,
"loss": 1.2627594470977783,
"step": 1344
},
{
"epoch": 1.6434676434676434,
"grad_norm": 1.5012274980545044,
"learning_rate": 3.5815737639797143e-06,
"loss": 1.1198487281799316,
"step": 1346
},
{
"epoch": 1.6459096459096458,
"grad_norm": 1.7359366416931152,
"learning_rate": 3.5773509301173136e-06,
"loss": 0.7089607119560242,
"step": 1348
},
{
"epoch": 1.6483516483516483,
"grad_norm": 1.7854307889938354,
"learning_rate": 3.573124723613473e-06,
"loss": 0.7905706763267517,
"step": 1350
},
{
"epoch": 1.6507936507936507,
"grad_norm": 2.4434316158294678,
"learning_rate": 3.568895161696042e-06,
"loss": 1.0632576942443848,
"step": 1352
},
{
"epoch": 1.6532356532356531,
"grad_norm": 1.7432414293289185,
"learning_rate": 3.5646622616065537e-06,
"loss": 1.170975685119629,
"step": 1354
},
{
"epoch": 1.6556776556776556,
"grad_norm": 1.8956907987594604,
"learning_rate": 3.560426040600143e-06,
"loss": 1.0797570943832397,
"step": 1356
},
{
"epoch": 1.658119658119658,
"grad_norm": 1.6335842609405518,
"learning_rate": 3.556186515945486e-06,
"loss": 0.5945901870727539,
"step": 1358
},
{
"epoch": 1.6605616605616604,
"grad_norm": 2.311692714691162,
"learning_rate": 3.5519437049247257e-06,
"loss": 0.8245255947113037,
"step": 1360
},
{
"epoch": 1.6630036630036629,
"grad_norm": 2.2353930473327637,
"learning_rate": 3.547697624833401e-06,
"loss": 1.1110084056854248,
"step": 1362
},
{
"epoch": 1.6654456654456653,
"grad_norm": 1.7413452863693237,
"learning_rate": 3.543448292980376e-06,
"loss": 1.1027268171310425,
"step": 1364
},
{
"epoch": 1.6678876678876677,
"grad_norm": 1.9247740507125854,
"learning_rate": 3.5391957266877724e-06,
"loss": 1.0763671398162842,
"step": 1366
},
{
"epoch": 1.6703296703296702,
"grad_norm": 1.415798544883728,
"learning_rate": 3.534939943290896e-06,
"loss": 1.0487414598464966,
"step": 1368
},
{
"epoch": 1.6727716727716728,
"grad_norm": 2.411515235900879,
"learning_rate": 3.530680960138166e-06,
"loss": 1.142496109008789,
"step": 1370
},
{
"epoch": 1.6752136752136753,
"grad_norm": 1.571021556854248,
"learning_rate": 3.5264187945910465e-06,
"loss": 0.6615177392959595,
"step": 1372
},
{
"epoch": 1.6776556776556777,
"grad_norm": 1.4412907361984253,
"learning_rate": 3.5221534640239745e-06,
"loss": 0.29376649856567383,
"step": 1374
},
{
"epoch": 1.6800976800976801,
"grad_norm": 6.718142509460449,
"learning_rate": 3.5178849858242874e-06,
"loss": 1.1929081678390503,
"step": 1376
},
{
"epoch": 1.6825396825396826,
"grad_norm": 4.863142013549805,
"learning_rate": 3.5136133773921553e-06,
"loss": 1.202161192893982,
"step": 1378
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.7358537912368774,
"learning_rate": 3.509338656140508e-06,
"loss": 0.9144766330718994,
"step": 1380
},
{
"epoch": 1.6874236874236874,
"grad_norm": 4.494753837585449,
"learning_rate": 3.505060839494964e-06,
"loss": 0.978439211845398,
"step": 1382
},
{
"epoch": 1.6898656898656899,
"grad_norm": 1.7089729309082031,
"learning_rate": 3.5007799448937617e-06,
"loss": 1.1718627214431763,
"step": 1384
},
{
"epoch": 1.6923076923076923,
"grad_norm": 1.796030879020691,
"learning_rate": 3.496495989787683e-06,
"loss": 1.0744086503982544,
"step": 1386
},
{
"epoch": 1.6947496947496947,
"grad_norm": 1.5995069742202759,
"learning_rate": 3.49220899163999e-06,
"loss": 1.1244831085205078,
"step": 1388
},
{
"epoch": 1.6971916971916972,
"grad_norm": 3.2209115028381348,
"learning_rate": 3.4879189679263474e-06,
"loss": 0.3722049295902252,
"step": 1390
},
{
"epoch": 1.6996336996336996,
"grad_norm": 1.2462571859359741,
"learning_rate": 3.4836259361347524e-06,
"loss": 1.0250697135925293,
"step": 1392
},
{
"epoch": 1.702075702075702,
"grad_norm": 2.99985408782959,
"learning_rate": 3.479329913765467e-06,
"loss": 0.946092426776886,
"step": 1394
},
{
"epoch": 1.7045177045177047,
"grad_norm": 3.67580246925354,
"learning_rate": 3.475030918330942e-06,
"loss": 1.217712163925171,
"step": 1396
},
{
"epoch": 1.7069597069597071,
"grad_norm": 3.188765525817871,
"learning_rate": 3.4707289673557486e-06,
"loss": 0.9007408022880554,
"step": 1398
},
{
"epoch": 1.7094017094017095,
"grad_norm": 5.768331050872803,
"learning_rate": 3.4664240783765064e-06,
"loss": 0.4004557728767395,
"step": 1400
},
{
"epoch": 1.711843711843712,
"grad_norm": 5.148880958557129,
"learning_rate": 3.4621162689418104e-06,
"loss": 0.9390780329704285,
"step": 1402
},
{
"epoch": 1.7142857142857144,
"grad_norm": 1.9988371133804321,
"learning_rate": 3.4578055566121617e-06,
"loss": 1.065889596939087,
"step": 1404
},
{
"epoch": 1.7167277167277168,
"grad_norm": 4.718473434448242,
"learning_rate": 3.453491958959894e-06,
"loss": 0.5322512984275818,
"step": 1406
},
{
"epoch": 1.7191697191697193,
"grad_norm": 3.3976686000823975,
"learning_rate": 3.449175493569103e-06,
"loss": 1.1359853744506836,
"step": 1408
},
{
"epoch": 1.7216117216117217,
"grad_norm": 6.322020530700684,
"learning_rate": 3.4448561780355766e-06,
"loss": 0.7464244961738586,
"step": 1410
},
{
"epoch": 1.7240537240537241,
"grad_norm": 11.572935104370117,
"learning_rate": 3.4405340299667183e-06,
"loss": 0.8479959964752197,
"step": 1412
},
{
"epoch": 1.7264957264957266,
"grad_norm": 1.7882882356643677,
"learning_rate": 3.436209066981479e-06,
"loss": 1.0817737579345703,
"step": 1414
},
{
"epoch": 1.728937728937729,
"grad_norm": 5.552520275115967,
"learning_rate": 3.4318813067102853e-06,
"loss": 0.9852099418640137,
"step": 1416
},
{
"epoch": 1.7313797313797314,
"grad_norm": 1.9042245149612427,
"learning_rate": 3.4275507667949658e-06,
"loss": 1.1091506481170654,
"step": 1418
},
{
"epoch": 1.7338217338217339,
"grad_norm": 2.391268491744995,
"learning_rate": 3.423217464888681e-06,
"loss": 0.8407750725746155,
"step": 1420
},
{
"epoch": 1.7362637362637363,
"grad_norm": 2.475590944290161,
"learning_rate": 3.41888141865585e-06,
"loss": 0.9131081104278564,
"step": 1422
},
{
"epoch": 1.7387057387057387,
"grad_norm": 5.156746864318848,
"learning_rate": 3.4145426457720787e-06,
"loss": 0.7782116532325745,
"step": 1424
},
{
"epoch": 1.7411477411477412,
"grad_norm": 7.184075355529785,
"learning_rate": 3.4102011639240884e-06,
"loss": 0.7344411611557007,
"step": 1426
},
{
"epoch": 1.7435897435897436,
"grad_norm": 2.159703016281128,
"learning_rate": 3.4058569908096436e-06,
"loss": 1.132224202156067,
"step": 1428
},
{
"epoch": 1.746031746031746,
"grad_norm": 1.8462954759597778,
"learning_rate": 3.4015101441374776e-06,
"loss": 1.173128366470337,
"step": 1430
},
{
"epoch": 1.7484737484737485,
"grad_norm": 2.538024425506592,
"learning_rate": 3.397160641627226e-06,
"loss": 0.7561154961585999,
"step": 1432
},
{
"epoch": 1.750915750915751,
"grad_norm": 1.7686879634857178,
"learning_rate": 3.392808501009347e-06,
"loss": 0.6580084562301636,
"step": 1434
},
{
"epoch": 1.7533577533577533,
"grad_norm": 10.234268188476562,
"learning_rate": 3.3884537400250554e-06,
"loss": 0.6667467951774597,
"step": 1436
},
{
"epoch": 1.7557997557997558,
"grad_norm": 1.436072826385498,
"learning_rate": 3.384096376426247e-06,
"loss": 0.5105250477790833,
"step": 1438
},
{
"epoch": 1.7582417582417582,
"grad_norm": 1.6276432275772095,
"learning_rate": 3.379736427975425e-06,
"loss": 1.0976946353912354,
"step": 1440
},
{
"epoch": 1.7606837606837606,
"grad_norm": 3.592867136001587,
"learning_rate": 3.3753739124456343e-06,
"loss": 0.8957812786102295,
"step": 1442
},
{
"epoch": 1.763125763125763,
"grad_norm": 4.000123023986816,
"learning_rate": 3.371008847620379e-06,
"loss": 0.7372997403144836,
"step": 1444
},
{
"epoch": 1.7655677655677655,
"grad_norm": 3.1201529502868652,
"learning_rate": 3.366641251293559e-06,
"loss": 1.102899193763733,
"step": 1446
},
{
"epoch": 1.768009768009768,
"grad_norm": 1.768283486366272,
"learning_rate": 3.3622711412693914e-06,
"loss": 1.124794602394104,
"step": 1448
},
{
"epoch": 1.7704517704517704,
"grad_norm": 2.403294801712036,
"learning_rate": 3.3578985353623416e-06,
"loss": 0.9902628660202026,
"step": 1450
},
{
"epoch": 1.7728937728937728,
"grad_norm": 3.0186891555786133,
"learning_rate": 3.3535234513970494e-06,
"loss": 0.399064302444458,
"step": 1452
},
{
"epoch": 1.7753357753357752,
"grad_norm": 1.5962026119232178,
"learning_rate": 3.349145907208255e-06,
"loss": 0.7983530163764954,
"step": 1454
},
{
"epoch": 1.7777777777777777,
"grad_norm": 3.296353816986084,
"learning_rate": 3.3447659206407285e-06,
"loss": 0.5403007864952087,
"step": 1456
},
{
"epoch": 1.7802197802197801,
"grad_norm": 1.5648705959320068,
"learning_rate": 3.3403835095491967e-06,
"loss": 1.0592517852783203,
"step": 1458
},
{
"epoch": 1.7826617826617825,
"grad_norm": 3.352639675140381,
"learning_rate": 3.3359986917982675e-06,
"loss": 1.0402568578720093,
"step": 1460
},
{
"epoch": 1.785103785103785,
"grad_norm": 3.2459142208099365,
"learning_rate": 3.3316114852623617e-06,
"loss": 0.9993575811386108,
"step": 1462
},
{
"epoch": 1.7875457875457874,
"grad_norm": 2.1725311279296875,
"learning_rate": 3.327221907825638e-06,
"loss": 0.8232885599136353,
"step": 1464
},
{
"epoch": 1.7899877899877898,
"grad_norm": 2.444363594055176,
"learning_rate": 3.3228299773819165e-06,
"loss": 0.8555684685707092,
"step": 1466
},
{
"epoch": 1.7924297924297923,
"grad_norm": 4.547183990478516,
"learning_rate": 3.318435711834615e-06,
"loss": 0.8133440017700195,
"step": 1468
},
{
"epoch": 1.7948717948717947,
"grad_norm": 3.024049758911133,
"learning_rate": 3.3140391290966646e-06,
"loss": 1.0311592817306519,
"step": 1470
},
{
"epoch": 1.7973137973137974,
"grad_norm": 4.397846221923828,
"learning_rate": 3.309640247090445e-06,
"loss": 1.0561209917068481,
"step": 1472
},
{
"epoch": 1.7997557997557998,
"grad_norm": 2.594501256942749,
"learning_rate": 3.3052390837477087e-06,
"loss": 0.6757609248161316,
"step": 1474
},
{
"epoch": 1.8021978021978022,
"grad_norm": 2.992253541946411,
"learning_rate": 3.300835657009507e-06,
"loss": 0.7614642977714539,
"step": 1476
},
{
"epoch": 1.8046398046398047,
"grad_norm": 5.074526786804199,
"learning_rate": 3.2964299848261187e-06,
"loss": 0.8146823048591614,
"step": 1478
},
{
"epoch": 1.807081807081807,
"grad_norm": 3.6561779975891113,
"learning_rate": 3.2920220851569746e-06,
"loss": 0.4933128356933594,
"step": 1480
},
{
"epoch": 1.8095238095238095,
"grad_norm": 5.129440784454346,
"learning_rate": 3.2876119759705884e-06,
"loss": 0.8365576267242432,
"step": 1482
},
{
"epoch": 1.811965811965812,
"grad_norm": 1.3081094026565552,
"learning_rate": 3.2831996752444774e-06,
"loss": 1.174236536026001,
"step": 1484
},
{
"epoch": 1.8144078144078144,
"grad_norm": 1.9769134521484375,
"learning_rate": 3.2787852009650945e-06,
"loss": 1.0928758382797241,
"step": 1486
},
{
"epoch": 1.8168498168498168,
"grad_norm": 3.200984001159668,
"learning_rate": 3.2743685711277533e-06,
"loss": 0.7248603701591492,
"step": 1488
},
{
"epoch": 1.8192918192918193,
"grad_norm": 1.9529130458831787,
"learning_rate": 3.269949803736554e-06,
"loss": 0.8898839950561523,
"step": 1490
},
{
"epoch": 1.8217338217338217,
"grad_norm": 2.331352949142456,
"learning_rate": 3.265528916804308e-06,
"loss": 1.097998857498169,
"step": 1492
},
{
"epoch": 1.8241758241758241,
"grad_norm": 7.352150917053223,
"learning_rate": 3.261105928352472e-06,
"loss": 0.7203211784362793,
"step": 1494
},
{
"epoch": 1.8266178266178266,
"grad_norm": 8.535738945007324,
"learning_rate": 3.2566808564110635e-06,
"loss": 0.8137180209159851,
"step": 1496
},
{
"epoch": 1.8290598290598292,
"grad_norm": 1.5943210124969482,
"learning_rate": 3.252253719018599e-06,
"loss": 1.4954842329025269,
"step": 1498
},
{
"epoch": 1.8315018315018317,
"grad_norm": 2.4209067821502686,
"learning_rate": 3.2478245342220094e-06,
"loss": 1.2031804323196411,
"step": 1500
},
{
"epoch": 1.833943833943834,
"grad_norm": 3.7259180545806885,
"learning_rate": 3.243393320076575e-06,
"loss": 0.8611478805541992,
"step": 1502
},
{
"epoch": 1.8363858363858365,
"grad_norm": 0.5359264612197876,
"learning_rate": 3.238960094645848e-06,
"loss": 0.9046647548675537,
"step": 1504
},
{
"epoch": 1.838827838827839,
"grad_norm": 2.4440624713897705,
"learning_rate": 3.2345248760015777e-06,
"loss": 0.7731856107711792,
"step": 1506
},
{
"epoch": 1.8412698412698414,
"grad_norm": 1.7057727575302124,
"learning_rate": 3.2300876822236427e-06,
"loss": 0.8238407373428345,
"step": 1508
},
{
"epoch": 1.8437118437118438,
"grad_norm": 2.0124754905700684,
"learning_rate": 3.225648531399968e-06,
"loss": 1.0737024545669556,
"step": 1510
},
{
"epoch": 1.8461538461538463,
"grad_norm": 1.904160499572754,
"learning_rate": 3.22120744162646e-06,
"loss": 1.0689663887023926,
"step": 1512
},
{
"epoch": 1.8485958485958487,
"grad_norm": 1.249457836151123,
"learning_rate": 3.2167644310069276e-06,
"loss": 1.0780993700027466,
"step": 1514
},
{
"epoch": 1.8510378510378511,
"grad_norm": 3.9271388053894043,
"learning_rate": 3.2123195176530104e-06,
"loss": 0.833716094493866,
"step": 1516
},
{
"epoch": 1.8534798534798536,
"grad_norm": 1.8167206048965454,
"learning_rate": 3.207872719684104e-06,
"loss": 1.1510157585144043,
"step": 1518
},
{
"epoch": 1.855921855921856,
"grad_norm": 3.824442148208618,
"learning_rate": 3.203424055227287e-06,
"loss": 0.9223167896270752,
"step": 1520
},
{
"epoch": 1.8583638583638584,
"grad_norm": 4.120997905731201,
"learning_rate": 3.1989735424172456e-06,
"loss": 0.9817994832992554,
"step": 1522
},
{
"epoch": 1.8608058608058609,
"grad_norm": 2.414776563644409,
"learning_rate": 3.1945211993962035e-06,
"loss": 0.9063418507575989,
"step": 1524
},
{
"epoch": 1.8632478632478633,
"grad_norm": 4.998463153839111,
"learning_rate": 3.190067044313841e-06,
"loss": 0.9489470720291138,
"step": 1526
},
{
"epoch": 1.8656898656898657,
"grad_norm": 1.9804654121398926,
"learning_rate": 3.185611095327227e-06,
"loss": 0.7647035121917725,
"step": 1528
},
{
"epoch": 1.8681318681318682,
"grad_norm": 1.3335086107254028,
"learning_rate": 3.181153370600745e-06,
"loss": 0.9383209943771362,
"step": 1530
},
{
"epoch": 1.8705738705738706,
"grad_norm": 4.721079349517822,
"learning_rate": 3.176693888306014e-06,
"loss": 0.77753746509552,
"step": 1532
},
{
"epoch": 1.873015873015873,
"grad_norm": 2.030644655227661,
"learning_rate": 3.1722326666218213e-06,
"loss": 0.8778524994850159,
"step": 1534
},
{
"epoch": 1.8754578754578755,
"grad_norm": 1.5334826707839966,
"learning_rate": 3.16776972373404e-06,
"loss": 1.1086459159851074,
"step": 1536
},
{
"epoch": 1.877899877899878,
"grad_norm": 1.6864469051361084,
"learning_rate": 3.1633050778355624e-06,
"loss": 1.0293059349060059,
"step": 1538
},
{
"epoch": 1.8803418803418803,
"grad_norm": 2.2873408794403076,
"learning_rate": 3.158838747126224e-06,
"loss": 1.0864299535751343,
"step": 1540
},
{
"epoch": 1.8827838827838828,
"grad_norm": 1.5731513500213623,
"learning_rate": 3.1543707498127267e-06,
"loss": 1.0680838823318481,
"step": 1542
},
{
"epoch": 1.8852258852258852,
"grad_norm": 2.0635628700256348,
"learning_rate": 3.1499011041085662e-06,
"loss": 0.9070185422897339,
"step": 1544
},
{
"epoch": 1.8876678876678876,
"grad_norm": 2.2307991981506348,
"learning_rate": 3.145429828233959e-06,
"loss": 1.060643196105957,
"step": 1546
},
{
"epoch": 1.89010989010989,
"grad_norm": 3.084059476852417,
"learning_rate": 3.1409569404157646e-06,
"loss": 1.0800150632858276,
"step": 1548
},
{
"epoch": 1.8925518925518925,
"grad_norm": 2.034463882446289,
"learning_rate": 3.136482458887416e-06,
"loss": 0.6771202087402344,
"step": 1550
},
{
"epoch": 1.894993894993895,
"grad_norm": 2.416832447052002,
"learning_rate": 3.132006401888841e-06,
"loss": 1.1564983129501343,
"step": 1552
},
{
"epoch": 1.8974358974358974,
"grad_norm": 2.9857418537139893,
"learning_rate": 3.1275287876663905e-06,
"loss": 0.8453341126441956,
"step": 1554
},
{
"epoch": 1.8998778998778998,
"grad_norm": 1.9065909385681152,
"learning_rate": 3.123049634472764e-06,
"loss": 1.206203818321228,
"step": 1556
},
{
"epoch": 1.9023199023199022,
"grad_norm": 1.7331615686416626,
"learning_rate": 3.118568960566933e-06,
"loss": 0.9110676050186157,
"step": 1558
},
{
"epoch": 1.9047619047619047,
"grad_norm": 2.49706768989563,
"learning_rate": 3.114086784214069e-06,
"loss": 0.6509535908699036,
"step": 1560
},
{
"epoch": 1.907203907203907,
"grad_norm": 1.9002443552017212,
"learning_rate": 3.109603123685468e-06,
"loss": 1.080418586730957,
"step": 1562
},
{
"epoch": 1.9096459096459095,
"grad_norm": 3.7310116291046143,
"learning_rate": 3.1051179972584756e-06,
"loss": 0.7549952268600464,
"step": 1564
},
{
"epoch": 1.912087912087912,
"grad_norm": 1.4353991746902466,
"learning_rate": 3.1006314232164146e-06,
"loss": 1.083061695098877,
"step": 1566
},
{
"epoch": 1.9145299145299144,
"grad_norm": 2.5150792598724365,
"learning_rate": 3.0961434198485067e-06,
"loss": 0.9303537607192993,
"step": 1568
},
{
"epoch": 1.9169719169719168,
"grad_norm": 1.2595463991165161,
"learning_rate": 3.0916540054498028e-06,
"loss": 0.7716434001922607,
"step": 1570
},
{
"epoch": 1.9194139194139193,
"grad_norm": 1.386602759361267,
"learning_rate": 3.087163198321103e-06,
"loss": 1.1206477880477905,
"step": 1572
},
{
"epoch": 1.9218559218559217,
"grad_norm": 2.0977489948272705,
"learning_rate": 3.0826710167688866e-06,
"loss": 0.7714415788650513,
"step": 1574
},
{
"epoch": 1.9242979242979243,
"grad_norm": 3.282386302947998,
"learning_rate": 3.0781774791052347e-06,
"loss": 1.0669711828231812,
"step": 1576
},
{
"epoch": 1.9267399267399268,
"grad_norm": 2.187236785888672,
"learning_rate": 3.073682603647758e-06,
"loss": 0.7885124683380127,
"step": 1578
},
{
"epoch": 1.9291819291819292,
"grad_norm": 2.4865806102752686,
"learning_rate": 3.0691864087195172e-06,
"loss": 1.084753394126892,
"step": 1580
},
{
"epoch": 1.9316239316239316,
"grad_norm": 3.804330348968506,
"learning_rate": 3.064688912648957e-06,
"loss": 0.3611922860145569,
"step": 1582
},
{
"epoch": 1.934065934065934,
"grad_norm": 18.454357147216797,
"learning_rate": 3.0601901337698213e-06,
"loss": 0.5478751063346863,
"step": 1584
},
{
"epoch": 1.9365079365079365,
"grad_norm": 9.308585166931152,
"learning_rate": 3.055690090421085e-06,
"loss": 0.6894688606262207,
"step": 1586
},
{
"epoch": 1.938949938949939,
"grad_norm": 4.380536079406738,
"learning_rate": 3.0511888009468792e-06,
"loss": 1.172979474067688,
"step": 1588
},
{
"epoch": 1.9413919413919414,
"grad_norm": 1.1702888011932373,
"learning_rate": 3.0466862836964117e-06,
"loss": 1.1025750637054443,
"step": 1590
},
{
"epoch": 1.9438339438339438,
"grad_norm": 2.2686538696289062,
"learning_rate": 3.0421825570238978e-06,
"loss": 1.0041526556015015,
"step": 1592
},
{
"epoch": 1.9462759462759462,
"grad_norm": 1.5547155141830444,
"learning_rate": 3.037677639288481e-06,
"loss": 0.7530244588851929,
"step": 1594
},
{
"epoch": 1.9487179487179487,
"grad_norm": 1.6241923570632935,
"learning_rate": 3.0331715488541626e-06,
"loss": 0.6593371629714966,
"step": 1596
},
{
"epoch": 1.9511599511599511,
"grad_norm": 1.3635199069976807,
"learning_rate": 3.0286643040897203e-06,
"loss": 0.7976773381233215,
"step": 1598
},
{
"epoch": 1.9536019536019538,
"grad_norm": 1.5380146503448486,
"learning_rate": 3.0241559233686424e-06,
"loss": 0.8483846187591553,
"step": 1600
},
{
"epoch": 1.9560439560439562,
"grad_norm": 1.5258017778396606,
"learning_rate": 3.0196464250690434e-06,
"loss": 1.0973600149154663,
"step": 1602
},
{
"epoch": 1.9584859584859586,
"grad_norm": 5.223465442657471,
"learning_rate": 3.0151358275735965e-06,
"loss": 1.2270939350128174,
"step": 1604
},
{
"epoch": 1.960927960927961,
"grad_norm": 4.014069080352783,
"learning_rate": 3.0106241492694533e-06,
"loss": 1.3512402772903442,
"step": 1606
},
{
"epoch": 1.9633699633699635,
"grad_norm": 1.4490033388137817,
"learning_rate": 3.0061114085481745e-06,
"loss": 1.1516140699386597,
"step": 1608
},
{
"epoch": 1.965811965811966,
"grad_norm": 1.19436776638031,
"learning_rate": 3.0015976238056475e-06,
"loss": 1.0787304639816284,
"step": 1610
},
{
"epoch": 1.9682539682539684,
"grad_norm": 6.923144817352295,
"learning_rate": 2.9970828134420198e-06,
"loss": 0.9626544713973999,
"step": 1612
},
{
"epoch": 1.9706959706959708,
"grad_norm": 1.686660885810852,
"learning_rate": 2.992566995861616e-06,
"loss": 1.1870635747909546,
"step": 1614
},
{
"epoch": 1.9731379731379732,
"grad_norm": 2.969782829284668,
"learning_rate": 2.988050189472869e-06,
"loss": 0.9546635150909424,
"step": 1616
},
{
"epoch": 1.9755799755799757,
"grad_norm": 2.04162335395813,
"learning_rate": 2.983532412688242e-06,
"loss": 1.0080379247665405,
"step": 1618
},
{
"epoch": 1.978021978021978,
"grad_norm": 1.1154638528823853,
"learning_rate": 2.979013683924154e-06,
"loss": 1.1551849842071533,
"step": 1620
},
{
"epoch": 1.9804639804639805,
"grad_norm": 1.3147307634353638,
"learning_rate": 2.9744940216009037e-06,
"loss": 0.8124474287033081,
"step": 1622
},
{
"epoch": 1.982905982905983,
"grad_norm": 3.960902690887451,
"learning_rate": 2.969973444142597e-06,
"loss": 0.5901971459388733,
"step": 1624
},
{
"epoch": 1.9853479853479854,
"grad_norm": 2.4836363792419434,
"learning_rate": 2.965451969977069e-06,
"loss": 0.8430943489074707,
"step": 1626
},
{
"epoch": 1.9877899877899878,
"grad_norm": 5.949784278869629,
"learning_rate": 2.9609296175358102e-06,
"loss": 0.9984661340713501,
"step": 1628
},
{
"epoch": 1.9902319902319903,
"grad_norm": 2.4892053604125977,
"learning_rate": 2.9564064052538926e-06,
"loss": 1.1695860624313354,
"step": 1630
},
{
"epoch": 1.9926739926739927,
"grad_norm": 1.6151142120361328,
"learning_rate": 2.951882351569892e-06,
"loss": 1.063124179840088,
"step": 1632
},
{
"epoch": 1.9951159951159951,
"grad_norm": 2.0610530376434326,
"learning_rate": 2.9473574749258143e-06,
"loss": 0.8075814247131348,
"step": 1634
},
{
"epoch": 1.9975579975579976,
"grad_norm": 2.036194086074829,
"learning_rate": 2.94283179376702e-06,
"loss": 1.161010980606079,
"step": 1636
},
{
"epoch": 2.0,
"grad_norm": 1.6809015274047852,
"learning_rate": 2.9383053265421514e-06,
"loss": 1.0740622282028198,
"step": 1638
},
{
"epoch": 2.0024420024420024,
"grad_norm": 7.186413288116455,
"learning_rate": 2.9337780917030513e-06,
"loss": 0.9597793221473694,
"step": 1640
},
{
"epoch": 2.004884004884005,
"grad_norm": 2.5799577236175537,
"learning_rate": 2.929250107704694e-06,
"loss": 0.7062101364135742,
"step": 1642
},
{
"epoch": 2.0073260073260073,
"grad_norm": 0.9430143237113953,
"learning_rate": 2.924721393005109e-06,
"loss": 0.9560756087303162,
"step": 1644
},
{
"epoch": 2.0097680097680097,
"grad_norm": 2.4356815814971924,
"learning_rate": 2.9201919660653e-06,
"loss": 0.7125204801559448,
"step": 1646
},
{
"epoch": 2.012210012210012,
"grad_norm": 2.3169310092926025,
"learning_rate": 2.9156618453491786e-06,
"loss": 0.8216168880462646,
"step": 1648
},
{
"epoch": 2.0146520146520146,
"grad_norm": 11.127049446105957,
"learning_rate": 2.911131049323483e-06,
"loss": 0.8026351928710938,
"step": 1650
},
{
"epoch": 2.017094017094017,
"grad_norm": 2.923428535461426,
"learning_rate": 2.9065995964577028e-06,
"loss": 0.7188471555709839,
"step": 1652
},
{
"epoch": 2.0195360195360195,
"grad_norm": 4.269984722137451,
"learning_rate": 2.902067505224008e-06,
"loss": 1.2672061920166016,
"step": 1654
},
{
"epoch": 2.021978021978022,
"grad_norm": 1.2916280031204224,
"learning_rate": 2.897534794097167e-06,
"loss": 0.5318281054496765,
"step": 1656
},
{
"epoch": 2.0244200244200243,
"grad_norm": 2.5028984546661377,
"learning_rate": 2.89300148155448e-06,
"loss": 0.9953727126121521,
"step": 1658
},
{
"epoch": 2.0268620268620268,
"grad_norm": 2.887450695037842,
"learning_rate": 2.8884675860756946e-06,
"loss": 0.9623196125030518,
"step": 1660
},
{
"epoch": 2.029304029304029,
"grad_norm": 2.6880152225494385,
"learning_rate": 2.883933126142937e-06,
"loss": 1.0482466220855713,
"step": 1662
},
{
"epoch": 2.0317460317460316,
"grad_norm": 1.8128950595855713,
"learning_rate": 2.8793981202406335e-06,
"loss": 0.4340633749961853,
"step": 1664
},
{
"epoch": 2.034188034188034,
"grad_norm": 3.808696985244751,
"learning_rate": 2.874862586855437e-06,
"loss": 0.7226059436798096,
"step": 1666
},
{
"epoch": 2.0366300366300365,
"grad_norm": 1.5693755149841309,
"learning_rate": 2.870326544476148e-06,
"loss": 1.0041981935501099,
"step": 1668
},
{
"epoch": 2.039072039072039,
"grad_norm": 3.0417141914367676,
"learning_rate": 2.8657900115936465e-06,
"loss": 0.7336680889129639,
"step": 1670
},
{
"epoch": 2.0415140415140414,
"grad_norm": 3.467229127883911,
"learning_rate": 2.8612530067008067e-06,
"loss": 0.9192556142807007,
"step": 1672
},
{
"epoch": 2.043956043956044,
"grad_norm": 3.149291515350342,
"learning_rate": 2.8567155482924315e-06,
"loss": 0.9109829068183899,
"step": 1674
},
{
"epoch": 2.0463980463980462,
"grad_norm": 1.5668519735336304,
"learning_rate": 2.8521776548651692e-06,
"loss": 0.6515228748321533,
"step": 1676
},
{
"epoch": 2.0488400488400487,
"grad_norm": 3.5928568840026855,
"learning_rate": 2.8476393449174426e-06,
"loss": 1.0088976621627808,
"step": 1678
},
{
"epoch": 2.051282051282051,
"grad_norm": 2.0251355171203613,
"learning_rate": 2.843100636949374e-06,
"loss": 1.004931092262268,
"step": 1680
},
{
"epoch": 2.0537240537240535,
"grad_norm": 3.476871967315674,
"learning_rate": 2.838561549462705e-06,
"loss": 0.7845253348350525,
"step": 1682
},
{
"epoch": 2.056166056166056,
"grad_norm": 8.491005897521973,
"learning_rate": 2.8340221009607272e-06,
"loss": 0.7041101455688477,
"step": 1684
},
{
"epoch": 2.0586080586080584,
"grad_norm": 7.643034934997559,
"learning_rate": 2.829482309948203e-06,
"loss": 0.8947182297706604,
"step": 1686
},
{
"epoch": 2.061050061050061,
"grad_norm": 2.488511323928833,
"learning_rate": 2.824942194931289e-06,
"loss": 0.9186074137687683,
"step": 1688
},
{
"epoch": 2.0634920634920633,
"grad_norm": 10.357978820800781,
"learning_rate": 2.820401774417466e-06,
"loss": 0.7940126061439514,
"step": 1690
},
{
"epoch": 2.065934065934066,
"grad_norm": 1.5219630002975464,
"learning_rate": 2.815861066915458e-06,
"loss": 0.7649714350700378,
"step": 1692
},
{
"epoch": 2.0683760683760686,
"grad_norm": 1.8372576236724854,
"learning_rate": 2.811320090935159e-06,
"loss": 0.7867807149887085,
"step": 1694
},
{
"epoch": 2.070818070818071,
"grad_norm": 2.9736199378967285,
"learning_rate": 2.806778864987558e-06,
"loss": 1.0023208856582642,
"step": 1696
},
{
"epoch": 2.0732600732600734,
"grad_norm": 4.48581075668335,
"learning_rate": 2.802237407584663e-06,
"loss": 0.9700354337692261,
"step": 1698
},
{
"epoch": 2.075702075702076,
"grad_norm": 2.3087658882141113,
"learning_rate": 2.797695737239425e-06,
"loss": 0.9603742361068726,
"step": 1700
},
{
"epoch": 2.0781440781440783,
"grad_norm": 3.8156135082244873,
"learning_rate": 2.7931538724656625e-06,
"loss": 0.4553748667240143,
"step": 1702
},
{
"epoch": 2.0805860805860807,
"grad_norm": 1.0407174825668335,
"learning_rate": 2.788611831777989e-06,
"loss": 0.5665370225906372,
"step": 1704
},
{
"epoch": 2.083028083028083,
"grad_norm": 5.208148956298828,
"learning_rate": 2.784069633691732e-06,
"loss": 0.41125673055648804,
"step": 1706
},
{
"epoch": 2.0854700854700856,
"grad_norm": 4.539152145385742,
"learning_rate": 2.779527296722863e-06,
"loss": 0.9381171464920044,
"step": 1708
},
{
"epoch": 2.087912087912088,
"grad_norm": 2.998134136199951,
"learning_rate": 2.774984839387918e-06,
"loss": 0.7079961895942688,
"step": 1710
},
{
"epoch": 2.0903540903540905,
"grad_norm": 3.755718231201172,
"learning_rate": 2.7704422802039255e-06,
"loss": 0.7328172922134399,
"step": 1712
},
{
"epoch": 2.092796092796093,
"grad_norm": 1.5156927108764648,
"learning_rate": 2.765899637688327e-06,
"loss": 0.6182104349136353,
"step": 1714
},
{
"epoch": 2.0952380952380953,
"grad_norm": 2.6270971298217773,
"learning_rate": 2.7613569303589054e-06,
"loss": 0.7295227646827698,
"step": 1716
},
{
"epoch": 2.0976800976800978,
"grad_norm": 2.0563015937805176,
"learning_rate": 2.756814176733707e-06,
"loss": 0.9640318155288696,
"step": 1718
},
{
"epoch": 2.1001221001221,
"grad_norm": 2.9778478145599365,
"learning_rate": 2.752271395330967e-06,
"loss": 0.9460858106613159,
"step": 1720
},
{
"epoch": 2.1025641025641026,
"grad_norm": 2.579092025756836,
"learning_rate": 2.7477286046690336e-06,
"loss": 0.9912809133529663,
"step": 1722
},
{
"epoch": 2.105006105006105,
"grad_norm": 2.132593870162964,
"learning_rate": 2.743185823266294e-06,
"loss": 0.657219648361206,
"step": 1724
},
{
"epoch": 2.1074481074481075,
"grad_norm": 3.7171902656555176,
"learning_rate": 2.7386430696410953e-06,
"loss": 0.6395490765571594,
"step": 1726
},
{
"epoch": 2.10989010989011,
"grad_norm": 1.617601752281189,
"learning_rate": 2.7341003623116743e-06,
"loss": 0.5296671986579895,
"step": 1728
},
{
"epoch": 2.1123321123321124,
"grad_norm": 2.1062819957733154,
"learning_rate": 2.729557719796076e-06,
"loss": 0.8426005840301514,
"step": 1730
},
{
"epoch": 2.114774114774115,
"grad_norm": 3.001302480697632,
"learning_rate": 2.7250151606120826e-06,
"loss": 0.565944254398346,
"step": 1732
},
{
"epoch": 2.1172161172161172,
"grad_norm": 0.658115565776825,
"learning_rate": 2.7204727032771376e-06,
"loss": 0.3656719923019409,
"step": 1734
},
{
"epoch": 2.1196581196581197,
"grad_norm": 2.848242998123169,
"learning_rate": 2.7159303663082687e-06,
"loss": 0.9933385252952576,
"step": 1736
},
{
"epoch": 2.122100122100122,
"grad_norm": 4.3910417556762695,
"learning_rate": 2.7113881682220123e-06,
"loss": 0.9253290891647339,
"step": 1738
},
{
"epoch": 2.1245421245421245,
"grad_norm": 1.5515904426574707,
"learning_rate": 2.7068461275343382e-06,
"loss": 0.8804880976676941,
"step": 1740
},
{
"epoch": 2.126984126984127,
"grad_norm": 5.038269996643066,
"learning_rate": 2.7023042627605754e-06,
"loss": 1.0033385753631592,
"step": 1742
},
{
"epoch": 2.1294261294261294,
"grad_norm": 2.507053852081299,
"learning_rate": 2.6977625924153376e-06,
"loss": 0.671730637550354,
"step": 1744
},
{
"epoch": 2.131868131868132,
"grad_norm": 2.777392625808716,
"learning_rate": 2.6932211350124425e-06,
"loss": 1.001034140586853,
"step": 1746
},
{
"epoch": 2.1343101343101343,
"grad_norm": 5.460464954376221,
"learning_rate": 2.6886799090648417e-06,
"loss": 0.38881126046180725,
"step": 1748
},
{
"epoch": 2.1367521367521367,
"grad_norm": 6.266025543212891,
"learning_rate": 2.684138933084543e-06,
"loss": 1.1089563369750977,
"step": 1750
},
{
"epoch": 2.139194139194139,
"grad_norm": 2.3073322772979736,
"learning_rate": 2.6795982255825354e-06,
"loss": 0.9409431219100952,
"step": 1752
},
{
"epoch": 2.1416361416361416,
"grad_norm": 3.6696202754974365,
"learning_rate": 2.6750578050687115e-06,
"loss": 0.8869442939758301,
"step": 1754
},
{
"epoch": 2.144078144078144,
"grad_norm": 4.61408805847168,
"learning_rate": 2.6705176900517983e-06,
"loss": 1.01822030544281,
"step": 1756
},
{
"epoch": 2.1465201465201464,
"grad_norm": 2.669914484024048,
"learning_rate": 2.665977899039274e-06,
"loss": 0.48161619901657104,
"step": 1758
},
{
"epoch": 2.148962148962149,
"grad_norm": 2.6554932594299316,
"learning_rate": 2.661438450537296e-06,
"loss": 0.8899593353271484,
"step": 1760
},
{
"epoch": 2.1514041514041513,
"grad_norm": 1.00688636302948,
"learning_rate": 2.656899363050628e-06,
"loss": 0.6889787912368774,
"step": 1762
},
{
"epoch": 2.1538461538461537,
"grad_norm": 5.138449192047119,
"learning_rate": 2.6523606550825577e-06,
"loss": 0.6849108934402466,
"step": 1764
},
{
"epoch": 2.156288156288156,
"grad_norm": 1.4361852407455444,
"learning_rate": 2.647822345134832e-06,
"loss": 0.5109698176383972,
"step": 1766
},
{
"epoch": 2.1587301587301586,
"grad_norm": 4.641076564788818,
"learning_rate": 2.6432844517075696e-06,
"loss": 0.7529181838035583,
"step": 1768
},
{
"epoch": 2.161172161172161,
"grad_norm": 2.5101401805877686,
"learning_rate": 2.638746993299194e-06,
"loss": 0.6117711067199707,
"step": 1770
},
{
"epoch": 2.1636141636141635,
"grad_norm": 1.5911965370178223,
"learning_rate": 2.6342099884063542e-06,
"loss": 0.9727715849876404,
"step": 1772
},
{
"epoch": 2.166056166056166,
"grad_norm": 4.569766044616699,
"learning_rate": 2.6296734555238517e-06,
"loss": 0.8418964147567749,
"step": 1774
},
{
"epoch": 2.1684981684981683,
"grad_norm": 2.1551764011383057,
"learning_rate": 2.625137413144564e-06,
"loss": 1.0541213750839233,
"step": 1776
},
{
"epoch": 2.1709401709401708,
"grad_norm": 6.51698112487793,
"learning_rate": 2.6206018797593672e-06,
"loss": 0.6803760528564453,
"step": 1778
},
{
"epoch": 2.173382173382173,
"grad_norm": 2.092607021331787,
"learning_rate": 2.6160668738570638e-06,
"loss": 0.9858105182647705,
"step": 1780
},
{
"epoch": 2.1758241758241756,
"grad_norm": 8.542030334472656,
"learning_rate": 2.6115324139243065e-06,
"loss": 0.7755582332611084,
"step": 1782
},
{
"epoch": 2.178266178266178,
"grad_norm": 2.45867919921875,
"learning_rate": 2.606998518445521e-06,
"loss": 0.9509971141815186,
"step": 1784
},
{
"epoch": 2.1807081807081805,
"grad_norm": 5.667660236358643,
"learning_rate": 2.6024652059028337e-06,
"loss": 0.8328191041946411,
"step": 1786
},
{
"epoch": 2.183150183150183,
"grad_norm": 15.772777557373047,
"learning_rate": 2.5979324947759936e-06,
"loss": 0.9569545388221741,
"step": 1788
},
{
"epoch": 2.185592185592186,
"grad_norm": 4.427873134613037,
"learning_rate": 2.5934004035422983e-06,
"loss": 0.897070050239563,
"step": 1790
},
{
"epoch": 2.1880341880341883,
"grad_norm": 4.582241535186768,
"learning_rate": 2.5888689506765186e-06,
"loss": 0.9291706681251526,
"step": 1792
},
{
"epoch": 2.1904761904761907,
"grad_norm": 2.202183485031128,
"learning_rate": 2.5843381546508217e-06,
"loss": 0.545952320098877,
"step": 1794
},
{
"epoch": 2.192918192918193,
"grad_norm": 1.6277117729187012,
"learning_rate": 2.579808033934701e-06,
"loss": 0.6887462735176086,
"step": 1796
},
{
"epoch": 2.1953601953601956,
"grad_norm": 4.229698657989502,
"learning_rate": 2.5752786069948925e-06,
"loss": 0.8135143518447876,
"step": 1798
},
{
"epoch": 2.197802197802198,
"grad_norm": 2.0216007232666016,
"learning_rate": 2.5707498922953065e-06,
"loss": 0.9676254391670227,
"step": 1800
},
{
"epoch": 2.2002442002442004,
"grad_norm": 3.828848361968994,
"learning_rate": 2.5662219082969502e-06,
"loss": 0.9208850264549255,
"step": 1802
},
{
"epoch": 2.202686202686203,
"grad_norm": 2.4354822635650635,
"learning_rate": 2.561694673457849e-06,
"loss": 0.6844379305839539,
"step": 1804
},
{
"epoch": 2.2051282051282053,
"grad_norm": 1.510022521018982,
"learning_rate": 2.55716820623298e-06,
"loss": 0.938340961933136,
"step": 1806
},
{
"epoch": 2.2075702075702077,
"grad_norm": 1.947124719619751,
"learning_rate": 2.5526425250741864e-06,
"loss": 0.9929482936859131,
"step": 1808
},
{
"epoch": 2.21001221001221,
"grad_norm": 2.3316943645477295,
"learning_rate": 2.548117648430109e-06,
"loss": 0.7233268618583679,
"step": 1810
},
{
"epoch": 2.2124542124542126,
"grad_norm": 2.0162341594696045,
"learning_rate": 2.543593594746108e-06,
"loss": 0.6767272353172302,
"step": 1812
},
{
"epoch": 2.214896214896215,
"grad_norm": 0.9409213662147522,
"learning_rate": 2.539070382464191e-06,
"loss": 0.435127854347229,
"step": 1814
},
{
"epoch": 2.2173382173382175,
"grad_norm": 1.5501841306686401,
"learning_rate": 2.5345480300229313e-06,
"loss": 0.9680942893028259,
"step": 1816
},
{
"epoch": 2.21978021978022,
"grad_norm": 2.132582426071167,
"learning_rate": 2.5300265558574034e-06,
"loss": 0.890035092830658,
"step": 1818
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.4618556499481201,
"learning_rate": 2.525505978399097e-06,
"loss": 0.8848022818565369,
"step": 1820
},
{
"epoch": 2.2246642246642248,
"grad_norm": 1.61579167842865,
"learning_rate": 2.5209863160758467e-06,
"loss": 0.5495251417160034,
"step": 1822
},
{
"epoch": 2.227106227106227,
"grad_norm": 2.571030378341675,
"learning_rate": 2.5164675873117588e-06,
"loss": 0.79774409532547,
"step": 1824
},
{
"epoch": 2.2295482295482296,
"grad_norm": 3.0071425437927246,
"learning_rate": 2.511949810527131e-06,
"loss": 0.7262362241744995,
"step": 1826
},
{
"epoch": 2.231990231990232,
"grad_norm": 5.134605884552002,
"learning_rate": 2.507433004138385e-06,
"loss": 0.6448302865028381,
"step": 1828
},
{
"epoch": 2.2344322344322345,
"grad_norm": 7.841651916503906,
"learning_rate": 2.5029171865579813e-06,
"loss": 0.8010722398757935,
"step": 1830
},
{
"epoch": 2.236874236874237,
"grad_norm": 2.221493721008301,
"learning_rate": 2.4984023761943532e-06,
"loss": 0.9125744104385376,
"step": 1832
},
{
"epoch": 2.2393162393162394,
"grad_norm": 2.1063764095306396,
"learning_rate": 2.493888591451826e-06,
"loss": 0.9146173000335693,
"step": 1834
},
{
"epoch": 2.241758241758242,
"grad_norm": 2.964050054550171,
"learning_rate": 2.4893758507305465e-06,
"loss": 1.0444574356079102,
"step": 1836
},
{
"epoch": 2.244200244200244,
"grad_norm": 3.5007026195526123,
"learning_rate": 2.4848641724264046e-06,
"loss": 1.0267515182495117,
"step": 1838
},
{
"epoch": 2.2466422466422467,
"grad_norm": 3.2701382637023926,
"learning_rate": 2.4803535749309578e-06,
"loss": 0.44911229610443115,
"step": 1840
},
{
"epoch": 2.249084249084249,
"grad_norm": 3.629748821258545,
"learning_rate": 2.4758440766313583e-06,
"loss": 1.014188528060913,
"step": 1842
},
{
"epoch": 2.2515262515262515,
"grad_norm": 2.0354208946228027,
"learning_rate": 2.4713356959102804e-06,
"loss": 1.1367512941360474,
"step": 1844
},
{
"epoch": 2.253968253968254,
"grad_norm": 7.688552379608154,
"learning_rate": 2.4668284511458385e-06,
"loss": 0.641595721244812,
"step": 1846
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.6762326955795288,
"learning_rate": 2.4623223607115195e-06,
"loss": 0.6372429132461548,
"step": 1848
},
{
"epoch": 2.258852258852259,
"grad_norm": 8.27773666381836,
"learning_rate": 2.457817442976103e-06,
"loss": 0.8019550442695618,
"step": 1850
},
{
"epoch": 2.2612942612942613,
"grad_norm": 4.783394813537598,
"learning_rate": 2.453313716303589e-06,
"loss": 0.896358072757721,
"step": 1852
},
{
"epoch": 2.2637362637362637,
"grad_norm": 1.9131306409835815,
"learning_rate": 2.4488111990531223e-06,
"loss": 0.9868752360343933,
"step": 1854
},
{
"epoch": 2.266178266178266,
"grad_norm": 1.9729207754135132,
"learning_rate": 2.4443099095789147e-06,
"loss": 1.0031776428222656,
"step": 1856
},
{
"epoch": 2.2686202686202686,
"grad_norm": 1.891891360282898,
"learning_rate": 2.4398098662301794e-06,
"loss": 0.8341459631919861,
"step": 1858
},
{
"epoch": 2.271062271062271,
"grad_norm": 4.215878486633301,
"learning_rate": 2.435311087351044e-06,
"loss": 1.061068058013916,
"step": 1860
},
{
"epoch": 2.2735042735042734,
"grad_norm": 3.9982106685638428,
"learning_rate": 2.430813591280483e-06,
"loss": 0.6541799902915955,
"step": 1862
},
{
"epoch": 2.275946275946276,
"grad_norm": 1.4903755187988281,
"learning_rate": 2.426317396352243e-06,
"loss": 0.6281458139419556,
"step": 1864
},
{
"epoch": 2.2783882783882783,
"grad_norm": 2.3184974193573,
"learning_rate": 2.421822520894766e-06,
"loss": 0.9676836729049683,
"step": 1866
},
{
"epoch": 2.2808302808302807,
"grad_norm": 6.018508434295654,
"learning_rate": 2.4173289832311137e-06,
"loss": 0.9747646450996399,
"step": 1868
},
{
"epoch": 2.283272283272283,
"grad_norm": 3.8310694694519043,
"learning_rate": 2.4128368016788973e-06,
"loss": 0.9547437429428101,
"step": 1870
},
{
"epoch": 2.2857142857142856,
"grad_norm": 2.1508607864379883,
"learning_rate": 2.408345994550198e-06,
"loss": 0.9991099834442139,
"step": 1872
},
{
"epoch": 2.288156288156288,
"grad_norm": 11.147876739501953,
"learning_rate": 2.403856580151494e-06,
"loss": 0.2010164111852646,
"step": 1874
},
{
"epoch": 2.2905982905982905,
"grad_norm": 1.8129445314407349,
"learning_rate": 2.3993685767835866e-06,
"loss": 0.9613729119300842,
"step": 1876
},
{
"epoch": 2.293040293040293,
"grad_norm": 6.837078094482422,
"learning_rate": 2.3948820027415247e-06,
"loss": 0.6076623201370239,
"step": 1878
},
{
"epoch": 2.2954822954822953,
"grad_norm": 10.136993408203125,
"learning_rate": 2.390396876314533e-06,
"loss": 0.819178581237793,
"step": 1880
},
{
"epoch": 2.2979242979242978,
"grad_norm": 4.178940773010254,
"learning_rate": 2.3859132157859323e-06,
"loss": 0.963537335395813,
"step": 1882
},
{
"epoch": 2.3003663003663,
"grad_norm": 3.2244484424591064,
"learning_rate": 2.3814310394330683e-06,
"loss": 0.6918718218803406,
"step": 1884
},
{
"epoch": 2.3028083028083026,
"grad_norm": 2.9510533809661865,
"learning_rate": 2.3769503655272375e-06,
"loss": 1.1837718486785889,
"step": 1886
},
{
"epoch": 2.305250305250305,
"grad_norm": 2.189448833465576,
"learning_rate": 2.3724712123336098e-06,
"loss": 0.953423798084259,
"step": 1888
},
{
"epoch": 2.3076923076923075,
"grad_norm": 1.599765419960022,
"learning_rate": 2.3679935981111594e-06,
"loss": 0.9839805960655212,
"step": 1890
},
{
"epoch": 2.31013431013431,
"grad_norm": 3.1475296020507812,
"learning_rate": 2.363517541112585e-06,
"loss": 0.9580415487289429,
"step": 1892
},
{
"epoch": 2.3125763125763124,
"grad_norm": 4.1370744705200195,
"learning_rate": 2.359043059584236e-06,
"loss": 1.0927242040634155,
"step": 1894
},
{
"epoch": 2.315018315018315,
"grad_norm": 2.406658411026001,
"learning_rate": 2.354570171766042e-06,
"loss": 0.9985021948814392,
"step": 1896
},
{
"epoch": 2.317460317460317,
"grad_norm": 2.142878770828247,
"learning_rate": 2.350098895891434e-06,
"loss": 1.1066349744796753,
"step": 1898
},
{
"epoch": 2.3199023199023197,
"grad_norm": 1.68159818649292,
"learning_rate": 2.345629250187274e-06,
"loss": 0.9075395464897156,
"step": 1900
},
{
"epoch": 2.3223443223443225,
"grad_norm": 2.389622688293457,
"learning_rate": 2.3411612528737765e-06,
"loss": 1.001306414604187,
"step": 1902
},
{
"epoch": 2.324786324786325,
"grad_norm": 17.086624145507812,
"learning_rate": 2.3366949221644387e-06,
"loss": 0.5735141038894653,
"step": 1904
},
{
"epoch": 2.3272283272283274,
"grad_norm": 2.313629388809204,
"learning_rate": 2.3322302762659616e-06,
"loss": 0.45153266191482544,
"step": 1906
},
{
"epoch": 2.32967032967033,
"grad_norm": 2.6861674785614014,
"learning_rate": 2.3277673333781803e-06,
"loss": 0.6503361463546753,
"step": 1908
},
{
"epoch": 2.3321123321123323,
"grad_norm": 2.286579132080078,
"learning_rate": 2.323306111693986e-06,
"loss": 0.4222344160079956,
"step": 1910
},
{
"epoch": 2.3345543345543347,
"grad_norm": 6.489071846008301,
"learning_rate": 2.3188466293992555e-06,
"loss": 0.816202700138092,
"step": 1912
},
{
"epoch": 2.336996336996337,
"grad_norm": 1.8680096864700317,
"learning_rate": 2.3143889046727735e-06,
"loss": 0.865801990032196,
"step": 1914
},
{
"epoch": 2.3394383394383396,
"grad_norm": 3.2620620727539062,
"learning_rate": 2.3099329556861605e-06,
"loss": 0.9299424290657043,
"step": 1916
},
{
"epoch": 2.341880341880342,
"grad_norm": 2.1896071434020996,
"learning_rate": 2.305478800603798e-06,
"loss": 0.7136389017105103,
"step": 1918
},
{
"epoch": 2.3443223443223444,
"grad_norm": 3.63539457321167,
"learning_rate": 2.301026457582754e-06,
"loss": 0.32393085956573486,
"step": 1920
},
{
"epoch": 2.346764346764347,
"grad_norm": 1.8691846132278442,
"learning_rate": 2.2965759447727136e-06,
"loss": 0.7247822284698486,
"step": 1922
},
{
"epoch": 2.3492063492063493,
"grad_norm": 3.239156723022461,
"learning_rate": 2.2921272803158966e-06,
"loss": 0.5720818638801575,
"step": 1924
},
{
"epoch": 2.3516483516483517,
"grad_norm": 2.6364552974700928,
"learning_rate": 2.2876804823469907e-06,
"loss": 0.977821946144104,
"step": 1926
},
{
"epoch": 2.354090354090354,
"grad_norm": 2.6305339336395264,
"learning_rate": 2.2832355689930736e-06,
"loss": 0.4369853138923645,
"step": 1928
},
{
"epoch": 2.3565323565323566,
"grad_norm": 1.0626336336135864,
"learning_rate": 2.2787925583735403e-06,
"loss": 0.513285219669342,
"step": 1930
},
{
"epoch": 2.358974358974359,
"grad_norm": 2.138998031616211,
"learning_rate": 2.274351468600033e-06,
"loss": 0.7080082297325134,
"step": 1932
},
{
"epoch": 2.3614163614163615,
"grad_norm": 1.5141674280166626,
"learning_rate": 2.2699123177763584e-06,
"loss": 0.9225776195526123,
"step": 1934
},
{
"epoch": 2.363858363858364,
"grad_norm": 1.8718771934509277,
"learning_rate": 2.265475123998423e-06,
"loss": 0.5893734693527222,
"step": 1936
},
{
"epoch": 2.3663003663003663,
"grad_norm": 1.9005098342895508,
"learning_rate": 2.2610399053541536e-06,
"loss": 0.9091716408729553,
"step": 1938
},
{
"epoch": 2.3687423687423688,
"grad_norm": 2.963907241821289,
"learning_rate": 2.2566066799234255e-06,
"loss": 0.7350085377693176,
"step": 1940
},
{
"epoch": 2.371184371184371,
"grad_norm": 0.5085861682891846,
"learning_rate": 2.252175465777991e-06,
"loss": 0.7246252298355103,
"step": 1942
},
{
"epoch": 2.3736263736263736,
"grad_norm": 3.5920464992523193,
"learning_rate": 2.2477462809814023e-06,
"loss": 0.8181778788566589,
"step": 1944
},
{
"epoch": 2.376068376068376,
"grad_norm": 4.16288948059082,
"learning_rate": 2.2433191435889368e-06,
"loss": 0.2666274309158325,
"step": 1946
},
{
"epoch": 2.3785103785103785,
"grad_norm": 1.2004927396774292,
"learning_rate": 2.2388940716475292e-06,
"loss": 0.6288062334060669,
"step": 1948
},
{
"epoch": 2.380952380952381,
"grad_norm": 5.698200225830078,
"learning_rate": 2.234471083195692e-06,
"loss": 0.5255064964294434,
"step": 1950
},
{
"epoch": 2.3833943833943834,
"grad_norm": 2.2315514087677,
"learning_rate": 2.2300501962634474e-06,
"loss": 0.5431297421455383,
"step": 1952
},
{
"epoch": 2.385836385836386,
"grad_norm": 6.294608116149902,
"learning_rate": 2.2256314288722474e-06,
"loss": 0.7784007787704468,
"step": 1954
},
{
"epoch": 2.3882783882783882,
"grad_norm": 2.0798985958099365,
"learning_rate": 2.2212147990349062e-06,
"loss": 1.0333225727081299,
"step": 1956
},
{
"epoch": 2.3907203907203907,
"grad_norm": 1.6888413429260254,
"learning_rate": 2.2168003247555238e-06,
"loss": 0.7074629068374634,
"step": 1958
},
{
"epoch": 2.393162393162393,
"grad_norm": 1.4450297355651855,
"learning_rate": 2.2123880240294127e-06,
"loss": 1.10811448097229,
"step": 1960
},
{
"epoch": 2.3956043956043955,
"grad_norm": 2.086488723754883,
"learning_rate": 2.2079779148430265e-06,
"loss": 0.6509331464767456,
"step": 1962
},
{
"epoch": 2.398046398046398,
"grad_norm": 1.7849246263504028,
"learning_rate": 2.203570015173882e-06,
"loss": 0.966160774230957,
"step": 1964
},
{
"epoch": 2.4004884004884004,
"grad_norm": 3.1780035495758057,
"learning_rate": 2.199164342990494e-06,
"loss": 0.5994513034820557,
"step": 1966
},
{
"epoch": 2.402930402930403,
"grad_norm": 2.7655558586120605,
"learning_rate": 2.1947609162522924e-06,
"loss": 0.6144997477531433,
"step": 1968
},
{
"epoch": 2.4053724053724053,
"grad_norm": 9.948949813842773,
"learning_rate": 2.190359752909556e-06,
"loss": 0.4493882656097412,
"step": 1970
},
{
"epoch": 2.4078144078144077,
"grad_norm": 3.8871443271636963,
"learning_rate": 2.1859608709033357e-06,
"loss": 0.22239239513874054,
"step": 1972
},
{
"epoch": 2.41025641025641,
"grad_norm": 1.7906450033187866,
"learning_rate": 2.1815642881653858e-06,
"loss": 0.23173484206199646,
"step": 1974
},
{
"epoch": 2.4126984126984126,
"grad_norm": 7.670548915863037,
"learning_rate": 2.177170022618084e-06,
"loss": 0.38976311683654785,
"step": 1976
},
{
"epoch": 2.415140415140415,
"grad_norm": 2.966620922088623,
"learning_rate": 2.1727780921743633e-06,
"loss": 0.9863390922546387,
"step": 1978
},
{
"epoch": 2.4175824175824174,
"grad_norm": 1.4998663663864136,
"learning_rate": 2.1683885147376394e-06,
"loss": 0.47463205456733704,
"step": 1980
},
{
"epoch": 2.42002442002442,
"grad_norm": 2.3728787899017334,
"learning_rate": 2.1640013082017332e-06,
"loss": 1.125450849533081,
"step": 1982
},
{
"epoch": 2.4224664224664223,
"grad_norm": 1.6976672410964966,
"learning_rate": 2.1596164904508044e-06,
"loss": 0.5219910740852356,
"step": 1984
},
{
"epoch": 2.4249084249084247,
"grad_norm": 2.192134380340576,
"learning_rate": 2.1552340793592718e-06,
"loss": 1.040833830833435,
"step": 1986
},
{
"epoch": 2.427350427350427,
"grad_norm": 2.2941408157348633,
"learning_rate": 2.1508540927917458e-06,
"loss": 0.9751767516136169,
"step": 1988
},
{
"epoch": 2.42979242979243,
"grad_norm": 2.0631191730499268,
"learning_rate": 2.1464765486029517e-06,
"loss": 1.166698932647705,
"step": 1990
},
{
"epoch": 2.4322344322344325,
"grad_norm": 9.183150291442871,
"learning_rate": 2.1421014646376583e-06,
"loss": 1.0005483627319336,
"step": 1992
},
{
"epoch": 2.434676434676435,
"grad_norm": 2.409327268600464,
"learning_rate": 2.137728858730609e-06,
"loss": 0.9616595506668091,
"step": 1994
},
{
"epoch": 2.4371184371184373,
"grad_norm": 1.7600177526474,
"learning_rate": 2.133358748706442e-06,
"loss": 0.7983999848365784,
"step": 1996
},
{
"epoch": 2.4395604395604398,
"grad_norm": 1.8082541227340698,
"learning_rate": 2.128991152379622e-06,
"loss": 0.9734374284744263,
"step": 1998
},
{
"epoch": 2.442002442002442,
"grad_norm": 1.7054754495620728,
"learning_rate": 2.1246260875543672e-06,
"loss": 0.6818905472755432,
"step": 2000
},
{
"epoch": 2.4444444444444446,
"grad_norm": 3.318437099456787,
"learning_rate": 2.1202635720245744e-06,
"loss": 1.0553401708602905,
"step": 2002
},
{
"epoch": 2.446886446886447,
"grad_norm": 1.892685055732727,
"learning_rate": 2.115903623573754e-06,
"loss": 0.637603759765625,
"step": 2004
},
{
"epoch": 2.4493284493284495,
"grad_norm": 3.667452573776245,
"learning_rate": 2.1115462599749453e-06,
"loss": 0.6911687254905701,
"step": 2006
},
{
"epoch": 2.451770451770452,
"grad_norm": 1.999451994895935,
"learning_rate": 2.107191498990654e-06,
"loss": 1.1354289054870605,
"step": 2008
},
{
"epoch": 2.4542124542124544,
"grad_norm": 2.8429207801818848,
"learning_rate": 2.1028393583727752e-06,
"loss": 0.6011534929275513,
"step": 2010
},
{
"epoch": 2.456654456654457,
"grad_norm": 3.9235146045684814,
"learning_rate": 2.0984898558625227e-06,
"loss": 0.6388018131256104,
"step": 2012
},
{
"epoch": 2.4590964590964592,
"grad_norm": 2.5842745304107666,
"learning_rate": 2.0941430091903576e-06,
"loss": 1.0912564992904663,
"step": 2014
},
{
"epoch": 2.4615384615384617,
"grad_norm": 2.1695728302001953,
"learning_rate": 2.0897988360759127e-06,
"loss": 0.90839684009552,
"step": 2016
},
{
"epoch": 2.463980463980464,
"grad_norm": 1.3284540176391602,
"learning_rate": 2.0854573542279216e-06,
"loss": 1.0240721702575684,
"step": 2018
},
{
"epoch": 2.4664224664224665,
"grad_norm": 5.811964511871338,
"learning_rate": 2.081118581344151e-06,
"loss": 0.7707440257072449,
"step": 2020
},
{
"epoch": 2.468864468864469,
"grad_norm": 1.8133083581924438,
"learning_rate": 2.0767825351113192e-06,
"loss": 0.6514004468917847,
"step": 2022
},
{
"epoch": 2.4713064713064714,
"grad_norm": 4.037316799163818,
"learning_rate": 2.072449233205035e-06,
"loss": 0.7341061234474182,
"step": 2024
},
{
"epoch": 2.473748473748474,
"grad_norm": 1.9994157552719116,
"learning_rate": 2.068118693289715e-06,
"loss": 0.9125716090202332,
"step": 2026
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.7720515727996826,
"learning_rate": 2.0637909330185217e-06,
"loss": 0.6419773101806641,
"step": 2028
},
{
"epoch": 2.4786324786324787,
"grad_norm": 1.6481854915618896,
"learning_rate": 2.0594659700332833e-06,
"loss": 0.9903475046157837,
"step": 2030
},
{
"epoch": 2.481074481074481,
"grad_norm": 2.603499174118042,
"learning_rate": 2.055143821964424e-06,
"loss": 1.1345065832138062,
"step": 2032
},
{
"epoch": 2.4835164835164836,
"grad_norm": 2.5555107593536377,
"learning_rate": 2.0508245064308968e-06,
"loss": 0.5736313462257385,
"step": 2034
},
{
"epoch": 2.485958485958486,
"grad_norm": 2.2995779514312744,
"learning_rate": 2.046508041040107e-06,
"loss": 1.004111409187317,
"step": 2036
},
{
"epoch": 2.4884004884004884,
"grad_norm": 1.261184573173523,
"learning_rate": 2.04219444338784e-06,
"loss": 0.6451095342636108,
"step": 2038
},
{
"epoch": 2.490842490842491,
"grad_norm": 9.273902893066406,
"learning_rate": 2.0378837310581907e-06,
"loss": 0.769629955291748,
"step": 2040
},
{
"epoch": 2.4932844932844933,
"grad_norm": 5.710522174835205,
"learning_rate": 2.0335759216234947e-06,
"loss": 0.9529898166656494,
"step": 2042
},
{
"epoch": 2.4957264957264957,
"grad_norm": 1.7338002920150757,
"learning_rate": 2.0292710326442517e-06,
"loss": 0.7281374931335449,
"step": 2044
},
{
"epoch": 2.498168498168498,
"grad_norm": 7.91054630279541,
"learning_rate": 2.0249690816690583e-06,
"loss": 0.5946838855743408,
"step": 2046
},
{
"epoch": 2.5006105006105006,
"grad_norm": 2.516921281814575,
"learning_rate": 2.0206700862345334e-06,
"loss": 0.719270646572113,
"step": 2048
},
{
"epoch": 2.503052503052503,
"grad_norm": 9.922062873840332,
"learning_rate": 2.016374063865248e-06,
"loss": 0.8115828037261963,
"step": 2050
},
{
"epoch": 2.5054945054945055,
"grad_norm": 2.6232059001922607,
"learning_rate": 2.0120810320736537e-06,
"loss": 1.1120948791503906,
"step": 2052
},
{
"epoch": 2.507936507936508,
"grad_norm": 1.4087735414505005,
"learning_rate": 2.00779100836001e-06,
"loss": 0.7034242153167725,
"step": 2054
},
{
"epoch": 2.5103785103785103,
"grad_norm": 1.9080172777175903,
"learning_rate": 2.003504010212317e-06,
"loss": 1.0267211198806763,
"step": 2056
},
{
"epoch": 2.5128205128205128,
"grad_norm": 8.740047454833984,
"learning_rate": 1.99922005510624e-06,
"loss": 0.31465768814086914,
"step": 2058
},
{
"epoch": 2.515262515262515,
"grad_norm": 3.2588388919830322,
"learning_rate": 1.9949391605050365e-06,
"loss": 0.2918320596218109,
"step": 2060
},
{
"epoch": 2.5177045177045176,
"grad_norm": 6.833197116851807,
"learning_rate": 1.990661343859493e-06,
"loss": 0.7108557224273682,
"step": 2062
},
{
"epoch": 2.52014652014652,
"grad_norm": 6.0173821449279785,
"learning_rate": 1.986386622607845e-06,
"loss": 0.8981122374534607,
"step": 2064
},
{
"epoch": 2.5225885225885225,
"grad_norm": 1.5162910223007202,
"learning_rate": 1.9821150141757133e-06,
"loss": 0.6950556039810181,
"step": 2066
},
{
"epoch": 2.525030525030525,
"grad_norm": 3.4228689670562744,
"learning_rate": 1.977846535976026e-06,
"loss": 0.7832509875297546,
"step": 2068
},
{
"epoch": 2.5274725274725274,
"grad_norm": 2.2627415657043457,
"learning_rate": 1.9735812054089542e-06,
"loss": 1.0561403036117554,
"step": 2070
},
{
"epoch": 2.52991452991453,
"grad_norm": 14.761664390563965,
"learning_rate": 1.969319039861835e-06,
"loss": 0.6642997860908508,
"step": 2072
},
{
"epoch": 2.5323565323565322,
"grad_norm": 1.6336398124694824,
"learning_rate": 1.965060056709105e-06,
"loss": 1.0829975605010986,
"step": 2074
},
{
"epoch": 2.5347985347985347,
"grad_norm": 2.2617239952087402,
"learning_rate": 1.960804273312228e-06,
"loss": 0.8906936645507812,
"step": 2076
},
{
"epoch": 2.537240537240537,
"grad_norm": 7.154871463775635,
"learning_rate": 1.9565517070196248e-06,
"loss": 1.0117489099502563,
"step": 2078
},
{
"epoch": 2.5396825396825395,
"grad_norm": 7.616284370422363,
"learning_rate": 1.9523023751665997e-06,
"loss": 0.6691079139709473,
"step": 2080
},
{
"epoch": 2.542124542124542,
"grad_norm": 2.257676839828491,
"learning_rate": 1.9480562950752745e-06,
"loss": 0.9914268255233765,
"step": 2082
},
{
"epoch": 2.5445665445665444,
"grad_norm": 1.5886762142181396,
"learning_rate": 1.9438134840545147e-06,
"loss": 1.0071735382080078,
"step": 2084
},
{
"epoch": 2.547008547008547,
"grad_norm": 1.6075587272644043,
"learning_rate": 1.939573959399858e-06,
"loss": 0.9144080281257629,
"step": 2086
},
{
"epoch": 2.5494505494505493,
"grad_norm": 1.8543524742126465,
"learning_rate": 1.9353377383934475e-06,
"loss": 0.912468433380127,
"step": 2088
},
{
"epoch": 2.5518925518925517,
"grad_norm": 2.2457010746002197,
"learning_rate": 1.931104838303958e-06,
"loss": 0.7604387998580933,
"step": 2090
},
{
"epoch": 2.554334554334554,
"grad_norm": 1.9843976497650146,
"learning_rate": 1.9268752763865285e-06,
"loss": 0.691798210144043,
"step": 2092
},
{
"epoch": 2.5567765567765566,
"grad_norm": 1.6185815334320068,
"learning_rate": 1.9226490698826876e-06,
"loss": 0.7290869951248169,
"step": 2094
},
{
"epoch": 2.559218559218559,
"grad_norm": 1.5648012161254883,
"learning_rate": 1.918426236020286e-06,
"loss": 0.8694143295288086,
"step": 2096
},
{
"epoch": 2.5616605616605614,
"grad_norm": 0.736847460269928,
"learning_rate": 1.91420679201343e-06,
"loss": 0.023200487717986107,
"step": 2098
},
{
"epoch": 2.564102564102564,
"grad_norm": 2.0540359020233154,
"learning_rate": 1.9099907550624034e-06,
"loss": 0.6316545009613037,
"step": 2100
},
{
"epoch": 2.5665445665445663,
"grad_norm": 4.105884075164795,
"learning_rate": 1.9057781423536015e-06,
"loss": 0.9644788503646851,
"step": 2102
},
{
"epoch": 2.5689865689865687,
"grad_norm": 6.087828159332275,
"learning_rate": 1.9015689710594627e-06,
"loss": 0.6429115533828735,
"step": 2104
},
{
"epoch": 2.571428571428571,
"grad_norm": 7.242172718048096,
"learning_rate": 1.897363258338395e-06,
"loss": 0.8835878968238831,
"step": 2106
},
{
"epoch": 2.5738705738705736,
"grad_norm": 3.802177906036377,
"learning_rate": 1.8931610213347096e-06,
"loss": 0.6208938360214233,
"step": 2108
},
{
"epoch": 2.576312576312576,
"grad_norm": 4.764630317687988,
"learning_rate": 1.888962277178548e-06,
"loss": 0.5702378749847412,
"step": 2110
},
{
"epoch": 2.578754578754579,
"grad_norm": 1.3646184206008911,
"learning_rate": 1.884767042985814e-06,
"loss": 1.0015933513641357,
"step": 2112
},
{
"epoch": 2.5811965811965814,
"grad_norm": 7.507686614990234,
"learning_rate": 1.880575335858102e-06,
"loss": 0.297787070274353,
"step": 2114
},
{
"epoch": 2.583638583638584,
"grad_norm": 1.5217941999435425,
"learning_rate": 1.8763871728826282e-06,
"loss": 0.7149800658226013,
"step": 2116
},
{
"epoch": 2.586080586080586,
"grad_norm": 1.9740521907806396,
"learning_rate": 1.8722025711321657e-06,
"loss": 0.998376190662384,
"step": 2118
},
{
"epoch": 2.5885225885225887,
"grad_norm": 1.2570465803146362,
"learning_rate": 1.8680215476649643e-06,
"loss": 0.665241539478302,
"step": 2120
},
{
"epoch": 2.590964590964591,
"grad_norm": 4.761254787445068,
"learning_rate": 1.8638441195246915e-06,
"loss": 0.9342296719551086,
"step": 2122
},
{
"epoch": 2.5934065934065935,
"grad_norm": 2.0453293323516846,
"learning_rate": 1.8596703037403573e-06,
"loss": 0.8592435121536255,
"step": 2124
},
{
"epoch": 2.595848595848596,
"grad_norm": 1.4386849403381348,
"learning_rate": 1.8555001173262449e-06,
"loss": 0.4735715985298157,
"step": 2126
},
{
"epoch": 2.5982905982905984,
"grad_norm": 1.8832699060440063,
"learning_rate": 1.8513335772818452e-06,
"loss": 0.9801812171936035,
"step": 2128
},
{
"epoch": 2.600732600732601,
"grad_norm": 2.282724380493164,
"learning_rate": 1.8471707005917833e-06,
"loss": 0.6964608430862427,
"step": 2130
},
{
"epoch": 2.6031746031746033,
"grad_norm": 1.7532885074615479,
"learning_rate": 1.8430115042257518e-06,
"loss": 0.5790331959724426,
"step": 2132
},
{
"epoch": 2.6056166056166057,
"grad_norm": 2.6640255451202393,
"learning_rate": 1.838856005138438e-06,
"loss": 0.9573779106140137,
"step": 2134
},
{
"epoch": 2.608058608058608,
"grad_norm": 3.7161571979522705,
"learning_rate": 1.8347042202694616e-06,
"loss": 0.6422839760780334,
"step": 2136
},
{
"epoch": 2.6105006105006106,
"grad_norm": 3.1850647926330566,
"learning_rate": 1.8305561665432987e-06,
"loss": 0.7944685816764832,
"step": 2138
},
{
"epoch": 2.612942612942613,
"grad_norm": 1.816838264465332,
"learning_rate": 1.8264118608692166e-06,
"loss": 0.6552348136901855,
"step": 2140
},
{
"epoch": 2.6153846153846154,
"grad_norm": 2.1410024166107178,
"learning_rate": 1.8222713201412034e-06,
"loss": 0.9763152599334717,
"step": 2142
},
{
"epoch": 2.617826617826618,
"grad_norm": 1.5893546342849731,
"learning_rate": 1.818134561237901e-06,
"loss": 0.9420812726020813,
"step": 2144
},
{
"epoch": 2.6202686202686203,
"grad_norm": 8.81820297241211,
"learning_rate": 1.814001601022533e-06,
"loss": 0.9948893785476685,
"step": 2146
},
{
"epoch": 2.6227106227106227,
"grad_norm": 1.6226983070373535,
"learning_rate": 1.8098724563428383e-06,
"loss": 0.6544241309165955,
"step": 2148
},
{
"epoch": 2.625152625152625,
"grad_norm": 1.8730573654174805,
"learning_rate": 1.8057471440310048e-06,
"loss": 1.034470796585083,
"step": 2150
},
{
"epoch": 2.6275946275946276,
"grad_norm": 1.7522205114364624,
"learning_rate": 1.8016256809035932e-06,
"loss": 1.0132882595062256,
"step": 2152
},
{
"epoch": 2.63003663003663,
"grad_norm": 2.8230953216552734,
"learning_rate": 1.7975080837614777e-06,
"loss": 0.989703357219696,
"step": 2154
},
{
"epoch": 2.6324786324786325,
"grad_norm": 2.739607334136963,
"learning_rate": 1.79339436938977e-06,
"loss": 0.9275919198989868,
"step": 2156
},
{
"epoch": 2.634920634920635,
"grad_norm": 2.475738048553467,
"learning_rate": 1.7892845545577547e-06,
"loss": 0.7446354627609253,
"step": 2158
},
{
"epoch": 2.6373626373626373,
"grad_norm": 3.7959513664245605,
"learning_rate": 1.7851786560188223e-06,
"loss": 0.5423752069473267,
"step": 2160
},
{
"epoch": 2.6398046398046398,
"grad_norm": 2.074728488922119,
"learning_rate": 1.7810766905103972e-06,
"loss": 0.7950323820114136,
"step": 2162
},
{
"epoch": 2.642246642246642,
"grad_norm": 1.902423620223999,
"learning_rate": 1.776978674753868e-06,
"loss": 0.48773831129074097,
"step": 2164
},
{
"epoch": 2.6446886446886446,
"grad_norm": 1.4304314851760864,
"learning_rate": 1.7728846254545285e-06,
"loss": 0.9862061738967896,
"step": 2166
},
{
"epoch": 2.647130647130647,
"grad_norm": 1.1472234725952148,
"learning_rate": 1.7687945593014988e-06,
"loss": 0.735059916973114,
"step": 2168
},
{
"epoch": 2.6495726495726495,
"grad_norm": 1.9486029148101807,
"learning_rate": 1.764708492967665e-06,
"loss": 1.0259606838226318,
"step": 2170
},
{
"epoch": 2.652014652014652,
"grad_norm": 1.9149264097213745,
"learning_rate": 1.7606264431096048e-06,
"loss": 1.0802158117294312,
"step": 2172
},
{
"epoch": 2.6544566544566544,
"grad_norm": 6.957102298736572,
"learning_rate": 1.7565484263675258e-06,
"loss": 0.9875915050506592,
"step": 2174
},
{
"epoch": 2.656898656898657,
"grad_norm": 2.418081283569336,
"learning_rate": 1.7524744593651948e-06,
"loss": 0.7961604595184326,
"step": 2176
},
{
"epoch": 2.659340659340659,
"grad_norm": 2.5019962787628174,
"learning_rate": 1.7484045587098681e-06,
"loss": 1.029079556465149,
"step": 2178
},
{
"epoch": 2.6617826617826617,
"grad_norm": 39.45793151855469,
"learning_rate": 1.7443387409922266e-06,
"loss": 1.0245277881622314,
"step": 2180
},
{
"epoch": 2.664224664224664,
"grad_norm": 1.2770639657974243,
"learning_rate": 1.740277022786309e-06,
"loss": 1.0204907655715942,
"step": 2182
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.364436149597168,
"learning_rate": 1.7362194206494421e-06,
"loss": 0.6930133700370789,
"step": 2184
},
{
"epoch": 2.669108669108669,
"grad_norm": 2.3246958255767822,
"learning_rate": 1.732165951122171e-06,
"loss": 1.0231374502182007,
"step": 2186
},
{
"epoch": 2.6715506715506714,
"grad_norm": 1.607748031616211,
"learning_rate": 1.7281166307281972e-06,
"loss": 1.094809651374817,
"step": 2188
},
{
"epoch": 2.6739926739926743,
"grad_norm": 2.158128261566162,
"learning_rate": 1.7240714759743084e-06,
"loss": 1.021047830581665,
"step": 2190
},
{
"epoch": 2.6764346764346767,
"grad_norm": 8.213944435119629,
"learning_rate": 1.7200305033503123e-06,
"loss": 0.9594013094902039,
"step": 2192
},
{
"epoch": 2.678876678876679,
"grad_norm": 4.63560676574707,
"learning_rate": 1.7159937293289639e-06,
"loss": 0.3299452066421509,
"step": 2194
},
{
"epoch": 2.6813186813186816,
"grad_norm": 2.0286736488342285,
"learning_rate": 1.711961170365909e-06,
"loss": 1.214423418045044,
"step": 2196
},
{
"epoch": 2.683760683760684,
"grad_norm": 1.665736198425293,
"learning_rate": 1.707932842899605e-06,
"loss": 0.9360992908477783,
"step": 2198
},
{
"epoch": 2.6862026862026864,
"grad_norm": 1.7861151695251465,
"learning_rate": 1.7039087633512652e-06,
"loss": 0.9141231179237366,
"step": 2200
},
{
"epoch": 2.688644688644689,
"grad_norm": 4.079377174377441,
"learning_rate": 1.6998889481247827e-06,
"loss": 0.6146577596664429,
"step": 2202
},
{
"epoch": 2.6910866910866913,
"grad_norm": 3.120830535888672,
"learning_rate": 1.6958734136066708e-06,
"loss": 0.7842304110527039,
"step": 2204
},
{
"epoch": 2.6935286935286937,
"grad_norm": 9.338458061218262,
"learning_rate": 1.6918621761659885e-06,
"loss": 0.4128279983997345,
"step": 2206
},
{
"epoch": 2.695970695970696,
"grad_norm": 5.846481800079346,
"learning_rate": 1.6878552521542825e-06,
"loss": 0.909477710723877,
"step": 2208
},
{
"epoch": 2.6984126984126986,
"grad_norm": 2.9776010513305664,
"learning_rate": 1.6838526579055108e-06,
"loss": 0.6446021795272827,
"step": 2210
},
{
"epoch": 2.700854700854701,
"grad_norm": 2.117492914199829,
"learning_rate": 1.679854409735989e-06,
"loss": 0.9117352962493896,
"step": 2212
},
{
"epoch": 2.7032967032967035,
"grad_norm": 2.0278072357177734,
"learning_rate": 1.6758605239443083e-06,
"loss": 0.6256328225135803,
"step": 2214
},
{
"epoch": 2.705738705738706,
"grad_norm": 5.5183563232421875,
"learning_rate": 1.6718710168112824e-06,
"loss": 0.5338436365127563,
"step": 2216
},
{
"epoch": 2.7081807081807083,
"grad_norm": 3.22253155708313,
"learning_rate": 1.6678859045998724e-06,
"loss": 0.6465069651603699,
"step": 2218
},
{
"epoch": 2.7106227106227108,
"grad_norm": 8.712440490722656,
"learning_rate": 1.663905203555125e-06,
"loss": 0.3656350374221802,
"step": 2220
},
{
"epoch": 2.713064713064713,
"grad_norm": 2.39136004447937,
"learning_rate": 1.6599289299041067e-06,
"loss": 0.5852014422416687,
"step": 2222
},
{
"epoch": 2.7155067155067156,
"grad_norm": 3.29854416847229,
"learning_rate": 1.6559570998558339e-06,
"loss": 0.7199364900588989,
"step": 2224
},
{
"epoch": 2.717948717948718,
"grad_norm": 1.553189754486084,
"learning_rate": 1.6519897296012089e-06,
"loss": 0.7559410333633423,
"step": 2226
},
{
"epoch": 2.7203907203907205,
"grad_norm": 5.676231384277344,
"learning_rate": 1.648026835312954e-06,
"loss": 0.7857324481010437,
"step": 2228
},
{
"epoch": 2.722832722832723,
"grad_norm": 2.2479665279388428,
"learning_rate": 1.644068433145548e-06,
"loss": 0.9991781711578369,
"step": 2230
},
{
"epoch": 2.7252747252747254,
"grad_norm": 19.45795249938965,
"learning_rate": 1.640114539235156e-06,
"loss": 0.6020703911781311,
"step": 2232
},
{
"epoch": 2.727716727716728,
"grad_norm": 1.4817429780960083,
"learning_rate": 1.6361651696995633e-06,
"loss": 1.0305383205413818,
"step": 2234
},
{
"epoch": 2.7301587301587302,
"grad_norm": 3.4105312824249268,
"learning_rate": 1.6322203406381158e-06,
"loss": 1.0053908824920654,
"step": 2236
},
{
"epoch": 2.7326007326007327,
"grad_norm": 1.9684903621673584,
"learning_rate": 1.6282800681316485e-06,
"loss": 0.9223586320877075,
"step": 2238
},
{
"epoch": 2.735042735042735,
"grad_norm": 3.927523374557495,
"learning_rate": 1.6243443682424211e-06,
"loss": 0.6888905167579651,
"step": 2240
},
{
"epoch": 2.7374847374847375,
"grad_norm": 9.635194778442383,
"learning_rate": 1.6204132570140551e-06,
"loss": 0.9834311008453369,
"step": 2242
},
{
"epoch": 2.73992673992674,
"grad_norm": 2.742316722869873,
"learning_rate": 1.616486750471466e-06,
"loss": 0.5603131055831909,
"step": 2244
},
{
"epoch": 2.7423687423687424,
"grad_norm": 2.2433788776397705,
"learning_rate": 1.6125648646207992e-06,
"loss": 0.7219388484954834,
"step": 2246
},
{
"epoch": 2.744810744810745,
"grad_norm": 3.132955312728882,
"learning_rate": 1.608647615449362e-06,
"loss": 0.8298469185829163,
"step": 2248
},
{
"epoch": 2.7472527472527473,
"grad_norm": 2.522810697555542,
"learning_rate": 1.604735018925563e-06,
"loss": 0.9102773070335388,
"step": 2250
},
{
"epoch": 2.7496947496947497,
"grad_norm": 2.429370164871216,
"learning_rate": 1.6008270909988414e-06,
"loss": 0.9825899600982666,
"step": 2252
},
{
"epoch": 2.752136752136752,
"grad_norm": 1.3979560136795044,
"learning_rate": 1.596923847599611e-06,
"loss": 0.694176197052002,
"step": 2254
},
{
"epoch": 2.7545787545787546,
"grad_norm": 3.7129030227661133,
"learning_rate": 1.593025304639183e-06,
"loss": 0.7678108811378479,
"step": 2256
},
{
"epoch": 2.757020757020757,
"grad_norm": 58.61724090576172,
"learning_rate": 1.5891314780097123e-06,
"loss": 0.9679561853408813,
"step": 2258
},
{
"epoch": 2.7594627594627594,
"grad_norm": 3.2823469638824463,
"learning_rate": 1.585242383584124e-06,
"loss": 1.0787243843078613,
"step": 2260
},
{
"epoch": 2.761904761904762,
"grad_norm": 4.105648517608643,
"learning_rate": 1.5813580372160558e-06,
"loss": 1.0055099725723267,
"step": 2262
},
{
"epoch": 2.7643467643467643,
"grad_norm": 1.8101457357406616,
"learning_rate": 1.5774784547397898e-06,
"loss": 0.9336439967155457,
"step": 2264
},
{
"epoch": 2.7667887667887667,
"grad_norm": 3.130258798599243,
"learning_rate": 1.5736036519701876e-06,
"loss": 0.912263035774231,
"step": 2266
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.9224950075149536,
"learning_rate": 1.5697336447026257e-06,
"loss": 0.7292864918708801,
"step": 2268
},
{
"epoch": 2.7716727716727716,
"grad_norm": 1.59903085231781,
"learning_rate": 1.565868448712935e-06,
"loss": 0.593657374382019,
"step": 2270
},
{
"epoch": 2.774114774114774,
"grad_norm": 1.2354272603988647,
"learning_rate": 1.562008079757329e-06,
"loss": 0.2578456699848175,
"step": 2272
},
{
"epoch": 2.7765567765567765,
"grad_norm": 1.8744750022888184,
"learning_rate": 1.5581525535723502e-06,
"loss": 1.0456628799438477,
"step": 2274
},
{
"epoch": 2.778998778998779,
"grad_norm": 1.6361182928085327,
"learning_rate": 1.5543018858747943e-06,
"loss": 0.9015727043151855,
"step": 2276
},
{
"epoch": 2.7814407814407813,
"grad_norm": 2.5508196353912354,
"learning_rate": 1.550456092361655e-06,
"loss": 0.5647008419036865,
"step": 2278
},
{
"epoch": 2.7838827838827838,
"grad_norm": 5.802591323852539,
"learning_rate": 1.546615188710055e-06,
"loss": 0.341159850358963,
"step": 2280
},
{
"epoch": 2.786324786324786,
"grad_norm": 2.65018630027771,
"learning_rate": 1.5427791905771843e-06,
"loss": 1.0216097831726074,
"step": 2282
},
{
"epoch": 2.7887667887667886,
"grad_norm": 4.260854721069336,
"learning_rate": 1.538948113600237e-06,
"loss": 0.8784246444702148,
"step": 2284
},
{
"epoch": 2.791208791208791,
"grad_norm": 3.1193864345550537,
"learning_rate": 1.5351219733963453e-06,
"loss": 0.9552139043807983,
"step": 2286
},
{
"epoch": 2.7936507936507935,
"grad_norm": 1.618998646736145,
"learning_rate": 1.5313007855625153e-06,
"loss": 0.9732692241668701,
"step": 2288
},
{
"epoch": 2.796092796092796,
"grad_norm": 6.491037368774414,
"learning_rate": 1.5274845656755687e-06,
"loss": 0.3624776303768158,
"step": 2290
},
{
"epoch": 2.7985347985347984,
"grad_norm": 2.8097808361053467,
"learning_rate": 1.5236733292920735e-06,
"loss": 0.8098872303962708,
"step": 2292
},
{
"epoch": 2.800976800976801,
"grad_norm": 2.353226900100708,
"learning_rate": 1.5198670919482839e-06,
"loss": 0.7608856558799744,
"step": 2294
},
{
"epoch": 2.8034188034188032,
"grad_norm": 1.7207751274108887,
"learning_rate": 1.5160658691600737e-06,
"loss": 0.8960850834846497,
"step": 2296
},
{
"epoch": 2.8058608058608057,
"grad_norm": 2.5294456481933594,
"learning_rate": 1.5122696764228772e-06,
"loss": 0.40981659293174744,
"step": 2298
},
{
"epoch": 2.808302808302808,
"grad_norm": 2.1142632961273193,
"learning_rate": 1.5084785292116244e-06,
"loss": 0.6546359658241272,
"step": 2300
},
{
"epoch": 2.8107448107448105,
"grad_norm": 2.852811574935913,
"learning_rate": 1.5046924429806747e-06,
"loss": 1.049178123474121,
"step": 2302
},
{
"epoch": 2.813186813186813,
"grad_norm": 0.45758649706840515,
"learning_rate": 1.50091143316376e-06,
"loss": 0.5754284262657166,
"step": 2304
},
{
"epoch": 2.8156288156288154,
"grad_norm": 1.5217318534851074,
"learning_rate": 1.497135515173917e-06,
"loss": 0.7435483336448669,
"step": 2306
},
{
"epoch": 2.818070818070818,
"grad_norm": 2.425044298171997,
"learning_rate": 1.4933647044034264e-06,
"loss": 0.6329599618911743,
"step": 2308
},
{
"epoch": 2.8205128205128203,
"grad_norm": 2.1778621673583984,
"learning_rate": 1.489599016223748e-06,
"loss": 1.040429949760437,
"step": 2310
},
{
"epoch": 2.8229548229548227,
"grad_norm": 1.7928675413131714,
"learning_rate": 1.485838465985463e-06,
"loss": 0.6599953770637512,
"step": 2312
},
{
"epoch": 2.825396825396825,
"grad_norm": 6.531580924987793,
"learning_rate": 1.482083069018203e-06,
"loss": 0.7039975523948669,
"step": 2314
},
{
"epoch": 2.8278388278388276,
"grad_norm": 2.4885547161102295,
"learning_rate": 1.4783328406306002e-06,
"loss": 0.7224160432815552,
"step": 2316
},
{
"epoch": 2.8302808302808304,
"grad_norm": 2.634704351425171,
"learning_rate": 1.4745877961102096e-06,
"loss": 1.0425044298171997,
"step": 2318
},
{
"epoch": 2.832722832722833,
"grad_norm": 1.6846414804458618,
"learning_rate": 1.4708479507234596e-06,
"loss": 0.6160850524902344,
"step": 2320
},
{
"epoch": 2.8351648351648353,
"grad_norm": 6.159206390380859,
"learning_rate": 1.4671133197155817e-06,
"loss": 0.6913861036300659,
"step": 2322
},
{
"epoch": 2.8376068376068377,
"grad_norm": 0.6180989146232605,
"learning_rate": 1.4633839183105531e-06,
"loss": 0.19272488355636597,
"step": 2324
},
{
"epoch": 2.84004884004884,
"grad_norm": 1.4327154159545898,
"learning_rate": 1.4596597617110327e-06,
"loss": 0.8577545285224915,
"step": 2326
},
{
"epoch": 2.8424908424908426,
"grad_norm": 2.3003664016723633,
"learning_rate": 1.4559408650982999e-06,
"loss": 0.8021556735038757,
"step": 2328
},
{
"epoch": 2.844932844932845,
"grad_norm": 1.9625604152679443,
"learning_rate": 1.4522272436321893e-06,
"loss": 0.8357652425765991,
"step": 2330
},
{
"epoch": 2.8473748473748475,
"grad_norm": 1.8401294946670532,
"learning_rate": 1.4485189124510355e-06,
"loss": 1.0011165142059326,
"step": 2332
},
{
"epoch": 2.84981684981685,
"grad_norm": 2.622108221054077,
"learning_rate": 1.4448158866716028e-06,
"loss": 0.15081661939620972,
"step": 2334
},
{
"epoch": 2.8522588522588523,
"grad_norm": 2.198842763900757,
"learning_rate": 1.441118181389035e-06,
"loss": 1.0237456560134888,
"step": 2336
},
{
"epoch": 2.8547008547008548,
"grad_norm": 1.9740854501724243,
"learning_rate": 1.437425811676781e-06,
"loss": 0.6290860176086426,
"step": 2338
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.7903653383255005,
"learning_rate": 1.4337387925865435e-06,
"loss": 1.0167012214660645,
"step": 2340
},
{
"epoch": 2.8595848595848596,
"grad_norm": 1.402300477027893,
"learning_rate": 1.430057139148211e-06,
"loss": 1.0619688034057617,
"step": 2342
},
{
"epoch": 2.862026862026862,
"grad_norm": 5.845098972320557,
"learning_rate": 1.4263808663698015e-06,
"loss": 0.3327184319496155,
"step": 2344
},
{
"epoch": 2.8644688644688645,
"grad_norm": 3.894296169281006,
"learning_rate": 1.4227099892373986e-06,
"loss": 0.9415085911750793,
"step": 2346
},
{
"epoch": 2.866910866910867,
"grad_norm": 0.5116417407989502,
"learning_rate": 1.4190445227150907e-06,
"loss": 0.5154658555984497,
"step": 2348
},
{
"epoch": 2.8693528693528694,
"grad_norm": 1.3756489753723145,
"learning_rate": 1.4153844817449087e-06,
"loss": 0.6424716114997864,
"step": 2350
},
{
"epoch": 2.871794871794872,
"grad_norm": 1.572013020515442,
"learning_rate": 1.4117298812467687e-06,
"loss": 0.7699521780014038,
"step": 2352
},
{
"epoch": 2.8742368742368742,
"grad_norm": 51.821266174316406,
"learning_rate": 1.4080807361184088e-06,
"loss": 0.5482099652290344,
"step": 2354
},
{
"epoch": 2.8766788766788767,
"grad_norm": 1.6499661207199097,
"learning_rate": 1.4044370612353281e-06,
"loss": 0.906887412071228,
"step": 2356
},
{
"epoch": 2.879120879120879,
"grad_norm": 4.545080661773682,
"learning_rate": 1.400798871450726e-06,
"loss": 0.784338653087616,
"step": 2358
},
{
"epoch": 2.8815628815628815,
"grad_norm": 2.03543758392334,
"learning_rate": 1.397166181595443e-06,
"loss": 0.5577901005744934,
"step": 2360
},
{
"epoch": 2.884004884004884,
"grad_norm": 5.208932399749756,
"learning_rate": 1.3935390064779008e-06,
"loss": 0.7476451992988586,
"step": 2362
},
{
"epoch": 2.8864468864468864,
"grad_norm": 2.228670835494995,
"learning_rate": 1.3899173608840378e-06,
"loss": 1.051893949508667,
"step": 2364
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.407289743423462,
"learning_rate": 1.3863012595772531e-06,
"loss": 1.076530933380127,
"step": 2366
},
{
"epoch": 2.8913308913308913,
"grad_norm": 22.543790817260742,
"learning_rate": 1.3826907172983456e-06,
"loss": 0.846904993057251,
"step": 2368
},
{
"epoch": 2.8937728937728937,
"grad_norm": 1.6555728912353516,
"learning_rate": 1.3790857487654535e-06,
"loss": 1.1604909896850586,
"step": 2370
},
{
"epoch": 2.896214896214896,
"grad_norm": 2.013773202896118,
"learning_rate": 1.3754863686739906e-06,
"loss": 0.915320634841919,
"step": 2372
},
{
"epoch": 2.8986568986568986,
"grad_norm": 2.000242233276367,
"learning_rate": 1.3718925916965945e-06,
"loss": 0.7186045050621033,
"step": 2374
},
{
"epoch": 2.901098901098901,
"grad_norm": 1.6389049291610718,
"learning_rate": 1.3683044324830573e-06,
"loss": 0.9410088658332825,
"step": 2376
},
{
"epoch": 2.9035409035409034,
"grad_norm": 2.346830129623413,
"learning_rate": 1.3647219056602757e-06,
"loss": 1.0101977586746216,
"step": 2378
},
{
"epoch": 2.905982905982906,
"grad_norm": 1.5173600912094116,
"learning_rate": 1.361145025832182e-06,
"loss": 0.8229511976242065,
"step": 2380
},
{
"epoch": 2.9084249084249083,
"grad_norm": 2.14473557472229,
"learning_rate": 1.3575738075796923e-06,
"loss": 0.9482402801513672,
"step": 2382
},
{
"epoch": 2.9108669108669107,
"grad_norm": 1.9663830995559692,
"learning_rate": 1.35400826546064e-06,
"loss": 0.9494956135749817,
"step": 2384
},
{
"epoch": 2.913308913308913,
"grad_norm": 5.7204909324646,
"learning_rate": 1.350448414009723e-06,
"loss": 1.0107911825180054,
"step": 2386
},
{
"epoch": 2.9157509157509156,
"grad_norm": 2.0800321102142334,
"learning_rate": 1.3468942677384408e-06,
"loss": 0.8393886089324951,
"step": 2388
},
{
"epoch": 2.918192918192918,
"grad_norm": 6.8317461013793945,
"learning_rate": 1.343345841135037e-06,
"loss": 0.46943965554237366,
"step": 2390
},
{
"epoch": 2.9206349206349205,
"grad_norm": 3.705573081970215,
"learning_rate": 1.3398031486644366e-06,
"loss": 0.5753905177116394,
"step": 2392
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.4765284061431885,
"learning_rate": 1.3362662047681928e-06,
"loss": 0.8073123097419739,
"step": 2394
},
{
"epoch": 2.925518925518926,
"grad_norm": 8.290820121765137,
"learning_rate": 1.3327350238644224e-06,
"loss": 0.6432682871818542,
"step": 2396
},
{
"epoch": 2.927960927960928,
"grad_norm": 33.93027877807617,
"learning_rate": 1.3292096203477533e-06,
"loss": 0.6455587148666382,
"step": 2398
},
{
"epoch": 2.9304029304029307,
"grad_norm": 15.2852201461792,
"learning_rate": 1.3256900085892584e-06,
"loss": 0.5954673290252686,
"step": 2400
},
{
"epoch": 2.932844932844933,
"grad_norm": 2.53251051902771,
"learning_rate": 1.3221762029364043e-06,
"loss": 0.656650960445404,
"step": 2402
},
{
"epoch": 2.9352869352869355,
"grad_norm": 3.878647565841675,
"learning_rate": 1.3186682177129862e-06,
"loss": 0.3129318654537201,
"step": 2404
},
{
"epoch": 2.937728937728938,
"grad_norm": 3.8784711360931396,
"learning_rate": 1.3151660672190744e-06,
"loss": 1.0069366693496704,
"step": 2406
},
{
"epoch": 2.9401709401709404,
"grad_norm": 1.668820858001709,
"learning_rate": 1.3116697657309547e-06,
"loss": 0.7313091158866882,
"step": 2408
},
{
"epoch": 2.942612942612943,
"grad_norm": 2.024500846862793,
"learning_rate": 1.3081793275010699e-06,
"loss": 0.6760754585266113,
"step": 2410
},
{
"epoch": 2.9450549450549453,
"grad_norm": 1.469117522239685,
"learning_rate": 1.3046947667579596e-06,
"loss": 0.9695707559585571,
"step": 2412
},
{
"epoch": 2.9474969474969477,
"grad_norm": 2.2661683559417725,
"learning_rate": 1.301216097706206e-06,
"loss": 1.0303492546081543,
"step": 2414
},
{
"epoch": 2.94993894993895,
"grad_norm": 1.312300682067871,
"learning_rate": 1.2977433345263752e-06,
"loss": 0.9242293238639832,
"step": 2416
},
{
"epoch": 2.9523809523809526,
"grad_norm": 2.9520680904388428,
"learning_rate": 1.2942764913749544e-06,
"loss": 0.6899678707122803,
"step": 2418
},
{
"epoch": 2.954822954822955,
"grad_norm": 1.741718053817749,
"learning_rate": 1.2908155823843033e-06,
"loss": 0.9872897267341614,
"step": 2420
},
{
"epoch": 2.9572649572649574,
"grad_norm": 1.809373378753662,
"learning_rate": 1.2873606216625879e-06,
"loss": 0.8448399305343628,
"step": 2422
},
{
"epoch": 2.95970695970696,
"grad_norm": 2.35263991355896,
"learning_rate": 1.2839116232937271e-06,
"loss": 0.5212328433990479,
"step": 2424
},
{
"epoch": 2.9621489621489623,
"grad_norm": 2.598365068435669,
"learning_rate": 1.280468601337335e-06,
"loss": 1.1081678867340088,
"step": 2426
},
{
"epoch": 2.9645909645909647,
"grad_norm": 11.081291198730469,
"learning_rate": 1.2770315698286643e-06,
"loss": 0.5913952589035034,
"step": 2428
},
{
"epoch": 2.967032967032967,
"grad_norm": 2.6343994140625,
"learning_rate": 1.273600542778546e-06,
"loss": 0.9255035519599915,
"step": 2430
},
{
"epoch": 2.9694749694749696,
"grad_norm": 2.8472864627838135,
"learning_rate": 1.2701755341733363e-06,
"loss": 0.8645012378692627,
"step": 2432
},
{
"epoch": 2.971916971916972,
"grad_norm": 1.3869469165802002,
"learning_rate": 1.2667565579748552e-06,
"loss": 0.9598724246025085,
"step": 2434
},
{
"epoch": 2.9743589743589745,
"grad_norm": 1.7193245887756348,
"learning_rate": 1.2633436281203353e-06,
"loss": 0.5284073948860168,
"step": 2436
},
{
"epoch": 2.976800976800977,
"grad_norm": 4.513336181640625,
"learning_rate": 1.2599367585223573e-06,
"loss": 0.5111241340637207,
"step": 2438
},
{
"epoch": 2.9792429792429793,
"grad_norm": 3.2821226119995117,
"learning_rate": 1.2565359630688029e-06,
"loss": 0.9840971231460571,
"step": 2440
},
{
"epoch": 2.9816849816849818,
"grad_norm": 1.6997895240783691,
"learning_rate": 1.2531412556227883e-06,
"loss": 1.0207282304763794,
"step": 2442
},
{
"epoch": 2.984126984126984,
"grad_norm": 2.6019296646118164,
"learning_rate": 1.2497526500226163e-06,
"loss": 0.940024197101593,
"step": 2444
},
{
"epoch": 2.9865689865689866,
"grad_norm": 2.273214101791382,
"learning_rate": 1.246370160081711e-06,
"loss": 0.9067605137825012,
"step": 2446
},
{
"epoch": 2.989010989010989,
"grad_norm": 1.8762654066085815,
"learning_rate": 1.2429937995885713e-06,
"loss": 0.93479323387146,
"step": 2448
},
{
"epoch": 2.9914529914529915,
"grad_norm": 5.84881067276001,
"learning_rate": 1.2396235823067076e-06,
"loss": 0.6413801312446594,
"step": 2450
},
{
"epoch": 2.993894993894994,
"grad_norm": 1.7108796834945679,
"learning_rate": 1.2362595219745882e-06,
"loss": 1.0565381050109863,
"step": 2452
},
{
"epoch": 2.9963369963369964,
"grad_norm": 1.5569299459457397,
"learning_rate": 1.2329016323055822e-06,
"loss": 0.9824570417404175,
"step": 2454
},
{
"epoch": 2.998778998778999,
"grad_norm": 2.560699939727783,
"learning_rate": 1.2295499269879063e-06,
"loss": 0.5337162613868713,
"step": 2456
},
{
"epoch": 3.001221001221001,
"grad_norm": 3.085305690765381,
"learning_rate": 1.2262044196845638e-06,
"loss": 0.6332882046699524,
"step": 2458
},
{
"epoch": 3.0036630036630036,
"grad_norm": 3.3725106716156006,
"learning_rate": 1.2228651240332972e-06,
"loss": 0.62852543592453,
"step": 2460
},
{
"epoch": 3.006105006105006,
"grad_norm": 1.6869263648986816,
"learning_rate": 1.2195320536465225e-06,
"loss": 1.0432286262512207,
"step": 2462
},
{
"epoch": 3.0085470085470085,
"grad_norm": 1.3357821702957153,
"learning_rate": 1.2162052221112828e-06,
"loss": 0.962488055229187,
"step": 2464
},
{
"epoch": 3.010989010989011,
"grad_norm": 4.632596492767334,
"learning_rate": 1.2128846429891852e-06,
"loss": 0.5416973233222961,
"step": 2466
},
{
"epoch": 3.0134310134310134,
"grad_norm": 1.600440502166748,
"learning_rate": 1.2095703298163526e-06,
"loss": 0.8857253789901733,
"step": 2468
},
{
"epoch": 3.015873015873016,
"grad_norm": 0.576468288898468,
"learning_rate": 1.2062622961033632e-06,
"loss": 0.2631528675556183,
"step": 2470
},
{
"epoch": 3.0183150183150182,
"grad_norm": 5.257506370544434,
"learning_rate": 1.2029605553351988e-06,
"loss": 0.3512267470359802,
"step": 2472
},
{
"epoch": 3.0207570207570207,
"grad_norm": 2.005457639694214,
"learning_rate": 1.199665120971188e-06,
"loss": 0.9261833429336548,
"step": 2474
},
{
"epoch": 3.023199023199023,
"grad_norm": 5.405751705169678,
"learning_rate": 1.1963760064449495e-06,
"loss": 0.5271846652030945,
"step": 2476
},
{
"epoch": 3.0256410256410255,
"grad_norm": 1.659690499305725,
"learning_rate": 1.1930932251643438e-06,
"loss": 0.6160858869552612,
"step": 2478
},
{
"epoch": 3.028083028083028,
"grad_norm": 1.8383840322494507,
"learning_rate": 1.189816790511409e-06,
"loss": 0.8536359667778015,
"step": 2480
},
{
"epoch": 3.0305250305250304,
"grad_norm": 2.1919424533843994,
"learning_rate": 1.1865467158423179e-06,
"loss": 0.9045109152793884,
"step": 2482
},
{
"epoch": 3.032967032967033,
"grad_norm": 1.5028966665267944,
"learning_rate": 1.1832830144873122e-06,
"loss": 0.6014432907104492,
"step": 2484
},
{
"epoch": 3.0354090354090353,
"grad_norm": 2.3299906253814697,
"learning_rate": 1.1800256997506557e-06,
"loss": 0.8661763072013855,
"step": 2486
},
{
"epoch": 3.0378510378510377,
"grad_norm": 9.991959571838379,
"learning_rate": 1.176774784910576e-06,
"loss": 0.6161713600158691,
"step": 2488
},
{
"epoch": 3.04029304029304,
"grad_norm": 2.847564697265625,
"learning_rate": 1.1735302832192135e-06,
"loss": 0.8722133636474609,
"step": 2490
},
{
"epoch": 3.0427350427350426,
"grad_norm": 2.7239389419555664,
"learning_rate": 1.1702922079025647e-06,
"loss": 0.3192221522331238,
"step": 2492
},
{
"epoch": 3.045177045177045,
"grad_norm": 0.7756720185279846,
"learning_rate": 1.1670605721604307e-06,
"loss": 0.2883589565753937,
"step": 2494
},
{
"epoch": 3.0476190476190474,
"grad_norm": 4.5272135734558105,
"learning_rate": 1.1638353891663602e-06,
"loss": 0.6891329288482666,
"step": 2496
},
{
"epoch": 3.05006105006105,
"grad_norm": 4.521149635314941,
"learning_rate": 1.1606166720675999e-06,
"loss": 0.45780226588249207,
"step": 2498
},
{
"epoch": 3.0525030525030523,
"grad_norm": 0.8906940221786499,
"learning_rate": 1.157404433985035e-06,
"loss": 0.5027573704719543,
"step": 2500
},
{
"epoch": 3.0549450549450547,
"grad_norm": 9.020967483520508,
"learning_rate": 1.1541986880131455e-06,
"loss": 0.4361349642276764,
"step": 2502
},
{
"epoch": 3.057387057387057,
"grad_norm": 2.3300914764404297,
"learning_rate": 1.1509994472199407e-06,
"loss": 0.8963256478309631,
"step": 2504
},
{
"epoch": 3.0598290598290596,
"grad_norm": 2.031867027282715,
"learning_rate": 1.1478067246469158e-06,
"loss": 0.4999798536300659,
"step": 2506
},
{
"epoch": 3.062271062271062,
"grad_norm": 1.6989192962646484,
"learning_rate": 1.1446205333089922e-06,
"loss": 0.7561573386192322,
"step": 2508
},
{
"epoch": 3.064713064713065,
"grad_norm": 4.123907089233398,
"learning_rate": 1.1414408861944695e-06,
"loss": 0.8584511876106262,
"step": 2510
},
{
"epoch": 3.0671550671550674,
"grad_norm": 0.06403572857379913,
"learning_rate": 1.1382677962649687e-06,
"loss": 0.3911321461200714,
"step": 2512
},
{
"epoch": 3.06959706959707,
"grad_norm": 4.289177894592285,
"learning_rate": 1.1351012764553828e-06,
"loss": 0.8152522444725037,
"step": 2514
},
{
"epoch": 3.0720390720390722,
"grad_norm": 2.2127068042755127,
"learning_rate": 1.1319413396738188e-06,
"loss": 0.5816116333007812,
"step": 2516
},
{
"epoch": 3.0744810744810747,
"grad_norm": 0.09444202482700348,
"learning_rate": 1.128787998801552e-06,
"loss": 0.20017878711223602,
"step": 2518
},
{
"epoch": 3.076923076923077,
"grad_norm": 2.8026583194732666,
"learning_rate": 1.1256412666929655e-06,
"loss": 0.514468789100647,
"step": 2520
},
{
"epoch": 3.0793650793650795,
"grad_norm": 2.7216711044311523,
"learning_rate": 1.1225011561755093e-06,
"loss": 0.6835171580314636,
"step": 2522
},
{
"epoch": 3.081807081807082,
"grad_norm": 2.2049448490142822,
"learning_rate": 1.1193676800496326e-06,
"loss": 0.8667712211608887,
"step": 2524
},
{
"epoch": 3.0842490842490844,
"grad_norm": 11.64513111114502,
"learning_rate": 1.1162408510887469e-06,
"loss": 0.5643727779388428,
"step": 2526
},
{
"epoch": 3.086691086691087,
"grad_norm": 2.371492862701416,
"learning_rate": 1.1131206820391618e-06,
"loss": 0.5264307856559753,
"step": 2528
},
{
"epoch": 3.0891330891330893,
"grad_norm": 8.419890403747559,
"learning_rate": 1.1100071856200413e-06,
"loss": 0.11923594772815704,
"step": 2530
},
{
"epoch": 3.0915750915750917,
"grad_norm": 4.913514137268066,
"learning_rate": 1.106900374523348e-06,
"loss": 0.32799002528190613,
"step": 2532
},
{
"epoch": 3.094017094017094,
"grad_norm": 2.237429141998291,
"learning_rate": 1.1038002614137922e-06,
"loss": 0.8726149797439575,
"step": 2534
},
{
"epoch": 3.0964590964590966,
"grad_norm": 5.674133777618408,
"learning_rate": 1.1007068589287814e-06,
"loss": 0.635856568813324,
"step": 2536
},
{
"epoch": 3.098901098901099,
"grad_norm": 24.948986053466797,
"learning_rate": 1.0976201796783642e-06,
"loss": 0.6740862131118774,
"step": 2538
},
{
"epoch": 3.1013431013431014,
"grad_norm": 2.047201633453369,
"learning_rate": 1.0945402362451871e-06,
"loss": 0.9215976595878601,
"step": 2540
},
{
"epoch": 3.103785103785104,
"grad_norm": 6.655368328094482,
"learning_rate": 1.0914670411844338e-06,
"loss": 0.559134304523468,
"step": 2542
},
{
"epoch": 3.1062271062271063,
"grad_norm": 1.7727597951889038,
"learning_rate": 1.0884006070237834e-06,
"loss": 0.5720962285995483,
"step": 2544
},
{
"epoch": 3.1086691086691087,
"grad_norm": 3.6166863441467285,
"learning_rate": 1.0853409462633507e-06,
"loss": 0.16919654607772827,
"step": 2546
},
{
"epoch": 3.111111111111111,
"grad_norm": 2.884989023208618,
"learning_rate": 1.0822880713756422e-06,
"loss": 0.639471173286438,
"step": 2548
},
{
"epoch": 3.1135531135531136,
"grad_norm": 1.7115203142166138,
"learning_rate": 1.0792419948054994e-06,
"loss": 0.6552147269248962,
"step": 2550
},
{
"epoch": 3.115995115995116,
"grad_norm": 3.130906343460083,
"learning_rate": 1.0762027289700527e-06,
"loss": 0.2590104043483734,
"step": 2552
},
{
"epoch": 3.1184371184371185,
"grad_norm": 2.566354751586914,
"learning_rate": 1.0731702862586686e-06,
"loss": 0.8442977666854858,
"step": 2554
},
{
"epoch": 3.120879120879121,
"grad_norm": 2.08247709274292,
"learning_rate": 1.070144679032901e-06,
"loss": 0.37470126152038574,
"step": 2556
},
{
"epoch": 3.1233211233211233,
"grad_norm": 0.45577648282051086,
"learning_rate": 1.0671259196264355e-06,
"loss": 0.4773566722869873,
"step": 2558
},
{
"epoch": 3.1257631257631258,
"grad_norm": 2.147977590560913,
"learning_rate": 1.064114020345048e-06,
"loss": 0.847014844417572,
"step": 2560
},
{
"epoch": 3.128205128205128,
"grad_norm": 6.595324516296387,
"learning_rate": 1.0611089934665438e-06,
"loss": 1.0399620532989502,
"step": 2562
},
{
"epoch": 3.1306471306471306,
"grad_norm": 3.0821518898010254,
"learning_rate": 1.0581108512407206e-06,
"loss": 0.8594496250152588,
"step": 2564
},
{
"epoch": 3.133089133089133,
"grad_norm": 6.90889310836792,
"learning_rate": 1.055119605889304e-06,
"loss": 0.6531029939651489,
"step": 2566
},
{
"epoch": 3.1355311355311355,
"grad_norm": 5.536701679229736,
"learning_rate": 1.0521352696059106e-06,
"loss": 0.8756755590438843,
"step": 2568
},
{
"epoch": 3.137973137973138,
"grad_norm": 7.192801475524902,
"learning_rate": 1.0491578545559882e-06,
"loss": 0.5930169820785522,
"step": 2570
},
{
"epoch": 3.1404151404151404,
"grad_norm": 5.717547416687012,
"learning_rate": 1.0461873728767735e-06,
"loss": 0.6029551029205322,
"step": 2572
},
{
"epoch": 3.142857142857143,
"grad_norm": 4.090261936187744,
"learning_rate": 1.043223836677239e-06,
"loss": 0.8777113556861877,
"step": 2574
},
{
"epoch": 3.1452991452991452,
"grad_norm": 14.0100736618042,
"learning_rate": 1.040267258038045e-06,
"loss": 0.9692713022232056,
"step": 2576
},
{
"epoch": 3.1477411477411477,
"grad_norm": 3.899435520172119,
"learning_rate": 1.0373176490114874e-06,
"loss": 0.8949326276779175,
"step": 2578
},
{
"epoch": 3.15018315018315,
"grad_norm": 5.538814067840576,
"learning_rate": 1.0343750216214546e-06,
"loss": 0.8762179017066956,
"step": 2580
},
{
"epoch": 3.1526251526251525,
"grad_norm": 7.619011402130127,
"learning_rate": 1.0314393878633705e-06,
"loss": 0.7504989504814148,
"step": 2582
},
{
"epoch": 3.155067155067155,
"grad_norm": 3.597076416015625,
"learning_rate": 1.0285107597041552e-06,
"loss": 0.31154295802116394,
"step": 2584
},
{
"epoch": 3.1575091575091574,
"grad_norm": 3.5088987350463867,
"learning_rate": 1.0255891490821657e-06,
"loss": 0.6339558362960815,
"step": 2586
},
{
"epoch": 3.15995115995116,
"grad_norm": 5.022501468658447,
"learning_rate": 1.0226745679071555e-06,
"loss": 0.328271746635437,
"step": 2588
},
{
"epoch": 3.1623931623931623,
"grad_norm": 4.233664035797119,
"learning_rate": 1.0197670280602234e-06,
"loss": 0.35303497314453125,
"step": 2590
},
{
"epoch": 3.1648351648351647,
"grad_norm": 2.7248518466949463,
"learning_rate": 1.016866541393762e-06,
"loss": 0.8729944825172424,
"step": 2592
},
{
"epoch": 3.167277167277167,
"grad_norm": 2.3809876441955566,
"learning_rate": 1.0139731197314144e-06,
"loss": 0.7970367074012756,
"step": 2594
},
{
"epoch": 3.1697191697191696,
"grad_norm": 5.26347017288208,
"learning_rate": 1.0110867748680229e-06,
"loss": 0.6249693632125854,
"step": 2596
},
{
"epoch": 3.172161172161172,
"grad_norm": 2.0786538124084473,
"learning_rate": 1.0082075185695821e-06,
"loss": 0.8957004547119141,
"step": 2598
},
{
"epoch": 3.1746031746031744,
"grad_norm": 2.350102424621582,
"learning_rate": 1.0053353625731898e-06,
"loss": 0.773188591003418,
"step": 2600
},
{
"epoch": 3.177045177045177,
"grad_norm": 2.2141921520233154,
"learning_rate": 1.0024703185870009e-06,
"loss": 0.8564462065696716,
"step": 2602
},
{
"epoch": 3.1794871794871793,
"grad_norm": 1.9684959650039673,
"learning_rate": 9.99612398290176e-07,
"loss": 0.8819740414619446,
"step": 2604
},
{
"epoch": 3.1819291819291817,
"grad_norm": 9.041963577270508,
"learning_rate": 9.967616133328415e-07,
"loss": 0.6753929257392883,
"step": 2606
},
{
"epoch": 3.1843711843711846,
"grad_norm": 3.164386510848999,
"learning_rate": 9.939179753360317e-07,
"loss": 0.9383725523948669,
"step": 2608
},
{
"epoch": 3.186813186813187,
"grad_norm": 1.6950500011444092,
"learning_rate": 9.910814958916509e-07,
"loss": 0.8148356676101685,
"step": 2610
},
{
"epoch": 3.1892551892551895,
"grad_norm": 1.7868210077285767,
"learning_rate": 9.882521865624188e-07,
"loss": 0.8345255255699158,
"step": 2612
},
{
"epoch": 3.191697191697192,
"grad_norm": 2.3038735389709473,
"learning_rate": 9.854300588818285e-07,
"loss": 0.5892983078956604,
"step": 2614
},
{
"epoch": 3.1941391941391943,
"grad_norm": 3.3323488235473633,
"learning_rate": 9.826151243540976e-07,
"loss": 0.5326892137527466,
"step": 2616
},
{
"epoch": 3.1965811965811968,
"grad_norm": 5.719020366668701,
"learning_rate": 9.798073944541209e-07,
"loss": 0.5761935114860535,
"step": 2618
},
{
"epoch": 3.199023199023199,
"grad_norm": 4.963831424713135,
"learning_rate": 9.77006880627423e-07,
"loss": 0.35491544008255005,
"step": 2620
},
{
"epoch": 3.2014652014652016,
"grad_norm": 1.7607579231262207,
"learning_rate": 9.742135942901152e-07,
"loss": 0.5363562703132629,
"step": 2622
},
{
"epoch": 3.203907203907204,
"grad_norm": 15.481447219848633,
"learning_rate": 9.714275468288426e-07,
"loss": 0.43480369448661804,
"step": 2624
},
{
"epoch": 3.2063492063492065,
"grad_norm": 2.512791633605957,
"learning_rate": 9.68648749600746e-07,
"loss": 0.9965137839317322,
"step": 2626
},
{
"epoch": 3.208791208791209,
"grad_norm": 38.066707611083984,
"learning_rate": 9.658772139334074e-07,
"loss": 0.227127343416214,
"step": 2628
},
{
"epoch": 3.2112332112332114,
"grad_norm": 2.2914047241210938,
"learning_rate": 9.631129511248099e-07,
"loss": 0.9076048135757446,
"step": 2630
},
{
"epoch": 3.213675213675214,
"grad_norm": 3.0350992679595947,
"learning_rate": 9.603559724432874e-07,
"loss": 0.5686833262443542,
"step": 2632
},
{
"epoch": 3.2161172161172162,
"grad_norm": 2.278398036956787,
"learning_rate": 9.576062891274816e-07,
"loss": 0.6908602714538574,
"step": 2634
},
{
"epoch": 3.2185592185592187,
"grad_norm": 7.601328372955322,
"learning_rate": 9.548639123862952e-07,
"loss": 0.81014084815979,
"step": 2636
},
{
"epoch": 3.221001221001221,
"grad_norm": 4.974308490753174,
"learning_rate": 9.52128853398847e-07,
"loss": 0.6480343341827393,
"step": 2638
},
{
"epoch": 3.2234432234432235,
"grad_norm": 5.386457443237305,
"learning_rate": 9.494011233144227e-07,
"loss": 0.6495685577392578,
"step": 2640
},
{
"epoch": 3.225885225885226,
"grad_norm": 3.7221124172210693,
"learning_rate": 9.466807332524343e-07,
"loss": 0.885014533996582,
"step": 2642
},
{
"epoch": 3.2283272283272284,
"grad_norm": 2.109729051589966,
"learning_rate": 9.439676943023732e-07,
"loss": 0.8729287385940552,
"step": 2644
},
{
"epoch": 3.230769230769231,
"grad_norm": 1.1997343301773071,
"learning_rate": 9.412620175237621e-07,
"loss": 0.913487434387207,
"step": 2646
},
{
"epoch": 3.2332112332112333,
"grad_norm": 5.304439544677734,
"learning_rate": 9.385637139461151e-07,
"loss": 0.9510135650634766,
"step": 2648
},
{
"epoch": 3.2356532356532357,
"grad_norm": 8.256377220153809,
"learning_rate": 9.358727945688877e-07,
"loss": 0.2964293956756592,
"step": 2650
},
{
"epoch": 3.238095238095238,
"grad_norm": 2.2458744049072266,
"learning_rate": 9.331892703614359e-07,
"loss": 0.8582343459129333,
"step": 2652
},
{
"epoch": 3.2405372405372406,
"grad_norm": 2.5474815368652344,
"learning_rate": 9.305131522629679e-07,
"loss": 1.0781978368759155,
"step": 2654
},
{
"epoch": 3.242979242979243,
"grad_norm": 1.53843092918396,
"learning_rate": 9.27844451182503e-07,
"loss": 0.4822746217250824,
"step": 2656
},
{
"epoch": 3.2454212454212454,
"grad_norm": 5.753013610839844,
"learning_rate": 9.251831779988252e-07,
"loss": 0.3543876111507416,
"step": 2658
},
{
"epoch": 3.247863247863248,
"grad_norm": 6.03514289855957,
"learning_rate": 9.22529343560439e-07,
"loss": 0.5339376330375671,
"step": 2660
},
{
"epoch": 3.2503052503052503,
"grad_norm": 3.018615245819092,
"learning_rate": 9.19882958685524e-07,
"loss": 1.2899425029754639,
"step": 2662
},
{
"epoch": 3.2527472527472527,
"grad_norm": 4.987201690673828,
"learning_rate": 9.172440341618951e-07,
"loss": 0.6661590337753296,
"step": 2664
},
{
"epoch": 3.255189255189255,
"grad_norm": 3.2230803966522217,
"learning_rate": 9.146125807469525e-07,
"loss": 0.6229037642478943,
"step": 2666
},
{
"epoch": 3.2576312576312576,
"grad_norm": 5.4705071449279785,
"learning_rate": 9.119886091676436e-07,
"loss": 0.9204983711242676,
"step": 2668
},
{
"epoch": 3.26007326007326,
"grad_norm": 1.7358949184417725,
"learning_rate": 9.093721301204143e-07,
"loss": 0.8217456340789795,
"step": 2670
},
{
"epoch": 3.2625152625152625,
"grad_norm": 2.2710583209991455,
"learning_rate": 9.067631542711692e-07,
"loss": 0.5310102701187134,
"step": 2672
},
{
"epoch": 3.264957264957265,
"grad_norm": 23.32669448852539,
"learning_rate": 9.041616922552254e-07,
"loss": 0.1262706220149994,
"step": 2674
},
{
"epoch": 3.2673992673992673,
"grad_norm": 2.316551923751831,
"learning_rate": 9.015677546772717e-07,
"loss": 0.9631689190864563,
"step": 2676
},
{
"epoch": 3.2698412698412698,
"grad_norm": 5.279893398284912,
"learning_rate": 8.989813521113232e-07,
"loss": 0.7836791276931763,
"step": 2678
},
{
"epoch": 3.272283272283272,
"grad_norm": 0.7904373407363892,
"learning_rate": 8.964024951006798e-07,
"loss": 0.49453315138816833,
"step": 2680
},
{
"epoch": 3.2747252747252746,
"grad_norm": 4.579488277435303,
"learning_rate": 8.938311941578806e-07,
"loss": 0.48266905546188354,
"step": 2682
},
{
"epoch": 3.277167277167277,
"grad_norm": 2.3565316200256348,
"learning_rate": 8.912674597646653e-07,
"loss": 0.6459278464317322,
"step": 2684
},
{
"epoch": 3.2796092796092795,
"grad_norm": 3.3402786254882812,
"learning_rate": 8.887113023719262e-07,
"loss": 1.0020655393600464,
"step": 2686
},
{
"epoch": 3.282051282051282,
"grad_norm": 0.46372556686401367,
"learning_rate": 8.861627323996724e-07,
"loss": 0.08561723679304123,
"step": 2688
},
{
"epoch": 3.2844932844932844,
"grad_norm": 2.682774782180786,
"learning_rate": 8.836217602369799e-07,
"loss": 1.0556048154830933,
"step": 2690
},
{
"epoch": 3.286935286935287,
"grad_norm": 2.9882302284240723,
"learning_rate": 8.810883962419542e-07,
"loss": 0.9429636001586914,
"step": 2692
},
{
"epoch": 3.2893772893772892,
"grad_norm": 0.21104033291339874,
"learning_rate": 8.785626507416855e-07,
"loss": 0.11109757423400879,
"step": 2694
},
{
"epoch": 3.2918192918192917,
"grad_norm": 0.5581515431404114,
"learning_rate": 8.760445340322096e-07,
"loss": 0.17564286291599274,
"step": 2696
},
{
"epoch": 3.294261294261294,
"grad_norm": 2.4714837074279785,
"learning_rate": 8.735340563784625e-07,
"loss": 0.6768051385879517,
"step": 2698
},
{
"epoch": 3.2967032967032965,
"grad_norm": 2.1521193981170654,
"learning_rate": 8.710312280142416e-07,
"loss": 0.9193722605705261,
"step": 2700
},
{
"epoch": 3.299145299145299,
"grad_norm": 13.212026596069336,
"learning_rate": 8.685360591421598e-07,
"loss": 0.9638568758964539,
"step": 2702
},
{
"epoch": 3.3015873015873014,
"grad_norm": 2.658658266067505,
"learning_rate": 8.660485599336094e-07,
"loss": 0.8721650838851929,
"step": 2704
},
{
"epoch": 3.304029304029304,
"grad_norm": 2.8447883129119873,
"learning_rate": 8.635687405287171e-07,
"loss": 0.7735913991928101,
"step": 2706
},
{
"epoch": 3.3064713064713063,
"grad_norm": 1.3805103302001953,
"learning_rate": 8.610966110363014e-07,
"loss": 0.5056965351104736,
"step": 2708
},
{
"epoch": 3.3089133089133087,
"grad_norm": 1.813744068145752,
"learning_rate": 8.586321815338361e-07,
"loss": 0.57419753074646,
"step": 2710
},
{
"epoch": 3.311355311355311,
"grad_norm": 2.599118232727051,
"learning_rate": 8.56175462067405e-07,
"loss": 0.8707802295684814,
"step": 2712
},
{
"epoch": 3.3137973137973136,
"grad_norm": 2.0446720123291016,
"learning_rate": 8.537264626516634e-07,
"loss": 0.5774456262588501,
"step": 2714
},
{
"epoch": 3.316239316239316,
"grad_norm": 3.4925484657287598,
"learning_rate": 8.512851932697947e-07,
"loss": 0.9497953653335571,
"step": 2716
},
{
"epoch": 3.3186813186813184,
"grad_norm": 3.462381601333618,
"learning_rate": 8.488516638734731e-07,
"loss": 0.8057655692100525,
"step": 2718
},
{
"epoch": 3.3211233211233213,
"grad_norm": 1.8965480327606201,
"learning_rate": 8.464258843828202e-07,
"loss": 0.8699415326118469,
"step": 2720
},
{
"epoch": 3.3235653235653237,
"grad_norm": 2.465651750564575,
"learning_rate": 8.440078646863664e-07,
"loss": 0.641089141368866,
"step": 2722
},
{
"epoch": 3.326007326007326,
"grad_norm": 0.7584552764892578,
"learning_rate": 8.415976146410084e-07,
"loss": 0.09330576658248901,
"step": 2724
},
{
"epoch": 3.3284493284493286,
"grad_norm": 2.273700714111328,
"learning_rate": 8.391951440719725e-07,
"loss": 0.5427566766738892,
"step": 2726
},
{
"epoch": 3.330891330891331,
"grad_norm": 2.9195804595947266,
"learning_rate": 8.368004627727699e-07,
"loss": 0.8910986185073853,
"step": 2728
},
{
"epoch": 3.3333333333333335,
"grad_norm": 4.608819007873535,
"learning_rate": 8.344135805051629e-07,
"loss": 0.7685779929161072,
"step": 2730
},
{
"epoch": 3.335775335775336,
"grad_norm": 0.7064034938812256,
"learning_rate": 8.320345069991175e-07,
"loss": 0.5918761491775513,
"step": 2732
},
{
"epoch": 3.3382173382173383,
"grad_norm": 3.630662202835083,
"learning_rate": 8.296632519527711e-07,
"loss": 0.6658087372779846,
"step": 2734
},
{
"epoch": 3.340659340659341,
"grad_norm": 2.0108442306518555,
"learning_rate": 8.272998250323872e-07,
"loss": 0.7752918004989624,
"step": 2736
},
{
"epoch": 3.343101343101343,
"grad_norm": 1.0330065488815308,
"learning_rate": 8.249442358723204e-07,
"loss": 0.5759359002113342,
"step": 2738
},
{
"epoch": 3.3455433455433456,
"grad_norm": 3.048520803451538,
"learning_rate": 8.225964940749737e-07,
"loss": 0.5758652687072754,
"step": 2740
},
{
"epoch": 3.347985347985348,
"grad_norm": 4.6908392906188965,
"learning_rate": 8.202566092107628e-07,
"loss": 0.8692240118980408,
"step": 2742
},
{
"epoch": 3.3504273504273505,
"grad_norm": 11.141556739807129,
"learning_rate": 8.179245908180724e-07,
"loss": 0.5387795567512512,
"step": 2744
},
{
"epoch": 3.352869352869353,
"grad_norm": 3.472233533859253,
"learning_rate": 8.156004484032226e-07,
"loss": 0.7473067045211792,
"step": 2746
},
{
"epoch": 3.3553113553113554,
"grad_norm": 7.765378475189209,
"learning_rate": 8.132841914404253e-07,
"loss": 0.4999602437019348,
"step": 2748
},
{
"epoch": 3.357753357753358,
"grad_norm": 9.492226600646973,
"learning_rate": 8.109758293717505e-07,
"loss": 0.36286643147468567,
"step": 2750
},
{
"epoch": 3.3601953601953602,
"grad_norm": 0.9999972581863403,
"learning_rate": 8.086753716070828e-07,
"loss": 0.402780145406723,
"step": 2752
},
{
"epoch": 3.3626373626373627,
"grad_norm": 2.837510108947754,
"learning_rate": 8.063828275240873e-07,
"loss": 0.4516952335834503,
"step": 2754
},
{
"epoch": 3.365079365079365,
"grad_norm": 4.009491443634033,
"learning_rate": 8.040982064681671e-07,
"loss": 0.8290095925331116,
"step": 2756
},
{
"epoch": 3.3675213675213675,
"grad_norm": 2.420555830001831,
"learning_rate": 8.018215177524302e-07,
"loss": 0.8783026337623596,
"step": 2758
},
{
"epoch": 3.36996336996337,
"grad_norm": 4.7987284660339355,
"learning_rate": 7.995527706576474e-07,
"loss": 1.161372423171997,
"step": 2760
},
{
"epoch": 3.3724053724053724,
"grad_norm": 2.0077738761901855,
"learning_rate": 7.972919744322172e-07,
"loss": 0.5153079032897949,
"step": 2762
},
{
"epoch": 3.374847374847375,
"grad_norm": 2.043386459350586,
"learning_rate": 7.950391382921253e-07,
"loss": 0.8760576248168945,
"step": 2764
},
{
"epoch": 3.3772893772893773,
"grad_norm": 2.278296709060669,
"learning_rate": 7.927942714209094e-07,
"loss": 0.47707459330558777,
"step": 2766
},
{
"epoch": 3.3797313797313797,
"grad_norm": 3.5936970710754395,
"learning_rate": 7.905573829696222e-07,
"loss": 0.3957478404045105,
"step": 2768
},
{
"epoch": 3.382173382173382,
"grad_norm": 7.540212154388428,
"learning_rate": 7.883284820567905e-07,
"loss": 0.5244758725166321,
"step": 2770
},
{
"epoch": 3.3846153846153846,
"grad_norm": 18.458484649658203,
"learning_rate": 7.861075777683822e-07,
"loss": 0.8487293720245361,
"step": 2772
},
{
"epoch": 3.387057387057387,
"grad_norm": 9.975359916687012,
"learning_rate": 7.838946791577669e-07,
"loss": 0.42381957173347473,
"step": 2774
},
{
"epoch": 3.3894993894993894,
"grad_norm": 1.6827895641326904,
"learning_rate": 7.816897952456802e-07,
"loss": 0.8452630043029785,
"step": 2776
},
{
"epoch": 3.391941391941392,
"grad_norm": 2.2170772552490234,
"learning_rate": 7.794929350201849e-07,
"loss": 0.7993656396865845,
"step": 2778
},
{
"epoch": 3.3943833943833943,
"grad_norm": 0.4966033399105072,
"learning_rate": 7.773041074366375e-07,
"loss": 0.38123244047164917,
"step": 2780
},
{
"epoch": 3.3968253968253967,
"grad_norm": 1.8956717252731323,
"learning_rate": 7.751233214176485e-07,
"loss": 0.4719703495502472,
"step": 2782
},
{
"epoch": 3.399267399267399,
"grad_norm": 3.7957403659820557,
"learning_rate": 7.729505858530489e-07,
"loss": 0.21603846549987793,
"step": 2784
},
{
"epoch": 3.4017094017094016,
"grad_norm": 3.2089285850524902,
"learning_rate": 7.70785909599851e-07,
"loss": 0.4859767258167267,
"step": 2786
},
{
"epoch": 3.404151404151404,
"grad_norm": 2.2830421924591064,
"learning_rate": 7.686293014822149e-07,
"loss": 0.8922374248504639,
"step": 2788
},
{
"epoch": 3.4065934065934065,
"grad_norm": 3.5256621837615967,
"learning_rate": 7.664807702914107e-07,
"loss": 0.8285965919494629,
"step": 2790
},
{
"epoch": 3.409035409035409,
"grad_norm": 2.4469144344329834,
"learning_rate": 7.643403247857853e-07,
"loss": 0.4633885622024536,
"step": 2792
},
{
"epoch": 3.4114774114774113,
"grad_norm": 3.2874977588653564,
"learning_rate": 7.622079736907219e-07,
"loss": 0.730563223361969,
"step": 2794
},
{
"epoch": 3.413919413919414,
"grad_norm": 1.843967318534851,
"learning_rate": 7.600837256986104e-07,
"loss": 0.9653308391571045,
"step": 2796
},
{
"epoch": 3.416361416361416,
"grad_norm": 6.217641830444336,
"learning_rate": 7.57967589468806e-07,
"loss": 0.47932058572769165,
"step": 2798
},
{
"epoch": 3.4188034188034186,
"grad_norm": 0.45403721928596497,
"learning_rate": 7.558595736275995e-07,
"loss": 0.05683291330933571,
"step": 2800
},
{
"epoch": 3.421245421245421,
"grad_norm": 2.4444427490234375,
"learning_rate": 7.537596867681773e-07,
"loss": 1.0308482646942139,
"step": 2802
},
{
"epoch": 3.4236874236874235,
"grad_norm": 0.5573722124099731,
"learning_rate": 7.516679374505911e-07,
"loss": 0.6440561413764954,
"step": 2804
},
{
"epoch": 3.426129426129426,
"grad_norm": 3.718284845352173,
"learning_rate": 7.495843342017173e-07,
"loss": 0.6178560853004456,
"step": 2806
},
{
"epoch": 3.4285714285714284,
"grad_norm": 3.11525297164917,
"learning_rate": 7.475088855152279e-07,
"loss": 0.923469066619873,
"step": 2808
},
{
"epoch": 3.4310134310134313,
"grad_norm": 2.810075521469116,
"learning_rate": 7.454415998515516e-07,
"loss": 0.7372915744781494,
"step": 2810
},
{
"epoch": 3.4334554334554337,
"grad_norm": 2.502122640609741,
"learning_rate": 7.433824856378425e-07,
"loss": 0.14429078996181488,
"step": 2812
},
{
"epoch": 3.435897435897436,
"grad_norm": 10.276256561279297,
"learning_rate": 7.413315512679436e-07,
"loss": 0.5484145283699036,
"step": 2814
},
{
"epoch": 3.4383394383394386,
"grad_norm": 2.402463912963867,
"learning_rate": 7.392888051023542e-07,
"loss": 0.8286385536193848,
"step": 2816
},
{
"epoch": 3.440781440781441,
"grad_norm": 1.590881586074829,
"learning_rate": 7.37254255468193e-07,
"loss": 0.9624377489089966,
"step": 2818
},
{
"epoch": 3.4432234432234434,
"grad_norm": 2.1349987983703613,
"learning_rate": 7.352279106591676e-07,
"loss": 0.8825662732124329,
"step": 2820
},
{
"epoch": 3.445665445665446,
"grad_norm": 3.0658047199249268,
"learning_rate": 7.332097789355388e-07,
"loss": 0.9127561450004578,
"step": 2822
},
{
"epoch": 3.4481074481074483,
"grad_norm": 1.7639163732528687,
"learning_rate": 7.31199868524088e-07,
"loss": 0.8078799247741699,
"step": 2824
},
{
"epoch": 3.4505494505494507,
"grad_norm": 1.9734654426574707,
"learning_rate": 7.291981876180815e-07,
"loss": 0.6381809115409851,
"step": 2826
},
{
"epoch": 3.452991452991453,
"grad_norm": 2.2318012714385986,
"learning_rate": 7.272047443772395e-07,
"loss": 0.760457456111908,
"step": 2828
},
{
"epoch": 3.4554334554334556,
"grad_norm": 9.063981056213379,
"learning_rate": 7.252195469277024e-07,
"loss": 0.6253539323806763,
"step": 2830
},
{
"epoch": 3.457875457875458,
"grad_norm": 3.0912418365478516,
"learning_rate": 7.232426033619955e-07,
"loss": 0.4733204245567322,
"step": 2832
},
{
"epoch": 3.4603174603174605,
"grad_norm": 1.568339228630066,
"learning_rate": 7.212739217389991e-07,
"loss": 0.9539817571640015,
"step": 2834
},
{
"epoch": 3.462759462759463,
"grad_norm": 9.57923412322998,
"learning_rate": 7.193135100839142e-07,
"loss": 0.5720884799957275,
"step": 2836
},
{
"epoch": 3.4652014652014653,
"grad_norm": 14.26650333404541,
"learning_rate": 7.173613763882297e-07,
"loss": 0.5722582936286926,
"step": 2838
},
{
"epoch": 3.4676434676434678,
"grad_norm": 3.157581329345703,
"learning_rate": 7.154175286096886e-07,
"loss": 0.954519510269165,
"step": 2840
},
{
"epoch": 3.47008547008547,
"grad_norm": 2.162440061569214,
"learning_rate": 7.134819746722588e-07,
"loss": 0.8875312805175781,
"step": 2842
},
{
"epoch": 3.4725274725274726,
"grad_norm": 1.576352834701538,
"learning_rate": 7.115547224660981e-07,
"loss": 0.8703738451004028,
"step": 2844
},
{
"epoch": 3.474969474969475,
"grad_norm": 2.352095127105713,
"learning_rate": 7.096357798475231e-07,
"loss": 0.8873903155326843,
"step": 2846
},
{
"epoch": 3.4774114774114775,
"grad_norm": 2.0396454334259033,
"learning_rate": 7.077251546389761e-07,
"loss": 0.8595806360244751,
"step": 2848
},
{
"epoch": 3.47985347985348,
"grad_norm": 2.4909889698028564,
"learning_rate": 7.058228546289952e-07,
"loss": 0.6372047662734985,
"step": 2850
},
{
"epoch": 3.4822954822954824,
"grad_norm": 2.2574751377105713,
"learning_rate": 7.039288875721798e-07,
"loss": 0.8206950426101685,
"step": 2852
},
{
"epoch": 3.484737484737485,
"grad_norm": 0.5610913634300232,
"learning_rate": 7.020432611891629e-07,
"loss": 0.1707066297531128,
"step": 2854
},
{
"epoch": 3.4871794871794872,
"grad_norm": 8.053951263427734,
"learning_rate": 7.001659831665748e-07,
"loss": 0.6180318593978882,
"step": 2856
},
{
"epoch": 3.4896214896214897,
"grad_norm": 8.793201446533203,
"learning_rate": 6.982970611570168e-07,
"loss": 0.29429712891578674,
"step": 2858
},
{
"epoch": 3.492063492063492,
"grad_norm": 1.830889344215393,
"learning_rate": 6.964365027790243e-07,
"loss": 0.8592406511306763,
"step": 2860
},
{
"epoch": 3.4945054945054945,
"grad_norm": 2.1449406147003174,
"learning_rate": 6.945843156170423e-07,
"loss": 0.9528040885925293,
"step": 2862
},
{
"epoch": 3.496947496947497,
"grad_norm": 2.4805285930633545,
"learning_rate": 6.927405072213878e-07,
"loss": 0.467544287443161,
"step": 2864
},
{
"epoch": 3.4993894993894994,
"grad_norm": 4.722518444061279,
"learning_rate": 6.909050851082258e-07,
"loss": 0.38818594813346863,
"step": 2866
},
{
"epoch": 3.501831501831502,
"grad_norm": 2.0547142028808594,
"learning_rate": 6.89078056759532e-07,
"loss": 0.8755742311477661,
"step": 2868
},
{
"epoch": 3.5042735042735043,
"grad_norm": 7.294073581695557,
"learning_rate": 6.872594296230677e-07,
"loss": 0.5849094986915588,
"step": 2870
},
{
"epoch": 3.5067155067155067,
"grad_norm": 4.594062328338623,
"learning_rate": 6.854492111123455e-07,
"loss": 0.5189932584762573,
"step": 2872
},
{
"epoch": 3.509157509157509,
"grad_norm": 3.3439576625823975,
"learning_rate": 6.836474086066024e-07,
"loss": 0.9283484220504761,
"step": 2874
},
{
"epoch": 3.5115995115995116,
"grad_norm": 6.525171279907227,
"learning_rate": 6.81854029450767e-07,
"loss": 0.32967475056648254,
"step": 2876
},
{
"epoch": 3.514041514041514,
"grad_norm": 1.570821762084961,
"learning_rate": 6.800690809554313e-07,
"loss": 0.9133099913597107,
"step": 2878
},
{
"epoch": 3.5164835164835164,
"grad_norm": 2.403273582458496,
"learning_rate": 6.782925703968195e-07,
"loss": 0.5854375958442688,
"step": 2880
},
{
"epoch": 3.518925518925519,
"grad_norm": 3.7819712162017822,
"learning_rate": 6.765245050167599e-07,
"loss": 0.6390686631202698,
"step": 2882
},
{
"epoch": 3.5213675213675213,
"grad_norm": 2.2759008407592773,
"learning_rate": 6.74764892022654e-07,
"loss": 0.9842717051506042,
"step": 2884
},
{
"epoch": 3.5238095238095237,
"grad_norm": 1.5493816137313843,
"learning_rate": 6.730137385874491e-07,
"loss": 0.9478884339332581,
"step": 2886
},
{
"epoch": 3.526251526251526,
"grad_norm": 2.1049609184265137,
"learning_rate": 6.712710518496049e-07,
"loss": 0.777178168296814,
"step": 2888
},
{
"epoch": 3.5286935286935286,
"grad_norm": 2.9918575286865234,
"learning_rate": 6.695368389130699e-07,
"loss": 0.8717899918556213,
"step": 2890
},
{
"epoch": 3.531135531135531,
"grad_norm": 3.209395170211792,
"learning_rate": 6.678111068472487e-07,
"loss": 0.7953534722328186,
"step": 2892
},
{
"epoch": 3.5335775335775335,
"grad_norm": 14.544081687927246,
"learning_rate": 6.660938626869734e-07,
"loss": 0.4765959680080414,
"step": 2894
},
{
"epoch": 3.536019536019536,
"grad_norm": 51.49199295043945,
"learning_rate": 6.643851134324767e-07,
"loss": 0.7235844731330872,
"step": 2896
},
{
"epoch": 3.5384615384615383,
"grad_norm": 4.060218811035156,
"learning_rate": 6.626848660493623e-07,
"loss": 0.804652750492096,
"step": 2898
},
{
"epoch": 3.5409035409035408,
"grad_norm": 12.073833465576172,
"learning_rate": 6.60993127468577e-07,
"loss": 0.867784321308136,
"step": 2900
},
{
"epoch": 3.543345543345543,
"grad_norm": 3.4921793937683105,
"learning_rate": 6.593099045863802e-07,
"loss": 0.13817808032035828,
"step": 2902
},
{
"epoch": 3.5457875457875456,
"grad_norm": 1.4257546663284302,
"learning_rate": 6.576352042643192e-07,
"loss": 0.8409507274627686,
"step": 2904
},
{
"epoch": 3.548229548229548,
"grad_norm": 4.283762454986572,
"learning_rate": 6.559690333292e-07,
"loss": 0.8512478470802307,
"step": 2906
},
{
"epoch": 3.5506715506715505,
"grad_norm": 2.5699775218963623,
"learning_rate": 6.543113985730579e-07,
"loss": 1.0054024457931519,
"step": 2908
},
{
"epoch": 3.553113553113553,
"grad_norm": 5.507492542266846,
"learning_rate": 6.526623067531313e-07,
"loss": 0.6415849328041077,
"step": 2910
},
{
"epoch": 3.5555555555555554,
"grad_norm": 4.384498119354248,
"learning_rate": 6.510217645918349e-07,
"loss": 0.46229088306427,
"step": 2912
},
{
"epoch": 3.557997557997558,
"grad_norm": 2.0857934951782227,
"learning_rate": 6.493897787767291e-07,
"loss": 0.5283727645874023,
"step": 2914
},
{
"epoch": 3.5604395604395602,
"grad_norm": 1.9115166664123535,
"learning_rate": 6.477663559604979e-07,
"loss": 0.6623761653900146,
"step": 2916
},
{
"epoch": 3.5628815628815627,
"grad_norm": 6.1141533851623535,
"learning_rate": 6.461515027609163e-07,
"loss": 0.6332585215568542,
"step": 2918
},
{
"epoch": 3.565323565323565,
"grad_norm": 8.153079986572266,
"learning_rate": 6.44545225760827e-07,
"loss": 0.5882151126861572,
"step": 2920
},
{
"epoch": 3.5677655677655675,
"grad_norm": 2.2321126461029053,
"learning_rate": 6.429475315081122e-07,
"loss": 0.8858240246772766,
"step": 2922
},
{
"epoch": 3.57020757020757,
"grad_norm": 19.70038414001465,
"learning_rate": 6.413584265156671e-07,
"loss": 0.6081412434577942,
"step": 2924
},
{
"epoch": 3.5726495726495724,
"grad_norm": 3.0893778800964355,
"learning_rate": 6.397779172613722e-07,
"loss": 0.454592227935791,
"step": 2926
},
{
"epoch": 3.575091575091575,
"grad_norm": 6.8976240158081055,
"learning_rate": 6.382060101880711e-07,
"loss": 0.8145590424537659,
"step": 2928
},
{
"epoch": 3.5775335775335773,
"grad_norm": 1.8353841304779053,
"learning_rate": 6.366427117035377e-07,
"loss": 0.8217576146125793,
"step": 2930
},
{
"epoch": 3.57997557997558,
"grad_norm": 4.694766044616699,
"learning_rate": 6.350880281804557e-07,
"loss": 0.7602511644363403,
"step": 2932
},
{
"epoch": 3.5824175824175826,
"grad_norm": 2.5171759128570557,
"learning_rate": 6.335419659563896e-07,
"loss": 0.7700616717338562,
"step": 2934
},
{
"epoch": 3.584859584859585,
"grad_norm": 5.43289041519165,
"learning_rate": 6.320045313337597e-07,
"loss": 0.518511950969696,
"step": 2936
},
{
"epoch": 3.5873015873015874,
"grad_norm": 0.7759566903114319,
"learning_rate": 6.304757305798172e-07,
"loss": 0.432235449552536,
"step": 2938
},
{
"epoch": 3.58974358974359,
"grad_norm": 2.7056305408477783,
"learning_rate": 6.289555699266174e-07,
"loss": 0.5823948383331299,
"step": 2940
},
{
"epoch": 3.5921855921855923,
"grad_norm": 10.587597846984863,
"learning_rate": 6.274440555709947e-07,
"loss": 0.9206511378288269,
"step": 2942
},
{
"epoch": 3.5946275946275947,
"grad_norm": 1.4514787197113037,
"learning_rate": 6.259411936745376e-07,
"loss": 0.9449152946472168,
"step": 2944
},
{
"epoch": 3.597069597069597,
"grad_norm": 2.0257363319396973,
"learning_rate": 6.244469903635632e-07,
"loss": 0.9899218678474426,
"step": 2946
},
{
"epoch": 3.5995115995115996,
"grad_norm": 3.9623706340789795,
"learning_rate": 6.229614517290932e-07,
"loss": 0.48770591616630554,
"step": 2948
},
{
"epoch": 3.601953601953602,
"grad_norm": 2.4973347187042236,
"learning_rate": 6.21484583826827e-07,
"loss": 0.5998440980911255,
"step": 2950
},
{
"epoch": 3.6043956043956045,
"grad_norm": 4.926875114440918,
"learning_rate": 6.200163926771196e-07,
"loss": 0.28131791949272156,
"step": 2952
},
{
"epoch": 3.606837606837607,
"grad_norm": 4.383153915405273,
"learning_rate": 6.185568842649552e-07,
"loss": 0.5602369904518127,
"step": 2954
},
{
"epoch": 3.6092796092796093,
"grad_norm": 2.369140625,
"learning_rate": 6.171060645399233e-07,
"loss": 0.7010159492492676,
"step": 2956
},
{
"epoch": 3.6117216117216118,
"grad_norm": 3.2974140644073486,
"learning_rate": 6.15663939416195e-07,
"loss": 0.7715173363685608,
"step": 2958
},
{
"epoch": 3.614163614163614,
"grad_norm": 1.597782850265503,
"learning_rate": 6.142305147724979e-07,
"loss": 0.8990174531936646,
"step": 2960
},
{
"epoch": 3.6166056166056166,
"grad_norm": 3.7081425189971924,
"learning_rate": 6.128057964520934e-07,
"loss": 0.5858969688415527,
"step": 2962
},
{
"epoch": 3.619047619047619,
"grad_norm": 1.764650821685791,
"learning_rate": 6.113897902627508e-07,
"loss": 0.8998643159866333,
"step": 2964
},
{
"epoch": 3.6214896214896215,
"grad_norm": 2.027956247329712,
"learning_rate": 6.099825019767264e-07,
"loss": 0.8704400658607483,
"step": 2966
},
{
"epoch": 3.623931623931624,
"grad_norm": 2.2779312133789062,
"learning_rate": 6.085839373307382e-07,
"loss": 0.9620934724807739,
"step": 2968
},
{
"epoch": 3.6263736263736264,
"grad_norm": 2.847346544265747,
"learning_rate": 6.071941020259423e-07,
"loss": 0.4650316834449768,
"step": 2970
},
{
"epoch": 3.628815628815629,
"grad_norm": 4.082263469696045,
"learning_rate": 6.058130017279103e-07,
"loss": 0.4654577672481537,
"step": 2972
},
{
"epoch": 3.6312576312576312,
"grad_norm": 4.675213813781738,
"learning_rate": 6.044406420666072e-07,
"loss": 0.5305784940719604,
"step": 2974
},
{
"epoch": 3.6336996336996337,
"grad_norm": 4.327298164367676,
"learning_rate": 6.030770286363656e-07,
"loss": 0.8460584282875061,
"step": 2976
},
{
"epoch": 3.636141636141636,
"grad_norm": 6.675053596496582,
"learning_rate": 6.017221669958662e-07,
"loss": 0.4189061224460602,
"step": 2978
},
{
"epoch": 3.6385836385836385,
"grad_norm": 1.6335099935531616,
"learning_rate": 6.003760626681127e-07,
"loss": 0.956732988357544,
"step": 2980
},
{
"epoch": 3.641025641025641,
"grad_norm": 6.5811381340026855,
"learning_rate": 5.99038721140411e-07,
"loss": 1.057121992111206,
"step": 2982
},
{
"epoch": 3.6434676434676434,
"grad_norm": 1.5813173055648804,
"learning_rate": 5.97710147864345e-07,
"loss": 0.9400102496147156,
"step": 2984
},
{
"epoch": 3.645909645909646,
"grad_norm": 3.3870911598205566,
"learning_rate": 5.963903482557566e-07,
"loss": 0.9326266050338745,
"step": 2986
},
{
"epoch": 3.6483516483516483,
"grad_norm": 2.8349955081939697,
"learning_rate": 5.950793276947205e-07,
"loss": 0.9676442742347717,
"step": 2988
},
{
"epoch": 3.6507936507936507,
"grad_norm": 31.81429100036621,
"learning_rate": 5.937770915255269e-07,
"loss": 0.9522081017494202,
"step": 2990
},
{
"epoch": 3.653235653235653,
"grad_norm": 3.3921871185302734,
"learning_rate": 5.924836450566549e-07,
"loss": 0.5230456590652466,
"step": 2992
},
{
"epoch": 3.6556776556776556,
"grad_norm": 2.2261812686920166,
"learning_rate": 5.911989935607538e-07,
"loss": 0.419090211391449,
"step": 2994
},
{
"epoch": 3.658119658119658,
"grad_norm": 2.2666232585906982,
"learning_rate": 5.899231422746202e-07,
"loss": 0.9825529456138611,
"step": 2996
},
{
"epoch": 3.6605616605616604,
"grad_norm": 1.18002188205719,
"learning_rate": 5.886560963991778e-07,
"loss": 0.45276400446891785,
"step": 2998
},
{
"epoch": 3.663003663003663,
"grad_norm": 4.351987361907959,
"learning_rate": 5.873978610994557e-07,
"loss": 0.38837531208992004,
"step": 3000
},
{
"epoch": 3.6654456654456653,
"grad_norm": 3.792799234390259,
"learning_rate": 5.861484415045672e-07,
"loss": 0.4969119429588318,
"step": 3002
},
{
"epoch": 3.6678876678876677,
"grad_norm": 4.516859531402588,
"learning_rate": 5.849078427076883e-07,
"loss": 0.2892443835735321,
"step": 3004
},
{
"epoch": 3.67032967032967,
"grad_norm": 1.7598987817764282,
"learning_rate": 5.836760697660382e-07,
"loss": 0.9143301844596863,
"step": 3006
},
{
"epoch": 3.672771672771673,
"grad_norm": 7.990298748016357,
"learning_rate": 5.82453127700858e-07,
"loss": 0.6147029399871826,
"step": 3008
},
{
"epoch": 3.6752136752136755,
"grad_norm": 0.5319908857345581,
"learning_rate": 5.812390214973905e-07,
"loss": 0.5243109464645386,
"step": 3010
},
{
"epoch": 3.677655677655678,
"grad_norm": 2.6800284385681152,
"learning_rate": 5.800337561048592e-07,
"loss": 0.9062631726264954,
"step": 3012
},
{
"epoch": 3.6800976800976803,
"grad_norm": 1.4025696516036987,
"learning_rate": 5.788373364364487e-07,
"loss": 0.9003893733024597,
"step": 3014
},
{
"epoch": 3.682539682539683,
"grad_norm": 9.346170425415039,
"learning_rate": 5.776497673692857e-07,
"loss": 0.7075907588005066,
"step": 3016
},
{
"epoch": 3.684981684981685,
"grad_norm": 2.3770735263824463,
"learning_rate": 5.764710537444159e-07,
"loss": 0.5896199941635132,
"step": 3018
},
{
"epoch": 3.6874236874236876,
"grad_norm": 3.0938150882720947,
"learning_rate": 5.753012003667885e-07,
"loss": 0.6084612011909485,
"step": 3020
},
{
"epoch": 3.68986568986569,
"grad_norm": 1.9582159519195557,
"learning_rate": 5.741402120052328e-07,
"loss": 0.5125177502632141,
"step": 3022
},
{
"epoch": 3.6923076923076925,
"grad_norm": 2.1140964031219482,
"learning_rate": 5.729880933924421e-07,
"loss": 1.003217101097107,
"step": 3024
},
{
"epoch": 3.694749694749695,
"grad_norm": 0.36588796973228455,
"learning_rate": 5.718448492249509e-07,
"loss": 0.5080230236053467,
"step": 3026
},
{
"epoch": 3.6971916971916974,
"grad_norm": 3.183983087539673,
"learning_rate": 5.707104841631195e-07,
"loss": 0.7072214484214783,
"step": 3028
},
{
"epoch": 3.6996336996337,
"grad_norm": 1.5387071371078491,
"learning_rate": 5.695850028311112e-07,
"loss": 0.9744673371315002,
"step": 3030
},
{
"epoch": 3.7020757020757022,
"grad_norm": 2.1208925247192383,
"learning_rate": 5.68468409816877e-07,
"loss": 0.8400413990020752,
"step": 3032
},
{
"epoch": 3.7045177045177047,
"grad_norm": 4.847201824188232,
"learning_rate": 5.673607096721346e-07,
"loss": 0.500311017036438,
"step": 3034
},
{
"epoch": 3.706959706959707,
"grad_norm": 2.9996325969696045,
"learning_rate": 5.662619069123503e-07,
"loss": 0.5278769135475159,
"step": 3036
},
{
"epoch": 3.7094017094017095,
"grad_norm": 2.098602771759033,
"learning_rate": 5.651720060167208e-07,
"loss": 0.5000000596046448,
"step": 3038
},
{
"epoch": 3.711843711843712,
"grad_norm": 1.7179620265960693,
"learning_rate": 5.640910114281555e-07,
"loss": 0.9520195722579956,
"step": 3040
},
{
"epoch": 3.7142857142857144,
"grad_norm": 2.3502564430236816,
"learning_rate": 5.630189275532574e-07,
"loss": 0.8327752947807312,
"step": 3042
},
{
"epoch": 3.716727716727717,
"grad_norm": 2.5049052238464355,
"learning_rate": 5.619557587623057e-07,
"loss": 0.6436217427253723,
"step": 3044
},
{
"epoch": 3.7191697191697193,
"grad_norm": 2.9425840377807617,
"learning_rate": 5.609015093892374e-07,
"loss": 0.9164323806762695,
"step": 3046
},
{
"epoch": 3.7216117216117217,
"grad_norm": 3.1850688457489014,
"learning_rate": 5.59856183731631e-07,
"loss": 0.5315079689025879,
"step": 3048
},
{
"epoch": 3.724053724053724,
"grad_norm": 2.6305289268493652,
"learning_rate": 5.588197860506867e-07,
"loss": 0.7617026567459106,
"step": 3050
},
{
"epoch": 3.7264957264957266,
"grad_norm": 3.4540348052978516,
"learning_rate": 5.577923205712124e-07,
"loss": 1.017609715461731,
"step": 3052
},
{
"epoch": 3.728937728937729,
"grad_norm": 7.902237415313721,
"learning_rate": 5.567737914816022e-07,
"loss": 0.5209454298019409,
"step": 3054
},
{
"epoch": 3.7313797313797314,
"grad_norm": 1.829217791557312,
"learning_rate": 5.557642029338236e-07,
"loss": 0.9426127672195435,
"step": 3056
},
{
"epoch": 3.733821733821734,
"grad_norm": 3.1745777130126953,
"learning_rate": 5.547635590433968e-07,
"loss": 0.6483992338180542,
"step": 3058
},
{
"epoch": 3.7362637362637363,
"grad_norm": 10.875771522521973,
"learning_rate": 5.53771863889381e-07,
"loss": 0.46888014674186707,
"step": 3060
},
{
"epoch": 3.7387057387057387,
"grad_norm": 2.8701348304748535,
"learning_rate": 5.527891215143559e-07,
"loss": 0.5719221830368042,
"step": 3062
},
{
"epoch": 3.741147741147741,
"grad_norm": 1.0279072523117065,
"learning_rate": 5.518153359244063e-07,
"loss": 0.3847256898880005,
"step": 3064
},
{
"epoch": 3.7435897435897436,
"grad_norm": 2.5575125217437744,
"learning_rate": 5.508505110891045e-07,
"loss": 0.5125806331634521,
"step": 3066
},
{
"epoch": 3.746031746031746,
"grad_norm": 1.723737120628357,
"learning_rate": 5.498946509414949e-07,
"loss": 0.8170480132102966,
"step": 3068
},
{
"epoch": 3.7484737484737485,
"grad_norm": 1.8103982210159302,
"learning_rate": 5.489477593780787e-07,
"loss": 1.0591984987258911,
"step": 3070
},
{
"epoch": 3.750915750915751,
"grad_norm": 6.911821365356445,
"learning_rate": 5.480098402587973e-07,
"loss": 0.645149290561676,
"step": 3072
},
{
"epoch": 3.7533577533577533,
"grad_norm": 0.26767414808273315,
"learning_rate": 5.470808974070152e-07,
"loss": 0.4036714732646942,
"step": 3074
},
{
"epoch": 3.755799755799756,
"grad_norm": 4.02056884765625,
"learning_rate": 5.461609346095067e-07,
"loss": 0.8655245304107666,
"step": 3076
},
{
"epoch": 3.758241758241758,
"grad_norm": 4.357627868652344,
"learning_rate": 5.452499556164402e-07,
"loss": 0.8845657110214233,
"step": 3078
},
{
"epoch": 3.7606837606837606,
"grad_norm": 10.457714080810547,
"learning_rate": 5.443479641413607e-07,
"loss": 0.6024913191795349,
"step": 3080
},
{
"epoch": 3.763125763125763,
"grad_norm": 2.315418243408203,
"learning_rate": 5.434549638611768e-07,
"loss": 0.9414732456207275,
"step": 3082
},
{
"epoch": 3.7655677655677655,
"grad_norm": 1.8591489791870117,
"learning_rate": 5.425709584161457e-07,
"loss": 0.9516326785087585,
"step": 3084
},
{
"epoch": 3.768009768009768,
"grad_norm": 1.980412244796753,
"learning_rate": 5.416959514098571e-07,
"loss": 0.9030287265777588,
"step": 3086
},
{
"epoch": 3.7704517704517704,
"grad_norm": 0.8802301287651062,
"learning_rate": 5.40829946409219e-07,
"loss": 0.2058449685573578,
"step": 3088
},
{
"epoch": 3.772893772893773,
"grad_norm": 3.767972230911255,
"learning_rate": 5.399729469444438e-07,
"loss": 0.8536104559898376,
"step": 3090
},
{
"epoch": 3.7753357753357752,
"grad_norm": 2.7339487075805664,
"learning_rate": 5.39124956509033e-07,
"loss": 0.8664818406105042,
"step": 3092
},
{
"epoch": 3.7777777777777777,
"grad_norm": 1.868648648262024,
"learning_rate": 5.382859785597643e-07,
"loss": 0.9490870237350464,
"step": 3094
},
{
"epoch": 3.78021978021978,
"grad_norm": 3.2051358222961426,
"learning_rate": 5.374560165166752e-07,
"loss": 0.8471544981002808,
"step": 3096
},
{
"epoch": 3.7826617826617825,
"grad_norm": 3.188377857208252,
"learning_rate": 5.366350737630515e-07,
"loss": 0.6783183217048645,
"step": 3098
},
{
"epoch": 3.785103785103785,
"grad_norm": 12.093615531921387,
"learning_rate": 5.358231536454119e-07,
"loss": 0.8494789004325867,
"step": 3100
},
{
"epoch": 3.7875457875457874,
"grad_norm": 2.5433156490325928,
"learning_rate": 5.350202594734954e-07,
"loss": 0.8256645202636719,
"step": 3102
},
{
"epoch": 3.78998778998779,
"grad_norm": 6.241081237792969,
"learning_rate": 5.34226394520247e-07,
"loss": 0.8711805939674377,
"step": 3104
},
{
"epoch": 3.7924297924297923,
"grad_norm": 2.3150527477264404,
"learning_rate": 5.33441562021805e-07,
"loss": 1.0078837871551514,
"step": 3106
},
{
"epoch": 3.7948717948717947,
"grad_norm": 2.26035737991333,
"learning_rate": 5.326657651774867e-07,
"loss": 0.5672973394393921,
"step": 3108
},
{
"epoch": 3.797313797313797,
"grad_norm": 3.3058907985687256,
"learning_rate": 5.318990071497772e-07,
"loss": 0.6369197368621826,
"step": 3110
},
{
"epoch": 3.7997557997557996,
"grad_norm": 2.8003060817718506,
"learning_rate": 5.311412910643145e-07,
"loss": 0.5773022174835205,
"step": 3112
},
{
"epoch": 3.802197802197802,
"grad_norm": 3.470675468444824,
"learning_rate": 5.303926200098789e-07,
"loss": 0.5989543199539185,
"step": 3114
},
{
"epoch": 3.8046398046398044,
"grad_norm": 3.9955947399139404,
"learning_rate": 5.296529970383777e-07,
"loss": 0.44651395082473755,
"step": 3116
},
{
"epoch": 3.807081807081807,
"grad_norm": 4.266364097595215,
"learning_rate": 5.289224251648359e-07,
"loss": 0.6023522019386292,
"step": 3118
},
{
"epoch": 3.8095238095238093,
"grad_norm": 2.1567165851593018,
"learning_rate": 5.282009073673812e-07,
"loss": 0.9219540953636169,
"step": 3120
},
{
"epoch": 3.8119658119658117,
"grad_norm": 4.827529430389404,
"learning_rate": 5.27488446587233e-07,
"loss": 0.5145304203033447,
"step": 3122
},
{
"epoch": 3.814407814407814,
"grad_norm": 3.446068048477173,
"learning_rate": 5.267850457286907e-07,
"loss": 0.6707845330238342,
"step": 3124
},
{
"epoch": 3.8168498168498166,
"grad_norm": 6.150956630706787,
"learning_rate": 5.26090707659122e-07,
"loss": 0.881208062171936,
"step": 3126
},
{
"epoch": 3.819291819291819,
"grad_norm": 7.88019323348999,
"learning_rate": 5.254054352089493e-07,
"loss": 0.48564082384109497,
"step": 3128
},
{
"epoch": 3.8217338217338215,
"grad_norm": 2.4069323539733887,
"learning_rate": 5.247292311716413e-07,
"loss": 0.8890138864517212,
"step": 3130
},
{
"epoch": 3.824175824175824,
"grad_norm": 1.5671998262405396,
"learning_rate": 5.240620983036986e-07,
"loss": 0.5058675408363342,
"step": 3132
},
{
"epoch": 3.8266178266178263,
"grad_norm": 6.1965227127075195,
"learning_rate": 5.234040393246448e-07,
"loss": 1.1437023878097534,
"step": 3134
},
{
"epoch": 3.8290598290598292,
"grad_norm": 3.5453076362609863,
"learning_rate": 5.227550569170133e-07,
"loss": 1.039106845855713,
"step": 3136
},
{
"epoch": 3.8315018315018317,
"grad_norm": 4.471746444702148,
"learning_rate": 5.221151537263382e-07,
"loss": 0.6547291278839111,
"step": 3138
},
{
"epoch": 3.833943833943834,
"grad_norm": 3.5945651531219482,
"learning_rate": 5.214843323611432e-07,
"loss": 0.3847421407699585,
"step": 3140
},
{
"epoch": 3.8363858363858365,
"grad_norm": 3.4551937580108643,
"learning_rate": 5.208625953929289e-07,
"loss": 0.7860216498374939,
"step": 3142
},
{
"epoch": 3.838827838827839,
"grad_norm": 5.9122633934021,
"learning_rate": 5.202499453561658e-07,
"loss": 0.26646631956100464,
"step": 3144
},
{
"epoch": 3.8412698412698414,
"grad_norm": 4.5092620849609375,
"learning_rate": 5.196463847482812e-07,
"loss": 0.5625693202018738,
"step": 3146
},
{
"epoch": 3.843711843711844,
"grad_norm": 0.4482984244823456,
"learning_rate": 5.1905191602965e-07,
"loss": 0.12481559067964554,
"step": 3148
},
{
"epoch": 3.8461538461538463,
"grad_norm": 5.856686115264893,
"learning_rate": 5.184665416235841e-07,
"loss": 0.5362542271614075,
"step": 3150
},
{
"epoch": 3.8485958485958487,
"grad_norm": 4.156497001647949,
"learning_rate": 5.178902639163247e-07,
"loss": 0.7409583330154419,
"step": 3152
},
{
"epoch": 3.851037851037851,
"grad_norm": 1.6845171451568604,
"learning_rate": 5.17323085257029e-07,
"loss": 0.5385940074920654,
"step": 3154
},
{
"epoch": 3.8534798534798536,
"grad_norm": 1.5355862379074097,
"learning_rate": 5.167650079577636e-07,
"loss": 0.8247669339179993,
"step": 3156
},
{
"epoch": 3.855921855921856,
"grad_norm": 4.407171249389648,
"learning_rate": 5.162160342934939e-07,
"loss": 0.8968489170074463,
"step": 3158
},
{
"epoch": 3.8583638583638584,
"grad_norm": 8.075994491577148,
"learning_rate": 5.15676166502075e-07,
"loss": 0.09241821616888046,
"step": 3160
},
{
"epoch": 3.860805860805861,
"grad_norm": 2.5929574966430664,
"learning_rate": 5.151454067842417e-07,
"loss": 0.4451131224632263,
"step": 3162
},
{
"epoch": 3.8632478632478633,
"grad_norm": 1.8862788677215576,
"learning_rate": 5.146237573036012e-07,
"loss": 0.9212697148323059,
"step": 3164
},
{
"epoch": 3.8656898656898657,
"grad_norm": 2.396461248397827,
"learning_rate": 5.141112201866231e-07,
"loss": 0.9008550047874451,
"step": 3166
},
{
"epoch": 3.868131868131868,
"grad_norm": 2.7560782432556152,
"learning_rate": 5.136077975226314e-07,
"loss": 0.7847106456756592,
"step": 3168
},
{
"epoch": 3.8705738705738706,
"grad_norm": 5.181787014007568,
"learning_rate": 5.131134913637951e-07,
"loss": 0.5696348547935486,
"step": 3170
},
{
"epoch": 3.873015873015873,
"grad_norm": 8.310593605041504,
"learning_rate": 5.126283037251208e-07,
"loss": 0.5494756102561951,
"step": 3172
},
{
"epoch": 3.8754578754578755,
"grad_norm": 2.406679391860962,
"learning_rate": 5.121522365844436e-07,
"loss": 0.5918058156967163,
"step": 3174
},
{
"epoch": 3.877899877899878,
"grad_norm": 2.115579128265381,
"learning_rate": 5.116852918824199e-07,
"loss": 0.9309298396110535,
"step": 3176
},
{
"epoch": 3.8803418803418803,
"grad_norm": 1.9531852006912231,
"learning_rate": 5.112274715225194e-07,
"loss": 0.858812153339386,
"step": 3178
},
{
"epoch": 3.8827838827838828,
"grad_norm": 3.3092024326324463,
"learning_rate": 5.107787773710157e-07,
"loss": 0.8395816087722778,
"step": 3180
},
{
"epoch": 3.885225885225885,
"grad_norm": 4.282203197479248,
"learning_rate": 5.103392112569815e-07,
"loss": 0.8726351261138916,
"step": 3182
},
{
"epoch": 3.8876678876678876,
"grad_norm": 5.603507995605469,
"learning_rate": 5.099087749722788e-07,
"loss": 0.3810088336467743,
"step": 3184
},
{
"epoch": 3.89010989010989,
"grad_norm": 3.650843858718872,
"learning_rate": 5.094874702715529e-07,
"loss": 0.9510683417320251,
"step": 3186
},
{
"epoch": 3.8925518925518925,
"grad_norm": 2.743922472000122,
"learning_rate": 5.090752988722245e-07,
"loss": 0.40368887782096863,
"step": 3188
},
{
"epoch": 3.894993894993895,
"grad_norm": 0.3727673292160034,
"learning_rate": 5.086722624544829e-07,
"loss": 0.420103520154953,
"step": 3190
},
{
"epoch": 3.8974358974358974,
"grad_norm": 2.02138090133667,
"learning_rate": 5.082783626612797e-07,
"loss": 0.8819708824157715,
"step": 3192
},
{
"epoch": 3.8998778998779,
"grad_norm": 3.9244892597198486,
"learning_rate": 5.078936010983213e-07,
"loss": 1.0119850635528564,
"step": 3194
},
{
"epoch": 3.9023199023199022,
"grad_norm": 6.452670574188232,
"learning_rate": 5.075179793340628e-07,
"loss": 0.5983652472496033,
"step": 3196
},
{
"epoch": 3.9047619047619047,
"grad_norm": 5.412775993347168,
"learning_rate": 5.071514988997016e-07,
"loss": 0.1550082117319107,
"step": 3198
},
{
"epoch": 3.907203907203907,
"grad_norm": 3.0789589881896973,
"learning_rate": 5.067941612891708e-07,
"loss": 0.9240917563438416,
"step": 3200
},
{
"epoch": 3.9096459096459095,
"grad_norm": 1.7385785579681396,
"learning_rate": 5.06445967959134e-07,
"loss": 0.6053808331489563,
"step": 3202
},
{
"epoch": 3.912087912087912,
"grad_norm": 2.076815605163574,
"learning_rate": 5.061069203289777e-07,
"loss": 0.9977898001670837,
"step": 3204
},
{
"epoch": 3.9145299145299144,
"grad_norm": 1.4593520164489746,
"learning_rate": 5.057770197808077e-07,
"loss": 0.9548913240432739,
"step": 3206
},
{
"epoch": 3.916971916971917,
"grad_norm": 2.623448371887207,
"learning_rate": 5.054562676594414e-07,
"loss": 1.132678508758545,
"step": 3208
},
{
"epoch": 3.9194139194139193,
"grad_norm": 1.8026434183120728,
"learning_rate": 5.051446652724042e-07,
"loss": 0.6159650087356567,
"step": 3210
},
{
"epoch": 3.9218559218559217,
"grad_norm": 2.9582080841064453,
"learning_rate": 5.048422138899222e-07,
"loss": 0.23612847924232483,
"step": 3212
},
{
"epoch": 3.9242979242979246,
"grad_norm": 1.8346482515335083,
"learning_rate": 5.045489147449187e-07,
"loss": 0.9001370668411255,
"step": 3214
},
{
"epoch": 3.926739926739927,
"grad_norm": 4.2038726806640625,
"learning_rate": 5.042647690330078e-07,
"loss": 0.921493411064148,
"step": 3216
},
{
"epoch": 3.9291819291819294,
"grad_norm": 6.33651065826416,
"learning_rate": 5.039897779124914e-07,
"loss": 0.6150534749031067,
"step": 3218
},
{
"epoch": 3.931623931623932,
"grad_norm": 2.513700246810913,
"learning_rate": 5.037239425043525e-07,
"loss": 0.6679733991622925,
"step": 3220
},
{
"epoch": 3.9340659340659343,
"grad_norm": 11.929234504699707,
"learning_rate": 5.034672638922512e-07,
"loss": 0.530619740486145,
"step": 3222
},
{
"epoch": 3.9365079365079367,
"grad_norm": 3.113684892654419,
"learning_rate": 5.032197431225214e-07,
"loss": 0.8231785297393799,
"step": 3224
},
{
"epoch": 3.938949938949939,
"grad_norm": 4.810062885284424,
"learning_rate": 5.029813812041649e-07,
"loss": 0.5280576944351196,
"step": 3226
},
{
"epoch": 3.9413919413919416,
"grad_norm": 2.087477922439575,
"learning_rate": 5.027521791088482e-07,
"loss": 0.9266934394836426,
"step": 3228
},
{
"epoch": 3.943833943833944,
"grad_norm": 4.400597095489502,
"learning_rate": 5.025321377708989e-07,
"loss": 0.5227733850479126,
"step": 3230
},
{
"epoch": 3.9462759462759465,
"grad_norm": 3.1473488807678223,
"learning_rate": 5.023212580873009e-07,
"loss": 0.952559769153595,
"step": 3232
},
{
"epoch": 3.948717948717949,
"grad_norm": 14.350162506103516,
"learning_rate": 5.02119540917691e-07,
"loss": 0.5347244143486023,
"step": 3234
},
{
"epoch": 3.9511599511599513,
"grad_norm": 2.0704898834228516,
"learning_rate": 5.01926987084356e-07,
"loss": 0.9426727294921875,
"step": 3236
},
{
"epoch": 3.9536019536019538,
"grad_norm": 3.468090057373047,
"learning_rate": 5.017435973722293e-07,
"loss": 0.7870326042175293,
"step": 3238
},
{
"epoch": 3.956043956043956,
"grad_norm": 2.9406344890594482,
"learning_rate": 5.015693725288866e-07,
"loss": 0.4789937436580658,
"step": 3240
},
{
"epoch": 3.9584859584859586,
"grad_norm": 15.776670455932617,
"learning_rate": 5.014043132645438e-07,
"loss": 0.6635629534721375,
"step": 3242
},
{
"epoch": 3.960927960927961,
"grad_norm": 4.5655083656311035,
"learning_rate": 5.012484202520545e-07,
"loss": 0.9738138914108276,
"step": 3244
},
{
"epoch": 3.9633699633699635,
"grad_norm": 2.4571170806884766,
"learning_rate": 5.01101694126906e-07,
"loss": 0.5079742670059204,
"step": 3246
},
{
"epoch": 3.965811965811966,
"grad_norm": 1.870768666267395,
"learning_rate": 5.009641354872178e-07,
"loss": 0.9230693578720093,
"step": 3248
},
{
"epoch": 3.9682539682539684,
"grad_norm": 2.229893445968628,
"learning_rate": 5.008357448937387e-07,
"loss": 0.6680663228034973,
"step": 3250
},
{
"epoch": 3.970695970695971,
"grad_norm": 25.037006378173828,
"learning_rate": 5.007165228698442e-07,
"loss": 0.4087255597114563,
"step": 3252
},
{
"epoch": 3.9731379731379732,
"grad_norm": 2.2511398792266846,
"learning_rate": 5.006064699015351e-07,
"loss": 0.8908025622367859,
"step": 3254
},
{
"epoch": 3.9755799755799757,
"grad_norm": 4.969597339630127,
"learning_rate": 5.005055864374352e-07,
"loss": 0.8304935693740845,
"step": 3256
},
{
"epoch": 3.978021978021978,
"grad_norm": 2.5601906776428223,
"learning_rate": 5.004138728887892e-07,
"loss": 0.40299245715141296,
"step": 3258
},
{
"epoch": 3.9804639804639805,
"grad_norm": 2.45926570892334,
"learning_rate": 5.003313296294612e-07,
"loss": 0.5143805146217346,
"step": 3260
},
{
"epoch": 3.982905982905983,
"grad_norm": 1.9476388692855835,
"learning_rate": 5.002579569959336e-07,
"loss": 0.5361751914024353,
"step": 3262
},
{
"epoch": 3.9853479853479854,
"grad_norm": 4.383269786834717,
"learning_rate": 5.001937552873049e-07,
"loss": 0.4276546835899353,
"step": 3264
},
{
"epoch": 3.987789987789988,
"grad_norm": 4.4012627601623535,
"learning_rate": 5.001387247652891e-07,
"loss": 0.8529163002967834,
"step": 3266
},
{
"epoch": 3.9902319902319903,
"grad_norm": 1.4377570152282715,
"learning_rate": 5.000928656542145e-07,
"loss": 0.9019818902015686,
"step": 3268
},
{
"epoch": 3.9926739926739927,
"grad_norm": 2.091071605682373,
"learning_rate": 5.000561781410232e-07,
"loss": 0.6819381713867188,
"step": 3270
},
{
"epoch": 3.995115995115995,
"grad_norm": 1.7002183198928833,
"learning_rate": 5.000286623752688e-07,
"loss": 0.9077348113059998,
"step": 3272
},
{
"epoch": 3.9975579975579976,
"grad_norm": 1.8634474277496338,
"learning_rate": 5.000103184691177e-07,
"loss": 0.8196188807487488,
"step": 3274
},
{
"epoch": 4.0,
"grad_norm": 8.092415809631348,
"learning_rate": 5.000011464973476e-07,
"loss": 0.480937659740448,
"step": 3276
},
{
"epoch": 4.0,
"step": 3276,
"total_flos": 3.438047841308639e+18,
"train_loss": 0.8972434957087567,
"train_runtime": 10632.4216,
"train_samples_per_second": 4.93,
"train_steps_per_second": 0.308
}
],
"logging_steps": 2,
"max_steps": 3276,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.438047841308639e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}