9b-133 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
4ccf64f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3564,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016835016835016834,
"grad_norm": 10.01240348815918,
"learning_rate": 5.5865921787709494e-09,
"loss": 1.7057493925094604,
"step": 2
},
{
"epoch": 0.003367003367003367,
"grad_norm": 14.913334846496582,
"learning_rate": 1.6759776536312847e-08,
"loss": 1.2436225414276123,
"step": 4
},
{
"epoch": 0.005050505050505051,
"grad_norm": 22.982995986938477,
"learning_rate": 2.7932960893854745e-08,
"loss": 1.686056137084961,
"step": 6
},
{
"epoch": 0.006734006734006734,
"grad_norm": 15.24986457824707,
"learning_rate": 3.910614525139665e-08,
"loss": 1.6055235862731934,
"step": 8
},
{
"epoch": 0.008417508417508417,
"grad_norm": 30.967639923095703,
"learning_rate": 5.027932960893855e-08,
"loss": 4.50665283203125,
"step": 10
},
{
"epoch": 0.010101010101010102,
"grad_norm": 4.303424835205078,
"learning_rate": 6.145251396648044e-08,
"loss": 1.9789408445358276,
"step": 12
},
{
"epoch": 0.011784511784511785,
"grad_norm": 5.598588466644287,
"learning_rate": 7.262569832402235e-08,
"loss": 1.6753730773925781,
"step": 14
},
{
"epoch": 0.013468013468013467,
"grad_norm": 4.323257923126221,
"learning_rate": 8.379888268156423e-08,
"loss": 1.6596330404281616,
"step": 16
},
{
"epoch": 0.015151515151515152,
"grad_norm": 26.17571258544922,
"learning_rate": 9.497206703910614e-08,
"loss": 2.7241992950439453,
"step": 18
},
{
"epoch": 0.016835016835016835,
"grad_norm": 9.184181213378906,
"learning_rate": 1.0614525139664805e-07,
"loss": 1.9634017944335938,
"step": 20
},
{
"epoch": 0.018518518518518517,
"grad_norm": 4.683750152587891,
"learning_rate": 1.1731843575418994e-07,
"loss": 1.8491621017456055,
"step": 22
},
{
"epoch": 0.020202020202020204,
"grad_norm": 14.232526779174805,
"learning_rate": 1.2849162011173183e-07,
"loss": 3.537993907928467,
"step": 24
},
{
"epoch": 0.021885521885521887,
"grad_norm": 11.717961311340332,
"learning_rate": 1.3966480446927373e-07,
"loss": 2.8410818576812744,
"step": 26
},
{
"epoch": 0.02356902356902357,
"grad_norm": 11.476764678955078,
"learning_rate": 1.5083798882681565e-07,
"loss": 2.1707875728607178,
"step": 28
},
{
"epoch": 0.025252525252525252,
"grad_norm": 42.536720275878906,
"learning_rate": 1.6201117318435754e-07,
"loss": 3.401388645172119,
"step": 30
},
{
"epoch": 0.026936026936026935,
"grad_norm": 15.799206733703613,
"learning_rate": 1.7318435754189943e-07,
"loss": 1.8762117624282837,
"step": 32
},
{
"epoch": 0.02861952861952862,
"grad_norm": 56.47621154785156,
"learning_rate": 1.8435754189944133e-07,
"loss": 4.025151252746582,
"step": 34
},
{
"epoch": 0.030303030303030304,
"grad_norm": 8.71907901763916,
"learning_rate": 1.9553072625698322e-07,
"loss": 1.9956148862838745,
"step": 36
},
{
"epoch": 0.03198653198653199,
"grad_norm": 13.315755844116211,
"learning_rate": 2.0670391061452514e-07,
"loss": 1.5647544860839844,
"step": 38
},
{
"epoch": 0.03367003367003367,
"grad_norm": 18.28321647644043,
"learning_rate": 2.17877094972067e-07,
"loss": 2.4461331367492676,
"step": 40
},
{
"epoch": 0.03535353535353535,
"grad_norm": 7.177945137023926,
"learning_rate": 2.2905027932960893e-07,
"loss": 3.1400742530822754,
"step": 42
},
{
"epoch": 0.037037037037037035,
"grad_norm": 11.345965385437012,
"learning_rate": 2.402234636871508e-07,
"loss": 2.982694149017334,
"step": 44
},
{
"epoch": 0.03872053872053872,
"grad_norm": 18.986379623413086,
"learning_rate": 2.5139664804469275e-07,
"loss": 1.7094351053237915,
"step": 46
},
{
"epoch": 0.04040404040404041,
"grad_norm": 25.200927734375,
"learning_rate": 2.6256983240223464e-07,
"loss": 3.4711947441101074,
"step": 48
},
{
"epoch": 0.04208754208754209,
"grad_norm": 25.79502296447754,
"learning_rate": 2.7374301675977653e-07,
"loss": 2.5125930309295654,
"step": 50
},
{
"epoch": 0.04377104377104377,
"grad_norm": 26.86095428466797,
"learning_rate": 2.849162011173184e-07,
"loss": 2.5184483528137207,
"step": 52
},
{
"epoch": 0.045454545454545456,
"grad_norm": 23.869613647460938,
"learning_rate": 2.960893854748603e-07,
"loss": 2.1967999935150146,
"step": 54
},
{
"epoch": 0.04713804713804714,
"grad_norm": 4.752484321594238,
"learning_rate": 3.072625698324022e-07,
"loss": 1.6605415344238281,
"step": 56
},
{
"epoch": 0.04882154882154882,
"grad_norm": 30.32961654663086,
"learning_rate": 3.184357541899441e-07,
"loss": 2.6820101737976074,
"step": 58
},
{
"epoch": 0.050505050505050504,
"grad_norm": 4.937363624572754,
"learning_rate": 3.29608938547486e-07,
"loss": 2.046969175338745,
"step": 60
},
{
"epoch": 0.05218855218855219,
"grad_norm": 26.058670043945312,
"learning_rate": 3.407821229050279e-07,
"loss": 2.126314163208008,
"step": 62
},
{
"epoch": 0.05387205387205387,
"grad_norm": 3.972296714782715,
"learning_rate": 3.5195530726256984e-07,
"loss": 1.469801902770996,
"step": 64
},
{
"epoch": 0.05555555555555555,
"grad_norm": 36.323368072509766,
"learning_rate": 3.6312849162011174e-07,
"loss": 2.0382440090179443,
"step": 66
},
{
"epoch": 0.05723905723905724,
"grad_norm": 5.039744853973389,
"learning_rate": 3.7430167597765363e-07,
"loss": 1.679071068763733,
"step": 68
},
{
"epoch": 0.058922558922558925,
"grad_norm": 5.542041778564453,
"learning_rate": 3.8547486033519547e-07,
"loss": 1.7368519306182861,
"step": 70
},
{
"epoch": 0.06060606060606061,
"grad_norm": 11.228593826293945,
"learning_rate": 3.966480446927374e-07,
"loss": 1.9073054790496826,
"step": 72
},
{
"epoch": 0.06228956228956229,
"grad_norm": 6.521553993225098,
"learning_rate": 4.078212290502793e-07,
"loss": 1.7021303176879883,
"step": 74
},
{
"epoch": 0.06397306397306397,
"grad_norm": 4.614531993865967,
"learning_rate": 4.189944134078212e-07,
"loss": 1.3584303855895996,
"step": 76
},
{
"epoch": 0.06565656565656566,
"grad_norm": 4.567502021789551,
"learning_rate": 4.301675977653631e-07,
"loss": 1.7855596542358398,
"step": 78
},
{
"epoch": 0.06734006734006734,
"grad_norm": 4.453341484069824,
"learning_rate": 4.41340782122905e-07,
"loss": 1.5260930061340332,
"step": 80
},
{
"epoch": 0.06902356902356903,
"grad_norm": 9.207719802856445,
"learning_rate": 4.5251396648044694e-07,
"loss": 1.7678306102752686,
"step": 82
},
{
"epoch": 0.0707070707070707,
"grad_norm": 11.142820358276367,
"learning_rate": 4.6368715083798884e-07,
"loss": 1.4878003597259521,
"step": 84
},
{
"epoch": 0.0723905723905724,
"grad_norm": 6.588044166564941,
"learning_rate": 4.7486033519553073e-07,
"loss": 1.6655892133712769,
"step": 86
},
{
"epoch": 0.07407407407407407,
"grad_norm": 7.762340068817139,
"learning_rate": 4.860335195530726e-07,
"loss": 1.4147857427597046,
"step": 88
},
{
"epoch": 0.07575757575757576,
"grad_norm": 19.327587127685547,
"learning_rate": 4.972067039106145e-07,
"loss": 1.6009736061096191,
"step": 90
},
{
"epoch": 0.07744107744107744,
"grad_norm": 16.781408309936523,
"learning_rate": 5.083798882681564e-07,
"loss": 1.331944227218628,
"step": 92
},
{
"epoch": 0.07912457912457913,
"grad_norm": 5.269062042236328,
"learning_rate": 5.195530726256983e-07,
"loss": 1.3683245182037354,
"step": 94
},
{
"epoch": 0.08080808080808081,
"grad_norm": 2.652998685836792,
"learning_rate": 5.307262569832402e-07,
"loss": 1.4645051956176758,
"step": 96
},
{
"epoch": 0.08249158249158249,
"grad_norm": 26.370506286621094,
"learning_rate": 5.418994413407821e-07,
"loss": 1.4499703645706177,
"step": 98
},
{
"epoch": 0.08417508417508418,
"grad_norm": 4.437371253967285,
"learning_rate": 5.53072625698324e-07,
"loss": 1.3694539070129395,
"step": 100
},
{
"epoch": 0.08585858585858586,
"grad_norm": 7.602840900421143,
"learning_rate": 5.642458100558659e-07,
"loss": 1.2753658294677734,
"step": 102
},
{
"epoch": 0.08754208754208755,
"grad_norm": 5.345534801483154,
"learning_rate": 5.754189944134078e-07,
"loss": 0.9927137494087219,
"step": 104
},
{
"epoch": 0.08922558922558922,
"grad_norm": 57.12667465209961,
"learning_rate": 5.865921787709497e-07,
"loss": 1.144801378250122,
"step": 106
},
{
"epoch": 0.09090909090909091,
"grad_norm": 3.486433267593384,
"learning_rate": 5.977653631284916e-07,
"loss": 1.3661882877349854,
"step": 108
},
{
"epoch": 0.09259259259259259,
"grad_norm": 8.98828411102295,
"learning_rate": 6.089385474860335e-07,
"loss": 0.9164130687713623,
"step": 110
},
{
"epoch": 0.09427609427609428,
"grad_norm": 4.9939141273498535,
"learning_rate": 6.201117318435754e-07,
"loss": 1.3426786661148071,
"step": 112
},
{
"epoch": 0.09595959595959595,
"grad_norm": 29.148103713989258,
"learning_rate": 6.312849162011172e-07,
"loss": 1.138382911682129,
"step": 114
},
{
"epoch": 0.09764309764309764,
"grad_norm": 34.31653594970703,
"learning_rate": 6.424581005586592e-07,
"loss": 0.7960847616195679,
"step": 116
},
{
"epoch": 0.09932659932659933,
"grad_norm": 4.712627410888672,
"learning_rate": 6.536312849162011e-07,
"loss": 1.2441091537475586,
"step": 118
},
{
"epoch": 0.10101010101010101,
"grad_norm": 5.5220794677734375,
"learning_rate": 6.64804469273743e-07,
"loss": 1.0892267227172852,
"step": 120
},
{
"epoch": 0.1026936026936027,
"grad_norm": 10.08218765258789,
"learning_rate": 6.759776536312849e-07,
"loss": 1.266754150390625,
"step": 122
},
{
"epoch": 0.10437710437710437,
"grad_norm": 7.951529026031494,
"learning_rate": 6.871508379888268e-07,
"loss": 0.8909415006637573,
"step": 124
},
{
"epoch": 0.10606060606060606,
"grad_norm": 3.5433144569396973,
"learning_rate": 6.983240223463687e-07,
"loss": 0.7614157795906067,
"step": 126
},
{
"epoch": 0.10774410774410774,
"grad_norm": 84.19695281982422,
"learning_rate": 7.094972067039106e-07,
"loss": 1.1203527450561523,
"step": 128
},
{
"epoch": 0.10942760942760943,
"grad_norm": 6.779047966003418,
"learning_rate": 7.206703910614524e-07,
"loss": 1.0394889116287231,
"step": 130
},
{
"epoch": 0.1111111111111111,
"grad_norm": 3.759247303009033,
"learning_rate": 7.318435754189943e-07,
"loss": 0.9934459328651428,
"step": 132
},
{
"epoch": 0.1127946127946128,
"grad_norm": 4.790719032287598,
"learning_rate": 7.430167597765363e-07,
"loss": 1.2970447540283203,
"step": 134
},
{
"epoch": 0.11447811447811448,
"grad_norm": 11.66688346862793,
"learning_rate": 7.541899441340782e-07,
"loss": 1.2734112739562988,
"step": 136
},
{
"epoch": 0.11616161616161616,
"grad_norm": 5.437692642211914,
"learning_rate": 7.653631284916201e-07,
"loss": 1.7463512420654297,
"step": 138
},
{
"epoch": 0.11784511784511785,
"grad_norm": 2.954306125640869,
"learning_rate": 7.76536312849162e-07,
"loss": 1.2036831378936768,
"step": 140
},
{
"epoch": 0.11952861952861953,
"grad_norm": 3.9827589988708496,
"learning_rate": 7.877094972067039e-07,
"loss": 1.1270943880081177,
"step": 142
},
{
"epoch": 0.12121212121212122,
"grad_norm": 19.19826316833496,
"learning_rate": 7.988826815642458e-07,
"loss": 1.0638954639434814,
"step": 144
},
{
"epoch": 0.12289562289562289,
"grad_norm": 2.969254970550537,
"learning_rate": 8.100558659217876e-07,
"loss": 1.2084304094314575,
"step": 146
},
{
"epoch": 0.12457912457912458,
"grad_norm": 3.5464372634887695,
"learning_rate": 8.212290502793295e-07,
"loss": 1.0377205610275269,
"step": 148
},
{
"epoch": 0.12626262626262627,
"grad_norm": 26.851030349731445,
"learning_rate": 8.324022346368714e-07,
"loss": 1.298867106437683,
"step": 150
},
{
"epoch": 0.12794612794612795,
"grad_norm": 12.729865074157715,
"learning_rate": 8.435754189944134e-07,
"loss": 1.0469536781311035,
"step": 152
},
{
"epoch": 0.12962962962962962,
"grad_norm": 39.720340728759766,
"learning_rate": 8.547486033519553e-07,
"loss": 1.3842543363571167,
"step": 154
},
{
"epoch": 0.13131313131313133,
"grad_norm": 30.861583709716797,
"learning_rate": 8.659217877094972e-07,
"loss": 1.2696869373321533,
"step": 156
},
{
"epoch": 0.132996632996633,
"grad_norm": 2.758213520050049,
"learning_rate": 8.770949720670391e-07,
"loss": 1.1152485609054565,
"step": 158
},
{
"epoch": 0.13468013468013468,
"grad_norm": 5.129064559936523,
"learning_rate": 8.88268156424581e-07,
"loss": 1.21260666847229,
"step": 160
},
{
"epoch": 0.13636363636363635,
"grad_norm": 2.200296640396118,
"learning_rate": 8.994413407821229e-07,
"loss": 1.0739009380340576,
"step": 162
},
{
"epoch": 0.13804713804713806,
"grad_norm": 22.802173614501953,
"learning_rate": 9.106145251396647e-07,
"loss": 1.0534250736236572,
"step": 164
},
{
"epoch": 0.13973063973063973,
"grad_norm": 8.334705352783203,
"learning_rate": 9.217877094972066e-07,
"loss": 0.9987061023712158,
"step": 166
},
{
"epoch": 0.1414141414141414,
"grad_norm": 3.1446645259857178,
"learning_rate": 9.329608938547485e-07,
"loss": 1.2239556312561035,
"step": 168
},
{
"epoch": 0.14309764309764308,
"grad_norm": 11.334406852722168,
"learning_rate": 9.441340782122904e-07,
"loss": 1.1194162368774414,
"step": 170
},
{
"epoch": 0.1447811447811448,
"grad_norm": 3.408362865447998,
"learning_rate": 9.553072625698324e-07,
"loss": 1.085777997970581,
"step": 172
},
{
"epoch": 0.14646464646464646,
"grad_norm": 6.2441534996032715,
"learning_rate": 9.664804469273742e-07,
"loss": 0.7717651128768921,
"step": 174
},
{
"epoch": 0.14814814814814814,
"grad_norm": 3.749255895614624,
"learning_rate": 9.776536312849163e-07,
"loss": 1.1312694549560547,
"step": 176
},
{
"epoch": 0.14983164983164984,
"grad_norm": 3.902320384979248,
"learning_rate": 9.888268156424581e-07,
"loss": 1.3509280681610107,
"step": 178
},
{
"epoch": 0.15151515151515152,
"grad_norm": 6.110651969909668,
"learning_rate": 1e-06,
"loss": 1.075784683227539,
"step": 180
},
{
"epoch": 0.1531986531986532,
"grad_norm": 9.884479522705078,
"learning_rate": 9.999992247803292e-07,
"loss": 1.4511303901672363,
"step": 182
},
{
"epoch": 0.15488215488215487,
"grad_norm": 22.860551834106445,
"learning_rate": 9.999968991239885e-07,
"loss": 1.0601496696472168,
"step": 184
},
{
"epoch": 0.15656565656565657,
"grad_norm": 47.76069641113281,
"learning_rate": 9.9999302303899e-07,
"loss": 1.175671100616455,
"step": 186
},
{
"epoch": 0.15824915824915825,
"grad_norm": 7.632693290710449,
"learning_rate": 9.999875965386889e-07,
"loss": 0.9617436528205872,
"step": 188
},
{
"epoch": 0.15993265993265993,
"grad_norm": 14.18217945098877,
"learning_rate": 9.999806196417815e-07,
"loss": 0.8225744962692261,
"step": 190
},
{
"epoch": 0.16161616161616163,
"grad_norm": 3.5702500343322754,
"learning_rate": 9.999720923723065e-07,
"loss": 1.3951547145843506,
"step": 192
},
{
"epoch": 0.1632996632996633,
"grad_norm": 6.512271881103516,
"learning_rate": 9.999620147596435e-07,
"loss": 1.3134064674377441,
"step": 194
},
{
"epoch": 0.16498316498316498,
"grad_norm": 4.347053050994873,
"learning_rate": 9.999503868385147e-07,
"loss": 1.1201355457305908,
"step": 196
},
{
"epoch": 0.16666666666666666,
"grad_norm": 4.274275779724121,
"learning_rate": 9.999372086489827e-07,
"loss": 1.2217128276824951,
"step": 198
},
{
"epoch": 0.16835016835016836,
"grad_norm": 36.957733154296875,
"learning_rate": 9.999224802364522e-07,
"loss": 0.9089727997779846,
"step": 200
},
{
"epoch": 0.17003367003367004,
"grad_norm": 10.688148498535156,
"learning_rate": 9.999062016516683e-07,
"loss": 0.9836642742156982,
"step": 202
},
{
"epoch": 0.1717171717171717,
"grad_norm": 5.000755310058594,
"learning_rate": 9.998883729507182e-07,
"loss": 1.0589679479599,
"step": 204
},
{
"epoch": 0.1734006734006734,
"grad_norm": 3.18554425239563,
"learning_rate": 9.998689941950286e-07,
"loss": 1.1106410026550293,
"step": 206
},
{
"epoch": 0.1750841750841751,
"grad_norm": 3.399953842163086,
"learning_rate": 9.99848065451368e-07,
"loss": 1.259597897529602,
"step": 208
},
{
"epoch": 0.17676767676767677,
"grad_norm": 34.06399917602539,
"learning_rate": 9.998255867918447e-07,
"loss": 0.7958086729049683,
"step": 210
},
{
"epoch": 0.17845117845117844,
"grad_norm": 11.635184288024902,
"learning_rate": 9.99801558293907e-07,
"loss": 0.974760115146637,
"step": 212
},
{
"epoch": 0.18013468013468015,
"grad_norm": 3.804048776626587,
"learning_rate": 9.997759800403432e-07,
"loss": 1.4053202867507935,
"step": 214
},
{
"epoch": 0.18181818181818182,
"grad_norm": 3.969377279281616,
"learning_rate": 9.99748852119281e-07,
"loss": 0.8879891633987427,
"step": 216
},
{
"epoch": 0.1835016835016835,
"grad_norm": 13.216470718383789,
"learning_rate": 9.997201746241877e-07,
"loss": 0.7051749229431152,
"step": 218
},
{
"epoch": 0.18518518518518517,
"grad_norm": 21.844314575195312,
"learning_rate": 9.996899476538694e-07,
"loss": 1.4015233516693115,
"step": 220
},
{
"epoch": 0.18686868686868688,
"grad_norm": 4.534096717834473,
"learning_rate": 9.996581713124706e-07,
"loss": 0.972633957862854,
"step": 222
},
{
"epoch": 0.18855218855218855,
"grad_norm": 3.273697853088379,
"learning_rate": 9.99624845709474e-07,
"loss": 1.2434642314910889,
"step": 224
},
{
"epoch": 0.19023569023569023,
"grad_norm": 4.797500133514404,
"learning_rate": 9.995899709597006e-07,
"loss": 1.0040223598480225,
"step": 226
},
{
"epoch": 0.1919191919191919,
"grad_norm": 12.437410354614258,
"learning_rate": 9.995535471833086e-07,
"loss": 1.2370095252990723,
"step": 228
},
{
"epoch": 0.1936026936026936,
"grad_norm": 7.460165023803711,
"learning_rate": 9.995155745057929e-07,
"loss": 1.4212405681610107,
"step": 230
},
{
"epoch": 0.19528619528619529,
"grad_norm": 9.647342681884766,
"learning_rate": 9.994760530579857e-07,
"loss": 1.1002936363220215,
"step": 232
},
{
"epoch": 0.19696969696969696,
"grad_norm": 11.12820053100586,
"learning_rate": 9.994349829760549e-07,
"loss": 1.237018346786499,
"step": 234
},
{
"epoch": 0.19865319865319866,
"grad_norm": 5.350140571594238,
"learning_rate": 9.993923644015042e-07,
"loss": 1.0195953845977783,
"step": 236
},
{
"epoch": 0.20033670033670034,
"grad_norm": 3.050861358642578,
"learning_rate": 9.993481974811725e-07,
"loss": 1.22686767578125,
"step": 238
},
{
"epoch": 0.20202020202020202,
"grad_norm": 7.857388019561768,
"learning_rate": 9.993024823672335e-07,
"loss": 1.0028936862945557,
"step": 240
},
{
"epoch": 0.2037037037037037,
"grad_norm": 7.335727214813232,
"learning_rate": 9.99255219217195e-07,
"loss": 1.2266963720321655,
"step": 242
},
{
"epoch": 0.2053872053872054,
"grad_norm": 6.673895359039307,
"learning_rate": 9.992064081938982e-07,
"loss": 1.0401980876922607,
"step": 244
},
{
"epoch": 0.20707070707070707,
"grad_norm": 11.121489524841309,
"learning_rate": 9.99156049465518e-07,
"loss": 0.704534649848938,
"step": 246
},
{
"epoch": 0.20875420875420875,
"grad_norm": 6.052087306976318,
"learning_rate": 9.99104143205561e-07,
"loss": 1.2733914852142334,
"step": 248
},
{
"epoch": 0.21043771043771045,
"grad_norm": 8.680047988891602,
"learning_rate": 9.990506895928664e-07,
"loss": 1.0285900831222534,
"step": 250
},
{
"epoch": 0.21212121212121213,
"grad_norm": 3.77591609954834,
"learning_rate": 9.989956888116044e-07,
"loss": 0.925588071346283,
"step": 252
},
{
"epoch": 0.2138047138047138,
"grad_norm": 17.994216918945312,
"learning_rate": 9.989391410512756e-07,
"loss": 1.09348726272583,
"step": 254
},
{
"epoch": 0.21548821548821548,
"grad_norm": 3.3857617378234863,
"learning_rate": 9.988810465067111e-07,
"loss": 1.2375221252441406,
"step": 256
},
{
"epoch": 0.21717171717171718,
"grad_norm": 2.9572367668151855,
"learning_rate": 9.988214053780707e-07,
"loss": 0.8651703000068665,
"step": 258
},
{
"epoch": 0.21885521885521886,
"grad_norm": 3.476825714111328,
"learning_rate": 9.987602178708435e-07,
"loss": 1.0651121139526367,
"step": 260
},
{
"epoch": 0.22053872053872053,
"grad_norm": 3.713834047317505,
"learning_rate": 9.986974841958463e-07,
"loss": 1.0779788494110107,
"step": 262
},
{
"epoch": 0.2222222222222222,
"grad_norm": 4.407167911529541,
"learning_rate": 9.986332045692227e-07,
"loss": 1.1462655067443848,
"step": 264
},
{
"epoch": 0.2239057239057239,
"grad_norm": 3.255230665206909,
"learning_rate": 9.98567379212443e-07,
"loss": 1.245474100112915,
"step": 266
},
{
"epoch": 0.2255892255892256,
"grad_norm": 30.358354568481445,
"learning_rate": 9.985000083523037e-07,
"loss": 0.6667277216911316,
"step": 268
},
{
"epoch": 0.22727272727272727,
"grad_norm": 5.595312595367432,
"learning_rate": 9.984310922209254e-07,
"loss": 1.0221211910247803,
"step": 270
},
{
"epoch": 0.22895622895622897,
"grad_norm": 16.317052841186523,
"learning_rate": 9.983606310557533e-07,
"loss": 1.3395957946777344,
"step": 272
},
{
"epoch": 0.23063973063973064,
"grad_norm": 11.24964714050293,
"learning_rate": 9.982886250995556e-07,
"loss": 1.1954050064086914,
"step": 274
},
{
"epoch": 0.23232323232323232,
"grad_norm": 44.74198913574219,
"learning_rate": 9.982150746004232e-07,
"loss": 0.9265189170837402,
"step": 276
},
{
"epoch": 0.234006734006734,
"grad_norm": 3.8386383056640625,
"learning_rate": 9.981399798117685e-07,
"loss": 1.198085069656372,
"step": 278
},
{
"epoch": 0.2356902356902357,
"grad_norm": 44.37248992919922,
"learning_rate": 9.980633409923247e-07,
"loss": 1.0136717557907104,
"step": 280
},
{
"epoch": 0.23737373737373738,
"grad_norm": 7.57785701751709,
"learning_rate": 9.979851584061449e-07,
"loss": 0.9574207663536072,
"step": 282
},
{
"epoch": 0.23905723905723905,
"grad_norm": 8.24811840057373,
"learning_rate": 9.97905432322601e-07,
"loss": 1.3114678859710693,
"step": 284
},
{
"epoch": 0.24074074074074073,
"grad_norm": 5.775442600250244,
"learning_rate": 9.978241630163826e-07,
"loss": 0.9548346400260925,
"step": 286
},
{
"epoch": 0.24242424242424243,
"grad_norm": 11.149243354797363,
"learning_rate": 9.977413507674968e-07,
"loss": 0.8632457852363586,
"step": 288
},
{
"epoch": 0.2441077441077441,
"grad_norm": 78.07566833496094,
"learning_rate": 9.976569958612667e-07,
"loss": 1.2243592739105225,
"step": 290
},
{
"epoch": 0.24579124579124578,
"grad_norm": 4.65302848815918,
"learning_rate": 9.975710985883304e-07,
"loss": 0.6913841366767883,
"step": 292
},
{
"epoch": 0.2474747474747475,
"grad_norm": 15.239048957824707,
"learning_rate": 9.974836592446402e-07,
"loss": 1.3095204830169678,
"step": 294
},
{
"epoch": 0.24915824915824916,
"grad_norm": 13.059560775756836,
"learning_rate": 9.973946781314614e-07,
"loss": 1.106144666671753,
"step": 296
},
{
"epoch": 0.25084175084175087,
"grad_norm": 5.432850360870361,
"learning_rate": 9.973041555553712e-07,
"loss": 0.6466901898384094,
"step": 298
},
{
"epoch": 0.25252525252525254,
"grad_norm": 9.237662315368652,
"learning_rate": 9.972120918282583e-07,
"loss": 0.8612852096557617,
"step": 300
},
{
"epoch": 0.2542087542087542,
"grad_norm": 19.600900650024414,
"learning_rate": 9.971184872673208e-07,
"loss": 1.105349063873291,
"step": 302
},
{
"epoch": 0.2558922558922559,
"grad_norm": 40.91580581665039,
"learning_rate": 9.970233421950659e-07,
"loss": 0.9198004603385925,
"step": 304
},
{
"epoch": 0.25757575757575757,
"grad_norm": 4.66962194442749,
"learning_rate": 9.969266569393081e-07,
"loss": 1.3845856189727783,
"step": 306
},
{
"epoch": 0.25925925925925924,
"grad_norm": 60.427490234375,
"learning_rate": 9.968284318331692e-07,
"loss": 1.1327593326568604,
"step": 308
},
{
"epoch": 0.2609427609427609,
"grad_norm": 22.725788116455078,
"learning_rate": 9.967286672150757e-07,
"loss": 1.1523091793060303,
"step": 310
},
{
"epoch": 0.26262626262626265,
"grad_norm": 24.43414878845215,
"learning_rate": 9.96627363428759e-07,
"loss": 1.234093189239502,
"step": 312
},
{
"epoch": 0.26430976430976433,
"grad_norm": 3.773989200592041,
"learning_rate": 9.965245208232528e-07,
"loss": 1.123462200164795,
"step": 314
},
{
"epoch": 0.265993265993266,
"grad_norm": 4.06792688369751,
"learning_rate": 9.964201397528935e-07,
"loss": 1.274748682975769,
"step": 316
},
{
"epoch": 0.2676767676767677,
"grad_norm": 6.183606147766113,
"learning_rate": 9.963142205773178e-07,
"loss": 1.0359277725219727,
"step": 318
},
{
"epoch": 0.26936026936026936,
"grad_norm": 17.0985164642334,
"learning_rate": 9.962067636614617e-07,
"loss": 0.7821587920188904,
"step": 320
},
{
"epoch": 0.27104377104377103,
"grad_norm": 8.39433765411377,
"learning_rate": 9.960977693755597e-07,
"loss": 1.007806420326233,
"step": 322
},
{
"epoch": 0.2727272727272727,
"grad_norm": 6.79010534286499,
"learning_rate": 9.959872380951425e-07,
"loss": 1.306843638420105,
"step": 324
},
{
"epoch": 0.27441077441077444,
"grad_norm": 3.4290051460266113,
"learning_rate": 9.958751702010373e-07,
"loss": 1.0737717151641846,
"step": 326
},
{
"epoch": 0.2760942760942761,
"grad_norm": 3.778372287750244,
"learning_rate": 9.957615660793653e-07,
"loss": 0.842218816280365,
"step": 328
},
{
"epoch": 0.2777777777777778,
"grad_norm": 4.193020343780518,
"learning_rate": 9.9564642612154e-07,
"loss": 0.9259565472602844,
"step": 330
},
{
"epoch": 0.27946127946127947,
"grad_norm": 5.208146572113037,
"learning_rate": 9.955297507242673e-07,
"loss": 1.1419891119003296,
"step": 332
},
{
"epoch": 0.28114478114478114,
"grad_norm": 5.717302322387695,
"learning_rate": 9.95411540289543e-07,
"loss": 1.1330386400222778,
"step": 334
},
{
"epoch": 0.2828282828282828,
"grad_norm": 5.831217288970947,
"learning_rate": 9.952917952246516e-07,
"loss": 1.0413146018981934,
"step": 336
},
{
"epoch": 0.2845117845117845,
"grad_norm": 3.645052433013916,
"learning_rate": 9.951705159421654e-07,
"loss": 1.235117793083191,
"step": 338
},
{
"epoch": 0.28619528619528617,
"grad_norm": 22.020658493041992,
"learning_rate": 9.950477028599428e-07,
"loss": 1.043231725692749,
"step": 340
},
{
"epoch": 0.2878787878787879,
"grad_norm": 21.875652313232422,
"learning_rate": 9.94923356401126e-07,
"loss": 1.175392985343933,
"step": 342
},
{
"epoch": 0.2895622895622896,
"grad_norm": 27.17024803161621,
"learning_rate": 9.947974769941413e-07,
"loss": 0.9123649001121521,
"step": 344
},
{
"epoch": 0.29124579124579125,
"grad_norm": 6.494509220123291,
"learning_rate": 9.946700650726963e-07,
"loss": 1.1428461074829102,
"step": 346
},
{
"epoch": 0.29292929292929293,
"grad_norm": 6.450991630554199,
"learning_rate": 9.94541121075778e-07,
"loss": 1.08597731590271,
"step": 348
},
{
"epoch": 0.2946127946127946,
"grad_norm": 4.014670372009277,
"learning_rate": 9.944106454476535e-07,
"loss": 0.8208044171333313,
"step": 350
},
{
"epoch": 0.2962962962962963,
"grad_norm": 5.806086540222168,
"learning_rate": 9.94278638637866e-07,
"loss": 0.6253402829170227,
"step": 352
},
{
"epoch": 0.29797979797979796,
"grad_norm": 3.053389310836792,
"learning_rate": 9.941451011012342e-07,
"loss": 1.0509334802627563,
"step": 354
},
{
"epoch": 0.2996632996632997,
"grad_norm": 7.8727521896362305,
"learning_rate": 9.940100332978513e-07,
"loss": 1.0956045389175415,
"step": 356
},
{
"epoch": 0.30134680134680136,
"grad_norm": 32.331748962402344,
"learning_rate": 9.938734356930828e-07,
"loss": 1.004880666732788,
"step": 358
},
{
"epoch": 0.30303030303030304,
"grad_norm": 4.172276020050049,
"learning_rate": 9.93735308757565e-07,
"loss": 0.8379921317100525,
"step": 360
},
{
"epoch": 0.3047138047138047,
"grad_norm": 6.15704870223999,
"learning_rate": 9.93595652967203e-07,
"loss": 1.0078097581863403,
"step": 362
},
{
"epoch": 0.3063973063973064,
"grad_norm": 4.966274261474609,
"learning_rate": 9.9345446880317e-07,
"loss": 1.2672780752182007,
"step": 364
},
{
"epoch": 0.30808080808080807,
"grad_norm": 8.712943077087402,
"learning_rate": 9.933117567519048e-07,
"loss": 0.8534368276596069,
"step": 366
},
{
"epoch": 0.30976430976430974,
"grad_norm": 6.715219020843506,
"learning_rate": 9.931675173051105e-07,
"loss": 0.8929988145828247,
"step": 368
},
{
"epoch": 0.3114478114478115,
"grad_norm": 8.526223182678223,
"learning_rate": 9.930217509597527e-07,
"loss": 1.1088082790374756,
"step": 370
},
{
"epoch": 0.31313131313131315,
"grad_norm": 13.495247840881348,
"learning_rate": 9.928744582180574e-07,
"loss": 1.2500221729278564,
"step": 372
},
{
"epoch": 0.3148148148148148,
"grad_norm": 23.23642921447754,
"learning_rate": 9.927256395875107e-07,
"loss": 0.7106721997261047,
"step": 374
},
{
"epoch": 0.3164983164983165,
"grad_norm": 6.651264190673828,
"learning_rate": 9.925752955808548e-07,
"loss": 1.0243923664093018,
"step": 376
},
{
"epoch": 0.3181818181818182,
"grad_norm": 5.95202112197876,
"learning_rate": 9.924234267160885e-07,
"loss": 1.370633840560913,
"step": 378
},
{
"epoch": 0.31986531986531985,
"grad_norm": 11.883193016052246,
"learning_rate": 9.922700335164638e-07,
"loss": 0.7322716116905212,
"step": 380
},
{
"epoch": 0.32154882154882153,
"grad_norm": 7.471388816833496,
"learning_rate": 9.92115116510485e-07,
"loss": 0.913710355758667,
"step": 382
},
{
"epoch": 0.32323232323232326,
"grad_norm": 44.468502044677734,
"learning_rate": 9.919586762319058e-07,
"loss": 1.1375393867492676,
"step": 384
},
{
"epoch": 0.32491582491582494,
"grad_norm": 73.37066650390625,
"learning_rate": 9.918007132197294e-07,
"loss": 0.750845193862915,
"step": 386
},
{
"epoch": 0.3265993265993266,
"grad_norm": 13.170440673828125,
"learning_rate": 9.916412280182047e-07,
"loss": 0.9285147190093994,
"step": 388
},
{
"epoch": 0.3282828282828283,
"grad_norm": 27.329137802124023,
"learning_rate": 9.91480221176825e-07,
"loss": 1.1141570806503296,
"step": 390
},
{
"epoch": 0.32996632996632996,
"grad_norm": 23.576858520507812,
"learning_rate": 9.913176932503269e-07,
"loss": 0.8426070809364319,
"step": 392
},
{
"epoch": 0.33164983164983164,
"grad_norm": 4.582382678985596,
"learning_rate": 9.911536447986874e-07,
"loss": 1.3466606140136719,
"step": 394
},
{
"epoch": 0.3333333333333333,
"grad_norm": 18.690176010131836,
"learning_rate": 9.909880763871225e-07,
"loss": 1.2158761024475098,
"step": 396
},
{
"epoch": 0.335016835016835,
"grad_norm": 12.741125106811523,
"learning_rate": 9.90820988586085e-07,
"loss": 0.7843135595321655,
"step": 398
},
{
"epoch": 0.3367003367003367,
"grad_norm": 8.261248588562012,
"learning_rate": 9.906523819712627e-07,
"loss": 0.9648294448852539,
"step": 400
},
{
"epoch": 0.3383838383838384,
"grad_norm": 13.866211891174316,
"learning_rate": 9.904822571235764e-07,
"loss": 0.9860712289810181,
"step": 402
},
{
"epoch": 0.3400673400673401,
"grad_norm": 7.611033916473389,
"learning_rate": 9.903106146291776e-07,
"loss": 1.0380196571350098,
"step": 404
},
{
"epoch": 0.34175084175084175,
"grad_norm": 4.44096565246582,
"learning_rate": 9.901374550794471e-07,
"loss": 1.0885226726531982,
"step": 406
},
{
"epoch": 0.3434343434343434,
"grad_norm": 7.336009502410889,
"learning_rate": 9.899627790709922e-07,
"loss": 0.978155255317688,
"step": 408
},
{
"epoch": 0.3451178451178451,
"grad_norm": 35.349571228027344,
"learning_rate": 9.897865872056454e-07,
"loss": 0.5597323179244995,
"step": 410
},
{
"epoch": 0.3468013468013468,
"grad_norm": 5.807060718536377,
"learning_rate": 9.896088800904617e-07,
"loss": 0.8961684703826904,
"step": 412
},
{
"epoch": 0.3484848484848485,
"grad_norm": 18.415029525756836,
"learning_rate": 9.894296583377171e-07,
"loss": 0.9247993230819702,
"step": 414
},
{
"epoch": 0.3501683501683502,
"grad_norm": 16.985078811645508,
"learning_rate": 9.892489225649058e-07,
"loss": 1.2044103145599365,
"step": 416
},
{
"epoch": 0.35185185185185186,
"grad_norm": 6.910268306732178,
"learning_rate": 9.890666733947386e-07,
"loss": 0.7405315637588501,
"step": 418
},
{
"epoch": 0.35353535353535354,
"grad_norm": 9.06907844543457,
"learning_rate": 9.888829114551404e-07,
"loss": 0.9250643253326416,
"step": 420
},
{
"epoch": 0.3552188552188552,
"grad_norm": 10.192124366760254,
"learning_rate": 9.886976373792488e-07,
"loss": 1.1218069791793823,
"step": 422
},
{
"epoch": 0.3569023569023569,
"grad_norm": 9.159024238586426,
"learning_rate": 9.885108518054106e-07,
"loss": 0.6351463794708252,
"step": 424
},
{
"epoch": 0.35858585858585856,
"grad_norm": 29.38273811340332,
"learning_rate": 9.883225553771807e-07,
"loss": 1.0669465065002441,
"step": 426
},
{
"epoch": 0.3602693602693603,
"grad_norm": 8.669297218322754,
"learning_rate": 9.881327487433198e-07,
"loss": 0.8117149472236633,
"step": 428
},
{
"epoch": 0.36195286195286197,
"grad_norm": 6.67222785949707,
"learning_rate": 9.879414325577916e-07,
"loss": 1.2592154741287231,
"step": 430
},
{
"epoch": 0.36363636363636365,
"grad_norm": 6.638124942779541,
"learning_rate": 9.877486074797602e-07,
"loss": 0.9993456602096558,
"step": 432
},
{
"epoch": 0.3653198653198653,
"grad_norm": 3.7449495792388916,
"learning_rate": 9.8755427417359e-07,
"loss": 0.8662674427032471,
"step": 434
},
{
"epoch": 0.367003367003367,
"grad_norm": 4.553740978240967,
"learning_rate": 9.873584333088407e-07,
"loss": 1.0476055145263672,
"step": 436
},
{
"epoch": 0.3686868686868687,
"grad_norm": 9.034341812133789,
"learning_rate": 9.871610855602662e-07,
"loss": 1.1130859851837158,
"step": 438
},
{
"epoch": 0.37037037037037035,
"grad_norm": 7.609111785888672,
"learning_rate": 9.869622316078128e-07,
"loss": 0.9781308770179749,
"step": 440
},
{
"epoch": 0.3720538720538721,
"grad_norm": 15.675320625305176,
"learning_rate": 9.86761872136616e-07,
"loss": 0.9868993759155273,
"step": 442
},
{
"epoch": 0.37373737373737376,
"grad_norm": 4.52480936050415,
"learning_rate": 9.865600078369985e-07,
"loss": 0.7887587547302246,
"step": 444
},
{
"epoch": 0.37542087542087543,
"grad_norm": 21.339006423950195,
"learning_rate": 9.863566394044677e-07,
"loss": 0.6558203101158142,
"step": 446
},
{
"epoch": 0.3771043771043771,
"grad_norm": 5.116230010986328,
"learning_rate": 9.861517675397135e-07,
"loss": 1.1714262962341309,
"step": 448
},
{
"epoch": 0.3787878787878788,
"grad_norm": 16.112041473388672,
"learning_rate": 9.859453929486054e-07,
"loss": 1.1047420501708984,
"step": 450
},
{
"epoch": 0.38047138047138046,
"grad_norm": 3.787045478820801,
"learning_rate": 9.857375163421912e-07,
"loss": 0.7425003051757812,
"step": 452
},
{
"epoch": 0.38215488215488214,
"grad_norm": 11.478412628173828,
"learning_rate": 9.855281384366928e-07,
"loss": 1.0151433944702148,
"step": 454
},
{
"epoch": 0.3838383838383838,
"grad_norm": 3.6095988750457764,
"learning_rate": 9.853172599535054e-07,
"loss": 0.8090977668762207,
"step": 456
},
{
"epoch": 0.38552188552188554,
"grad_norm": 7.952422618865967,
"learning_rate": 9.85104881619194e-07,
"loss": 0.9961310625076294,
"step": 458
},
{
"epoch": 0.3872053872053872,
"grad_norm": 2.0787007808685303,
"learning_rate": 9.848910041654915e-07,
"loss": 1.1424083709716797,
"step": 460
},
{
"epoch": 0.3888888888888889,
"grad_norm": 3.276982545852661,
"learning_rate": 9.846756283292955e-07,
"loss": 0.8972825407981873,
"step": 462
},
{
"epoch": 0.39057239057239057,
"grad_norm": 6.26957368850708,
"learning_rate": 9.844587548526665e-07,
"loss": 0.8542879223823547,
"step": 464
},
{
"epoch": 0.39225589225589225,
"grad_norm": 32.88930892944336,
"learning_rate": 9.842403844828249e-07,
"loss": 0.9769890308380127,
"step": 466
},
{
"epoch": 0.3939393939393939,
"grad_norm": 10.898834228515625,
"learning_rate": 9.840205179721486e-07,
"loss": 0.9689866304397583,
"step": 468
},
{
"epoch": 0.3956228956228956,
"grad_norm": 3.45035457611084,
"learning_rate": 9.837991560781698e-07,
"loss": 0.9729927778244019,
"step": 470
},
{
"epoch": 0.39730639730639733,
"grad_norm": 7.222962379455566,
"learning_rate": 9.835762995635739e-07,
"loss": 0.8332297801971436,
"step": 472
},
{
"epoch": 0.398989898989899,
"grad_norm": 3.249415636062622,
"learning_rate": 9.833519491961951e-07,
"loss": 1.0173261165618896,
"step": 474
},
{
"epoch": 0.4006734006734007,
"grad_norm": 6.285678863525391,
"learning_rate": 9.831261057490148e-07,
"loss": 0.7735811471939087,
"step": 476
},
{
"epoch": 0.40235690235690236,
"grad_norm": 3.5245249271392822,
"learning_rate": 9.82898770000159e-07,
"loss": 0.9958957433700562,
"step": 478
},
{
"epoch": 0.40404040404040403,
"grad_norm": 13.678420066833496,
"learning_rate": 9.826699427328944e-07,
"loss": 1.0717885494232178,
"step": 480
},
{
"epoch": 0.4057239057239057,
"grad_norm": 12.059322357177734,
"learning_rate": 9.824396247356276e-07,
"loss": 1.0886049270629883,
"step": 482
},
{
"epoch": 0.4074074074074074,
"grad_norm": 9.42127513885498,
"learning_rate": 9.822078168019012e-07,
"loss": 0.8954146504402161,
"step": 484
},
{
"epoch": 0.4090909090909091,
"grad_norm": 13.108272552490234,
"learning_rate": 9.819745197303907e-07,
"loss": 0.881049633026123,
"step": 486
},
{
"epoch": 0.4107744107744108,
"grad_norm": 3.574754238128662,
"learning_rate": 9.817397343249028e-07,
"loss": 1.1146478652954102,
"step": 488
},
{
"epoch": 0.41245791245791247,
"grad_norm": 3.4290618896484375,
"learning_rate": 9.815034613943722e-07,
"loss": 1.118224859237671,
"step": 490
},
{
"epoch": 0.41414141414141414,
"grad_norm": 16.978740692138672,
"learning_rate": 9.812657017528584e-07,
"loss": 1.0728644132614136,
"step": 492
},
{
"epoch": 0.4158249158249158,
"grad_norm": 5.449537754058838,
"learning_rate": 9.810264562195432e-07,
"loss": 0.9440809488296509,
"step": 494
},
{
"epoch": 0.4175084175084175,
"grad_norm": 2.756265640258789,
"learning_rate": 9.807857256187283e-07,
"loss": 1.1065900325775146,
"step": 496
},
{
"epoch": 0.41919191919191917,
"grad_norm": 3.4030373096466064,
"learning_rate": 9.805435107798322e-07,
"loss": 1.0974758863449097,
"step": 498
},
{
"epoch": 0.4208754208754209,
"grad_norm": 9.233179092407227,
"learning_rate": 9.802998125373864e-07,
"loss": 0.851800799369812,
"step": 500
},
{
"epoch": 0.4225589225589226,
"grad_norm": 11.157843589782715,
"learning_rate": 9.800546317310343e-07,
"loss": 0.6602354645729065,
"step": 502
},
{
"epoch": 0.42424242424242425,
"grad_norm": 15.149531364440918,
"learning_rate": 9.798079692055267e-07,
"loss": 0.9472991228103638,
"step": 504
},
{
"epoch": 0.42592592592592593,
"grad_norm": 30.058595657348633,
"learning_rate": 9.7955982581072e-07,
"loss": 0.9938538670539856,
"step": 506
},
{
"epoch": 0.4276094276094276,
"grad_norm": 23.41927719116211,
"learning_rate": 9.793102024015724e-07,
"loss": 1.4200940132141113,
"step": 508
},
{
"epoch": 0.4292929292929293,
"grad_norm": 3.9220423698425293,
"learning_rate": 9.790590998381417e-07,
"loss": 1.0478514432907104,
"step": 510
},
{
"epoch": 0.43097643097643096,
"grad_norm": 3.723065137863159,
"learning_rate": 9.788065189855817e-07,
"loss": 1.2064735889434814,
"step": 512
},
{
"epoch": 0.43265993265993263,
"grad_norm": 3.486267566680908,
"learning_rate": 9.7855246071414e-07,
"loss": 1.140267014503479,
"step": 514
},
{
"epoch": 0.43434343434343436,
"grad_norm": 8.95257568359375,
"learning_rate": 9.78296925899154e-07,
"loss": 1.0755705833435059,
"step": 516
},
{
"epoch": 0.43602693602693604,
"grad_norm": 4.213111400604248,
"learning_rate": 9.780399154210487e-07,
"loss": 1.0637681484222412,
"step": 518
},
{
"epoch": 0.4377104377104377,
"grad_norm": 26.23670196533203,
"learning_rate": 9.777814301653336e-07,
"loss": 0.9591152667999268,
"step": 520
},
{
"epoch": 0.4393939393939394,
"grad_norm": 2.839754343032837,
"learning_rate": 9.775214710225987e-07,
"loss": 0.8415237665176392,
"step": 522
},
{
"epoch": 0.44107744107744107,
"grad_norm": 5.952809810638428,
"learning_rate": 9.77260038888513e-07,
"loss": 1.133270502090454,
"step": 524
},
{
"epoch": 0.44276094276094274,
"grad_norm": 8.995283126831055,
"learning_rate": 9.769971346638203e-07,
"loss": 0.7777677774429321,
"step": 526
},
{
"epoch": 0.4444444444444444,
"grad_norm": 3.4373066425323486,
"learning_rate": 9.767327592543359e-07,
"loss": 1.2248082160949707,
"step": 528
},
{
"epoch": 0.44612794612794615,
"grad_norm": 7.905541896820068,
"learning_rate": 9.764669135709443e-07,
"loss": 0.8326348066329956,
"step": 530
},
{
"epoch": 0.4478114478114478,
"grad_norm": 2.997097969055176,
"learning_rate": 9.76199598529596e-07,
"loss": 0.8697119355201721,
"step": 532
},
{
"epoch": 0.4494949494949495,
"grad_norm": 3.4758172035217285,
"learning_rate": 9.759308150513039e-07,
"loss": 0.9715222716331482,
"step": 534
},
{
"epoch": 0.4511784511784512,
"grad_norm": 4.66405725479126,
"learning_rate": 9.756605640621397e-07,
"loss": 1.2556489706039429,
"step": 536
},
{
"epoch": 0.45286195286195285,
"grad_norm": 11.930469512939453,
"learning_rate": 9.753888464932322e-07,
"loss": 1.1018869876861572,
"step": 538
},
{
"epoch": 0.45454545454545453,
"grad_norm": 13.772843360900879,
"learning_rate": 9.751156632807626e-07,
"loss": 0.8878042101860046,
"step": 540
},
{
"epoch": 0.4562289562289562,
"grad_norm": 4.206384181976318,
"learning_rate": 9.748410153659618e-07,
"loss": 1.0389076471328735,
"step": 542
},
{
"epoch": 0.45791245791245794,
"grad_norm": 3.624582052230835,
"learning_rate": 9.745649036951079e-07,
"loss": 1.1431198120117188,
"step": 544
},
{
"epoch": 0.4595959595959596,
"grad_norm": 12.98609733581543,
"learning_rate": 9.742873292195213e-07,
"loss": 1.1605827808380127,
"step": 546
},
{
"epoch": 0.4612794612794613,
"grad_norm": 7.910975456237793,
"learning_rate": 9.740082928955634e-07,
"loss": 1.3202755451202393,
"step": 548
},
{
"epoch": 0.46296296296296297,
"grad_norm": 5.325044631958008,
"learning_rate": 9.737277956846313e-07,
"loss": 0.9252653121948242,
"step": 550
},
{
"epoch": 0.46464646464646464,
"grad_norm": 14.551304817199707,
"learning_rate": 9.73445838553156e-07,
"loss": 0.876882791519165,
"step": 552
},
{
"epoch": 0.4663299663299663,
"grad_norm": 3.202234983444214,
"learning_rate": 9.731624224725986e-07,
"loss": 1.0558652877807617,
"step": 554
},
{
"epoch": 0.468013468013468,
"grad_norm": 4.0583086013793945,
"learning_rate": 9.728775484194464e-07,
"loss": 0.740475594997406,
"step": 556
},
{
"epoch": 0.4696969696969697,
"grad_norm": 3.9330027103424072,
"learning_rate": 9.725912173752106e-07,
"loss": 1.2117640972137451,
"step": 558
},
{
"epoch": 0.4713804713804714,
"grad_norm": 7.207095146179199,
"learning_rate": 9.723034303264225e-07,
"loss": 0.4382402002811432,
"step": 560
},
{
"epoch": 0.4730639730639731,
"grad_norm": 4.947695255279541,
"learning_rate": 9.72014188264629e-07,
"loss": 0.6228041648864746,
"step": 562
},
{
"epoch": 0.47474747474747475,
"grad_norm": 9.088849067687988,
"learning_rate": 9.71723492186391e-07,
"loss": 1.3076156377792358,
"step": 564
},
{
"epoch": 0.4764309764309764,
"grad_norm": 4.49135160446167,
"learning_rate": 9.714313430932785e-07,
"loss": 1.1357098817825317,
"step": 566
},
{
"epoch": 0.4781144781144781,
"grad_norm": 9.188185691833496,
"learning_rate": 9.711377419918683e-07,
"loss": 0.4768811762332916,
"step": 568
},
{
"epoch": 0.4797979797979798,
"grad_norm": 23.397979736328125,
"learning_rate": 9.708426898937399e-07,
"loss": 1.1221351623535156,
"step": 570
},
{
"epoch": 0.48148148148148145,
"grad_norm": 27.77615737915039,
"learning_rate": 9.705461878154714e-07,
"loss": 0.7149933576583862,
"step": 572
},
{
"epoch": 0.4831649831649832,
"grad_norm": 4.684352874755859,
"learning_rate": 9.702482367786377e-07,
"loss": 0.9776611924171448,
"step": 574
},
{
"epoch": 0.48484848484848486,
"grad_norm": 7.567526817321777,
"learning_rate": 9.699488378098055e-07,
"loss": 0.8799599409103394,
"step": 576
},
{
"epoch": 0.48653198653198654,
"grad_norm": 9.130019187927246,
"learning_rate": 9.696479919405298e-07,
"loss": 1.1031641960144043,
"step": 578
},
{
"epoch": 0.4882154882154882,
"grad_norm": 9.574334144592285,
"learning_rate": 9.693457002073517e-07,
"loss": 0.8267420530319214,
"step": 580
},
{
"epoch": 0.4898989898989899,
"grad_norm": 4.069400787353516,
"learning_rate": 9.69041963651793e-07,
"loss": 1.3716950416564941,
"step": 582
},
{
"epoch": 0.49158249158249157,
"grad_norm": 4.066318988800049,
"learning_rate": 9.68736783320354e-07,
"loss": 1.017892837524414,
"step": 584
},
{
"epoch": 0.49326599326599324,
"grad_norm": 2.714144706726074,
"learning_rate": 9.684301602645098e-07,
"loss": 0.861703097820282,
"step": 586
},
{
"epoch": 0.494949494949495,
"grad_norm": 3.8651719093322754,
"learning_rate": 9.681220955407053e-07,
"loss": 0.6647518873214722,
"step": 588
},
{
"epoch": 0.49663299663299665,
"grad_norm": 3.4340827465057373,
"learning_rate": 9.67812590210353e-07,
"loss": 1.1181421279907227,
"step": 590
},
{
"epoch": 0.4983164983164983,
"grad_norm": 3.8552682399749756,
"learning_rate": 9.675016453398296e-07,
"loss": 1.1666280031204224,
"step": 592
},
{
"epoch": 0.5,
"grad_norm": 13.408713340759277,
"learning_rate": 9.671892620004706e-07,
"loss": 0.8374857902526855,
"step": 594
},
{
"epoch": 0.5016835016835017,
"grad_norm": 7.0116424560546875,
"learning_rate": 9.66875441268568e-07,
"loss": 0.960757851600647,
"step": 596
},
{
"epoch": 0.5033670033670034,
"grad_norm": 2.764244556427002,
"learning_rate": 9.665601842253666e-07,
"loss": 1.3247270584106445,
"step": 598
},
{
"epoch": 0.5050505050505051,
"grad_norm": 13.236382484436035,
"learning_rate": 9.662434919570592e-07,
"loss": 0.8124715685844421,
"step": 600
},
{
"epoch": 0.5067340067340067,
"grad_norm": 39.5108528137207,
"learning_rate": 9.659253655547843e-07,
"loss": 1.0799833536148071,
"step": 602
},
{
"epoch": 0.5084175084175084,
"grad_norm": 13.359992027282715,
"learning_rate": 9.656058061146207e-07,
"loss": 1.0351530313491821,
"step": 604
},
{
"epoch": 0.51010101010101,
"grad_norm": 4.374532699584961,
"learning_rate": 9.652848147375853e-07,
"loss": 1.1660369634628296,
"step": 606
},
{
"epoch": 0.5117845117845118,
"grad_norm": 7.170238018035889,
"learning_rate": 9.649623925296288e-07,
"loss": 0.6313941478729248,
"step": 608
},
{
"epoch": 0.5134680134680135,
"grad_norm": 2.792412519454956,
"learning_rate": 9.646385406016313e-07,
"loss": 0.9415972232818604,
"step": 610
},
{
"epoch": 0.5151515151515151,
"grad_norm": 13.598429679870605,
"learning_rate": 9.643132600693983e-07,
"loss": 0.9117315411567688,
"step": 612
},
{
"epoch": 0.5168350168350169,
"grad_norm": 4.011415481567383,
"learning_rate": 9.639865520536588e-07,
"loss": 0.7065603137016296,
"step": 614
},
{
"epoch": 0.5185185185185185,
"grad_norm": 9.801816940307617,
"learning_rate": 9.636584176800593e-07,
"loss": 1.1204071044921875,
"step": 616
},
{
"epoch": 0.5202020202020202,
"grad_norm": 10.913689613342285,
"learning_rate": 9.633288580791603e-07,
"loss": 1.031501054763794,
"step": 618
},
{
"epoch": 0.5218855218855218,
"grad_norm": 2.2291688919067383,
"learning_rate": 9.62997874386434e-07,
"loss": 1.0308109521865845,
"step": 620
},
{
"epoch": 0.5235690235690236,
"grad_norm": 12.420637130737305,
"learning_rate": 9.62665467742258e-07,
"loss": 1.0678637027740479,
"step": 622
},
{
"epoch": 0.5252525252525253,
"grad_norm": 17.982452392578125,
"learning_rate": 9.623316392919132e-07,
"loss": 0.7635082006454468,
"step": 624
},
{
"epoch": 0.5269360269360269,
"grad_norm": 31.17810821533203,
"learning_rate": 9.619963901855789e-07,
"loss": 0.9803504943847656,
"step": 626
},
{
"epoch": 0.5286195286195287,
"grad_norm": 3.1647303104400635,
"learning_rate": 9.616597215783295e-07,
"loss": 0.8586722612380981,
"step": 628
},
{
"epoch": 0.5303030303030303,
"grad_norm": 10.497335433959961,
"learning_rate": 9.6132163463013e-07,
"loss": 0.7892797589302063,
"step": 630
},
{
"epoch": 0.531986531986532,
"grad_norm": 11.274188995361328,
"learning_rate": 9.609821305058324e-07,
"loss": 1.1465822458267212,
"step": 632
},
{
"epoch": 0.5336700336700336,
"grad_norm": 4.127675533294678,
"learning_rate": 9.606412103751707e-07,
"loss": 0.9373839497566223,
"step": 634
},
{
"epoch": 0.5353535353535354,
"grad_norm": 4.121032238006592,
"learning_rate": 9.602988754127585e-07,
"loss": 0.8166585564613342,
"step": 636
},
{
"epoch": 0.5370370370370371,
"grad_norm": 29.52313804626465,
"learning_rate": 9.59955126798084e-07,
"loss": 1.0028636455535889,
"step": 638
},
{
"epoch": 0.5387205387205387,
"grad_norm": 4.636293888092041,
"learning_rate": 9.596099657155056e-07,
"loss": 0.8631769418716431,
"step": 640
},
{
"epoch": 0.5404040404040404,
"grad_norm": 2.6743357181549072,
"learning_rate": 9.592633933542484e-07,
"loss": 0.9822747707366943,
"step": 642
},
{
"epoch": 0.5420875420875421,
"grad_norm": 12.097616195678711,
"learning_rate": 9.589154109084e-07,
"loss": 0.9199867844581604,
"step": 644
},
{
"epoch": 0.5437710437710438,
"grad_norm": 4.201647758483887,
"learning_rate": 9.585660195769066e-07,
"loss": 0.9225333333015442,
"step": 646
},
{
"epoch": 0.5454545454545454,
"grad_norm": 7.826382160186768,
"learning_rate": 9.582152205635682e-07,
"loss": 1.0213161706924438,
"step": 648
},
{
"epoch": 0.5471380471380471,
"grad_norm": 8.643582344055176,
"learning_rate": 9.578630150770348e-07,
"loss": 1.1659046411514282,
"step": 650
},
{
"epoch": 0.5488215488215489,
"grad_norm": 16.885889053344727,
"learning_rate": 9.575094043308027e-07,
"loss": 1.0685768127441406,
"step": 652
},
{
"epoch": 0.5505050505050505,
"grad_norm": 3.666364908218384,
"learning_rate": 9.5715438954321e-07,
"loss": 1.0853323936462402,
"step": 654
},
{
"epoch": 0.5521885521885522,
"grad_norm": 21.654556274414062,
"learning_rate": 9.567979719374313e-07,
"loss": 0.9922153353691101,
"step": 656
},
{
"epoch": 0.5538720538720538,
"grad_norm": 7.106581211090088,
"learning_rate": 9.564401527414757e-07,
"loss": 0.8094037771224976,
"step": 658
},
{
"epoch": 0.5555555555555556,
"grad_norm": 6.885115146636963,
"learning_rate": 9.56080933188181e-07,
"loss": 0.7689495086669922,
"step": 660
},
{
"epoch": 0.5572390572390572,
"grad_norm": 3.9134387969970703,
"learning_rate": 9.557203145152093e-07,
"loss": 1.064096212387085,
"step": 662
},
{
"epoch": 0.5589225589225589,
"grad_norm": 3.955990791320801,
"learning_rate": 9.55358297965044e-07,
"loss": 1.1137442588806152,
"step": 664
},
{
"epoch": 0.5606060606060606,
"grad_norm": 4.690779209136963,
"learning_rate": 9.549948847849842e-07,
"loss": 0.5054531693458557,
"step": 666
},
{
"epoch": 0.5622895622895623,
"grad_norm": 32.8538818359375,
"learning_rate": 9.546300762271414e-07,
"loss": 0.6846545934677124,
"step": 668
},
{
"epoch": 0.563973063973064,
"grad_norm": 18.116151809692383,
"learning_rate": 9.542638735484346e-07,
"loss": 1.099835991859436,
"step": 670
},
{
"epoch": 0.5656565656565656,
"grad_norm": 26.123899459838867,
"learning_rate": 9.538962780105855e-07,
"loss": 0.6106569766998291,
"step": 672
},
{
"epoch": 0.5673400673400674,
"grad_norm": 6.80141019821167,
"learning_rate": 9.535272908801164e-07,
"loss": 0.6078236103057861,
"step": 674
},
{
"epoch": 0.569023569023569,
"grad_norm": 3.6088900566101074,
"learning_rate": 9.531569134283426e-07,
"loss": 0.6979132890701294,
"step": 676
},
{
"epoch": 0.5707070707070707,
"grad_norm": 35.824989318847656,
"learning_rate": 9.527851469313703e-07,
"loss": 1.3292642831802368,
"step": 678
},
{
"epoch": 0.5723905723905723,
"grad_norm": 13.528051376342773,
"learning_rate": 9.524119926700916e-07,
"loss": 0.41806691884994507,
"step": 680
},
{
"epoch": 0.5740740740740741,
"grad_norm": 10.345752716064453,
"learning_rate": 9.520374519301801e-07,
"loss": 1.0647339820861816,
"step": 682
},
{
"epoch": 0.5757575757575758,
"grad_norm": 5.383781433105469,
"learning_rate": 9.516615260020859e-07,
"loss": 1.1695669889450073,
"step": 684
},
{
"epoch": 0.5774410774410774,
"grad_norm": 4.6796770095825195,
"learning_rate": 9.512842161810322e-07,
"loss": 1.1320273876190186,
"step": 686
},
{
"epoch": 0.5791245791245792,
"grad_norm": 3.494124412536621,
"learning_rate": 9.509055237670101e-07,
"loss": 0.8368796706199646,
"step": 688
},
{
"epoch": 0.5808080808080808,
"grad_norm": 18.290544509887695,
"learning_rate": 9.505254500647742e-07,
"loss": 0.7732558250427246,
"step": 690
},
{
"epoch": 0.5824915824915825,
"grad_norm": 3.7307989597320557,
"learning_rate": 9.501439963838383e-07,
"loss": 0.8185931444168091,
"step": 692
},
{
"epoch": 0.5841750841750841,
"grad_norm": 5.913649559020996,
"learning_rate": 9.497611640384712e-07,
"loss": 1.0147478580474854,
"step": 694
},
{
"epoch": 0.5858585858585859,
"grad_norm": 14.875641822814941,
"learning_rate": 9.493769543476909e-07,
"loss": 0.9212662577629089,
"step": 696
},
{
"epoch": 0.5875420875420876,
"grad_norm": 10.849754333496094,
"learning_rate": 9.489913686352616e-07,
"loss": 0.8869191408157349,
"step": 698
},
{
"epoch": 0.5892255892255892,
"grad_norm": 7.326023578643799,
"learning_rate": 9.486044082296886e-07,
"loss": 0.8455855846405029,
"step": 700
},
{
"epoch": 0.5909090909090909,
"grad_norm": 8.260787010192871,
"learning_rate": 9.48216074464213e-07,
"loss": 0.944000780582428,
"step": 702
},
{
"epoch": 0.5925925925925926,
"grad_norm": 4.983551979064941,
"learning_rate": 9.47826368676808e-07,
"loss": 1.0806821584701538,
"step": 704
},
{
"epoch": 0.5942760942760943,
"grad_norm": 7.548647880554199,
"learning_rate": 9.474352922101741e-07,
"loss": 1.0155982971191406,
"step": 706
},
{
"epoch": 0.5959595959595959,
"grad_norm": 9.95559024810791,
"learning_rate": 9.470428464117344e-07,
"loss": 0.8041818141937256,
"step": 708
},
{
"epoch": 0.5976430976430976,
"grad_norm": 22.083297729492188,
"learning_rate": 9.466490326336298e-07,
"loss": 0.8329028487205505,
"step": 710
},
{
"epoch": 0.5993265993265994,
"grad_norm": 3.1762735843658447,
"learning_rate": 9.462538522327144e-07,
"loss": 1.1545898914337158,
"step": 712
},
{
"epoch": 0.601010101010101,
"grad_norm": 2.4671504497528076,
"learning_rate": 9.458573065705507e-07,
"loss": 1.081796407699585,
"step": 714
},
{
"epoch": 0.6026936026936027,
"grad_norm": 4.517568111419678,
"learning_rate": 9.454593970134058e-07,
"loss": 0.7743735313415527,
"step": 716
},
{
"epoch": 0.6043771043771043,
"grad_norm": 11.208656311035156,
"learning_rate": 9.45060124932245e-07,
"loss": 0.9187523126602173,
"step": 718
},
{
"epoch": 0.6060606060606061,
"grad_norm": 10.369696617126465,
"learning_rate": 9.446594917027293e-07,
"loss": 0.965773344039917,
"step": 720
},
{
"epoch": 0.6077441077441077,
"grad_norm": 11.875805854797363,
"learning_rate": 9.442574987052082e-07,
"loss": 0.9600865840911865,
"step": 722
},
{
"epoch": 0.6094276094276094,
"grad_norm": 9.8040189743042,
"learning_rate": 9.438541473247169e-07,
"loss": 0.9117884635925293,
"step": 724
},
{
"epoch": 0.6111111111111112,
"grad_norm": 61.72325134277344,
"learning_rate": 9.434494389509707e-07,
"loss": 1.0104196071624756,
"step": 726
},
{
"epoch": 0.6127946127946128,
"grad_norm": 19.176124572753906,
"learning_rate": 9.430433749783601e-07,
"loss": 0.9295721650123596,
"step": 728
},
{
"epoch": 0.6144781144781145,
"grad_norm": 6.085904121398926,
"learning_rate": 9.426359568059465e-07,
"loss": 1.1639102697372437,
"step": 730
},
{
"epoch": 0.6161616161616161,
"grad_norm": 2.7430849075317383,
"learning_rate": 9.422271858374567e-07,
"loss": 1.1210119724273682,
"step": 732
},
{
"epoch": 0.6178451178451179,
"grad_norm": 6.412540435791016,
"learning_rate": 9.418170634812789e-07,
"loss": 0.8046259880065918,
"step": 734
},
{
"epoch": 0.6195286195286195,
"grad_norm": 15.164510726928711,
"learning_rate": 9.41405591150457e-07,
"loss": 0.8280715942382812,
"step": 736
},
{
"epoch": 0.6212121212121212,
"grad_norm": 13.97409725189209,
"learning_rate": 9.409927702626865e-07,
"loss": 0.6932380199432373,
"step": 738
},
{
"epoch": 0.622895622895623,
"grad_norm": 2.947274684906006,
"learning_rate": 9.405786022403089e-07,
"loss": 1.2565734386444092,
"step": 740
},
{
"epoch": 0.6245791245791246,
"grad_norm": 4.588158130645752,
"learning_rate": 9.401630885103074e-07,
"loss": 1.0269739627838135,
"step": 742
},
{
"epoch": 0.6262626262626263,
"grad_norm": 4.135093688964844,
"learning_rate": 9.397462305043016e-07,
"loss": 1.2328283786773682,
"step": 744
},
{
"epoch": 0.6279461279461279,
"grad_norm": 3.079167127609253,
"learning_rate": 9.393280296585427e-07,
"loss": 0.968951404094696,
"step": 746
},
{
"epoch": 0.6296296296296297,
"grad_norm": 2.28676176071167,
"learning_rate": 9.389084874139085e-07,
"loss": 1.2347244024276733,
"step": 748
},
{
"epoch": 0.6313131313131313,
"grad_norm": 8.729804992675781,
"learning_rate": 9.384876052158987e-07,
"loss": 1.3113691806793213,
"step": 750
},
{
"epoch": 0.632996632996633,
"grad_norm": 7.039168357849121,
"learning_rate": 9.380653845146294e-07,
"loss": 0.7496945858001709,
"step": 752
},
{
"epoch": 0.6346801346801347,
"grad_norm": 14.870685577392578,
"learning_rate": 9.37641826764829e-07,
"loss": 1.0088348388671875,
"step": 754
},
{
"epoch": 0.6363636363636364,
"grad_norm": 3.9592251777648926,
"learning_rate": 9.372169334258315e-07,
"loss": 0.7920987606048584,
"step": 756
},
{
"epoch": 0.6380471380471381,
"grad_norm": 10.84424114227295,
"learning_rate": 9.367907059615737e-07,
"loss": 0.85060054063797,
"step": 758
},
{
"epoch": 0.6397306397306397,
"grad_norm": 235.0703582763672,
"learning_rate": 9.363631458405885e-07,
"loss": 0.6581774353981018,
"step": 760
},
{
"epoch": 0.6414141414141414,
"grad_norm": 5.294051647186279,
"learning_rate": 9.359342545360002e-07,
"loss": 0.46980541944503784,
"step": 762
},
{
"epoch": 0.6430976430976431,
"grad_norm": 29.527233123779297,
"learning_rate": 9.355040335255201e-07,
"loss": 1.0372706651687622,
"step": 764
},
{
"epoch": 0.6447811447811448,
"grad_norm": 4.027895927429199,
"learning_rate": 9.350724842914403e-07,
"loss": 1.104457139968872,
"step": 766
},
{
"epoch": 0.6464646464646465,
"grad_norm": 45.400699615478516,
"learning_rate": 9.346396083206297e-07,
"loss": 0.8071002960205078,
"step": 768
},
{
"epoch": 0.6481481481481481,
"grad_norm": 4.046747207641602,
"learning_rate": 9.342054071045281e-07,
"loss": 0.8214056491851807,
"step": 770
},
{
"epoch": 0.6498316498316499,
"grad_norm": 4.489753723144531,
"learning_rate": 9.337698821391413e-07,
"loss": 0.8206263780593872,
"step": 772
},
{
"epoch": 0.6515151515151515,
"grad_norm": 3.739696502685547,
"learning_rate": 9.333330349250363e-07,
"loss": 0.7051388025283813,
"step": 774
},
{
"epoch": 0.6531986531986532,
"grad_norm": 5.395556449890137,
"learning_rate": 9.328948669673353e-07,
"loss": 0.9473454356193542,
"step": 776
},
{
"epoch": 0.6548821548821548,
"grad_norm": 3.432518720626831,
"learning_rate": 9.324553797757113e-07,
"loss": 1.0663374662399292,
"step": 778
},
{
"epoch": 0.6565656565656566,
"grad_norm": 15.647449493408203,
"learning_rate": 9.320145748643827e-07,
"loss": 1.015528678894043,
"step": 780
},
{
"epoch": 0.6582491582491582,
"grad_norm": 18.728303909301758,
"learning_rate": 9.315724537521078e-07,
"loss": 1.0769071578979492,
"step": 782
},
{
"epoch": 0.6599326599326599,
"grad_norm": 9.825349807739258,
"learning_rate": 9.311290179621801e-07,
"loss": 1.0078058242797852,
"step": 784
},
{
"epoch": 0.6616161616161617,
"grad_norm": 8.568079948425293,
"learning_rate": 9.306842690224221e-07,
"loss": 1.1149715185165405,
"step": 786
},
{
"epoch": 0.6632996632996633,
"grad_norm": 14.216378211975098,
"learning_rate": 9.302382084651813e-07,
"loss": 0.9104188680648804,
"step": 788
},
{
"epoch": 0.664983164983165,
"grad_norm": 3.3533241748809814,
"learning_rate": 9.297908378273238e-07,
"loss": 0.9613898992538452,
"step": 790
},
{
"epoch": 0.6666666666666666,
"grad_norm": 22.51508140563965,
"learning_rate": 9.293421586502299e-07,
"loss": 1.0459431409835815,
"step": 792
},
{
"epoch": 0.6683501683501684,
"grad_norm": 4.43943977355957,
"learning_rate": 9.288921724797881e-07,
"loss": 0.6562730073928833,
"step": 794
},
{
"epoch": 0.67003367003367,
"grad_norm": 3.550076484680176,
"learning_rate": 9.2844088086639e-07,
"loss": 0.9962120056152344,
"step": 796
},
{
"epoch": 0.6717171717171717,
"grad_norm": 4.60956335067749,
"learning_rate": 9.279882853649251e-07,
"loss": 1.0277674198150635,
"step": 798
},
{
"epoch": 0.6734006734006734,
"grad_norm": 4.80393648147583,
"learning_rate": 9.275343875347754e-07,
"loss": 0.6581063866615295,
"step": 800
},
{
"epoch": 0.6750841750841751,
"grad_norm": 5.648859024047852,
"learning_rate": 9.270791889398098e-07,
"loss": 1.016190528869629,
"step": 802
},
{
"epoch": 0.6767676767676768,
"grad_norm": 27.92025375366211,
"learning_rate": 9.266226911483792e-07,
"loss": 0.77015221118927,
"step": 804
},
{
"epoch": 0.6784511784511784,
"grad_norm": 15.88348388671875,
"learning_rate": 9.261648957333104e-07,
"loss": 0.7054531574249268,
"step": 806
},
{
"epoch": 0.6801346801346801,
"grad_norm": 11.626742362976074,
"learning_rate": 9.257058042719014e-07,
"loss": 1.162412405014038,
"step": 808
},
{
"epoch": 0.6818181818181818,
"grad_norm": 3.297008752822876,
"learning_rate": 9.252454183459151e-07,
"loss": 1.0062317848205566,
"step": 810
},
{
"epoch": 0.6835016835016835,
"grad_norm": 3.699937343597412,
"learning_rate": 9.24783739541575e-07,
"loss": 1.1737666130065918,
"step": 812
},
{
"epoch": 0.6851851851851852,
"grad_norm": 12.622026443481445,
"learning_rate": 9.243207694495587e-07,
"loss": 0.5980294942855835,
"step": 814
},
{
"epoch": 0.6868686868686869,
"grad_norm": 2.4388279914855957,
"learning_rate": 9.238565096649931e-07,
"loss": 1.1263744831085205,
"step": 816
},
{
"epoch": 0.6885521885521886,
"grad_norm": 5.467193603515625,
"learning_rate": 9.233909617874485e-07,
"loss": 0.8187447786331177,
"step": 818
},
{
"epoch": 0.6902356902356902,
"grad_norm": 19.933046340942383,
"learning_rate": 9.229241274209331e-07,
"loss": 0.7387347221374512,
"step": 820
},
{
"epoch": 0.6919191919191919,
"grad_norm": 4.639487266540527,
"learning_rate": 9.224560081738876e-07,
"loss": 0.8205159902572632,
"step": 822
},
{
"epoch": 0.6936026936026936,
"grad_norm": 5.4859089851379395,
"learning_rate": 9.219866056591803e-07,
"loss": 0.8951364755630493,
"step": 824
},
{
"epoch": 0.6952861952861953,
"grad_norm": 10.06679916381836,
"learning_rate": 9.215159214940999e-07,
"loss": 0.924353837966919,
"step": 826
},
{
"epoch": 0.696969696969697,
"grad_norm": 4.803708076477051,
"learning_rate": 9.210439573003513e-07,
"loss": 0.8230616450309753,
"step": 828
},
{
"epoch": 0.6986531986531986,
"grad_norm": 2.6663763523101807,
"learning_rate": 9.205707147040502e-07,
"loss": 1.2476671934127808,
"step": 830
},
{
"epoch": 0.7003367003367004,
"grad_norm": 11.887960433959961,
"learning_rate": 9.200961953357161e-07,
"loss": 0.9090033173561096,
"step": 832
},
{
"epoch": 0.702020202020202,
"grad_norm": 12.790818214416504,
"learning_rate": 9.196204008302679e-07,
"loss": 0.7313128709793091,
"step": 834
},
{
"epoch": 0.7037037037037037,
"grad_norm": 19.932966232299805,
"learning_rate": 9.191433328270181e-07,
"loss": 0.9331467151641846,
"step": 836
},
{
"epoch": 0.7053872053872053,
"grad_norm": 14.500178337097168,
"learning_rate": 9.186649929696663e-07,
"loss": 0.6199721097946167,
"step": 838
},
{
"epoch": 0.7070707070707071,
"grad_norm": 3.49665904045105,
"learning_rate": 9.181853829062953e-07,
"loss": 1.2793331146240234,
"step": 840
},
{
"epoch": 0.7087542087542088,
"grad_norm": 4.229721546173096,
"learning_rate": 9.177045042893626e-07,
"loss": 1.1469063758850098,
"step": 842
},
{
"epoch": 0.7104377104377104,
"grad_norm": 110.08422088623047,
"learning_rate": 9.172223587756982e-07,
"loss": 1.1059083938598633,
"step": 844
},
{
"epoch": 0.7121212121212122,
"grad_norm": 18.97850799560547,
"learning_rate": 9.167389480264958e-07,
"loss": 0.8827245235443115,
"step": 846
},
{
"epoch": 0.7138047138047138,
"grad_norm": 17.975536346435547,
"learning_rate": 9.162542737073089e-07,
"loss": 0.8001298904418945,
"step": 848
},
{
"epoch": 0.7154882154882155,
"grad_norm": 7.855954647064209,
"learning_rate": 9.157683374880446e-07,
"loss": 0.9649063348770142,
"step": 850
},
{
"epoch": 0.7171717171717171,
"grad_norm": 8.463844299316406,
"learning_rate": 9.152811410429576e-07,
"loss": 0.972816526889801,
"step": 852
},
{
"epoch": 0.7188552188552189,
"grad_norm": 12.091350555419922,
"learning_rate": 9.147926860506445e-07,
"loss": 0.7975931763648987,
"step": 854
},
{
"epoch": 0.7205387205387206,
"grad_norm": 4.873641014099121,
"learning_rate": 9.143029741940385e-07,
"loss": 1.1548885107040405,
"step": 856
},
{
"epoch": 0.7222222222222222,
"grad_norm": 11.703914642333984,
"learning_rate": 9.138120071604027e-07,
"loss": 0.7869529724121094,
"step": 858
},
{
"epoch": 0.7239057239057239,
"grad_norm": 8.07150650024414,
"learning_rate": 9.133197866413254e-07,
"loss": 1.0205129384994507,
"step": 860
},
{
"epoch": 0.7255892255892256,
"grad_norm": 9.105744361877441,
"learning_rate": 9.128263143327132e-07,
"loss": 1.2168781757354736,
"step": 862
},
{
"epoch": 0.7272727272727273,
"grad_norm": 13.992351531982422,
"learning_rate": 9.12331591934786e-07,
"loss": 1.0841448307037354,
"step": 864
},
{
"epoch": 0.7289562289562289,
"grad_norm": 44.512203216552734,
"learning_rate": 9.118356211520704e-07,
"loss": 1.0125892162322998,
"step": 866
},
{
"epoch": 0.7306397306397306,
"grad_norm": 3.5231881141662598,
"learning_rate": 9.113384036933945e-07,
"loss": 1.2724123001098633,
"step": 868
},
{
"epoch": 0.7323232323232324,
"grad_norm": 5.931739330291748,
"learning_rate": 9.108399412718818e-07,
"loss": 1.1999413967132568,
"step": 870
},
{
"epoch": 0.734006734006734,
"grad_norm": 5.34647798538208,
"learning_rate": 9.103402356049452e-07,
"loss": 1.127119541168213,
"step": 872
},
{
"epoch": 0.7356902356902357,
"grad_norm": 4.207188606262207,
"learning_rate": 9.098392884142805e-07,
"loss": 1.114919900894165,
"step": 874
},
{
"epoch": 0.7373737373737373,
"grad_norm": 21.882280349731445,
"learning_rate": 9.093371014258618e-07,
"loss": 0.9378777742385864,
"step": 876
},
{
"epoch": 0.7390572390572391,
"grad_norm": 127.14752197265625,
"learning_rate": 9.088336763699347e-07,
"loss": 0.6694403886795044,
"step": 878
},
{
"epoch": 0.7407407407407407,
"grad_norm": 81.61506652832031,
"learning_rate": 9.083290149810101e-07,
"loss": 0.6651909351348877,
"step": 880
},
{
"epoch": 0.7424242424242424,
"grad_norm": 3.663316488265991,
"learning_rate": 9.07823118997859e-07,
"loss": 1.182866096496582,
"step": 882
},
{
"epoch": 0.7441077441077442,
"grad_norm": 3.8022303581237793,
"learning_rate": 9.07315990163506e-07,
"loss": 1.1220306158065796,
"step": 884
},
{
"epoch": 0.7457912457912458,
"grad_norm": 3.328054189682007,
"learning_rate": 9.06807630225223e-07,
"loss": 0.6599295139312744,
"step": 886
},
{
"epoch": 0.7474747474747475,
"grad_norm": 3.3686916828155518,
"learning_rate": 9.062980409345242e-07,
"loss": 1.0259349346160889,
"step": 888
},
{
"epoch": 0.7491582491582491,
"grad_norm": 20.480480194091797,
"learning_rate": 9.05787224047159e-07,
"loss": 0.9568924903869629,
"step": 890
},
{
"epoch": 0.7508417508417509,
"grad_norm": 6.441938877105713,
"learning_rate": 9.052751813231064e-07,
"loss": 0.9797095060348511,
"step": 892
},
{
"epoch": 0.7525252525252525,
"grad_norm": 9.020792007446289,
"learning_rate": 9.047619145265693e-07,
"loss": 0.786825954914093,
"step": 894
},
{
"epoch": 0.7542087542087542,
"grad_norm": 12.181696891784668,
"learning_rate": 9.042474254259673e-07,
"loss": 0.9024474620819092,
"step": 896
},
{
"epoch": 0.7558922558922558,
"grad_norm": 28.832189559936523,
"learning_rate": 9.037317157939322e-07,
"loss": 0.6734418869018555,
"step": 898
},
{
"epoch": 0.7575757575757576,
"grad_norm": 3.2818045616149902,
"learning_rate": 9.032147874073007e-07,
"loss": 0.9285035133361816,
"step": 900
},
{
"epoch": 0.7592592592592593,
"grad_norm": 18.371009826660156,
"learning_rate": 9.026966420471087e-07,
"loss": 0.7218674421310425,
"step": 902
},
{
"epoch": 0.7609427609427609,
"grad_norm": 2.9429922103881836,
"learning_rate": 9.021772814985844e-07,
"loss": 1.222078800201416,
"step": 904
},
{
"epoch": 0.7626262626262627,
"grad_norm": 2.7464704513549805,
"learning_rate": 9.016567075511441e-07,
"loss": 0.9446361064910889,
"step": 906
},
{
"epoch": 0.7643097643097643,
"grad_norm": 6.568495750427246,
"learning_rate": 9.011349219983836e-07,
"loss": 0.929685115814209,
"step": 908
},
{
"epoch": 0.765993265993266,
"grad_norm": 4.631781578063965,
"learning_rate": 9.006119266380738e-07,
"loss": 0.8691076040267944,
"step": 910
},
{
"epoch": 0.7676767676767676,
"grad_norm": 19.05845069885254,
"learning_rate": 9.000877232721539e-07,
"loss": 1.0112216472625732,
"step": 912
},
{
"epoch": 0.7693602693602694,
"grad_norm": 14.539863586425781,
"learning_rate": 8.99562313706725e-07,
"loss": 0.890055775642395,
"step": 914
},
{
"epoch": 0.7710437710437711,
"grad_norm": 5.530696868896484,
"learning_rate": 8.99035699752044e-07,
"loss": 1.0191471576690674,
"step": 916
},
{
"epoch": 0.7727272727272727,
"grad_norm": 14.078718185424805,
"learning_rate": 8.985078832225178e-07,
"loss": 0.6652472019195557,
"step": 918
},
{
"epoch": 0.7744107744107744,
"grad_norm": 28.123485565185547,
"learning_rate": 8.979788659366963e-07,
"loss": 0.5262911319732666,
"step": 920
},
{
"epoch": 0.7760942760942761,
"grad_norm": 12.658363342285156,
"learning_rate": 8.974486497172664e-07,
"loss": 0.6195323467254639,
"step": 922
},
{
"epoch": 0.7777777777777778,
"grad_norm": 9.243937492370605,
"learning_rate": 8.969172363910464e-07,
"loss": 0.9786189198493958,
"step": 924
},
{
"epoch": 0.7794612794612794,
"grad_norm": 6.694032669067383,
"learning_rate": 8.963846277889788e-07,
"loss": 1.1813392639160156,
"step": 926
},
{
"epoch": 0.7811447811447811,
"grad_norm": 2.7287495136260986,
"learning_rate": 8.95850825746124e-07,
"loss": 0.5288863182067871,
"step": 928
},
{
"epoch": 0.7828282828282829,
"grad_norm": 12.440982818603516,
"learning_rate": 8.953158321016549e-07,
"loss": 1.3665971755981445,
"step": 930
},
{
"epoch": 0.7845117845117845,
"grad_norm": 6.197256565093994,
"learning_rate": 8.947796486988499e-07,
"loss": 0.934798002243042,
"step": 932
},
{
"epoch": 0.7861952861952862,
"grad_norm": 5.526829719543457,
"learning_rate": 8.942422773850861e-07,
"loss": 1.0153696537017822,
"step": 934
},
{
"epoch": 0.7878787878787878,
"grad_norm": 3.1978728771209717,
"learning_rate": 8.937037200118339e-07,
"loss": 0.8981832265853882,
"step": 936
},
{
"epoch": 0.7895622895622896,
"grad_norm": 2.9995744228363037,
"learning_rate": 8.931639784346499e-07,
"loss": 0.8695104718208313,
"step": 938
},
{
"epoch": 0.7912457912457912,
"grad_norm": 6.706093788146973,
"learning_rate": 8.926230545131711e-07,
"loss": 1.0370559692382812,
"step": 940
},
{
"epoch": 0.7929292929292929,
"grad_norm": 17.68717384338379,
"learning_rate": 8.920809501111082e-07,
"loss": 0.43204930424690247,
"step": 942
},
{
"epoch": 0.7946127946127947,
"grad_norm": 4.556012153625488,
"learning_rate": 8.915376670962384e-07,
"loss": 0.934272289276123,
"step": 944
},
{
"epoch": 0.7962962962962963,
"grad_norm": 4.898090362548828,
"learning_rate": 8.90993207340401e-07,
"loss": 0.910577654838562,
"step": 946
},
{
"epoch": 0.797979797979798,
"grad_norm": 3.080552577972412,
"learning_rate": 8.904475727194881e-07,
"loss": 1.0652995109558105,
"step": 948
},
{
"epoch": 0.7996632996632996,
"grad_norm": 6.745321273803711,
"learning_rate": 8.899007651134413e-07,
"loss": 0.8568437099456787,
"step": 950
},
{
"epoch": 0.8013468013468014,
"grad_norm": 13.813915252685547,
"learning_rate": 8.893527864062427e-07,
"loss": 0.47221675515174866,
"step": 952
},
{
"epoch": 0.803030303030303,
"grad_norm": 5.70471715927124,
"learning_rate": 8.88803638485909e-07,
"loss": 0.9682356119155884,
"step": 954
},
{
"epoch": 0.8047138047138047,
"grad_norm": 9.05542278289795,
"learning_rate": 8.882533232444864e-07,
"loss": 0.9946258068084717,
"step": 956
},
{
"epoch": 0.8063973063973064,
"grad_norm": 30.702098846435547,
"learning_rate": 8.877018425780425e-07,
"loss": 1.1317826509475708,
"step": 958
},
{
"epoch": 0.8080808080808081,
"grad_norm": 14.64018726348877,
"learning_rate": 8.8714919838666e-07,
"loss": 0.7012873888015747,
"step": 960
},
{
"epoch": 0.8097643097643098,
"grad_norm": 3.149690866470337,
"learning_rate": 8.865953925744305e-07,
"loss": 0.795744776725769,
"step": 962
},
{
"epoch": 0.8114478114478114,
"grad_norm": 6.090580463409424,
"learning_rate": 8.860404270494483e-07,
"loss": 0.7089242935180664,
"step": 964
},
{
"epoch": 0.8131313131313131,
"grad_norm": 3.53495192527771,
"learning_rate": 8.85484303723803e-07,
"loss": 1.0081251859664917,
"step": 966
},
{
"epoch": 0.8148148148148148,
"grad_norm": 4.274377346038818,
"learning_rate": 8.849270245135737e-07,
"loss": 1.2170288562774658,
"step": 968
},
{
"epoch": 0.8164983164983165,
"grad_norm": 3.169619560241699,
"learning_rate": 8.843685913388216e-07,
"loss": 1.0120604038238525,
"step": 970
},
{
"epoch": 0.8181818181818182,
"grad_norm": 9.656790733337402,
"learning_rate": 8.838090061235839e-07,
"loss": 1.0408661365509033,
"step": 972
},
{
"epoch": 0.8198653198653199,
"grad_norm": 3.6206579208374023,
"learning_rate": 8.832482707958671e-07,
"loss": 0.7572422027587891,
"step": 974
},
{
"epoch": 0.8215488215488216,
"grad_norm": 4.2206034660339355,
"learning_rate": 8.826863872876405e-07,
"loss": 0.9668401479721069,
"step": 976
},
{
"epoch": 0.8232323232323232,
"grad_norm": 2.5796895027160645,
"learning_rate": 8.82123357534829e-07,
"loss": 1.2220442295074463,
"step": 978
},
{
"epoch": 0.8249158249158249,
"grad_norm": 3.009799003601074,
"learning_rate": 8.815591834773073e-07,
"loss": 1.1853399276733398,
"step": 980
},
{
"epoch": 0.8265993265993266,
"grad_norm": 3.527939796447754,
"learning_rate": 8.80993867058892e-07,
"loss": 1.1044703722000122,
"step": 982
},
{
"epoch": 0.8282828282828283,
"grad_norm": 5.415159225463867,
"learning_rate": 8.804274102273362e-07,
"loss": 1.0707950592041016,
"step": 984
},
{
"epoch": 0.82996632996633,
"grad_norm": 41.4835205078125,
"learning_rate": 8.798598149343223e-07,
"loss": 0.9894696474075317,
"step": 986
},
{
"epoch": 0.8316498316498316,
"grad_norm": 11.90714168548584,
"learning_rate": 8.792910831354544e-07,
"loss": 0.8949055671691895,
"step": 988
},
{
"epoch": 0.8333333333333334,
"grad_norm": 19.156835556030273,
"learning_rate": 8.787212167902533e-07,
"loss": 0.847869336605072,
"step": 990
},
{
"epoch": 0.835016835016835,
"grad_norm": 9.593557357788086,
"learning_rate": 8.781502178621481e-07,
"loss": 0.7175034284591675,
"step": 992
},
{
"epoch": 0.8367003367003367,
"grad_norm": 7.256720066070557,
"learning_rate": 8.775780883184705e-07,
"loss": 0.9604957103729248,
"step": 994
},
{
"epoch": 0.8383838383838383,
"grad_norm": 6.030484676361084,
"learning_rate": 8.770048301304473e-07,
"loss": 0.69129478931427,
"step": 996
},
{
"epoch": 0.8400673400673401,
"grad_norm": 19.379892349243164,
"learning_rate": 8.764304452731941e-07,
"loss": 0.9693500399589539,
"step": 998
},
{
"epoch": 0.8417508417508418,
"grad_norm": 5.4144086837768555,
"learning_rate": 8.758549357257088e-07,
"loss": 1.0944030284881592,
"step": 1000
},
{
"epoch": 0.8434343434343434,
"grad_norm": 3.4778013229370117,
"learning_rate": 8.752783034708636e-07,
"loss": 0.7972965240478516,
"step": 1002
},
{
"epoch": 0.8451178451178452,
"grad_norm": 18.509031295776367,
"learning_rate": 8.747005504953994e-07,
"loss": 0.7230968475341797,
"step": 1004
},
{
"epoch": 0.8468013468013468,
"grad_norm": 4.210479736328125,
"learning_rate": 8.741216787899185e-07,
"loss": 1.1015040874481201,
"step": 1006
},
{
"epoch": 0.8484848484848485,
"grad_norm": 2.3543701171875,
"learning_rate": 8.73541690348877e-07,
"loss": 0.6013465523719788,
"step": 1008
},
{
"epoch": 0.8501683501683501,
"grad_norm": 4.900216579437256,
"learning_rate": 8.729605871705794e-07,
"loss": 0.9569622278213501,
"step": 1010
},
{
"epoch": 0.8518518518518519,
"grad_norm": 13.174873352050781,
"learning_rate": 8.723783712571706e-07,
"loss": 0.891572117805481,
"step": 1012
},
{
"epoch": 0.8535353535353535,
"grad_norm": 7.153807163238525,
"learning_rate": 8.717950446146296e-07,
"loss": 0.7898436784744263,
"step": 1014
},
{
"epoch": 0.8552188552188552,
"grad_norm": 17.859582901000977,
"learning_rate": 8.712106092527618e-07,
"loss": 0.6778484582901001,
"step": 1016
},
{
"epoch": 0.8569023569023569,
"grad_norm": 25.399763107299805,
"learning_rate": 8.706250671851929e-07,
"loss": 1.0100421905517578,
"step": 1018
},
{
"epoch": 0.8585858585858586,
"grad_norm": 4.458539962768555,
"learning_rate": 8.70038420429362e-07,
"loss": 1.280473232269287,
"step": 1020
},
{
"epoch": 0.8602693602693603,
"grad_norm": 13.934873580932617,
"learning_rate": 8.694506710065139e-07,
"loss": 0.9307641386985779,
"step": 1022
},
{
"epoch": 0.8619528619528619,
"grad_norm": 6.230085372924805,
"learning_rate": 8.688618209416927e-07,
"loss": 0.9810340404510498,
"step": 1024
},
{
"epoch": 0.8636363636363636,
"grad_norm": 7.749796390533447,
"learning_rate": 8.682718722637344e-07,
"loss": 0.9103548526763916,
"step": 1026
},
{
"epoch": 0.8653198653198653,
"grad_norm": 5.378295421600342,
"learning_rate": 8.676808270052607e-07,
"loss": 1.0003798007965088,
"step": 1028
},
{
"epoch": 0.867003367003367,
"grad_norm": 5.721936225891113,
"learning_rate": 8.670886872026711e-07,
"loss": 0.6671168804168701,
"step": 1030
},
{
"epoch": 0.8686868686868687,
"grad_norm": 10.666192054748535,
"learning_rate": 8.664954548961363e-07,
"loss": 0.8651524782180786,
"step": 1032
},
{
"epoch": 0.8703703703703703,
"grad_norm": 7.22635555267334,
"learning_rate": 8.659011321295913e-07,
"loss": 0.9622019529342651,
"step": 1034
},
{
"epoch": 0.8720538720538721,
"grad_norm": 4.455495357513428,
"learning_rate": 8.65305720950728e-07,
"loss": 0.9549316167831421,
"step": 1036
},
{
"epoch": 0.8737373737373737,
"grad_norm": 7.26788854598999,
"learning_rate": 8.647092234109884e-07,
"loss": 1.1264393329620361,
"step": 1038
},
{
"epoch": 0.8754208754208754,
"grad_norm": 6.3819499015808105,
"learning_rate": 8.64111641565558e-07,
"loss": 1.0972923040390015,
"step": 1040
},
{
"epoch": 0.877104377104377,
"grad_norm": 4.891845226287842,
"learning_rate": 8.63512977473357e-07,
"loss": 0.9982548952102661,
"step": 1042
},
{
"epoch": 0.8787878787878788,
"grad_norm": 16.61280059814453,
"learning_rate": 8.629132331970353e-07,
"loss": 1.1183404922485352,
"step": 1044
},
{
"epoch": 0.8804713804713805,
"grad_norm": 3.0736172199249268,
"learning_rate": 8.623124108029645e-07,
"loss": 1.0902597904205322,
"step": 1046
},
{
"epoch": 0.8821548821548821,
"grad_norm": 15.772442817687988,
"learning_rate": 8.617105123612304e-07,
"loss": 0.9946341514587402,
"step": 1048
},
{
"epoch": 0.8838383838383839,
"grad_norm": 22.210824966430664,
"learning_rate": 8.611075399456263e-07,
"loss": 0.8030619025230408,
"step": 1050
},
{
"epoch": 0.8855218855218855,
"grad_norm": 13.653421401977539,
"learning_rate": 8.605034956336462e-07,
"loss": 1.084486484527588,
"step": 1052
},
{
"epoch": 0.8872053872053872,
"grad_norm": 13.737056732177734,
"learning_rate": 8.598983815064766e-07,
"loss": 0.5944472551345825,
"step": 1054
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.5293309688568115,
"learning_rate": 8.592921996489902e-07,
"loss": 0.9724396467208862,
"step": 1056
},
{
"epoch": 0.8905723905723906,
"grad_norm": 2.733849287033081,
"learning_rate": 8.586849521497389e-07,
"loss": 0.9384986162185669,
"step": 1058
},
{
"epoch": 0.8922558922558923,
"grad_norm": 18.489913940429688,
"learning_rate": 8.580766411009455e-07,
"loss": 0.9987908601760864,
"step": 1060
},
{
"epoch": 0.8939393939393939,
"grad_norm": 5.748605251312256,
"learning_rate": 8.574672685984979e-07,
"loss": 0.9200767278671265,
"step": 1062
},
{
"epoch": 0.8956228956228957,
"grad_norm": 11.951451301574707,
"learning_rate": 8.568568367419404e-07,
"loss": 0.844304621219635,
"step": 1064
},
{
"epoch": 0.8973063973063973,
"grad_norm": 2.693372964859009,
"learning_rate": 8.562453476344677e-07,
"loss": 1.1123064756393433,
"step": 1066
},
{
"epoch": 0.898989898989899,
"grad_norm": 3.8241171836853027,
"learning_rate": 8.556328033829172e-07,
"loss": 0.8062398433685303,
"step": 1068
},
{
"epoch": 0.9006734006734006,
"grad_norm": 11.482207298278809,
"learning_rate": 8.550192060977614e-07,
"loss": 0.9785133600234985,
"step": 1070
},
{
"epoch": 0.9023569023569024,
"grad_norm": 3.1708807945251465,
"learning_rate": 8.544045578931013e-07,
"loss": 1.2256948947906494,
"step": 1072
},
{
"epoch": 0.9040404040404041,
"grad_norm": 3.0588254928588867,
"learning_rate": 8.537888608866584e-07,
"loss": 0.8702206611633301,
"step": 1074
},
{
"epoch": 0.9057239057239057,
"grad_norm": 6.964415073394775,
"learning_rate": 8.531721171997681e-07,
"loss": 0.5286012291908264,
"step": 1076
},
{
"epoch": 0.9074074074074074,
"grad_norm": 19.570329666137695,
"learning_rate": 8.525543289573718e-07,
"loss": 1.1106371879577637,
"step": 1078
},
{
"epoch": 0.9090909090909091,
"grad_norm": 3.5319879055023193,
"learning_rate": 8.519354982880099e-07,
"loss": 0.9486319422721863,
"step": 1080
},
{
"epoch": 0.9107744107744108,
"grad_norm": 3.6544623374938965,
"learning_rate": 8.513156273238146e-07,
"loss": 0.9495224356651306,
"step": 1082
},
{
"epoch": 0.9124579124579124,
"grad_norm": 27.266931533813477,
"learning_rate": 8.50694718200502e-07,
"loss": 0.766098141670227,
"step": 1084
},
{
"epoch": 0.9141414141414141,
"grad_norm": 4.358726978302002,
"learning_rate": 8.500727730573655e-07,
"loss": 1.1725554466247559,
"step": 1086
},
{
"epoch": 0.9158249158249159,
"grad_norm": 13.713922500610352,
"learning_rate": 8.494497940372675e-07,
"loss": 0.9348576068878174,
"step": 1088
},
{
"epoch": 0.9175084175084175,
"grad_norm": 2.8525874614715576,
"learning_rate": 8.488257832866332e-07,
"loss": 0.9388105869293213,
"step": 1090
},
{
"epoch": 0.9191919191919192,
"grad_norm": 19.22268295288086,
"learning_rate": 8.482007429554419e-07,
"loss": 1.0528115034103394,
"step": 1092
},
{
"epoch": 0.9208754208754208,
"grad_norm": 7.083608627319336,
"learning_rate": 8.475746751972207e-07,
"loss": 0.9258947968482971,
"step": 1094
},
{
"epoch": 0.9225589225589226,
"grad_norm": 17.767122268676758,
"learning_rate": 8.469475821690364e-07,
"loss": 0.7900251746177673,
"step": 1096
},
{
"epoch": 0.9242424242424242,
"grad_norm": 11.199775695800781,
"learning_rate": 8.463194660314884e-07,
"loss": 0.43797174096107483,
"step": 1098
},
{
"epoch": 0.9259259259259259,
"grad_norm": 6.160865306854248,
"learning_rate": 8.456903289487008e-07,
"loss": 1.0159149169921875,
"step": 1100
},
{
"epoch": 0.9276094276094277,
"grad_norm": 23.086267471313477,
"learning_rate": 8.45060173088316e-07,
"loss": 0.5812975168228149,
"step": 1102
},
{
"epoch": 0.9292929292929293,
"grad_norm": 5.783674240112305,
"learning_rate": 8.444290006214858e-07,
"loss": 1.1394703388214111,
"step": 1104
},
{
"epoch": 0.930976430976431,
"grad_norm": 5.126986026763916,
"learning_rate": 8.43796813722865e-07,
"loss": 1.0383517742156982,
"step": 1106
},
{
"epoch": 0.9326599326599326,
"grad_norm": 16.552364349365234,
"learning_rate": 8.431636145706035e-07,
"loss": 0.8570190072059631,
"step": 1108
},
{
"epoch": 0.9343434343434344,
"grad_norm": 6.3068037033081055,
"learning_rate": 8.425294053463387e-07,
"loss": 1.227846384048462,
"step": 1110
},
{
"epoch": 0.936026936026936,
"grad_norm": 21.39204978942871,
"learning_rate": 8.418941882351883e-07,
"loss": 1.2234206199645996,
"step": 1112
},
{
"epoch": 0.9377104377104377,
"grad_norm": 3.4600205421447754,
"learning_rate": 8.412579654257424e-07,
"loss": 1.0893580913543701,
"step": 1114
},
{
"epoch": 0.9393939393939394,
"grad_norm": 9.739093780517578,
"learning_rate": 8.406207391100564e-07,
"loss": 1.1603511571884155,
"step": 1116
},
{
"epoch": 0.9410774410774411,
"grad_norm": 9.583012580871582,
"learning_rate": 8.399825114836431e-07,
"loss": 1.036285161972046,
"step": 1118
},
{
"epoch": 0.9427609427609428,
"grad_norm": 3.670794725418091,
"learning_rate": 8.393432847454651e-07,
"loss": 1.2967090606689453,
"step": 1120
},
{
"epoch": 0.9444444444444444,
"grad_norm": 3.190880060195923,
"learning_rate": 8.387030610979276e-07,
"loss": 0.7892323732376099,
"step": 1122
},
{
"epoch": 0.9461279461279462,
"grad_norm": 2.7288999557495117,
"learning_rate": 8.380618427468703e-07,
"loss": 0.8631899356842041,
"step": 1124
},
{
"epoch": 0.9478114478114478,
"grad_norm": 80.42435455322266,
"learning_rate": 8.374196319015605e-07,
"loss": 0.8700990080833435,
"step": 1126
},
{
"epoch": 0.9494949494949495,
"grad_norm": 2.7032294273376465,
"learning_rate": 8.367764307746843e-07,
"loss": 0.9584017992019653,
"step": 1128
},
{
"epoch": 0.9511784511784511,
"grad_norm": 29.493919372558594,
"learning_rate": 8.361322415823407e-07,
"loss": 0.9330191016197205,
"step": 1130
},
{
"epoch": 0.9528619528619529,
"grad_norm": 2.8431601524353027,
"learning_rate": 8.354870665440322e-07,
"loss": 0.9470508098602295,
"step": 1132
},
{
"epoch": 0.9545454545454546,
"grad_norm": 4.1329240798950195,
"learning_rate": 8.348409078826586e-07,
"loss": 1.003962755203247,
"step": 1134
},
{
"epoch": 0.9562289562289562,
"grad_norm": 21.232402801513672,
"learning_rate": 8.341937678245078e-07,
"loss": 0.8706526756286621,
"step": 1136
},
{
"epoch": 0.9579124579124579,
"grad_norm": 6.638863563537598,
"learning_rate": 8.335456485992501e-07,
"loss": 0.7324610948562622,
"step": 1138
},
{
"epoch": 0.9595959595959596,
"grad_norm": 9.82058048248291,
"learning_rate": 8.328965524399288e-07,
"loss": 0.5701298713684082,
"step": 1140
},
{
"epoch": 0.9612794612794613,
"grad_norm": 4.2321672439575195,
"learning_rate": 8.322464815829531e-07,
"loss": 0.8950085639953613,
"step": 1142
},
{
"epoch": 0.9629629629629629,
"grad_norm": 4.749987602233887,
"learning_rate": 8.315954382680909e-07,
"loss": 0.6259889602661133,
"step": 1144
},
{
"epoch": 0.9646464646464646,
"grad_norm": 3.1439943313598633,
"learning_rate": 8.309434247384601e-07,
"loss": 0.9208143949508667,
"step": 1146
},
{
"epoch": 0.9663299663299664,
"grad_norm": 9.139312744140625,
"learning_rate": 8.302904432405219e-07,
"loss": 0.7828265428543091,
"step": 1148
},
{
"epoch": 0.968013468013468,
"grad_norm": 8.519466400146484,
"learning_rate": 8.296364960240722e-07,
"loss": 0.9561738967895508,
"step": 1150
},
{
"epoch": 0.9696969696969697,
"grad_norm": 21.469980239868164,
"learning_rate": 8.289815853422342e-07,
"loss": 0.608352541923523,
"step": 1152
},
{
"epoch": 0.9713804713804713,
"grad_norm": 6.825742721557617,
"learning_rate": 8.283257134514507e-07,
"loss": 0.9338740110397339,
"step": 1154
},
{
"epoch": 0.9730639730639731,
"grad_norm": 4.129487991333008,
"learning_rate": 8.276688826114768e-07,
"loss": 0.5884324312210083,
"step": 1156
},
{
"epoch": 0.9747474747474747,
"grad_norm": 5.313873291015625,
"learning_rate": 8.270110950853706e-07,
"loss": 0.9547237753868103,
"step": 1158
},
{
"epoch": 0.9764309764309764,
"grad_norm": 6.063114643096924,
"learning_rate": 8.263523531394872e-07,
"loss": 0.44445914030075073,
"step": 1160
},
{
"epoch": 0.9781144781144782,
"grad_norm": 17.088842391967773,
"learning_rate": 8.256926590434696e-07,
"loss": 1.1655336618423462,
"step": 1162
},
{
"epoch": 0.9797979797979798,
"grad_norm": 2.781656265258789,
"learning_rate": 8.250320150702416e-07,
"loss": 0.6978096961975098,
"step": 1164
},
{
"epoch": 0.9814814814814815,
"grad_norm": 4.57460355758667,
"learning_rate": 8.243704234959996e-07,
"loss": 0.8053257465362549,
"step": 1166
},
{
"epoch": 0.9831649831649831,
"grad_norm": 7.392634391784668,
"learning_rate": 8.237078866002051e-07,
"loss": 0.8369849920272827,
"step": 1168
},
{
"epoch": 0.9848484848484849,
"grad_norm": 7.21369743347168,
"learning_rate": 8.230444066655763e-07,
"loss": 0.8643122911453247,
"step": 1170
},
{
"epoch": 0.9865319865319865,
"grad_norm": 8.024483680725098,
"learning_rate": 8.223799859780808e-07,
"loss": 0.6412187814712524,
"step": 1172
},
{
"epoch": 0.9882154882154882,
"grad_norm": 25.19280433654785,
"learning_rate": 8.217146268269274e-07,
"loss": 0.917904257774353,
"step": 1174
},
{
"epoch": 0.98989898989899,
"grad_norm": 9.128271102905273,
"learning_rate": 8.210483315045584e-07,
"loss": 0.4360630214214325,
"step": 1176
},
{
"epoch": 0.9915824915824916,
"grad_norm": 10.149953842163086,
"learning_rate": 8.203811023066416e-07,
"loss": 1.071942925453186,
"step": 1178
},
{
"epoch": 0.9932659932659933,
"grad_norm": 8.710041999816895,
"learning_rate": 8.197129415320622e-07,
"loss": 0.4572172164916992,
"step": 1180
},
{
"epoch": 0.9949494949494949,
"grad_norm": 3.669222116470337,
"learning_rate": 8.190438514829151e-07,
"loss": 0.9243024587631226,
"step": 1182
},
{
"epoch": 0.9966329966329966,
"grad_norm": 12.717865943908691,
"learning_rate": 8.183738344644973e-07,
"loss": 1.0385701656341553,
"step": 1184
},
{
"epoch": 0.9983164983164983,
"grad_norm": 4.85836935043335,
"learning_rate": 8.177028927852992e-07,
"loss": 0.6608575582504272,
"step": 1186
},
{
"epoch": 1.0,
"grad_norm": 11.576709747314453,
"learning_rate": 8.170310287569973e-07,
"loss": 0.7577022910118103,
"step": 1188
},
{
"epoch": 1.0016835016835017,
"grad_norm": 9.695958137512207,
"learning_rate": 8.163582446944456e-07,
"loss": 0.4615962505340576,
"step": 1190
},
{
"epoch": 1.0033670033670035,
"grad_norm": 6.610690116882324,
"learning_rate": 8.156845429156687e-07,
"loss": 0.4831297993659973,
"step": 1192
},
{
"epoch": 1.005050505050505,
"grad_norm": 3.4326443672180176,
"learning_rate": 8.150099257418522e-07,
"loss": 1.146728515625,
"step": 1194
},
{
"epoch": 1.0067340067340067,
"grad_norm": 20.49312973022461,
"learning_rate": 8.143343954973366e-07,
"loss": 0.8859339356422424,
"step": 1196
},
{
"epoch": 1.0084175084175084,
"grad_norm": 3.5065126419067383,
"learning_rate": 8.136579545096076e-07,
"loss": 1.0677597522735596,
"step": 1198
},
{
"epoch": 1.0101010101010102,
"grad_norm": 13.90986156463623,
"learning_rate": 8.129806051092889e-07,
"loss": 1.1894700527191162,
"step": 1200
},
{
"epoch": 1.0117845117845117,
"grad_norm": 3.6254143714904785,
"learning_rate": 8.123023496301343e-07,
"loss": 0.985792338848114,
"step": 1202
},
{
"epoch": 1.0134680134680134,
"grad_norm": 2.666475296020508,
"learning_rate": 8.116231904090192e-07,
"loss": 1.0036242008209229,
"step": 1204
},
{
"epoch": 1.0151515151515151,
"grad_norm": 15.559446334838867,
"learning_rate": 8.109431297859332e-07,
"loss": 1.0831941366195679,
"step": 1206
},
{
"epoch": 1.0168350168350169,
"grad_norm": 16.54594612121582,
"learning_rate": 8.10262170103971e-07,
"loss": 0.6582114696502686,
"step": 1208
},
{
"epoch": 1.0185185185185186,
"grad_norm": 4.971505641937256,
"learning_rate": 8.095803137093252e-07,
"loss": 0.7359082698822021,
"step": 1210
},
{
"epoch": 1.02020202020202,
"grad_norm": 3.355790853500366,
"learning_rate": 8.088975629512781e-07,
"loss": 0.5685245990753174,
"step": 1212
},
{
"epoch": 1.0218855218855218,
"grad_norm": 9.155191421508789,
"learning_rate": 8.082139201821933e-07,
"loss": 0.8225246667861938,
"step": 1214
},
{
"epoch": 1.0235690235690236,
"grad_norm": 12.392461776733398,
"learning_rate": 8.075293877575079e-07,
"loss": 0.4670335352420807,
"step": 1216
},
{
"epoch": 1.0252525252525253,
"grad_norm": 15.242469787597656,
"learning_rate": 8.068439680357239e-07,
"loss": 0.9990904331207275,
"step": 1218
},
{
"epoch": 1.026936026936027,
"grad_norm": 5.101475238800049,
"learning_rate": 8.06157663378401e-07,
"loss": 0.8169501423835754,
"step": 1220
},
{
"epoch": 1.0286195286195285,
"grad_norm": 45.69724655151367,
"learning_rate": 8.054704761501471e-07,
"loss": 0.9720203280448914,
"step": 1222
},
{
"epoch": 1.0303030303030303,
"grad_norm": 9.621611595153809,
"learning_rate": 8.047824087186116e-07,
"loss": 1.1497771739959717,
"step": 1224
},
{
"epoch": 1.031986531986532,
"grad_norm": 17.078630447387695,
"learning_rate": 8.040934634544761e-07,
"loss": 0.6966054439544678,
"step": 1226
},
{
"epoch": 1.0336700336700337,
"grad_norm": 12.26323413848877,
"learning_rate": 8.03403642731447e-07,
"loss": 0.9055821299552917,
"step": 1228
},
{
"epoch": 1.0353535353535352,
"grad_norm": 4.618709564208984,
"learning_rate": 8.027129489262472e-07,
"loss": 0.8367654085159302,
"step": 1230
},
{
"epoch": 1.037037037037037,
"grad_norm": 14.03416919708252,
"learning_rate": 8.020213844186071e-07,
"loss": 0.5471811294555664,
"step": 1232
},
{
"epoch": 1.0387205387205387,
"grad_norm": 2.462353229522705,
"learning_rate": 8.013289515912575e-07,
"loss": 0.9337582588195801,
"step": 1234
},
{
"epoch": 1.0404040404040404,
"grad_norm": 3.580676794052124,
"learning_rate": 8.006356528299211e-07,
"loss": 0.9284713268280029,
"step": 1236
},
{
"epoch": 1.0420875420875422,
"grad_norm": 14.55753231048584,
"learning_rate": 7.999414905233035e-07,
"loss": 0.5675897598266602,
"step": 1238
},
{
"epoch": 1.0437710437710437,
"grad_norm": 3.7598259449005127,
"learning_rate": 7.992464670630862e-07,
"loss": 1.0432960987091064,
"step": 1240
},
{
"epoch": 1.0454545454545454,
"grad_norm": 6.506076335906982,
"learning_rate": 7.985505848439171e-07,
"loss": 1.0147356986999512,
"step": 1242
},
{
"epoch": 1.0471380471380471,
"grad_norm": 4.660027503967285,
"learning_rate": 7.978538462634036e-07,
"loss": 0.7054228782653809,
"step": 1244
},
{
"epoch": 1.0488215488215489,
"grad_norm": 15.017945289611816,
"learning_rate": 7.971562537221032e-07,
"loss": 0.7315689325332642,
"step": 1246
},
{
"epoch": 1.0505050505050506,
"grad_norm": 11.619869232177734,
"learning_rate": 7.964578096235156e-07,
"loss": 0.9787733554840088,
"step": 1248
},
{
"epoch": 1.0521885521885521,
"grad_norm": 25.001440048217773,
"learning_rate": 7.957585163740746e-07,
"loss": 0.7732163667678833,
"step": 1250
},
{
"epoch": 1.0538720538720538,
"grad_norm": 13.280570030212402,
"learning_rate": 7.950583763831398e-07,
"loss": 0.7055392861366272,
"step": 1252
},
{
"epoch": 1.0555555555555556,
"grad_norm": 8.0188627243042,
"learning_rate": 7.943573920629879e-07,
"loss": 1.0268526077270508,
"step": 1254
},
{
"epoch": 1.0572390572390573,
"grad_norm": 8.311823844909668,
"learning_rate": 7.936555658288051e-07,
"loss": 0.7499762177467346,
"step": 1256
},
{
"epoch": 1.0589225589225588,
"grad_norm": 12.510072708129883,
"learning_rate": 7.929529000986778e-07,
"loss": 0.5642093420028687,
"step": 1258
},
{
"epoch": 1.0606060606060606,
"grad_norm": 8.302406311035156,
"learning_rate": 7.922493972935851e-07,
"loss": 0.8775455355644226,
"step": 1260
},
{
"epoch": 1.0622895622895623,
"grad_norm": 4.110003471374512,
"learning_rate": 7.915450598373903e-07,
"loss": 0.6986871957778931,
"step": 1262
},
{
"epoch": 1.063973063973064,
"grad_norm": 5.865422248840332,
"learning_rate": 7.908398901568324e-07,
"loss": 0.8195330500602722,
"step": 1264
},
{
"epoch": 1.0656565656565657,
"grad_norm": 9.913485527038574,
"learning_rate": 7.901338906815174e-07,
"loss": 0.8037704229354858,
"step": 1266
},
{
"epoch": 1.0673400673400673,
"grad_norm": 111.66101837158203,
"learning_rate": 7.894270638439106e-07,
"loss": 0.6612458825111389,
"step": 1268
},
{
"epoch": 1.069023569023569,
"grad_norm": 6.807026386260986,
"learning_rate": 7.88719412079328e-07,
"loss": 0.6571763157844543,
"step": 1270
},
{
"epoch": 1.0707070707070707,
"grad_norm": 6.202319622039795,
"learning_rate": 7.880109378259274e-07,
"loss": 0.7407518625259399,
"step": 1272
},
{
"epoch": 1.0723905723905724,
"grad_norm": 18.488807678222656,
"learning_rate": 7.873016435247011e-07,
"loss": 0.5137653350830078,
"step": 1274
},
{
"epoch": 1.074074074074074,
"grad_norm": 6.398234844207764,
"learning_rate": 7.865915316194661e-07,
"loss": 0.7220208644866943,
"step": 1276
},
{
"epoch": 1.0757575757575757,
"grad_norm": 24.44901466369629,
"learning_rate": 7.858806045568568e-07,
"loss": 1.0816729068756104,
"step": 1278
},
{
"epoch": 1.0774410774410774,
"grad_norm": 42.94617462158203,
"learning_rate": 7.85168864786316e-07,
"loss": 0.569089412689209,
"step": 1280
},
{
"epoch": 1.0791245791245792,
"grad_norm": 17.059085845947266,
"learning_rate": 7.844563147600869e-07,
"loss": 0.34395474195480347,
"step": 1282
},
{
"epoch": 1.0808080808080809,
"grad_norm": 5.726075172424316,
"learning_rate": 7.837429569332038e-07,
"loss": 1.104400873184204,
"step": 1284
},
{
"epoch": 1.0824915824915824,
"grad_norm": 5.970583915710449,
"learning_rate": 7.830287937634848e-07,
"loss": 0.9108725786209106,
"step": 1286
},
{
"epoch": 1.0841750841750841,
"grad_norm": 3.13798451423645,
"learning_rate": 7.823138277115227e-07,
"loss": 0.6928012371063232,
"step": 1288
},
{
"epoch": 1.0858585858585859,
"grad_norm": 3.2338767051696777,
"learning_rate": 7.81598061240676e-07,
"loss": 0.6945496797561646,
"step": 1290
},
{
"epoch": 1.0875420875420876,
"grad_norm": 9.174521446228027,
"learning_rate": 7.808814968170612e-07,
"loss": 1.177178144454956,
"step": 1292
},
{
"epoch": 1.0892255892255893,
"grad_norm": 2.838789463043213,
"learning_rate": 7.801641369095449e-07,
"loss": 0.8742045164108276,
"step": 1294
},
{
"epoch": 1.0909090909090908,
"grad_norm": 33.68141555786133,
"learning_rate": 7.794459839897334e-07,
"loss": 0.5730578899383545,
"step": 1296
},
{
"epoch": 1.0925925925925926,
"grad_norm": 8.239413261413574,
"learning_rate": 7.787270405319656e-07,
"loss": 0.6627512574195862,
"step": 1298
},
{
"epoch": 1.0942760942760943,
"grad_norm": 10.630107879638672,
"learning_rate": 7.780073090133045e-07,
"loss": 0.6856255531311035,
"step": 1300
},
{
"epoch": 1.095959595959596,
"grad_norm": 8.586835861206055,
"learning_rate": 7.772867919135278e-07,
"loss": 0.7367527484893799,
"step": 1302
},
{
"epoch": 1.0976430976430978,
"grad_norm": 26.07152557373047,
"learning_rate": 7.765654917151201e-07,
"loss": 0.6313869953155518,
"step": 1304
},
{
"epoch": 1.0993265993265993,
"grad_norm": 26.481813430786133,
"learning_rate": 7.758434109032642e-07,
"loss": 0.6839025020599365,
"step": 1306
},
{
"epoch": 1.101010101010101,
"grad_norm": 11.492305755615234,
"learning_rate": 7.751205519658321e-07,
"loss": 0.5959317684173584,
"step": 1308
},
{
"epoch": 1.1026936026936027,
"grad_norm": 5.645211219787598,
"learning_rate": 7.743969173933771e-07,
"loss": 0.5784125924110413,
"step": 1310
},
{
"epoch": 1.1043771043771045,
"grad_norm": 4.408408164978027,
"learning_rate": 7.736725096791249e-07,
"loss": 1.2098188400268555,
"step": 1312
},
{
"epoch": 1.106060606060606,
"grad_norm": 9.238399505615234,
"learning_rate": 7.729473313189647e-07,
"loss": 0.9550820589065552,
"step": 1314
},
{
"epoch": 1.1077441077441077,
"grad_norm": 15.260536193847656,
"learning_rate": 7.722213848114411e-07,
"loss": 0.9281185865402222,
"step": 1316
},
{
"epoch": 1.1094276094276094,
"grad_norm": 11.378418922424316,
"learning_rate": 7.714946726577453e-07,
"loss": 0.9321832656860352,
"step": 1318
},
{
"epoch": 1.1111111111111112,
"grad_norm": 27.803199768066406,
"learning_rate": 7.707671973617066e-07,
"loss": 0.7850360870361328,
"step": 1320
},
{
"epoch": 1.112794612794613,
"grad_norm": 11.86633586883545,
"learning_rate": 7.700389614297832e-07,
"loss": 0.8705657124519348,
"step": 1322
},
{
"epoch": 1.1144781144781144,
"grad_norm": 13.372186660766602,
"learning_rate": 7.693099673710545e-07,
"loss": 0.5348168015480042,
"step": 1324
},
{
"epoch": 1.1161616161616161,
"grad_norm": 7.737417697906494,
"learning_rate": 7.685802176972117e-07,
"loss": 0.8875303268432617,
"step": 1326
},
{
"epoch": 1.1178451178451179,
"grad_norm": 4.609512805938721,
"learning_rate": 7.678497149225494e-07,
"loss": 0.7146286964416504,
"step": 1328
},
{
"epoch": 1.1195286195286196,
"grad_norm": 3.953033447265625,
"learning_rate": 7.671184615639573e-07,
"loss": 1.0624680519104004,
"step": 1330
},
{
"epoch": 1.121212121212121,
"grad_norm": 15.329386711120605,
"learning_rate": 7.663864601409106e-07,
"loss": 0.7291280031204224,
"step": 1332
},
{
"epoch": 1.1228956228956228,
"grad_norm": 5.592386722564697,
"learning_rate": 7.656537131754621e-07,
"loss": 1.146779179573059,
"step": 1334
},
{
"epoch": 1.1245791245791246,
"grad_norm": 19.50740623474121,
"learning_rate": 7.649202231922338e-07,
"loss": 0.6419116258621216,
"step": 1336
},
{
"epoch": 1.1262626262626263,
"grad_norm": 3.845174789428711,
"learning_rate": 7.641859927184071e-07,
"loss": 0.7372583150863647,
"step": 1338
},
{
"epoch": 1.127946127946128,
"grad_norm": 8.609213829040527,
"learning_rate": 7.634510242837149e-07,
"loss": 0.603482723236084,
"step": 1340
},
{
"epoch": 1.1296296296296295,
"grad_norm": 7.67048454284668,
"learning_rate": 7.627153204204329e-07,
"loss": 0.9267317056655884,
"step": 1342
},
{
"epoch": 1.1313131313131313,
"grad_norm": 3.1689493656158447,
"learning_rate": 7.619788836633701e-07,
"loss": 1.1948891878128052,
"step": 1344
},
{
"epoch": 1.132996632996633,
"grad_norm": 44.90256118774414,
"learning_rate": 7.612417165498611e-07,
"loss": 1.0813300609588623,
"step": 1346
},
{
"epoch": 1.1346801346801347,
"grad_norm": 30.334089279174805,
"learning_rate": 7.605038216197569e-07,
"loss": 0.7344606518745422,
"step": 1348
},
{
"epoch": 1.1363636363636362,
"grad_norm": 7.781182765960693,
"learning_rate": 7.597652014154162e-07,
"loss": 0.5709810256958008,
"step": 1350
},
{
"epoch": 1.138047138047138,
"grad_norm": 17.377174377441406,
"learning_rate": 7.590258584816957e-07,
"loss": 0.32737797498703003,
"step": 1352
},
{
"epoch": 1.1397306397306397,
"grad_norm": 3.968998908996582,
"learning_rate": 7.582857953659437e-07,
"loss": 1.0901448726654053,
"step": 1354
},
{
"epoch": 1.1414141414141414,
"grad_norm": 4.9800801277160645,
"learning_rate": 7.575450146179887e-07,
"loss": 1.098610281944275,
"step": 1356
},
{
"epoch": 1.1430976430976432,
"grad_norm": 11.949906349182129,
"learning_rate": 7.56803518790132e-07,
"loss": 0.8105623722076416,
"step": 1358
},
{
"epoch": 1.144781144781145,
"grad_norm": 3.4032137393951416,
"learning_rate": 7.560613104371386e-07,
"loss": 0.7330828905105591,
"step": 1360
},
{
"epoch": 1.1464646464646464,
"grad_norm": 2.8660380840301514,
"learning_rate": 7.553183921162289e-07,
"loss": 0.9020315408706665,
"step": 1362
},
{
"epoch": 1.1481481481481481,
"grad_norm": 12.72059154510498,
"learning_rate": 7.545747663870687e-07,
"loss": 0.9371917843818665,
"step": 1364
},
{
"epoch": 1.1498316498316499,
"grad_norm": 23.1413631439209,
"learning_rate": 7.53830435811762e-07,
"loss": 0.7397361993789673,
"step": 1366
},
{
"epoch": 1.1515151515151516,
"grad_norm": 13.042642593383789,
"learning_rate": 7.530854029548404e-07,
"loss": 0.8247054815292358,
"step": 1368
},
{
"epoch": 1.1531986531986531,
"grad_norm": 4.0835795402526855,
"learning_rate": 7.523396703832557e-07,
"loss": 1.090425968170166,
"step": 1370
},
{
"epoch": 1.1548821548821548,
"grad_norm": 3.6361794471740723,
"learning_rate": 7.515932406663705e-07,
"loss": 1.0872161388397217,
"step": 1372
},
{
"epoch": 1.1565656565656566,
"grad_norm": 13.066899299621582,
"learning_rate": 7.508461163759493e-07,
"loss": 0.49930015206336975,
"step": 1374
},
{
"epoch": 1.1582491582491583,
"grad_norm": 5.910285472869873,
"learning_rate": 7.500983000861493e-07,
"loss": 0.46187859773635864,
"step": 1376
},
{
"epoch": 1.15993265993266,
"grad_norm": 12.718847274780273,
"learning_rate": 7.493497943735124e-07,
"loss": 0.9587620496749878,
"step": 1378
},
{
"epoch": 1.1616161616161615,
"grad_norm": 2.7174603939056396,
"learning_rate": 7.48600601816956e-07,
"loss": 0.7705467939376831,
"step": 1380
},
{
"epoch": 1.1632996632996633,
"grad_norm": 10.425454139709473,
"learning_rate": 7.478507249977632e-07,
"loss": 0.5908098220825195,
"step": 1382
},
{
"epoch": 1.164983164983165,
"grad_norm": 4.83370304107666,
"learning_rate": 7.471001664995757e-07,
"loss": 0.4560571312904358,
"step": 1384
},
{
"epoch": 1.1666666666666667,
"grad_norm": 16.3512020111084,
"learning_rate": 7.46348928908383e-07,
"loss": 0.6046204566955566,
"step": 1386
},
{
"epoch": 1.1683501683501682,
"grad_norm": 3.3071091175079346,
"learning_rate": 7.455970148125145e-07,
"loss": 0.6498188972473145,
"step": 1388
},
{
"epoch": 1.17003367003367,
"grad_norm": 3.1778576374053955,
"learning_rate": 7.44844426802631e-07,
"loss": 0.9177660942077637,
"step": 1390
},
{
"epoch": 1.1717171717171717,
"grad_norm": 6.8912577629089355,
"learning_rate": 7.440911674717148e-07,
"loss": 0.9661788940429688,
"step": 1392
},
{
"epoch": 1.1734006734006734,
"grad_norm": 2.982248306274414,
"learning_rate": 7.433372394150613e-07,
"loss": 0.7623599171638489,
"step": 1394
},
{
"epoch": 1.1750841750841752,
"grad_norm": 6.73823356628418,
"learning_rate": 7.425826452302695e-07,
"loss": 0.6162515878677368,
"step": 1396
},
{
"epoch": 1.1767676767676767,
"grad_norm": 7.467746734619141,
"learning_rate": 7.418273875172344e-07,
"loss": 0.7228857278823853,
"step": 1398
},
{
"epoch": 1.1784511784511784,
"grad_norm": 10.521594047546387,
"learning_rate": 7.410714688781362e-07,
"loss": 0.547920823097229,
"step": 1400
},
{
"epoch": 1.1801346801346801,
"grad_norm": 4.692141056060791,
"learning_rate": 7.403148919174327e-07,
"loss": 1.011480450630188,
"step": 1402
},
{
"epoch": 1.1818181818181819,
"grad_norm": 6.844545841217041,
"learning_rate": 7.3955765924185e-07,
"loss": 0.7596945762634277,
"step": 1404
},
{
"epoch": 1.1835016835016834,
"grad_norm": 8.648809432983398,
"learning_rate": 7.387997734603734e-07,
"loss": 0.771956205368042,
"step": 1406
},
{
"epoch": 1.1851851851851851,
"grad_norm": 15.440680503845215,
"learning_rate": 7.38041237184238e-07,
"loss": 1.2356925010681152,
"step": 1408
},
{
"epoch": 1.1868686868686869,
"grad_norm": 5.456315040588379,
"learning_rate": 7.372820530269203e-07,
"loss": 0.727834165096283,
"step": 1410
},
{
"epoch": 1.1885521885521886,
"grad_norm": 33.579254150390625,
"learning_rate": 7.365222236041298e-07,
"loss": 0.780275821685791,
"step": 1412
},
{
"epoch": 1.1902356902356903,
"grad_norm": 14.142115592956543,
"learning_rate": 7.35761751533798e-07,
"loss": 0.9008167386054993,
"step": 1414
},
{
"epoch": 1.1919191919191918,
"grad_norm": 9.76620864868164,
"learning_rate": 7.350006394360716e-07,
"loss": 0.7642953991889954,
"step": 1416
},
{
"epoch": 1.1936026936026936,
"grad_norm": 17.838695526123047,
"learning_rate": 7.342388899333014e-07,
"loss": 1.0995585918426514,
"step": 1418
},
{
"epoch": 1.1952861952861953,
"grad_norm": 65.10449981689453,
"learning_rate": 7.334765056500356e-07,
"loss": 0.947974443435669,
"step": 1420
},
{
"epoch": 1.196969696969697,
"grad_norm": 4.257263660430908,
"learning_rate": 7.327134892130085e-07,
"loss": 0.7925307750701904,
"step": 1422
},
{
"epoch": 1.1986531986531987,
"grad_norm": 3.5786726474761963,
"learning_rate": 7.319498432511329e-07,
"loss": 0.6507192850112915,
"step": 1424
},
{
"epoch": 1.2003367003367003,
"grad_norm": 9.806020736694336,
"learning_rate": 7.311855703954901e-07,
"loss": 0.9374374747276306,
"step": 1426
},
{
"epoch": 1.202020202020202,
"grad_norm": 16.49274253845215,
"learning_rate": 7.304206732793222e-07,
"loss": 0.5745439529418945,
"step": 1428
},
{
"epoch": 1.2037037037037037,
"grad_norm": 10.744287490844727,
"learning_rate": 7.296551545380213e-07,
"loss": 0.9440407752990723,
"step": 1430
},
{
"epoch": 1.2053872053872055,
"grad_norm": 4.190220832824707,
"learning_rate": 7.288890168091214e-07,
"loss": 0.7019326686859131,
"step": 1432
},
{
"epoch": 1.2070707070707072,
"grad_norm": 4.626961708068848,
"learning_rate": 7.281222627322897e-07,
"loss": 1.2138803005218506,
"step": 1434
},
{
"epoch": 1.2087542087542087,
"grad_norm": 29.172809600830078,
"learning_rate": 7.273548949493166e-07,
"loss": 0.6954021453857422,
"step": 1436
},
{
"epoch": 1.2104377104377104,
"grad_norm": 6.540690898895264,
"learning_rate": 7.265869161041065e-07,
"loss": 0.5005062818527222,
"step": 1438
},
{
"epoch": 1.2121212121212122,
"grad_norm": 1.7837268114089966,
"learning_rate": 7.258183288426703e-07,
"loss": 0.4664597511291504,
"step": 1440
},
{
"epoch": 1.2138047138047139,
"grad_norm": 6.852010250091553,
"learning_rate": 7.25049135813114e-07,
"loss": 0.7454104423522949,
"step": 1442
},
{
"epoch": 1.2154882154882154,
"grad_norm": 3.7926137447357178,
"learning_rate": 7.242793396656315e-07,
"loss": 0.9171748757362366,
"step": 1444
},
{
"epoch": 1.2171717171717171,
"grad_norm": 4.602051734924316,
"learning_rate": 7.235089430524943e-07,
"loss": 0.9297394156455994,
"step": 1446
},
{
"epoch": 1.2188552188552189,
"grad_norm": 8.485408782958984,
"learning_rate": 7.227379486280432e-07,
"loss": 0.6902468800544739,
"step": 1448
},
{
"epoch": 1.2205387205387206,
"grad_norm": 3.0322980880737305,
"learning_rate": 7.219663590486778e-07,
"loss": 0.9321104288101196,
"step": 1450
},
{
"epoch": 1.2222222222222223,
"grad_norm": 21.71652603149414,
"learning_rate": 7.211941769728493e-07,
"loss": 0.9111616611480713,
"step": 1452
},
{
"epoch": 1.2239057239057238,
"grad_norm": 5.9835405349731445,
"learning_rate": 7.204214050610498e-07,
"loss": 0.6736348867416382,
"step": 1454
},
{
"epoch": 1.2255892255892256,
"grad_norm": 5.00324010848999,
"learning_rate": 7.196480459758035e-07,
"loss": 0.8823907375335693,
"step": 1456
},
{
"epoch": 1.2272727272727273,
"grad_norm": 9.083556175231934,
"learning_rate": 7.188741023816581e-07,
"loss": 0.8732795715332031,
"step": 1458
},
{
"epoch": 1.228956228956229,
"grad_norm": 6.7020440101623535,
"learning_rate": 7.180995769451747e-07,
"loss": 0.9818441867828369,
"step": 1460
},
{
"epoch": 1.2306397306397305,
"grad_norm": 13.26759147644043,
"learning_rate": 7.173244723349194e-07,
"loss": 0.7110154628753662,
"step": 1462
},
{
"epoch": 1.2323232323232323,
"grad_norm": 3.6703689098358154,
"learning_rate": 7.165487912214538e-07,
"loss": 0.6870818138122559,
"step": 1464
},
{
"epoch": 1.234006734006734,
"grad_norm": 4.100058078765869,
"learning_rate": 7.157725362773258e-07,
"loss": 0.8629697561264038,
"step": 1466
},
{
"epoch": 1.2356902356902357,
"grad_norm": 7.570556640625,
"learning_rate": 7.1499571017706e-07,
"loss": 0.9524326324462891,
"step": 1468
},
{
"epoch": 1.2373737373737375,
"grad_norm": 3.626100778579712,
"learning_rate": 7.142183155971493e-07,
"loss": 1.1208899021148682,
"step": 1470
},
{
"epoch": 1.239057239057239,
"grad_norm": 6.774829387664795,
"learning_rate": 7.13440355216045e-07,
"loss": 0.6910721063613892,
"step": 1472
},
{
"epoch": 1.2407407407407407,
"grad_norm": 12.941313743591309,
"learning_rate": 7.126618317141482e-07,
"loss": 0.6839091777801514,
"step": 1474
},
{
"epoch": 1.2424242424242424,
"grad_norm": 13.288043022155762,
"learning_rate": 7.118827477737999e-07,
"loss": 0.4849187135696411,
"step": 1476
},
{
"epoch": 1.2441077441077442,
"grad_norm": 14.330803871154785,
"learning_rate": 7.111031060792719e-07,
"loss": 0.7669592499732971,
"step": 1478
},
{
"epoch": 1.2457912457912457,
"grad_norm": 3.7719264030456543,
"learning_rate": 7.103229093167579e-07,
"loss": 0.7678747773170471,
"step": 1480
},
{
"epoch": 1.2474747474747474,
"grad_norm": 5.733471393585205,
"learning_rate": 7.095421601743643e-07,
"loss": 0.7603921890258789,
"step": 1482
},
{
"epoch": 1.2491582491582491,
"grad_norm": 3.3023183345794678,
"learning_rate": 7.087608613421e-07,
"loss": 0.475089430809021,
"step": 1484
},
{
"epoch": 1.2508417508417509,
"grad_norm": 6.135479927062988,
"learning_rate": 7.079790155118684e-07,
"loss": 0.6280136108398438,
"step": 1486
},
{
"epoch": 1.2525252525252526,
"grad_norm": 14.41522216796875,
"learning_rate": 7.071966253774575e-07,
"loss": 0.7469892501831055,
"step": 1488
},
{
"epoch": 1.2542087542087543,
"grad_norm": 2.5887715816497803,
"learning_rate": 7.064136936345304e-07,
"loss": 0.7018432021141052,
"step": 1490
},
{
"epoch": 1.2558922558922558,
"grad_norm": 3.5334408283233643,
"learning_rate": 7.056302229806163e-07,
"loss": 0.825816810131073,
"step": 1492
},
{
"epoch": 1.2575757575757576,
"grad_norm": 3.581906795501709,
"learning_rate": 7.048462161151012e-07,
"loss": 0.8269777297973633,
"step": 1494
},
{
"epoch": 1.2592592592592593,
"grad_norm": 8.52226734161377,
"learning_rate": 7.040616757392188e-07,
"loss": 0.7199699282646179,
"step": 1496
},
{
"epoch": 1.2609427609427608,
"grad_norm": 2.9323740005493164,
"learning_rate": 7.032766045560408e-07,
"loss": 0.9787487387657166,
"step": 1498
},
{
"epoch": 1.2626262626262625,
"grad_norm": 6.500389099121094,
"learning_rate": 7.024910052704677e-07,
"loss": 1.0706979036331177,
"step": 1500
},
{
"epoch": 1.2643097643097643,
"grad_norm": 5.391655445098877,
"learning_rate": 7.017048805892194e-07,
"loss": 0.5319828987121582,
"step": 1502
},
{
"epoch": 1.265993265993266,
"grad_norm": 5.92175817489624,
"learning_rate": 7.009182332208266e-07,
"loss": 0.7819663286209106,
"step": 1504
},
{
"epoch": 1.2676767676767677,
"grad_norm": 4.497714996337891,
"learning_rate": 7.001310658756201e-07,
"loss": 1.1338582038879395,
"step": 1506
},
{
"epoch": 1.2693602693602695,
"grad_norm": 4.954183578491211,
"learning_rate": 6.993433812657226e-07,
"loss": 1.1781617403030396,
"step": 1508
},
{
"epoch": 1.271043771043771,
"grad_norm": 17.044879913330078,
"learning_rate": 6.985551821050395e-07,
"loss": 0.5676237344741821,
"step": 1510
},
{
"epoch": 1.2727272727272727,
"grad_norm": 17.09630012512207,
"learning_rate": 6.97766471109248e-07,
"loss": 0.6173258423805237,
"step": 1512
},
{
"epoch": 1.2744107744107744,
"grad_norm": 5.917657375335693,
"learning_rate": 6.969772509957895e-07,
"loss": 0.8361184597015381,
"step": 1514
},
{
"epoch": 1.2760942760942762,
"grad_norm": 4.721149921417236,
"learning_rate": 6.961875244838596e-07,
"loss": 0.8495975732803345,
"step": 1516
},
{
"epoch": 1.2777777777777777,
"grad_norm": 11.31229019165039,
"learning_rate": 6.953972942943981e-07,
"loss": 0.7243598699569702,
"step": 1518
},
{
"epoch": 1.2794612794612794,
"grad_norm": 3.162838935852051,
"learning_rate": 6.946065631500806e-07,
"loss": 0.9145760536193848,
"step": 1520
},
{
"epoch": 1.2811447811447811,
"grad_norm": 9.259127616882324,
"learning_rate": 6.938153337753088e-07,
"loss": 0.6645021438598633,
"step": 1522
},
{
"epoch": 1.2828282828282829,
"grad_norm": 26.91777229309082,
"learning_rate": 6.930236088962004e-07,
"loss": 0.651879072189331,
"step": 1524
},
{
"epoch": 1.2845117845117846,
"grad_norm": 5.544670104980469,
"learning_rate": 6.922313912405811e-07,
"loss": 0.8310514688491821,
"step": 1526
},
{
"epoch": 1.2861952861952861,
"grad_norm": 16.370214462280273,
"learning_rate": 6.914386835379738e-07,
"loss": 0.7569658756256104,
"step": 1528
},
{
"epoch": 1.2878787878787878,
"grad_norm": 8.142780303955078,
"learning_rate": 6.906454885195904e-07,
"loss": 0.4488654136657715,
"step": 1530
},
{
"epoch": 1.2895622895622896,
"grad_norm": 5.413924217224121,
"learning_rate": 6.898518089183211e-07,
"loss": 0.8656577467918396,
"step": 1532
},
{
"epoch": 1.2912457912457913,
"grad_norm": 4.607274532318115,
"learning_rate": 6.890576474687263e-07,
"loss": 1.0356013774871826,
"step": 1534
},
{
"epoch": 1.2929292929292928,
"grad_norm": 6.521271705627441,
"learning_rate": 6.882630069070262e-07,
"loss": 0.9825664758682251,
"step": 1536
},
{
"epoch": 1.2946127946127945,
"grad_norm": 5.2521071434021,
"learning_rate": 6.874678899710923e-07,
"loss": 0.6595628261566162,
"step": 1538
},
{
"epoch": 1.2962962962962963,
"grad_norm": 16.32155990600586,
"learning_rate": 6.866722994004364e-07,
"loss": 0.7686331868171692,
"step": 1540
},
{
"epoch": 1.297979797979798,
"grad_norm": 15.72677230834961,
"learning_rate": 6.858762379362032e-07,
"loss": 0.8358673453330994,
"step": 1542
},
{
"epoch": 1.2996632996632997,
"grad_norm": 5.651062965393066,
"learning_rate": 6.850797083211591e-07,
"loss": 0.9706641435623169,
"step": 1544
},
{
"epoch": 1.3013468013468015,
"grad_norm": 10.415674209594727,
"learning_rate": 6.842827132996841e-07,
"loss": 0.8287351131439209,
"step": 1546
},
{
"epoch": 1.303030303030303,
"grad_norm": 16.539886474609375,
"learning_rate": 6.83485255617761e-07,
"loss": 0.8370147943496704,
"step": 1548
},
{
"epoch": 1.3047138047138047,
"grad_norm": 6.127871036529541,
"learning_rate": 6.826873380229673e-07,
"loss": 0.6265941858291626,
"step": 1550
},
{
"epoch": 1.3063973063973064,
"grad_norm": 6.429442882537842,
"learning_rate": 6.818889632644649e-07,
"loss": 0.9182727336883545,
"step": 1552
},
{
"epoch": 1.308080808080808,
"grad_norm": 4.870426654815674,
"learning_rate": 6.810901340929906e-07,
"loss": 0.962719202041626,
"step": 1554
},
{
"epoch": 1.3097643097643097,
"grad_norm": 4.017622947692871,
"learning_rate": 6.802908532608472e-07,
"loss": 1.0228416919708252,
"step": 1556
},
{
"epoch": 1.3114478114478114,
"grad_norm": 6.815629482269287,
"learning_rate": 6.794911235218932e-07,
"loss": 0.9271608591079712,
"step": 1558
},
{
"epoch": 1.3131313131313131,
"grad_norm": 18.521018981933594,
"learning_rate": 6.786909476315342e-07,
"loss": 0.473792165517807,
"step": 1560
},
{
"epoch": 1.3148148148148149,
"grad_norm": 7.367074966430664,
"learning_rate": 6.778903283467128e-07,
"loss": 0.5411000847816467,
"step": 1562
},
{
"epoch": 1.3164983164983166,
"grad_norm": 23.994110107421875,
"learning_rate": 6.770892684258995e-07,
"loss": 0.5685646533966064,
"step": 1564
},
{
"epoch": 1.3181818181818181,
"grad_norm": 2.764239549636841,
"learning_rate": 6.762877706290823e-07,
"loss": 1.0790038108825684,
"step": 1566
},
{
"epoch": 1.3198653198653199,
"grad_norm": 13.226496696472168,
"learning_rate": 6.754858377177587e-07,
"loss": 0.6365941762924194,
"step": 1568
},
{
"epoch": 1.3215488215488216,
"grad_norm": 8.614484786987305,
"learning_rate": 6.74683472454925e-07,
"loss": 0.9468154907226562,
"step": 1570
},
{
"epoch": 1.3232323232323233,
"grad_norm": 6.47797966003418,
"learning_rate": 6.738806776050672e-07,
"loss": 0.8475841283798218,
"step": 1572
},
{
"epoch": 1.3249158249158248,
"grad_norm": 5.899251461029053,
"learning_rate": 6.730774559341512e-07,
"loss": 0.7157614231109619,
"step": 1574
},
{
"epoch": 1.3265993265993266,
"grad_norm": 2.9437053203582764,
"learning_rate": 6.722738102096135e-07,
"loss": 1.0155985355377197,
"step": 1576
},
{
"epoch": 1.3282828282828283,
"grad_norm": 7.690277576446533,
"learning_rate": 6.714697432003519e-07,
"loss": 0.8999049663543701,
"step": 1578
},
{
"epoch": 1.32996632996633,
"grad_norm": 8.396078109741211,
"learning_rate": 6.706652576767156e-07,
"loss": 0.6247600317001343,
"step": 1580
},
{
"epoch": 1.3316498316498318,
"grad_norm": 7.930534362792969,
"learning_rate": 6.698603564104958e-07,
"loss": 0.6954329013824463,
"step": 1582
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.4395411014556885,
"learning_rate": 6.690550421749157e-07,
"loss": 1.0694022178649902,
"step": 1584
},
{
"epoch": 1.335016835016835,
"grad_norm": 4.19919490814209,
"learning_rate": 6.682493177446221e-07,
"loss": 0.946961522102356,
"step": 1586
},
{
"epoch": 1.3367003367003367,
"grad_norm": 6.434517860412598,
"learning_rate": 6.674431858956743e-07,
"loss": 0.5836731195449829,
"step": 1588
},
{
"epoch": 1.3383838383838385,
"grad_norm": 4.509551048278809,
"learning_rate": 6.666366494055358e-07,
"loss": 0.72353595495224,
"step": 1590
},
{
"epoch": 1.34006734006734,
"grad_norm": 17.6490478515625,
"learning_rate": 6.658297110530646e-07,
"loss": 0.598315954208374,
"step": 1592
},
{
"epoch": 1.3417508417508417,
"grad_norm": 8.337894439697266,
"learning_rate": 6.650223736185023e-07,
"loss": 1.0166845321655273,
"step": 1594
},
{
"epoch": 1.3434343434343434,
"grad_norm": 27.206972122192383,
"learning_rate": 6.642146398834663e-07,
"loss": 0.4449620842933655,
"step": 1596
},
{
"epoch": 1.3451178451178452,
"grad_norm": 8.197900772094727,
"learning_rate": 6.63406512630939e-07,
"loss": 0.9014108180999756,
"step": 1598
},
{
"epoch": 1.3468013468013469,
"grad_norm": 7.576647758483887,
"learning_rate": 6.625979946452592e-07,
"loss": 0.809765100479126,
"step": 1600
},
{
"epoch": 1.3484848484848486,
"grad_norm": 44.33702087402344,
"learning_rate": 6.617890887121111e-07,
"loss": 0.7150375843048096,
"step": 1602
},
{
"epoch": 1.3501683501683501,
"grad_norm": 7.649331569671631,
"learning_rate": 6.60979797618516e-07,
"loss": 0.8225715756416321,
"step": 1604
},
{
"epoch": 1.3518518518518519,
"grad_norm": 4.691690444946289,
"learning_rate": 6.601701241528228e-07,
"loss": 1.2066047191619873,
"step": 1606
},
{
"epoch": 1.3535353535353536,
"grad_norm": 4.63446569442749,
"learning_rate": 6.593600711046969e-07,
"loss": 0.924203097820282,
"step": 1608
},
{
"epoch": 1.355218855218855,
"grad_norm": 10.622723579406738,
"learning_rate": 6.585496412651116e-07,
"loss": 0.5192527770996094,
"step": 1610
},
{
"epoch": 1.3569023569023568,
"grad_norm": 3.938314914703369,
"learning_rate": 6.57738837426339e-07,
"loss": 1.000133752822876,
"step": 1612
},
{
"epoch": 1.3585858585858586,
"grad_norm": 4.946197986602783,
"learning_rate": 6.569276623819396e-07,
"loss": 0.6809890270233154,
"step": 1614
},
{
"epoch": 1.3602693602693603,
"grad_norm": 10.934066772460938,
"learning_rate": 6.561161189267526e-07,
"loss": 0.6985521912574768,
"step": 1616
},
{
"epoch": 1.361952861952862,
"grad_norm": 3.1545114517211914,
"learning_rate": 6.553042098568865e-07,
"loss": 0.916617214679718,
"step": 1618
},
{
"epoch": 1.3636363636363638,
"grad_norm": 3.2825722694396973,
"learning_rate": 6.544919379697099e-07,
"loss": 0.729028582572937,
"step": 1620
},
{
"epoch": 1.3653198653198653,
"grad_norm": 3.8294615745544434,
"learning_rate": 6.536793060638412e-07,
"loss": 1.0753536224365234,
"step": 1622
},
{
"epoch": 1.367003367003367,
"grad_norm": 2.503497838973999,
"learning_rate": 6.528663169391391e-07,
"loss": 0.9852238893508911,
"step": 1624
},
{
"epoch": 1.3686868686868687,
"grad_norm": 8.0145263671875,
"learning_rate": 6.520529733966932e-07,
"loss": 0.6827946901321411,
"step": 1626
},
{
"epoch": 1.3703703703703702,
"grad_norm": 3.5943119525909424,
"learning_rate": 6.512392782388144e-07,
"loss": 0.9226878881454468,
"step": 1628
},
{
"epoch": 1.372053872053872,
"grad_norm": 4.993966102600098,
"learning_rate": 6.504252342690247e-07,
"loss": 0.9282613396644592,
"step": 1630
},
{
"epoch": 1.3737373737373737,
"grad_norm": 8.258445739746094,
"learning_rate": 6.496108442920482e-07,
"loss": 1.0419143438339233,
"step": 1632
},
{
"epoch": 1.3754208754208754,
"grad_norm": 11.405352592468262,
"learning_rate": 6.48796111113801e-07,
"loss": 0.7039163112640381,
"step": 1634
},
{
"epoch": 1.3771043771043772,
"grad_norm": 2.947396755218506,
"learning_rate": 6.479810375413819e-07,
"loss": 0.39542487263679504,
"step": 1636
},
{
"epoch": 1.378787878787879,
"grad_norm": 8.117940902709961,
"learning_rate": 6.471656263830618e-07,
"loss": 0.6473898887634277,
"step": 1638
},
{
"epoch": 1.3804713804713804,
"grad_norm": 3.3129611015319824,
"learning_rate": 6.463498804482757e-07,
"loss": 0.7153133153915405,
"step": 1640
},
{
"epoch": 1.3821548821548821,
"grad_norm": 7.839447498321533,
"learning_rate": 6.455338025476116e-07,
"loss": 0.9829051494598389,
"step": 1642
},
{
"epoch": 1.3838383838383839,
"grad_norm": 6.2321343421936035,
"learning_rate": 6.447173954928011e-07,
"loss": 1.191624641418457,
"step": 1644
},
{
"epoch": 1.3855218855218856,
"grad_norm": 8.412511825561523,
"learning_rate": 6.439006620967097e-07,
"loss": 0.8809744715690613,
"step": 1646
},
{
"epoch": 1.387205387205387,
"grad_norm": 3.1402454376220703,
"learning_rate": 6.430836051733282e-07,
"loss": 1.0235364437103271,
"step": 1648
},
{
"epoch": 1.3888888888888888,
"grad_norm": 4.362932205200195,
"learning_rate": 6.42266227537761e-07,
"loss": 0.9193822741508484,
"step": 1650
},
{
"epoch": 1.3905723905723906,
"grad_norm": 2.9106979370117188,
"learning_rate": 6.414485320062181e-07,
"loss": 1.2303351163864136,
"step": 1652
},
{
"epoch": 1.3922558922558923,
"grad_norm": 3.156247854232788,
"learning_rate": 6.406305213960045e-07,
"loss": 1.0456502437591553,
"step": 1654
},
{
"epoch": 1.393939393939394,
"grad_norm": 13.951912879943848,
"learning_rate": 6.398121985255116e-07,
"loss": 0.6429623365402222,
"step": 1656
},
{
"epoch": 1.3956228956228955,
"grad_norm": 2.782015323638916,
"learning_rate": 6.389935662142053e-07,
"loss": 0.6639566421508789,
"step": 1658
},
{
"epoch": 1.3973063973063973,
"grad_norm": 9.586030960083008,
"learning_rate": 6.381746272826186e-07,
"loss": 0.9411950707435608,
"step": 1660
},
{
"epoch": 1.398989898989899,
"grad_norm": 25.26241111755371,
"learning_rate": 6.373553845523407e-07,
"loss": 0.8540170192718506,
"step": 1662
},
{
"epoch": 1.4006734006734007,
"grad_norm": 40.64924240112305,
"learning_rate": 6.365358408460076e-07,
"loss": 0.7800917625427246,
"step": 1664
},
{
"epoch": 1.4023569023569022,
"grad_norm": 5.472312927246094,
"learning_rate": 6.35715998987292e-07,
"loss": 0.5686221718788147,
"step": 1666
},
{
"epoch": 1.404040404040404,
"grad_norm": 15.37363338470459,
"learning_rate": 6.348958618008943e-07,
"loss": 0.8799217939376831,
"step": 1668
},
{
"epoch": 1.4057239057239057,
"grad_norm": 2.726579189300537,
"learning_rate": 6.340754321125318e-07,
"loss": 0.8866001963615417,
"step": 1670
},
{
"epoch": 1.4074074074074074,
"grad_norm": 2.6039483547210693,
"learning_rate": 6.332547127489305e-07,
"loss": 0.8179314136505127,
"step": 1672
},
{
"epoch": 1.4090909090909092,
"grad_norm": 3.4876205921173096,
"learning_rate": 6.324337065378136e-07,
"loss": 1.2043547630310059,
"step": 1674
},
{
"epoch": 1.410774410774411,
"grad_norm": 13.763050079345703,
"learning_rate": 6.316124163078927e-07,
"loss": 0.488219678401947,
"step": 1676
},
{
"epoch": 1.4124579124579124,
"grad_norm": 8.983017921447754,
"learning_rate": 6.307908448888588e-07,
"loss": 1.0192590951919556,
"step": 1678
},
{
"epoch": 1.4141414141414141,
"grad_norm": 25.50023651123047,
"learning_rate": 6.299689951113709e-07,
"loss": 1.12066650390625,
"step": 1680
},
{
"epoch": 1.4158249158249159,
"grad_norm": 2.510024309158325,
"learning_rate": 6.29146869807047e-07,
"loss": 0.6489291191101074,
"step": 1682
},
{
"epoch": 1.4175084175084174,
"grad_norm": 20.36254119873047,
"learning_rate": 6.283244718084551e-07,
"loss": 0.5022568702697754,
"step": 1684
},
{
"epoch": 1.4191919191919191,
"grad_norm": 25.578750610351562,
"learning_rate": 6.27501803949102e-07,
"loss": 0.6631441712379456,
"step": 1686
},
{
"epoch": 1.4208754208754208,
"grad_norm": 3.3692312240600586,
"learning_rate": 6.266788690634247e-07,
"loss": 1.16062593460083,
"step": 1688
},
{
"epoch": 1.4225589225589226,
"grad_norm": 34.72169876098633,
"learning_rate": 6.258556699867804e-07,
"loss": 0.5728762149810791,
"step": 1690
},
{
"epoch": 1.4242424242424243,
"grad_norm": 6.119333744049072,
"learning_rate": 6.25032209555436e-07,
"loss": 0.6605233550071716,
"step": 1692
},
{
"epoch": 1.425925925925926,
"grad_norm": 5.281041622161865,
"learning_rate": 6.242084906065592e-07,
"loss": 0.6033918261528015,
"step": 1694
},
{
"epoch": 1.4276094276094276,
"grad_norm": 7.152311325073242,
"learning_rate": 6.233845159782085e-07,
"loss": 1.2653751373291016,
"step": 1696
},
{
"epoch": 1.4292929292929293,
"grad_norm": 5.30517053604126,
"learning_rate": 6.22560288509323e-07,
"loss": 1.0920519828796387,
"step": 1698
},
{
"epoch": 1.430976430976431,
"grad_norm": 2.4701943397521973,
"learning_rate": 6.217358110397133e-07,
"loss": 0.8582168817520142,
"step": 1700
},
{
"epoch": 1.4326599326599325,
"grad_norm": 3.8614046573638916,
"learning_rate": 6.209110864100511e-07,
"loss": 0.8965442776679993,
"step": 1702
},
{
"epoch": 1.4343434343434343,
"grad_norm": 6.42789888381958,
"learning_rate": 6.200861174618599e-07,
"loss": 0.570695698261261,
"step": 1704
},
{
"epoch": 1.436026936026936,
"grad_norm": 35.690406799316406,
"learning_rate": 6.192609070375045e-07,
"loss": 0.4622350335121155,
"step": 1706
},
{
"epoch": 1.4377104377104377,
"grad_norm": 12.375661849975586,
"learning_rate": 6.184354579801825e-07,
"loss": 1.0623770952224731,
"step": 1708
},
{
"epoch": 1.4393939393939394,
"grad_norm": 80.91400146484375,
"learning_rate": 6.176097731339128e-07,
"loss": 1.1389422416687012,
"step": 1710
},
{
"epoch": 1.4410774410774412,
"grad_norm": 10.364182472229004,
"learning_rate": 6.167838553435273e-07,
"loss": 1.0922863483428955,
"step": 1712
},
{
"epoch": 1.4427609427609427,
"grad_norm": 2.8876967430114746,
"learning_rate": 6.159577074546601e-07,
"loss": 1.0083891153335571,
"step": 1714
},
{
"epoch": 1.4444444444444444,
"grad_norm": 26.479930877685547,
"learning_rate": 6.151313323137387e-07,
"loss": 0.958626925945282,
"step": 1716
},
{
"epoch": 1.4461279461279462,
"grad_norm": 2.9834275245666504,
"learning_rate": 6.14304732767973e-07,
"loss": 0.8797729015350342,
"step": 1718
},
{
"epoch": 1.4478114478114479,
"grad_norm": 7.285353660583496,
"learning_rate": 6.134779116653459e-07,
"loss": 0.7979905605316162,
"step": 1720
},
{
"epoch": 1.4494949494949494,
"grad_norm": 13.242645263671875,
"learning_rate": 6.126508718546044e-07,
"loss": 0.6679774522781372,
"step": 1722
},
{
"epoch": 1.4511784511784511,
"grad_norm": 5.430975437164307,
"learning_rate": 6.118236161852486e-07,
"loss": 0.7967842221260071,
"step": 1724
},
{
"epoch": 1.4528619528619529,
"grad_norm": 15.7615385055542,
"learning_rate": 6.10996147507522e-07,
"loss": 0.9348810911178589,
"step": 1726
},
{
"epoch": 1.4545454545454546,
"grad_norm": 14.294349670410156,
"learning_rate": 6.101684686724027e-07,
"loss": 0.7149630188941956,
"step": 1728
},
{
"epoch": 1.4562289562289563,
"grad_norm": 4.464022636413574,
"learning_rate": 6.093405825315923e-07,
"loss": 1.0214498043060303,
"step": 1730
},
{
"epoch": 1.457912457912458,
"grad_norm": 3.1939473152160645,
"learning_rate": 6.08512491937507e-07,
"loss": 1.2640581130981445,
"step": 1732
},
{
"epoch": 1.4595959595959596,
"grad_norm": 4.230099678039551,
"learning_rate": 6.076841997432677e-07,
"loss": 0.9663617014884949,
"step": 1734
},
{
"epoch": 1.4612794612794613,
"grad_norm": 11.766712188720703,
"learning_rate": 6.06855708802689e-07,
"loss": 0.7833054065704346,
"step": 1736
},
{
"epoch": 1.462962962962963,
"grad_norm": 6.552529811859131,
"learning_rate": 6.060270219702709e-07,
"loss": 0.6994054317474365,
"step": 1738
},
{
"epoch": 1.4646464646464645,
"grad_norm": 2.931861400604248,
"learning_rate": 6.051981421011882e-07,
"loss": 1.1358039379119873,
"step": 1740
},
{
"epoch": 1.4663299663299663,
"grad_norm": 9.284839630126953,
"learning_rate": 6.043690720512812e-07,
"loss": 0.7364188432693481,
"step": 1742
},
{
"epoch": 1.468013468013468,
"grad_norm": 5.37172794342041,
"learning_rate": 6.035398146770444e-07,
"loss": 0.5165277123451233,
"step": 1744
},
{
"epoch": 1.4696969696969697,
"grad_norm": 5.121616363525391,
"learning_rate": 6.027103728356189e-07,
"loss": 1.0125455856323242,
"step": 1746
},
{
"epoch": 1.4713804713804715,
"grad_norm": 2.773219347000122,
"learning_rate": 6.018807493847804e-07,
"loss": 1.035334825515747,
"step": 1748
},
{
"epoch": 1.4730639730639732,
"grad_norm": 7.262451171875,
"learning_rate": 6.010509471829312e-07,
"loss": 0.7966405153274536,
"step": 1750
},
{
"epoch": 1.4747474747474747,
"grad_norm": 7.338104248046875,
"learning_rate": 6.002209690890889e-07,
"loss": 0.7077836990356445,
"step": 1752
},
{
"epoch": 1.4764309764309764,
"grad_norm": 7.950678825378418,
"learning_rate": 5.993908179628772e-07,
"loss": 0.7144612073898315,
"step": 1754
},
{
"epoch": 1.4781144781144782,
"grad_norm": 9.630928039550781,
"learning_rate": 5.985604966645159e-07,
"loss": 0.8856356143951416,
"step": 1756
},
{
"epoch": 1.4797979797979797,
"grad_norm": 15.059102058410645,
"learning_rate": 5.977300080548113e-07,
"loss": 0.7022537589073181,
"step": 1758
},
{
"epoch": 1.4814814814814814,
"grad_norm": 8.7070894241333,
"learning_rate": 5.968993549951463e-07,
"loss": 0.764058530330658,
"step": 1760
},
{
"epoch": 1.4831649831649831,
"grad_norm": 8.469696998596191,
"learning_rate": 5.9606854034747e-07,
"loss": 0.9842470288276672,
"step": 1762
},
{
"epoch": 1.4848484848484849,
"grad_norm": 3.3772764205932617,
"learning_rate": 5.952375669742885e-07,
"loss": 0.9660754799842834,
"step": 1764
},
{
"epoch": 1.4865319865319866,
"grad_norm": 15.527210235595703,
"learning_rate": 5.944064377386546e-07,
"loss": 0.7293991446495056,
"step": 1766
},
{
"epoch": 1.4882154882154883,
"grad_norm": 7.509492874145508,
"learning_rate": 5.935751555041584e-07,
"loss": 0.8063384294509888,
"step": 1768
},
{
"epoch": 1.4898989898989898,
"grad_norm": 4.355234622955322,
"learning_rate": 5.927437231349168e-07,
"loss": 1.001720666885376,
"step": 1770
},
{
"epoch": 1.4915824915824916,
"grad_norm": 12.318822860717773,
"learning_rate": 5.919121434955643e-07,
"loss": 0.4859294295310974,
"step": 1772
},
{
"epoch": 1.4932659932659933,
"grad_norm": 2.495269536972046,
"learning_rate": 5.910804194512425e-07,
"loss": 0.8450926542282104,
"step": 1774
},
{
"epoch": 1.494949494949495,
"grad_norm": 11.203375816345215,
"learning_rate": 5.902485538675909e-07,
"loss": 0.8008178472518921,
"step": 1776
},
{
"epoch": 1.4966329966329965,
"grad_norm": 7.061748504638672,
"learning_rate": 5.894165496107362e-07,
"loss": 0.9183659553527832,
"step": 1778
},
{
"epoch": 1.4983164983164983,
"grad_norm": 10.182241439819336,
"learning_rate": 5.885844095472832e-07,
"loss": 0.9454483985900879,
"step": 1780
},
{
"epoch": 1.5,
"grad_norm": 10.898093223571777,
"learning_rate": 5.877521365443047e-07,
"loss": 0.612937331199646,
"step": 1782
},
{
"epoch": 1.5016835016835017,
"grad_norm": 4.307864665985107,
"learning_rate": 5.869197334693311e-07,
"loss": 1.2052326202392578,
"step": 1784
},
{
"epoch": 1.5033670033670035,
"grad_norm": 5.633955478668213,
"learning_rate": 5.860872031903415e-07,
"loss": 0.8493650555610657,
"step": 1786
},
{
"epoch": 1.5050505050505052,
"grad_norm": 4.648436069488525,
"learning_rate": 5.85254548575753e-07,
"loss": 1.030457615852356,
"step": 1788
},
{
"epoch": 1.5067340067340067,
"grad_norm": 19.26193618774414,
"learning_rate": 5.84421772494411e-07,
"loss": 0.6253769397735596,
"step": 1790
},
{
"epoch": 1.5084175084175084,
"grad_norm": 18.525527954101562,
"learning_rate": 5.835888778155793e-07,
"loss": 0.6486117839813232,
"step": 1792
},
{
"epoch": 1.51010101010101,
"grad_norm": 11.307801246643066,
"learning_rate": 5.827558674089309e-07,
"loss": 0.9593780636787415,
"step": 1794
},
{
"epoch": 1.5117845117845117,
"grad_norm": 8.364538192749023,
"learning_rate": 5.81922744144537e-07,
"loss": 0.9520887136459351,
"step": 1796
},
{
"epoch": 1.5134680134680134,
"grad_norm": 20.700618743896484,
"learning_rate": 5.810895108928576e-07,
"loss": 1.0315901041030884,
"step": 1798
},
{
"epoch": 1.5151515151515151,
"grad_norm": 5.827000617980957,
"learning_rate": 5.802561705247322e-07,
"loss": 0.8540360331535339,
"step": 1800
},
{
"epoch": 1.5168350168350169,
"grad_norm": 5.0441365242004395,
"learning_rate": 5.794227259113688e-07,
"loss": 1.0596797466278076,
"step": 1802
},
{
"epoch": 1.5185185185185186,
"grad_norm": 15.39765453338623,
"learning_rate": 5.785891799243345e-07,
"loss": 0.9995817542076111,
"step": 1804
},
{
"epoch": 1.5202020202020203,
"grad_norm": 3.5208356380462646,
"learning_rate": 5.777555354355465e-07,
"loss": 0.8799208402633667,
"step": 1806
},
{
"epoch": 1.5218855218855218,
"grad_norm": 19.627885818481445,
"learning_rate": 5.769217953172606e-07,
"loss": 0.7398556470870972,
"step": 1808
},
{
"epoch": 1.5235690235690236,
"grad_norm": 6.568966865539551,
"learning_rate": 5.760879624420619e-07,
"loss": 0.7647089958190918,
"step": 1810
},
{
"epoch": 1.5252525252525253,
"grad_norm": 1.675675868988037,
"learning_rate": 5.752540396828562e-07,
"loss": 0.31169167160987854,
"step": 1812
},
{
"epoch": 1.5269360269360268,
"grad_norm": 2.579169273376465,
"learning_rate": 5.744200299128579e-07,
"loss": 1.1429425477981567,
"step": 1814
},
{
"epoch": 1.5286195286195285,
"grad_norm": 23.35523796081543,
"learning_rate": 5.735859360055814e-07,
"loss": 0.8635933995246887,
"step": 1816
},
{
"epoch": 1.5303030303030303,
"grad_norm": 21.272926330566406,
"learning_rate": 5.727517608348317e-07,
"loss": 0.947623610496521,
"step": 1818
},
{
"epoch": 1.531986531986532,
"grad_norm": 5.091054916381836,
"learning_rate": 5.719175072746926e-07,
"loss": 0.8388112187385559,
"step": 1820
},
{
"epoch": 1.5336700336700337,
"grad_norm": 5.891815185546875,
"learning_rate": 5.710831781995191e-07,
"loss": 0.7908442616462708,
"step": 1822
},
{
"epoch": 1.5353535353535355,
"grad_norm": 5.613356113433838,
"learning_rate": 5.702487764839258e-07,
"loss": 1.0302139520645142,
"step": 1824
},
{
"epoch": 1.5370370370370372,
"grad_norm": 5.067226886749268,
"learning_rate": 5.694143050027778e-07,
"loss": 0.9267786145210266,
"step": 1826
},
{
"epoch": 1.5387205387205387,
"grad_norm": 5.48887300491333,
"learning_rate": 5.685797666311801e-07,
"loss": 0.9696795344352722,
"step": 1828
},
{
"epoch": 1.5404040404040404,
"grad_norm": 6.487977981567383,
"learning_rate": 5.677451642444689e-07,
"loss": 0.7679098844528198,
"step": 1830
},
{
"epoch": 1.542087542087542,
"grad_norm": 8.134166717529297,
"learning_rate": 5.669105007182005e-07,
"loss": 0.7442073822021484,
"step": 1832
},
{
"epoch": 1.5437710437710437,
"grad_norm": 4.254693984985352,
"learning_rate": 5.660757789281417e-07,
"loss": 1.0978777408599854,
"step": 1834
},
{
"epoch": 1.5454545454545454,
"grad_norm": 11.371539115905762,
"learning_rate": 5.652410017502606e-07,
"loss": 0.9501652717590332,
"step": 1836
},
{
"epoch": 1.5471380471380471,
"grad_norm": 16.133960723876953,
"learning_rate": 5.644061720607157e-07,
"loss": 0.536079466342926,
"step": 1838
},
{
"epoch": 1.5488215488215489,
"grad_norm": 3.337813377380371,
"learning_rate": 5.635712927358466e-07,
"loss": 0.7914686799049377,
"step": 1840
},
{
"epoch": 1.5505050505050506,
"grad_norm": 3.199794292449951,
"learning_rate": 5.627363666521635e-07,
"loss": 0.6903548240661621,
"step": 1842
},
{
"epoch": 1.5521885521885523,
"grad_norm": 6.261875152587891,
"learning_rate": 5.619013966863388e-07,
"loss": 0.5220504403114319,
"step": 1844
},
{
"epoch": 1.5538720538720538,
"grad_norm": 3.182934284210205,
"learning_rate": 5.610663857151945e-07,
"loss": 0.9434134364128113,
"step": 1846
},
{
"epoch": 1.5555555555555556,
"grad_norm": 10.680120468139648,
"learning_rate": 5.602313366156953e-07,
"loss": 1.0320630073547363,
"step": 1848
},
{
"epoch": 1.557239057239057,
"grad_norm": 3.8439457416534424,
"learning_rate": 5.593962522649366e-07,
"loss": 0.837065577507019,
"step": 1850
},
{
"epoch": 1.5589225589225588,
"grad_norm": 3.5652201175689697,
"learning_rate": 5.585611355401352e-07,
"loss": 0.9864023923873901,
"step": 1852
},
{
"epoch": 1.5606060606060606,
"grad_norm": 20.397172927856445,
"learning_rate": 5.577259893186196e-07,
"loss": 0.6269755363464355,
"step": 1854
},
{
"epoch": 1.5622895622895623,
"grad_norm": 5.430056571960449,
"learning_rate": 5.568908164778201e-07,
"loss": 0.6682024598121643,
"step": 1856
},
{
"epoch": 1.563973063973064,
"grad_norm": 3.305800199508667,
"learning_rate": 5.560556198952585e-07,
"loss": 1.017985224723816,
"step": 1858
},
{
"epoch": 1.5656565656565657,
"grad_norm": 10.345197677612305,
"learning_rate": 5.552204024485382e-07,
"loss": 0.46250391006469727,
"step": 1860
},
{
"epoch": 1.5673400673400675,
"grad_norm": 3.9617226123809814,
"learning_rate": 5.543851670153353e-07,
"loss": 1.0285084247589111,
"step": 1862
},
{
"epoch": 1.569023569023569,
"grad_norm": 4.295073509216309,
"learning_rate": 5.535499164733869e-07,
"loss": 0.44839808344841003,
"step": 1864
},
{
"epoch": 1.5707070707070707,
"grad_norm": 9.806756973266602,
"learning_rate": 5.527146537004823e-07,
"loss": 1.037379503250122,
"step": 1866
},
{
"epoch": 1.5723905723905722,
"grad_norm": 7.301255702972412,
"learning_rate": 5.518793815744538e-07,
"loss": 0.6518345475196838,
"step": 1868
},
{
"epoch": 1.574074074074074,
"grad_norm": 2.5327539443969727,
"learning_rate": 5.510441029731648e-07,
"loss": 0.8190163969993591,
"step": 1870
},
{
"epoch": 1.5757575757575757,
"grad_norm": 5.190461158752441,
"learning_rate": 5.502088207745018e-07,
"loss": 0.8958265781402588,
"step": 1872
},
{
"epoch": 1.5774410774410774,
"grad_norm": 4.127246379852295,
"learning_rate": 5.493735378563634e-07,
"loss": 1.0178121328353882,
"step": 1874
},
{
"epoch": 1.5791245791245792,
"grad_norm": 6.272322654724121,
"learning_rate": 5.485382570966506e-07,
"loss": 0.6380331516265869,
"step": 1876
},
{
"epoch": 1.5808080808080809,
"grad_norm": 4.318612575531006,
"learning_rate": 5.477029813732572e-07,
"loss": 1.184647798538208,
"step": 1878
},
{
"epoch": 1.5824915824915826,
"grad_norm": 13.378433227539062,
"learning_rate": 5.468677135640595e-07,
"loss": 0.8356841802597046,
"step": 1880
},
{
"epoch": 1.5841750841750841,
"grad_norm": 6.793831825256348,
"learning_rate": 5.460324565469065e-07,
"loss": 0.5384290218353271,
"step": 1882
},
{
"epoch": 1.5858585858585859,
"grad_norm": 10.296432495117188,
"learning_rate": 5.4519721319961e-07,
"loss": 0.574350893497467,
"step": 1884
},
{
"epoch": 1.5875420875420876,
"grad_norm": 2.8328824043273926,
"learning_rate": 5.443619863999349e-07,
"loss": 0.7007859945297241,
"step": 1886
},
{
"epoch": 1.589225589225589,
"grad_norm": 4.71426248550415,
"learning_rate": 5.435267790255889e-07,
"loss": 1.0490843057632446,
"step": 1888
},
{
"epoch": 1.5909090909090908,
"grad_norm": 4.223301887512207,
"learning_rate": 5.426915939542127e-07,
"loss": 0.2507448196411133,
"step": 1890
},
{
"epoch": 1.5925925925925926,
"grad_norm": 4.74931526184082,
"learning_rate": 5.418564340633704e-07,
"loss": 1.1350317001342773,
"step": 1892
},
{
"epoch": 1.5942760942760943,
"grad_norm": 4.932158470153809,
"learning_rate": 5.410213022305395e-07,
"loss": 0.8503820300102234,
"step": 1894
},
{
"epoch": 1.595959595959596,
"grad_norm": 14.765481948852539,
"learning_rate": 5.401862013331e-07,
"loss": 1.011979103088379,
"step": 1896
},
{
"epoch": 1.5976430976430978,
"grad_norm": 19.991121292114258,
"learning_rate": 5.393511342483262e-07,
"loss": 0.9245116710662842,
"step": 1898
},
{
"epoch": 1.5993265993265995,
"grad_norm": 8.519593238830566,
"learning_rate": 5.385161038533756e-07,
"loss": 1.0895578861236572,
"step": 1900
},
{
"epoch": 1.601010101010101,
"grad_norm": 7.841440200805664,
"learning_rate": 5.376811130252791e-07,
"loss": 0.9659103155136108,
"step": 1902
},
{
"epoch": 1.6026936026936027,
"grad_norm": 20.247276306152344,
"learning_rate": 5.368461646409316e-07,
"loss": 0.796362042427063,
"step": 1904
},
{
"epoch": 1.6043771043771042,
"grad_norm": 3.420994281768799,
"learning_rate": 5.360112615770814e-07,
"loss": 1.1793514490127563,
"step": 1906
},
{
"epoch": 1.606060606060606,
"grad_norm": 3.9010415077209473,
"learning_rate": 5.351764067103209e-07,
"loss": 0.9917897582054138,
"step": 1908
},
{
"epoch": 1.6077441077441077,
"grad_norm": 2.7143442630767822,
"learning_rate": 5.343416029170767e-07,
"loss": 0.6407607793807983,
"step": 1910
},
{
"epoch": 1.6094276094276094,
"grad_norm": 30.25970458984375,
"learning_rate": 5.335068530735986e-07,
"loss": 0.6329153776168823,
"step": 1912
},
{
"epoch": 1.6111111111111112,
"grad_norm": 24.24694061279297,
"learning_rate": 5.326721600559513e-07,
"loss": 0.8712905645370483,
"step": 1914
},
{
"epoch": 1.612794612794613,
"grad_norm": 8.489051818847656,
"learning_rate": 5.318375267400035e-07,
"loss": 0.7373044490814209,
"step": 1916
},
{
"epoch": 1.6144781144781146,
"grad_norm": 17.98837661743164,
"learning_rate": 5.310029560014182e-07,
"loss": 0.6858376860618591,
"step": 1918
},
{
"epoch": 1.6161616161616161,
"grad_norm": 8.963407516479492,
"learning_rate": 5.301684507156424e-07,
"loss": 0.7940559983253479,
"step": 1920
},
{
"epoch": 1.6178451178451179,
"grad_norm": 3.7711410522460938,
"learning_rate": 5.293340137578983e-07,
"loss": 0.9433008432388306,
"step": 1922
},
{
"epoch": 1.6195286195286194,
"grad_norm": 3.9224212169647217,
"learning_rate": 5.284996480031722e-07,
"loss": 0.7148711085319519,
"step": 1924
},
{
"epoch": 1.621212121212121,
"grad_norm": 4.903892993927002,
"learning_rate": 5.276653563262053e-07,
"loss": 0.6378931403160095,
"step": 1926
},
{
"epoch": 1.6228956228956228,
"grad_norm": 2.4689173698425293,
"learning_rate": 5.268311416014831e-07,
"loss": 0.8439034223556519,
"step": 1928
},
{
"epoch": 1.6245791245791246,
"grad_norm": 10.568015098571777,
"learning_rate": 5.259970067032267e-07,
"loss": 0.8784427642822266,
"step": 1930
},
{
"epoch": 1.6262626262626263,
"grad_norm": 3.8736679553985596,
"learning_rate": 5.251629545053817e-07,
"loss": 0.711959958076477,
"step": 1932
},
{
"epoch": 1.627946127946128,
"grad_norm": 8.50756549835205,
"learning_rate": 5.243289878816088e-07,
"loss": 1.071230173110962,
"step": 1934
},
{
"epoch": 1.6296296296296298,
"grad_norm": 7.397336006164551,
"learning_rate": 5.23495109705274e-07,
"loss": 1.075880765914917,
"step": 1936
},
{
"epoch": 1.6313131313131313,
"grad_norm": 4.465485572814941,
"learning_rate": 5.226613228494383e-07,
"loss": 1.1016345024108887,
"step": 1938
},
{
"epoch": 1.632996632996633,
"grad_norm": 3.5152881145477295,
"learning_rate": 5.218276301868484e-07,
"loss": 0.8878377676010132,
"step": 1940
},
{
"epoch": 1.6346801346801347,
"grad_norm": 2.7900075912475586,
"learning_rate": 5.209940345899263e-07,
"loss": 1.0775192975997925,
"step": 1942
},
{
"epoch": 1.6363636363636362,
"grad_norm": 16.28611183166504,
"learning_rate": 5.201605389307595e-07,
"loss": 0.8081328868865967,
"step": 1944
},
{
"epoch": 1.638047138047138,
"grad_norm": 4.536927223205566,
"learning_rate": 5.193271460810912e-07,
"loss": 0.5076104998588562,
"step": 1946
},
{
"epoch": 1.6397306397306397,
"grad_norm": 7.646073341369629,
"learning_rate": 5.184938589123105e-07,
"loss": 1.030837059020996,
"step": 1948
},
{
"epoch": 1.6414141414141414,
"grad_norm": 3.751291036605835,
"learning_rate": 5.176606802954427e-07,
"loss": 1.0447328090667725,
"step": 1950
},
{
"epoch": 1.6430976430976432,
"grad_norm": 10.364418983459473,
"learning_rate": 5.168276131011378e-07,
"loss": 0.5750001072883606,
"step": 1952
},
{
"epoch": 1.644781144781145,
"grad_norm": 12.142930030822754,
"learning_rate": 5.159946601996638e-07,
"loss": 0.5072500705718994,
"step": 1954
},
{
"epoch": 1.6464646464646466,
"grad_norm": 26.362775802612305,
"learning_rate": 5.151618244608931e-07,
"loss": 0.3224486708641052,
"step": 1956
},
{
"epoch": 1.6481481481481481,
"grad_norm": 5.14034366607666,
"learning_rate": 5.143291087542957e-07,
"loss": 0.7505396604537964,
"step": 1958
},
{
"epoch": 1.6498316498316499,
"grad_norm": 3.361147880554199,
"learning_rate": 5.134965159489276e-07,
"loss": 0.8362823128700256,
"step": 1960
},
{
"epoch": 1.6515151515151514,
"grad_norm": 19.640413284301758,
"learning_rate": 5.126640489134211e-07,
"loss": 0.7406565546989441,
"step": 1962
},
{
"epoch": 1.6531986531986531,
"grad_norm": 8.70249080657959,
"learning_rate": 5.118317105159754e-07,
"loss": 0.5722910761833191,
"step": 1964
},
{
"epoch": 1.6548821548821548,
"grad_norm": 4.43184232711792,
"learning_rate": 5.109995036243469e-07,
"loss": 0.6934190392494202,
"step": 1966
},
{
"epoch": 1.6565656565656566,
"grad_norm": 6.205933094024658,
"learning_rate": 5.10167431105838e-07,
"loss": 0.8717750310897827,
"step": 1968
},
{
"epoch": 1.6582491582491583,
"grad_norm": 11.131174087524414,
"learning_rate": 5.093354958272888e-07,
"loss": 0.8401749730110168,
"step": 1970
},
{
"epoch": 1.65993265993266,
"grad_norm": 7.66545295715332,
"learning_rate": 5.085037006550664e-07,
"loss": 0.9823508858680725,
"step": 1972
},
{
"epoch": 1.6616161616161618,
"grad_norm": 2.336907148361206,
"learning_rate": 5.076720484550552e-07,
"loss": 0.8289145231246948,
"step": 1974
},
{
"epoch": 1.6632996632996633,
"grad_norm": 4.420996189117432,
"learning_rate": 5.068405420926468e-07,
"loss": 0.787537693977356,
"step": 1976
},
{
"epoch": 1.664983164983165,
"grad_norm": 16.187654495239258,
"learning_rate": 5.060091844327308e-07,
"loss": 0.8101489543914795,
"step": 1978
},
{
"epoch": 1.6666666666666665,
"grad_norm": 6.166725158691406,
"learning_rate": 5.051779783396839e-07,
"loss": 0.9080666303634644,
"step": 1980
},
{
"epoch": 1.6683501683501682,
"grad_norm": 14.882169723510742,
"learning_rate": 5.043469266773607e-07,
"loss": 0.5505136251449585,
"step": 1982
},
{
"epoch": 1.67003367003367,
"grad_norm": 20.98061180114746,
"learning_rate": 5.035160323090842e-07,
"loss": 0.4539128839969635,
"step": 1984
},
{
"epoch": 1.6717171717171717,
"grad_norm": 3.427556276321411,
"learning_rate": 5.026852980976348e-07,
"loss": 1.0426026582717896,
"step": 1986
},
{
"epoch": 1.6734006734006734,
"grad_norm": 13.226459503173828,
"learning_rate": 5.018547269052416e-07,
"loss": 0.9861583113670349,
"step": 1988
},
{
"epoch": 1.6750841750841752,
"grad_norm": 3.2640278339385986,
"learning_rate": 5.010243215935715e-07,
"loss": 0.6827632784843445,
"step": 1990
},
{
"epoch": 1.676767676767677,
"grad_norm": 3.51690673828125,
"learning_rate": 5.001940850237208e-07,
"loss": 1.151839256286621,
"step": 1992
},
{
"epoch": 1.6784511784511784,
"grad_norm": 9.070838928222656,
"learning_rate": 4.993640200562031e-07,
"loss": 0.7563179731369019,
"step": 1994
},
{
"epoch": 1.6801346801346801,
"grad_norm": 7.1710896492004395,
"learning_rate": 4.985341295509421e-07,
"loss": 0.6942537426948547,
"step": 1996
},
{
"epoch": 1.6818181818181817,
"grad_norm": 2.580467939376831,
"learning_rate": 4.977044163672595e-07,
"loss": 0.9790170192718506,
"step": 1998
},
{
"epoch": 1.6835016835016834,
"grad_norm": 13.908555030822754,
"learning_rate": 4.968748833638661e-07,
"loss": 0.7780789136886597,
"step": 2000
},
{
"epoch": 1.6851851851851851,
"grad_norm": 4.1657209396362305,
"learning_rate": 4.960455333988525e-07,
"loss": 0.6467783451080322,
"step": 2002
},
{
"epoch": 1.6868686868686869,
"grad_norm": 8.925399780273438,
"learning_rate": 4.952163693296782e-07,
"loss": 0.7447915077209473,
"step": 2004
},
{
"epoch": 1.6885521885521886,
"grad_norm": 9.181722640991211,
"learning_rate": 4.943873940131618e-07,
"loss": 0.6678234338760376,
"step": 2006
},
{
"epoch": 1.6902356902356903,
"grad_norm": 4.147680282592773,
"learning_rate": 4.935586103054729e-07,
"loss": 0.9828382730484009,
"step": 2008
},
{
"epoch": 1.691919191919192,
"grad_norm": 4.527743339538574,
"learning_rate": 4.927300210621198e-07,
"loss": 0.6916370987892151,
"step": 2010
},
{
"epoch": 1.6936026936026936,
"grad_norm": 8.658282279968262,
"learning_rate": 4.919016291379407e-07,
"loss": 0.9242024421691895,
"step": 2012
},
{
"epoch": 1.6952861952861953,
"grad_norm": 4.856821537017822,
"learning_rate": 4.910734373870946e-07,
"loss": 0.6717578172683716,
"step": 2014
},
{
"epoch": 1.696969696969697,
"grad_norm": 6.037668704986572,
"learning_rate": 4.902454486630506e-07,
"loss": 0.8340665102005005,
"step": 2016
},
{
"epoch": 1.6986531986531985,
"grad_norm": 46.316307067871094,
"learning_rate": 4.894176658185781e-07,
"loss": 0.8020853996276855,
"step": 2018
},
{
"epoch": 1.7003367003367003,
"grad_norm": 7.765172958374023,
"learning_rate": 4.885900917057374e-07,
"loss": 0.8143132328987122,
"step": 2020
},
{
"epoch": 1.702020202020202,
"grad_norm": 2.976177930831909,
"learning_rate": 4.877627291758697e-07,
"loss": 1.0872082710266113,
"step": 2022
},
{
"epoch": 1.7037037037037037,
"grad_norm": 7.9460225105285645,
"learning_rate": 4.869355810795866e-07,
"loss": 0.8318688273429871,
"step": 2024
},
{
"epoch": 1.7053872053872055,
"grad_norm": 5.210888385772705,
"learning_rate": 4.861086502667617e-07,
"loss": 0.9813876152038574,
"step": 2026
},
{
"epoch": 1.7070707070707072,
"grad_norm": 6.269561767578125,
"learning_rate": 4.852819395865196e-07,
"loss": 1.1104636192321777,
"step": 2028
},
{
"epoch": 1.708754208754209,
"grad_norm": 14.44339656829834,
"learning_rate": 4.844554518872261e-07,
"loss": 0.6626958847045898,
"step": 2030
},
{
"epoch": 1.7104377104377104,
"grad_norm": 22.18317413330078,
"learning_rate": 4.836291900164793e-07,
"loss": 0.5179702639579773,
"step": 2032
},
{
"epoch": 1.7121212121212122,
"grad_norm": 4.272150039672852,
"learning_rate": 4.82803156821099e-07,
"loss": 1.0629268884658813,
"step": 2034
},
{
"epoch": 1.7138047138047137,
"grad_norm": 4.388714790344238,
"learning_rate": 4.81977355147117e-07,
"loss": 0.8111241459846497,
"step": 2036
},
{
"epoch": 1.7154882154882154,
"grad_norm": 10.06100082397461,
"learning_rate": 4.811517878397676e-07,
"loss": 0.4932488799095154,
"step": 2038
},
{
"epoch": 1.7171717171717171,
"grad_norm": 6.3692474365234375,
"learning_rate": 4.803264577434778e-07,
"loss": 0.5541532039642334,
"step": 2040
},
{
"epoch": 1.7188552188552189,
"grad_norm": 3.8727078437805176,
"learning_rate": 4.795013677018567e-07,
"loss": 0.9600075483322144,
"step": 2042
},
{
"epoch": 1.7205387205387206,
"grad_norm": 3.6546130180358887,
"learning_rate": 4.786765205576866e-07,
"loss": 0.9439678192138672,
"step": 2044
},
{
"epoch": 1.7222222222222223,
"grad_norm": 2.5347650051116943,
"learning_rate": 4.778519191529133e-07,
"loss": 1.1322201490402222,
"step": 2046
},
{
"epoch": 1.723905723905724,
"grad_norm": 3.2225019931793213,
"learning_rate": 4.770275663286354e-07,
"loss": 1.0858080387115479,
"step": 2048
},
{
"epoch": 1.7255892255892256,
"grad_norm": 7.802936553955078,
"learning_rate": 4.762034649250951e-07,
"loss": 0.4231239855289459,
"step": 2050
},
{
"epoch": 1.7272727272727273,
"grad_norm": 2.6615946292877197,
"learning_rate": 4.753796177816688e-07,
"loss": 1.0833523273468018,
"step": 2052
},
{
"epoch": 1.7289562289562288,
"grad_norm": 12.313030242919922,
"learning_rate": 4.745560277368563e-07,
"loss": 0.9946305751800537,
"step": 2054
},
{
"epoch": 1.7306397306397305,
"grad_norm": 4.885106563568115,
"learning_rate": 4.7373269762827196e-07,
"loss": 0.8092712163925171,
"step": 2056
},
{
"epoch": 1.7323232323232323,
"grad_norm": 6.8623809814453125,
"learning_rate": 4.7290963029263453e-07,
"loss": 1.1297715902328491,
"step": 2058
},
{
"epoch": 1.734006734006734,
"grad_norm": 3.386683702468872,
"learning_rate": 4.720868285657571e-07,
"loss": 0.6623663902282715,
"step": 2060
},
{
"epoch": 1.7356902356902357,
"grad_norm": 7.138562202453613,
"learning_rate": 4.7126429528253775e-07,
"loss": 1.0328242778778076,
"step": 2062
},
{
"epoch": 1.7373737373737375,
"grad_norm": 12.482364654541016,
"learning_rate": 4.7044203327694995e-07,
"loss": 0.7162414789199829,
"step": 2064
},
{
"epoch": 1.7390572390572392,
"grad_norm": 2.729790449142456,
"learning_rate": 4.6962004538203224e-07,
"loss": 0.74675053358078,
"step": 2066
},
{
"epoch": 1.7407407407407407,
"grad_norm": 11.824975967407227,
"learning_rate": 4.687983344298786e-07,
"loss": 0.8567626476287842,
"step": 2068
},
{
"epoch": 1.7424242424242424,
"grad_norm": 18.96659278869629,
"learning_rate": 4.679769032516293e-07,
"loss": 0.7988073825836182,
"step": 2070
},
{
"epoch": 1.7441077441077442,
"grad_norm": 9.409902572631836,
"learning_rate": 4.6715575467746014e-07,
"loss": 0.6924943923950195,
"step": 2072
},
{
"epoch": 1.7457912457912457,
"grad_norm": 4.546468257904053,
"learning_rate": 4.663348915365735e-07,
"loss": 0.5785316228866577,
"step": 2074
},
{
"epoch": 1.7474747474747474,
"grad_norm": 19.581872940063477,
"learning_rate": 4.6551431665718833e-07,
"loss": 1.1338218450546265,
"step": 2076
},
{
"epoch": 1.7491582491582491,
"grad_norm": 5.740807056427002,
"learning_rate": 4.646940328665302e-07,
"loss": 0.8011679649353027,
"step": 2078
},
{
"epoch": 1.7508417508417509,
"grad_norm": 11.178342819213867,
"learning_rate": 4.638740429908222e-07,
"loss": 1.0102814435958862,
"step": 2080
},
{
"epoch": 1.7525252525252526,
"grad_norm": 5.02017879486084,
"learning_rate": 4.6305434985527437e-07,
"loss": 0.7039767503738403,
"step": 2082
},
{
"epoch": 1.7542087542087543,
"grad_norm": 2.7927052974700928,
"learning_rate": 4.6223495628407427e-07,
"loss": 1.2280118465423584,
"step": 2084
},
{
"epoch": 1.7558922558922558,
"grad_norm": 3.4108667373657227,
"learning_rate": 4.614158651003778e-07,
"loss": 0.8403428196907043,
"step": 2086
},
{
"epoch": 1.7575757575757576,
"grad_norm": 7.315975189208984,
"learning_rate": 4.605970791262984e-07,
"loss": 0.5117719769477844,
"step": 2088
},
{
"epoch": 1.7592592592592593,
"grad_norm": 26.32462501525879,
"learning_rate": 4.5977860118289846e-07,
"loss": 0.5781146287918091,
"step": 2090
},
{
"epoch": 1.7609427609427608,
"grad_norm": 6.7758965492248535,
"learning_rate": 4.5896043409017895e-07,
"loss": 0.6854249238967896,
"step": 2092
},
{
"epoch": 1.7626262626262625,
"grad_norm": 8.735897064208984,
"learning_rate": 4.5814258066706946e-07,
"loss": 0.4588479995727539,
"step": 2094
},
{
"epoch": 1.7643097643097643,
"grad_norm": 3.5783393383026123,
"learning_rate": 4.5732504373141957e-07,
"loss": 0.6785897612571716,
"step": 2096
},
{
"epoch": 1.765993265993266,
"grad_norm": 3.7991697788238525,
"learning_rate": 4.5650782609998785e-07,
"loss": 1.091996192932129,
"step": 2098
},
{
"epoch": 1.7676767676767677,
"grad_norm": 4.503328800201416,
"learning_rate": 4.556909305884327e-07,
"loss": 0.9916384816169739,
"step": 2100
},
{
"epoch": 1.7693602693602695,
"grad_norm": 4.141926288604736,
"learning_rate": 4.5487436001130295e-07,
"loss": 0.9449851512908936,
"step": 2102
},
{
"epoch": 1.7710437710437712,
"grad_norm": 13.318826675415039,
"learning_rate": 4.5405811718202804e-07,
"loss": 0.5735121369361877,
"step": 2104
},
{
"epoch": 1.7727272727272727,
"grad_norm": 4.924741268157959,
"learning_rate": 4.5324220491290765e-07,
"loss": 0.7375026941299438,
"step": 2106
},
{
"epoch": 1.7744107744107744,
"grad_norm": 7.583310127258301,
"learning_rate": 4.5242662601510305e-07,
"loss": 0.9382034540176392,
"step": 2108
},
{
"epoch": 1.776094276094276,
"grad_norm": 60.219932556152344,
"learning_rate": 4.516113832986267e-07,
"loss": 0.6118134260177612,
"step": 2110
},
{
"epoch": 1.7777777777777777,
"grad_norm": 2.8085687160491943,
"learning_rate": 4.5079647957233256e-07,
"loss": 0.869990348815918,
"step": 2112
},
{
"epoch": 1.7794612794612794,
"grad_norm": 2.663541078567505,
"learning_rate": 4.499819176439071e-07,
"loss": 0.9881576299667358,
"step": 2114
},
{
"epoch": 1.7811447811447811,
"grad_norm": 3.3883938789367676,
"learning_rate": 4.4916770031985887e-07,
"loss": 0.9770991206169128,
"step": 2116
},
{
"epoch": 1.7828282828282829,
"grad_norm": 3.3858611583709717,
"learning_rate": 4.48353830405509e-07,
"loss": 1.073500394821167,
"step": 2118
},
{
"epoch": 1.7845117845117846,
"grad_norm": 29.282451629638672,
"learning_rate": 4.475403107049819e-07,
"loss": 0.6810465455055237,
"step": 2120
},
{
"epoch": 1.7861952861952863,
"grad_norm": 12.527602195739746,
"learning_rate": 4.4672714402119514e-07,
"loss": 0.682815432548523,
"step": 2122
},
{
"epoch": 1.7878787878787878,
"grad_norm": 8.23759937286377,
"learning_rate": 4.4591433315585025e-07,
"loss": 0.7326172590255737,
"step": 2124
},
{
"epoch": 1.7895622895622896,
"grad_norm": 2.9576361179351807,
"learning_rate": 4.4510188090942246e-07,
"loss": 0.736370861530304,
"step": 2126
},
{
"epoch": 1.791245791245791,
"grad_norm": 2.443329095840454,
"learning_rate": 4.4428979008115173e-07,
"loss": 0.7781453728675842,
"step": 2128
},
{
"epoch": 1.7929292929292928,
"grad_norm": 8.095796585083008,
"learning_rate": 4.434780634690326e-07,
"loss": 0.7423359155654907,
"step": 2130
},
{
"epoch": 1.7946127946127945,
"grad_norm": 4.694947719573975,
"learning_rate": 4.426667038698049e-07,
"loss": 0.5872843265533447,
"step": 2132
},
{
"epoch": 1.7962962962962963,
"grad_norm": 4.841182708740234,
"learning_rate": 4.418557140789436e-07,
"loss": 0.769493579864502,
"step": 2134
},
{
"epoch": 1.797979797979798,
"grad_norm": 35.13887023925781,
"learning_rate": 4.4104509689065016e-07,
"loss": 0.40486854314804077,
"step": 2136
},
{
"epoch": 1.7996632996632997,
"grad_norm": 4.075418472290039,
"learning_rate": 4.402348550978414e-07,
"loss": 1.0084233283996582,
"step": 2138
},
{
"epoch": 1.8013468013468015,
"grad_norm": 5.782071590423584,
"learning_rate": 4.394249914921415e-07,
"loss": 0.852903425693512,
"step": 2140
},
{
"epoch": 1.803030303030303,
"grad_norm": 5.194396018981934,
"learning_rate": 4.3861550886387133e-07,
"loss": 0.8081188201904297,
"step": 2142
},
{
"epoch": 1.8047138047138047,
"grad_norm": 5.665356636047363,
"learning_rate": 4.378064100020391e-07,
"loss": 0.7818201780319214,
"step": 2144
},
{
"epoch": 1.8063973063973064,
"grad_norm": 13.495172500610352,
"learning_rate": 4.369976976943307e-07,
"loss": 0.9256261587142944,
"step": 2146
},
{
"epoch": 1.808080808080808,
"grad_norm": 4.425163269042969,
"learning_rate": 4.361893747271005e-07,
"loss": 0.9166650772094727,
"step": 2148
},
{
"epoch": 1.8097643097643097,
"grad_norm": 4.725872039794922,
"learning_rate": 4.3538144388536105e-07,
"loss": 1.0181063413619995,
"step": 2150
},
{
"epoch": 1.8114478114478114,
"grad_norm": 11.999753952026367,
"learning_rate": 4.3457390795277415e-07,
"loss": 1.053621530532837,
"step": 2152
},
{
"epoch": 1.8131313131313131,
"grad_norm": 7.518166542053223,
"learning_rate": 4.3376676971164096e-07,
"loss": 0.8652574419975281,
"step": 2154
},
{
"epoch": 1.8148148148148149,
"grad_norm": 4.472687244415283,
"learning_rate": 4.3296003194289224e-07,
"loss": 0.7134494781494141,
"step": 2156
},
{
"epoch": 1.8164983164983166,
"grad_norm": 5.567774772644043,
"learning_rate": 4.321536974260788e-07,
"loss": 0.5291237831115723,
"step": 2158
},
{
"epoch": 1.8181818181818183,
"grad_norm": 3.6279776096343994,
"learning_rate": 4.313477689393628e-07,
"loss": 0.9376990795135498,
"step": 2160
},
{
"epoch": 1.8198653198653199,
"grad_norm": 5.327882766723633,
"learning_rate": 4.305422492595063e-07,
"loss": 0.8061087131500244,
"step": 2162
},
{
"epoch": 1.8215488215488216,
"grad_norm": 8.221955299377441,
"learning_rate": 4.2973714116186433e-07,
"loss": 0.9052633047103882,
"step": 2164
},
{
"epoch": 1.823232323232323,
"grad_norm": 5.31283712387085,
"learning_rate": 4.289324474203726e-07,
"loss": 0.9636974930763245,
"step": 2166
},
{
"epoch": 1.8249158249158248,
"grad_norm": 4.392337799072266,
"learning_rate": 4.281281708075397e-07,
"loss": 0.9123021364212036,
"step": 2168
},
{
"epoch": 1.8265993265993266,
"grad_norm": 5.4881744384765625,
"learning_rate": 4.2732431409443694e-07,
"loss": 0.6539809703826904,
"step": 2170
},
{
"epoch": 1.8282828282828283,
"grad_norm": 4.425382614135742,
"learning_rate": 4.26520880050689e-07,
"loss": 0.7706068158149719,
"step": 2172
},
{
"epoch": 1.82996632996633,
"grad_norm": 5.568687915802002,
"learning_rate": 4.25717871444464e-07,
"loss": 0.47670307755470276,
"step": 2174
},
{
"epoch": 1.8316498316498318,
"grad_norm": 3.117622137069702,
"learning_rate": 4.249152910424648e-07,
"loss": 0.49261391162872314,
"step": 2176
},
{
"epoch": 1.8333333333333335,
"grad_norm": 14.853697776794434,
"learning_rate": 4.2411314160991827e-07,
"loss": 0.7614182233810425,
"step": 2178
},
{
"epoch": 1.835016835016835,
"grad_norm": 3.1323137283325195,
"learning_rate": 4.23311425910567e-07,
"loss": 0.8719555735588074,
"step": 2180
},
{
"epoch": 1.8367003367003367,
"grad_norm": 16.170852661132812,
"learning_rate": 4.225101467066587e-07,
"loss": 0.5341575741767883,
"step": 2182
},
{
"epoch": 1.8383838383838382,
"grad_norm": 32.08811950683594,
"learning_rate": 4.2170930675893745e-07,
"loss": 0.9574685096740723,
"step": 2184
},
{
"epoch": 1.84006734006734,
"grad_norm": 3.1121602058410645,
"learning_rate": 4.209089088266337e-07,
"loss": 1.0799657106399536,
"step": 2186
},
{
"epoch": 1.8417508417508417,
"grad_norm": 18.168685913085938,
"learning_rate": 4.201089556674553e-07,
"loss": 0.9567815065383911,
"step": 2188
},
{
"epoch": 1.8434343434343434,
"grad_norm": 11.96113109588623,
"learning_rate": 4.193094500375772e-07,
"loss": 0.6286576390266418,
"step": 2190
},
{
"epoch": 1.8451178451178452,
"grad_norm": 4.731752395629883,
"learning_rate": 4.1851039469163306e-07,
"loss": 0.8796607255935669,
"step": 2192
},
{
"epoch": 1.8468013468013469,
"grad_norm": 10.77106761932373,
"learning_rate": 4.177117923827046e-07,
"loss": 0.6798102855682373,
"step": 2194
},
{
"epoch": 1.8484848484848486,
"grad_norm": 7.331570148468018,
"learning_rate": 4.169136458623126e-07,
"loss": 0.8384144902229309,
"step": 2196
},
{
"epoch": 1.8501683501683501,
"grad_norm": 22.100555419921875,
"learning_rate": 4.161159578804079e-07,
"loss": 0.46593400835990906,
"step": 2198
},
{
"epoch": 1.8518518518518519,
"grad_norm": 3.0996837615966797,
"learning_rate": 4.153187311853611e-07,
"loss": 1.0288646221160889,
"step": 2200
},
{
"epoch": 1.8535353535353534,
"grad_norm": 8.950583457946777,
"learning_rate": 4.145219685239535e-07,
"loss": 0.7397197484970093,
"step": 2202
},
{
"epoch": 1.855218855218855,
"grad_norm": 4.474309921264648,
"learning_rate": 4.1372567264136806e-07,
"loss": 0.652114987373352,
"step": 2204
},
{
"epoch": 1.8569023569023568,
"grad_norm": 15.298965454101562,
"learning_rate": 4.129298462811789e-07,
"loss": 0.9816831350326538,
"step": 2206
},
{
"epoch": 1.8585858585858586,
"grad_norm": 6.661653518676758,
"learning_rate": 4.121344921853426e-07,
"loss": 0.782197892665863,
"step": 2208
},
{
"epoch": 1.8602693602693603,
"grad_norm": 13.741491317749023,
"learning_rate": 4.1133961309418885e-07,
"loss": 0.35760360956192017,
"step": 2210
},
{
"epoch": 1.861952861952862,
"grad_norm": 10.694735527038574,
"learning_rate": 4.1054521174641065e-07,
"loss": 0.9551196098327637,
"step": 2212
},
{
"epoch": 1.8636363636363638,
"grad_norm": 3.1088006496429443,
"learning_rate": 4.097512908790546e-07,
"loss": 1.12099027633667,
"step": 2214
},
{
"epoch": 1.8653198653198653,
"grad_norm": 25.677371978759766,
"learning_rate": 4.089578532275123e-07,
"loss": 0.3952019214630127,
"step": 2216
},
{
"epoch": 1.867003367003367,
"grad_norm": 9.417123794555664,
"learning_rate": 4.081649015255104e-07,
"loss": 0.6426748633384705,
"step": 2218
},
{
"epoch": 1.8686868686868687,
"grad_norm": 11.567337989807129,
"learning_rate": 4.0737243850510097e-07,
"loss": 0.6122760772705078,
"step": 2220
},
{
"epoch": 1.8703703703703702,
"grad_norm": 5.804279327392578,
"learning_rate": 4.065804668966527e-07,
"loss": 0.6974793672561646,
"step": 2222
},
{
"epoch": 1.872053872053872,
"grad_norm": 2.7025163173675537,
"learning_rate": 4.057889894288409e-07,
"loss": 1.073783040046692,
"step": 2224
},
{
"epoch": 1.8737373737373737,
"grad_norm": 4.876822471618652,
"learning_rate": 4.049980088286384e-07,
"loss": 0.6222144365310669,
"step": 2226
},
{
"epoch": 1.8754208754208754,
"grad_norm": 9.058050155639648,
"learning_rate": 4.042075278213065e-07,
"loss": 0.44170594215393066,
"step": 2228
},
{
"epoch": 1.8771043771043772,
"grad_norm": 7.270165920257568,
"learning_rate": 4.0341754913038463e-07,
"loss": 1.0838236808776855,
"step": 2230
},
{
"epoch": 1.878787878787879,
"grad_norm": 5.1195969581604,
"learning_rate": 4.0262807547768164e-07,
"loss": 0.9825941324234009,
"step": 2232
},
{
"epoch": 1.8804713804713806,
"grad_norm": 6.805301666259766,
"learning_rate": 4.018391095832665e-07,
"loss": 0.5576257705688477,
"step": 2234
},
{
"epoch": 1.8821548821548821,
"grad_norm": 5.000736236572266,
"learning_rate": 4.0105065416545904e-07,
"loss": 0.7729544639587402,
"step": 2236
},
{
"epoch": 1.8838383838383839,
"grad_norm": 26.782957077026367,
"learning_rate": 4.002627119408196e-07,
"loss": 0.9620450735092163,
"step": 2238
},
{
"epoch": 1.8855218855218854,
"grad_norm": 2.8780593872070312,
"learning_rate": 3.994752856241407e-07,
"loss": 1.1825776100158691,
"step": 2240
},
{
"epoch": 1.887205387205387,
"grad_norm": 22.294492721557617,
"learning_rate": 3.9868837792843744e-07,
"loss": 1.0324305295944214,
"step": 2242
},
{
"epoch": 1.8888888888888888,
"grad_norm": 4.332054615020752,
"learning_rate": 3.97901991564938e-07,
"loss": 0.9554680585861206,
"step": 2244
},
{
"epoch": 1.8905723905723906,
"grad_norm": 3.9427170753479004,
"learning_rate": 3.971161292430738e-07,
"loss": 1.0006061792373657,
"step": 2246
},
{
"epoch": 1.8922558922558923,
"grad_norm": 13.30428695678711,
"learning_rate": 3.9633079367047176e-07,
"loss": 0.9314384460449219,
"step": 2248
},
{
"epoch": 1.893939393939394,
"grad_norm": 7.6282057762146,
"learning_rate": 3.9554598755294313e-07,
"loss": 1.031144380569458,
"step": 2250
},
{
"epoch": 1.8956228956228958,
"grad_norm": 4.266255855560303,
"learning_rate": 3.947617135944751e-07,
"loss": 1.2106260061264038,
"step": 2252
},
{
"epoch": 1.8973063973063973,
"grad_norm": 4.872833251953125,
"learning_rate": 3.9397797449722157e-07,
"loss": 0.9372920393943787,
"step": 2254
},
{
"epoch": 1.898989898989899,
"grad_norm": 6.311352729797363,
"learning_rate": 3.931947729614935e-07,
"loss": 0.8530165553092957,
"step": 2256
},
{
"epoch": 1.9006734006734005,
"grad_norm": 4.680610656738281,
"learning_rate": 3.924121116857496e-07,
"loss": 1.026566505432129,
"step": 2258
},
{
"epoch": 1.9023569023569022,
"grad_norm": 9.47015380859375,
"learning_rate": 3.9162999336658754e-07,
"loss": 0.778825044631958,
"step": 2260
},
{
"epoch": 1.904040404040404,
"grad_norm": 4.094303607940674,
"learning_rate": 3.908484206987338e-07,
"loss": 0.837942361831665,
"step": 2262
},
{
"epoch": 1.9057239057239057,
"grad_norm": 4.3366522789001465,
"learning_rate": 3.9006739637503504e-07,
"loss": 0.5546213388442993,
"step": 2264
},
{
"epoch": 1.9074074074074074,
"grad_norm": 4.000308036804199,
"learning_rate": 3.8928692308644873e-07,
"loss": 0.8694909811019897,
"step": 2266
},
{
"epoch": 1.9090909090909092,
"grad_norm": 9.866999626159668,
"learning_rate": 3.8850700352203393e-07,
"loss": 0.7251837253570557,
"step": 2268
},
{
"epoch": 1.910774410774411,
"grad_norm": 25.70250129699707,
"learning_rate": 3.8772764036894135e-07,
"loss": 0.8718059659004211,
"step": 2270
},
{
"epoch": 1.9124579124579124,
"grad_norm": 3.6470611095428467,
"learning_rate": 3.8694883631240525e-07,
"loss": 0.9727774858474731,
"step": 2272
},
{
"epoch": 1.9141414141414141,
"grad_norm": 17.55716896057129,
"learning_rate": 3.8617059403573315e-07,
"loss": 0.7658140659332275,
"step": 2274
},
{
"epoch": 1.9158249158249159,
"grad_norm": 321.7969665527344,
"learning_rate": 3.8539291622029726e-07,
"loss": 0.9249438047409058,
"step": 2276
},
{
"epoch": 1.9175084175084174,
"grad_norm": 12.505878448486328,
"learning_rate": 3.8461580554552473e-07,
"loss": 0.6528811454772949,
"step": 2278
},
{
"epoch": 1.9191919191919191,
"grad_norm": 11.336592674255371,
"learning_rate": 3.8383926468888894e-07,
"loss": 0.4342978596687317,
"step": 2280
},
{
"epoch": 1.9208754208754208,
"grad_norm": 5.1289567947387695,
"learning_rate": 3.830632963258998e-07,
"loss": 1.0175809860229492,
"step": 2282
},
{
"epoch": 1.9225589225589226,
"grad_norm": 13.561434745788574,
"learning_rate": 3.82287903130095e-07,
"loss": 0.5333043336868286,
"step": 2284
},
{
"epoch": 1.9242424242424243,
"grad_norm": 1.7644530534744263,
"learning_rate": 3.815130877730299e-07,
"loss": 0.6704491376876831,
"step": 2286
},
{
"epoch": 1.925925925925926,
"grad_norm": 5.901256084442139,
"learning_rate": 3.807388529242699e-07,
"loss": 1.0216944217681885,
"step": 2288
},
{
"epoch": 1.9276094276094278,
"grad_norm": 7.095743179321289,
"learning_rate": 3.799652012513795e-07,
"loss": 0.9275904893875122,
"step": 2290
},
{
"epoch": 1.9292929292929293,
"grad_norm": 11.874740600585938,
"learning_rate": 3.791921354199145e-07,
"loss": 0.5191354751586914,
"step": 2292
},
{
"epoch": 1.930976430976431,
"grad_norm": 5.656970500946045,
"learning_rate": 3.784196580934117e-07,
"loss": 0.9575490951538086,
"step": 2294
},
{
"epoch": 1.9326599326599325,
"grad_norm": 37.43537521362305,
"learning_rate": 3.776477719333806e-07,
"loss": 0.8639167547225952,
"step": 2296
},
{
"epoch": 1.9343434343434343,
"grad_norm": 3.692530632019043,
"learning_rate": 3.768764795992939e-07,
"loss": 0.8566898107528687,
"step": 2298
},
{
"epoch": 1.936026936026936,
"grad_norm": 7.679093837738037,
"learning_rate": 3.761057837485782e-07,
"loss": 0.7409002780914307,
"step": 2300
},
{
"epoch": 1.9377104377104377,
"grad_norm": 4.928491592407227,
"learning_rate": 3.753356870366049e-07,
"loss": 1.2324477434158325,
"step": 2302
},
{
"epoch": 1.9393939393939394,
"grad_norm": 6.215064525604248,
"learning_rate": 3.745661921166813e-07,
"loss": 1.0157601833343506,
"step": 2304
},
{
"epoch": 1.9410774410774412,
"grad_norm": 9.274593353271484,
"learning_rate": 3.73797301640041e-07,
"loss": 0.39169979095458984,
"step": 2306
},
{
"epoch": 1.942760942760943,
"grad_norm": 5.436382293701172,
"learning_rate": 3.730290182558352e-07,
"loss": 0.9424724578857422,
"step": 2308
},
{
"epoch": 1.9444444444444444,
"grad_norm": 6.4669389724731445,
"learning_rate": 3.722613446111238e-07,
"loss": 1.0893113613128662,
"step": 2310
},
{
"epoch": 1.9461279461279462,
"grad_norm": 3.2764360904693604,
"learning_rate": 3.7149428335086505e-07,
"loss": 0.9788646697998047,
"step": 2312
},
{
"epoch": 1.9478114478114477,
"grad_norm": 6.531454086303711,
"learning_rate": 3.70727837117908e-07,
"loss": 0.9268249869346619,
"step": 2314
},
{
"epoch": 1.9494949494949494,
"grad_norm": 10.755637168884277,
"learning_rate": 3.6996200855298243e-07,
"loss": 0.7596557140350342,
"step": 2316
},
{
"epoch": 1.9511784511784511,
"grad_norm": 4.191674709320068,
"learning_rate": 3.691968002946899e-07,
"loss": 0.8969882130622864,
"step": 2318
},
{
"epoch": 1.9528619528619529,
"grad_norm": 3.586559772491455,
"learning_rate": 3.684322149794947e-07,
"loss": 0.926864743232727,
"step": 2320
},
{
"epoch": 1.9545454545454546,
"grad_norm": 3.739887237548828,
"learning_rate": 3.676682552417152e-07,
"loss": 1.0153056383132935,
"step": 2322
},
{
"epoch": 1.9562289562289563,
"grad_norm": 5.188506603240967,
"learning_rate": 3.669049237135139e-07,
"loss": 0.7965476512908936,
"step": 2324
},
{
"epoch": 1.957912457912458,
"grad_norm": 19.38260269165039,
"learning_rate": 3.6614222302488915e-07,
"loss": 0.5549055337905884,
"step": 2326
},
{
"epoch": 1.9595959595959596,
"grad_norm": 3.7028939723968506,
"learning_rate": 3.6538015580366585e-07,
"loss": 1.1440973281860352,
"step": 2328
},
{
"epoch": 1.9612794612794613,
"grad_norm": 3.7629427909851074,
"learning_rate": 3.6461872467548625e-07,
"loss": 1.0486090183258057,
"step": 2330
},
{
"epoch": 1.9629629629629628,
"grad_norm": 4.191500186920166,
"learning_rate": 3.638579322638007e-07,
"loss": 1.029564619064331,
"step": 2332
},
{
"epoch": 1.9646464646464645,
"grad_norm": 3.297617197036743,
"learning_rate": 3.6309778118985943e-07,
"loss": 1.0488507747650146,
"step": 2334
},
{
"epoch": 1.9663299663299663,
"grad_norm": 3.2955570220947266,
"learning_rate": 3.623382740727028e-07,
"loss": 0.9328145384788513,
"step": 2336
},
{
"epoch": 1.968013468013468,
"grad_norm": 3.8717432022094727,
"learning_rate": 3.61579413529152e-07,
"loss": 1.0710524320602417,
"step": 2338
},
{
"epoch": 1.9696969696969697,
"grad_norm": 16.362682342529297,
"learning_rate": 3.608212021738011e-07,
"loss": 0.565844714641571,
"step": 2340
},
{
"epoch": 1.9713804713804715,
"grad_norm": 4.6352057456970215,
"learning_rate": 3.600636426190075e-07,
"loss": 0.7352415919303894,
"step": 2342
},
{
"epoch": 1.9730639730639732,
"grad_norm": 30.211545944213867,
"learning_rate": 3.593067374748823e-07,
"loss": 0.5901581645011902,
"step": 2344
},
{
"epoch": 1.9747474747474747,
"grad_norm": 14.593511581420898,
"learning_rate": 3.585504893492821e-07,
"loss": 0.8802275657653809,
"step": 2346
},
{
"epoch": 1.9764309764309764,
"grad_norm": 6.9926438331604,
"learning_rate": 3.577949008478004e-07,
"loss": 0.7798852920532227,
"step": 2348
},
{
"epoch": 1.9781144781144782,
"grad_norm": 27.421436309814453,
"learning_rate": 3.57039974573757e-07,
"loss": 0.726132333278656,
"step": 2350
},
{
"epoch": 1.9797979797979797,
"grad_norm": 3.9214117527008057,
"learning_rate": 3.562857131281907e-07,
"loss": 0.7651845216751099,
"step": 2352
},
{
"epoch": 1.9814814814814814,
"grad_norm": 2.7648415565490723,
"learning_rate": 3.555321191098498e-07,
"loss": 0.4599582552909851,
"step": 2354
},
{
"epoch": 1.9831649831649831,
"grad_norm": 3.028148651123047,
"learning_rate": 3.547791951151824e-07,
"loss": 1.0578691959381104,
"step": 2356
},
{
"epoch": 1.9848484848484849,
"grad_norm": 3.8524043560028076,
"learning_rate": 3.5402694373832863e-07,
"loss": 0.9566428065299988,
"step": 2358
},
{
"epoch": 1.9865319865319866,
"grad_norm": 6.2976250648498535,
"learning_rate": 3.53275367571111e-07,
"loss": 0.9507308602333069,
"step": 2360
},
{
"epoch": 1.9882154882154883,
"grad_norm": 9.228460311889648,
"learning_rate": 3.525244692030256e-07,
"loss": 0.646575927734375,
"step": 2362
},
{
"epoch": 1.98989898989899,
"grad_norm": 3.0814363956451416,
"learning_rate": 3.517742512212333e-07,
"loss": 0.9748328924179077,
"step": 2364
},
{
"epoch": 1.9915824915824916,
"grad_norm": 15.091059684753418,
"learning_rate": 3.5102471621055083e-07,
"loss": 0.8788052797317505,
"step": 2366
},
{
"epoch": 1.9932659932659933,
"grad_norm": 2.8277087211608887,
"learning_rate": 3.5027586675344134e-07,
"loss": 1.026127576828003,
"step": 2368
},
{
"epoch": 1.9949494949494948,
"grad_norm": 155.9358673095703,
"learning_rate": 3.495277054300065e-07,
"loss": 0.41760489344596863,
"step": 2370
},
{
"epoch": 1.9966329966329965,
"grad_norm": 47.22990036010742,
"learning_rate": 3.487802348179771e-07,
"loss": 0.6611791849136353,
"step": 2372
},
{
"epoch": 1.9983164983164983,
"grad_norm": 3.25925612449646,
"learning_rate": 3.480334574927034e-07,
"loss": 0.9254864454269409,
"step": 2374
},
{
"epoch": 2.0,
"grad_norm": 3.941661834716797,
"learning_rate": 3.4728737602714777e-07,
"loss": 0.8802586793899536,
"step": 2376
},
{
"epoch": 2.0016835016835017,
"grad_norm": 49.501678466796875,
"learning_rate": 3.465419929918748e-07,
"loss": 0.709393322467804,
"step": 2378
},
{
"epoch": 2.0033670033670035,
"grad_norm": 5.433994770050049,
"learning_rate": 3.457973109550426e-07,
"loss": 1.1732385158538818,
"step": 2380
},
{
"epoch": 2.005050505050505,
"grad_norm": 29.29537582397461,
"learning_rate": 3.4505333248239437e-07,
"loss": 0.6126368641853333,
"step": 2382
},
{
"epoch": 2.006734006734007,
"grad_norm": 15.888188362121582,
"learning_rate": 3.443100601372486e-07,
"loss": 0.534448504447937,
"step": 2384
},
{
"epoch": 2.008417508417508,
"grad_norm": 4.571238040924072,
"learning_rate": 3.435674964804913e-07,
"loss": 0.6711810827255249,
"step": 2386
},
{
"epoch": 2.01010101010101,
"grad_norm": 5.0248517990112305,
"learning_rate": 3.4282564407056714e-07,
"loss": 0.856137752532959,
"step": 2388
},
{
"epoch": 2.0117845117845117,
"grad_norm": 3.410614490509033,
"learning_rate": 3.420845054634693e-07,
"loss": 1.0443634986877441,
"step": 2390
},
{
"epoch": 2.0134680134680134,
"grad_norm": 8.341497421264648,
"learning_rate": 3.413440832127323e-07,
"loss": 0.6617559194564819,
"step": 2392
},
{
"epoch": 2.015151515151515,
"grad_norm": 9.096837043762207,
"learning_rate": 3.406043798694226e-07,
"loss": 0.7012159824371338,
"step": 2394
},
{
"epoch": 2.016835016835017,
"grad_norm": 6.935766220092773,
"learning_rate": 3.39865397982129e-07,
"loss": 0.8126204013824463,
"step": 2396
},
{
"epoch": 2.0185185185185186,
"grad_norm": 10.908267974853516,
"learning_rate": 3.3912714009695525e-07,
"loss": 0.7988526225090027,
"step": 2398
},
{
"epoch": 2.0202020202020203,
"grad_norm": 3.9570505619049072,
"learning_rate": 3.3838960875751057e-07,
"loss": 0.6374803781509399,
"step": 2400
},
{
"epoch": 2.021885521885522,
"grad_norm": 149.55186462402344,
"learning_rate": 3.3765280650490043e-07,
"loss": 0.4227946400642395,
"step": 2402
},
{
"epoch": 2.0235690235690234,
"grad_norm": 9.027767181396484,
"learning_rate": 3.3691673587771866e-07,
"loss": 0.8504242897033691,
"step": 2404
},
{
"epoch": 2.025252525252525,
"grad_norm": 8.02371883392334,
"learning_rate": 3.361813994120386e-07,
"loss": 0.7173169851303101,
"step": 2406
},
{
"epoch": 2.026936026936027,
"grad_norm": 2.9558398723602295,
"learning_rate": 3.354467996414034e-07,
"loss": 0.8256983757019043,
"step": 2408
},
{
"epoch": 2.0286195286195285,
"grad_norm": 15.695834159851074,
"learning_rate": 3.3471293909681844e-07,
"loss": 0.8146846294403076,
"step": 2410
},
{
"epoch": 2.0303030303030303,
"grad_norm": 4.859415531158447,
"learning_rate": 3.339798203067422e-07,
"loss": 0.9352428913116455,
"step": 2412
},
{
"epoch": 2.031986531986532,
"grad_norm": 9.821430206298828,
"learning_rate": 3.332474457970773e-07,
"loss": 0.7644020318984985,
"step": 2414
},
{
"epoch": 2.0336700336700337,
"grad_norm": 2.8421833515167236,
"learning_rate": 3.32515818091162e-07,
"loss": 0.936759889125824,
"step": 2416
},
{
"epoch": 2.0353535353535355,
"grad_norm": 4.6429243087768555,
"learning_rate": 3.3178493970976183e-07,
"loss": 0.7487270832061768,
"step": 2418
},
{
"epoch": 2.037037037037037,
"grad_norm": 8.991228103637695,
"learning_rate": 3.310548131710601e-07,
"loss": 0.4855067729949951,
"step": 2420
},
{
"epoch": 2.038720538720539,
"grad_norm": 3.958752393722534,
"learning_rate": 3.3032544099065003e-07,
"loss": 0.7952554821968079,
"step": 2422
},
{
"epoch": 2.04040404040404,
"grad_norm": 4.862611293792725,
"learning_rate": 3.295968256815257e-07,
"loss": 0.36966073513031006,
"step": 2424
},
{
"epoch": 2.042087542087542,
"grad_norm": 10.325581550598145,
"learning_rate": 3.288689697540733e-07,
"loss": 0.4272541403770447,
"step": 2426
},
{
"epoch": 2.0437710437710437,
"grad_norm": 46.957489013671875,
"learning_rate": 3.281418757160629e-07,
"loss": 0.6797230839729309,
"step": 2428
},
{
"epoch": 2.0454545454545454,
"grad_norm": 3.5477898120880127,
"learning_rate": 3.274155460726392e-07,
"loss": 0.8319392204284668,
"step": 2430
},
{
"epoch": 2.047138047138047,
"grad_norm": 6.360466480255127,
"learning_rate": 3.2668998332631374e-07,
"loss": 0.6863579154014587,
"step": 2432
},
{
"epoch": 2.048821548821549,
"grad_norm": 6.163110733032227,
"learning_rate": 3.259651899769552e-07,
"loss": 0.845360279083252,
"step": 2434
},
{
"epoch": 2.0505050505050506,
"grad_norm": 4.161473274230957,
"learning_rate": 3.2524116852178163e-07,
"loss": 1.2110919952392578,
"step": 2436
},
{
"epoch": 2.0521885521885523,
"grad_norm": 9.452940940856934,
"learning_rate": 3.245179214553519e-07,
"loss": 0.7553325891494751,
"step": 2438
},
{
"epoch": 2.053872053872054,
"grad_norm": 2.847379207611084,
"learning_rate": 3.23795451269556e-07,
"loss": 0.8473318219184875,
"step": 2440
},
{
"epoch": 2.0555555555555554,
"grad_norm": 8.454484939575195,
"learning_rate": 3.2307376045360804e-07,
"loss": 0.7530231475830078,
"step": 2442
},
{
"epoch": 2.057239057239057,
"grad_norm": 3.292670965194702,
"learning_rate": 3.223528514940365e-07,
"loss": 0.8452006578445435,
"step": 2444
},
{
"epoch": 2.058922558922559,
"grad_norm": 5.7511820793151855,
"learning_rate": 3.216327268746759e-07,
"loss": 1.0079270601272583,
"step": 2446
},
{
"epoch": 2.0606060606060606,
"grad_norm": 3.7970666885375977,
"learning_rate": 3.2091338907665864e-07,
"loss": 0.8261886835098267,
"step": 2448
},
{
"epoch": 2.0622895622895623,
"grad_norm": 4.847807884216309,
"learning_rate": 3.201948405784062e-07,
"loss": 0.7386308908462524,
"step": 2450
},
{
"epoch": 2.063973063973064,
"grad_norm": 4.138136386871338,
"learning_rate": 3.1947708385562033e-07,
"loss": 0.967164158821106,
"step": 2452
},
{
"epoch": 2.0656565656565657,
"grad_norm": 6.592377185821533,
"learning_rate": 3.1876012138127525e-07,
"loss": 0.820540189743042,
"step": 2454
},
{
"epoch": 2.0673400673400675,
"grad_norm": 3.9689667224884033,
"learning_rate": 3.1804395562560795e-07,
"loss": 0.884551465511322,
"step": 2456
},
{
"epoch": 2.069023569023569,
"grad_norm": 3.013533353805542,
"learning_rate": 3.173285890561109e-07,
"loss": 0.7905436158180237,
"step": 2458
},
{
"epoch": 2.0707070707070705,
"grad_norm": 12.277018547058105,
"learning_rate": 3.166140241375233e-07,
"loss": 0.6569070219993591,
"step": 2460
},
{
"epoch": 2.0723905723905722,
"grad_norm": 6.178629398345947,
"learning_rate": 3.159002633318214e-07,
"loss": 0.6464763879776001,
"step": 2462
},
{
"epoch": 2.074074074074074,
"grad_norm": 41.42927932739258,
"learning_rate": 3.151873090982117e-07,
"loss": 0.7555403709411621,
"step": 2464
},
{
"epoch": 2.0757575757575757,
"grad_norm": 6.328332424163818,
"learning_rate": 3.144751638931219e-07,
"loss": 0.8773843050003052,
"step": 2466
},
{
"epoch": 2.0774410774410774,
"grad_norm": 3.656052589416504,
"learning_rate": 3.137638301701912e-07,
"loss": 0.5875815749168396,
"step": 2468
},
{
"epoch": 2.079124579124579,
"grad_norm": 5.517158031463623,
"learning_rate": 3.13053310380264e-07,
"loss": 1.0708808898925781,
"step": 2470
},
{
"epoch": 2.080808080808081,
"grad_norm": 3.563227891921997,
"learning_rate": 3.123436069713801e-07,
"loss": 1.0506317615509033,
"step": 2472
},
{
"epoch": 2.0824915824915826,
"grad_norm": 3.579038619995117,
"learning_rate": 3.116347223887658e-07,
"loss": 0.5262918472290039,
"step": 2474
},
{
"epoch": 2.0841750841750843,
"grad_norm": 4.360744953155518,
"learning_rate": 3.1092665907482705e-07,
"loss": 0.6860552430152893,
"step": 2476
},
{
"epoch": 2.0858585858585856,
"grad_norm": 5.244029521942139,
"learning_rate": 3.102194194691402e-07,
"loss": 0.8589056730270386,
"step": 2478
},
{
"epoch": 2.0875420875420874,
"grad_norm": 2.6665806770324707,
"learning_rate": 3.0951300600844277e-07,
"loss": 0.7219854593276978,
"step": 2480
},
{
"epoch": 2.089225589225589,
"grad_norm": 4.833096027374268,
"learning_rate": 3.088074211266265e-07,
"loss": 0.6794151067733765,
"step": 2482
},
{
"epoch": 2.090909090909091,
"grad_norm": 4.1560773849487305,
"learning_rate": 3.0810266725472843e-07,
"loss": 1.1254472732543945,
"step": 2484
},
{
"epoch": 2.0925925925925926,
"grad_norm": 13.069584846496582,
"learning_rate": 3.073987468209218e-07,
"loss": 0.7453956604003906,
"step": 2486
},
{
"epoch": 2.0942760942760943,
"grad_norm": 2.8535449504852295,
"learning_rate": 3.0669566225050904e-07,
"loss": 0.7250915765762329,
"step": 2488
},
{
"epoch": 2.095959595959596,
"grad_norm": 7.14851188659668,
"learning_rate": 3.059934159659122e-07,
"loss": 0.9290302991867065,
"step": 2490
},
{
"epoch": 2.0976430976430978,
"grad_norm": 4.216705799102783,
"learning_rate": 3.052920103866651e-07,
"loss": 0.9226129055023193,
"step": 2492
},
{
"epoch": 2.0993265993265995,
"grad_norm": 6.80454158782959,
"learning_rate": 3.0459144792940506e-07,
"loss": 0.6964681148529053,
"step": 2494
},
{
"epoch": 2.101010101010101,
"grad_norm": 4.916928291320801,
"learning_rate": 3.038917310078648e-07,
"loss": 0.9581238627433777,
"step": 2496
},
{
"epoch": 2.1026936026936025,
"grad_norm": 6.742152214050293,
"learning_rate": 3.031928620328632e-07,
"loss": 0.5878009796142578,
"step": 2498
},
{
"epoch": 2.1043771043771042,
"grad_norm": 8.78212833404541,
"learning_rate": 3.024948434122981e-07,
"loss": 0.6806055307388306,
"step": 2500
},
{
"epoch": 2.106060606060606,
"grad_norm": 5.455780029296875,
"learning_rate": 3.017976775511374e-07,
"loss": 1.1094366312026978,
"step": 2502
},
{
"epoch": 2.1077441077441077,
"grad_norm": 16.17888641357422,
"learning_rate": 3.011013668514106e-07,
"loss": 0.9498310089111328,
"step": 2504
},
{
"epoch": 2.1094276094276094,
"grad_norm": 33.786502838134766,
"learning_rate": 3.0040591371220126e-07,
"loss": 0.9682769775390625,
"step": 2506
},
{
"epoch": 2.111111111111111,
"grad_norm": 4.037204265594482,
"learning_rate": 2.997113205296381e-07,
"loss": 0.6556534171104431,
"step": 2508
},
{
"epoch": 2.112794612794613,
"grad_norm": 4.9933576583862305,
"learning_rate": 2.990175896968867e-07,
"loss": 0.6443968415260315,
"step": 2510
},
{
"epoch": 2.1144781144781146,
"grad_norm": 9.915764808654785,
"learning_rate": 2.983247236041416e-07,
"loss": 0.8275219202041626,
"step": 2512
},
{
"epoch": 2.1161616161616164,
"grad_norm": 8.728922843933105,
"learning_rate": 2.9763272463861846e-07,
"loss": 0.4485883116722107,
"step": 2514
},
{
"epoch": 2.1178451178451176,
"grad_norm": 4.324676513671875,
"learning_rate": 2.9694159518454436e-07,
"loss": 1.0087292194366455,
"step": 2516
},
{
"epoch": 2.1195286195286194,
"grad_norm": 29.97382164001465,
"learning_rate": 2.9625133762315134e-07,
"loss": 0.30623072385787964,
"step": 2518
},
{
"epoch": 2.121212121212121,
"grad_norm": 13.100899696350098,
"learning_rate": 2.9556195433266724e-07,
"loss": 0.5369913578033447,
"step": 2520
},
{
"epoch": 2.122895622895623,
"grad_norm": 5.657482147216797,
"learning_rate": 2.94873447688307e-07,
"loss": 0.3709213137626648,
"step": 2522
},
{
"epoch": 2.1245791245791246,
"grad_norm": 8.130796432495117,
"learning_rate": 2.9418582006226644e-07,
"loss": 0.528016209602356,
"step": 2524
},
{
"epoch": 2.1262626262626263,
"grad_norm": 12.914457321166992,
"learning_rate": 2.9349907382371175e-07,
"loss": 0.5530096888542175,
"step": 2526
},
{
"epoch": 2.127946127946128,
"grad_norm": 13.510022163391113,
"learning_rate": 2.9281321133877256e-07,
"loss": 0.4185825288295746,
"step": 2528
},
{
"epoch": 2.1296296296296298,
"grad_norm": 4.050384998321533,
"learning_rate": 2.921282349705338e-07,
"loss": 0.6386127471923828,
"step": 2530
},
{
"epoch": 2.1313131313131315,
"grad_norm": 6.590632915496826,
"learning_rate": 2.914441470790274e-07,
"loss": 0.9100687503814697,
"step": 2532
},
{
"epoch": 2.1329966329966332,
"grad_norm": 4.762322425842285,
"learning_rate": 2.9076095002122373e-07,
"loss": 0.5006492137908936,
"step": 2534
},
{
"epoch": 2.1346801346801345,
"grad_norm": 5.085036754608154,
"learning_rate": 2.900786461510243e-07,
"loss": 0.7980141639709473,
"step": 2536
},
{
"epoch": 2.1363636363636362,
"grad_norm": 7.086611270904541,
"learning_rate": 2.8939723781925304e-07,
"loss": 0.5176095962524414,
"step": 2538
},
{
"epoch": 2.138047138047138,
"grad_norm": 8.522965431213379,
"learning_rate": 2.8871672737364814e-07,
"loss": 0.4830123782157898,
"step": 2540
},
{
"epoch": 2.1397306397306397,
"grad_norm": 9.686579704284668,
"learning_rate": 2.8803711715885457e-07,
"loss": 0.7633793354034424,
"step": 2542
},
{
"epoch": 2.1414141414141414,
"grad_norm": 3.3301565647125244,
"learning_rate": 2.8735840951641566e-07,
"loss": 0.21130666136741638,
"step": 2544
},
{
"epoch": 2.143097643097643,
"grad_norm": 2.9118270874023438,
"learning_rate": 2.866806067847645e-07,
"loss": 0.4212937355041504,
"step": 2546
},
{
"epoch": 2.144781144781145,
"grad_norm": 3.857438087463379,
"learning_rate": 2.860037112992167e-07,
"loss": 0.7907487154006958,
"step": 2548
},
{
"epoch": 2.1464646464646466,
"grad_norm": 3.103694438934326,
"learning_rate": 2.8532772539196236e-07,
"loss": 0.9942638874053955,
"step": 2550
},
{
"epoch": 2.148148148148148,
"grad_norm": 5.230748653411865,
"learning_rate": 2.8465265139205696e-07,
"loss": 0.6354756951332092,
"step": 2552
},
{
"epoch": 2.1498316498316496,
"grad_norm": 22.947580337524414,
"learning_rate": 2.839784916254147e-07,
"loss": 0.3525312840938568,
"step": 2554
},
{
"epoch": 2.1515151515151514,
"grad_norm": 4.191389083862305,
"learning_rate": 2.8330524841479964e-07,
"loss": 0.6104186773300171,
"step": 2556
},
{
"epoch": 2.153198653198653,
"grad_norm": 3.742684841156006,
"learning_rate": 2.8263292407981777e-07,
"loss": 0.6527650356292725,
"step": 2558
},
{
"epoch": 2.154882154882155,
"grad_norm": 17.96328353881836,
"learning_rate": 2.819615209369093e-07,
"loss": 0.5300241112709045,
"step": 2560
},
{
"epoch": 2.1565656565656566,
"grad_norm": 27.818559646606445,
"learning_rate": 2.812910412993409e-07,
"loss": 0.5620636940002441,
"step": 2562
},
{
"epoch": 2.1582491582491583,
"grad_norm": 3.4521737098693848,
"learning_rate": 2.806214874771965e-07,
"loss": 0.8342366218566895,
"step": 2564
},
{
"epoch": 2.15993265993266,
"grad_norm": 7.760178089141846,
"learning_rate": 2.799528617773711e-07,
"loss": 0.6607711315155029,
"step": 2566
},
{
"epoch": 2.1616161616161618,
"grad_norm": 4.2295379638671875,
"learning_rate": 2.792851665035616e-07,
"loss": 0.5361987352371216,
"step": 2568
},
{
"epoch": 2.1632996632996635,
"grad_norm": 4.252224922180176,
"learning_rate": 2.7861840395625887e-07,
"loss": 1.0253345966339111,
"step": 2570
},
{
"epoch": 2.164983164983165,
"grad_norm": 5.132496356964111,
"learning_rate": 2.779525764327406e-07,
"loss": 1.1341686248779297,
"step": 2572
},
{
"epoch": 2.1666666666666665,
"grad_norm": 10.062471389770508,
"learning_rate": 2.7728768622706294e-07,
"loss": 0.8332287073135376,
"step": 2574
},
{
"epoch": 2.1683501683501682,
"grad_norm": 11.787501335144043,
"learning_rate": 2.7662373563005206e-07,
"loss": 0.3077271282672882,
"step": 2576
},
{
"epoch": 2.17003367003367,
"grad_norm": 6.277064323425293,
"learning_rate": 2.7596072692929724e-07,
"loss": 0.7766256332397461,
"step": 2578
},
{
"epoch": 2.1717171717171717,
"grad_norm": 22.47462272644043,
"learning_rate": 2.752986624091427e-07,
"loss": 0.32620465755462646,
"step": 2580
},
{
"epoch": 2.1734006734006734,
"grad_norm": 7.154234886169434,
"learning_rate": 2.746375443506788e-07,
"loss": 0.5342273116111755,
"step": 2582
},
{
"epoch": 2.175084175084175,
"grad_norm": 10.44245719909668,
"learning_rate": 2.739773750317358e-07,
"loss": 0.45068609714508057,
"step": 2584
},
{
"epoch": 2.176767676767677,
"grad_norm": 7.9054155349731445,
"learning_rate": 2.7331815672687476e-07,
"loss": 0.6677770614624023,
"step": 2586
},
{
"epoch": 2.1784511784511786,
"grad_norm": 10.807048797607422,
"learning_rate": 2.726598917073798e-07,
"loss": 0.7541825175285339,
"step": 2588
},
{
"epoch": 2.18013468013468,
"grad_norm": 7.9458746910095215,
"learning_rate": 2.720025822412512e-07,
"loss": 0.7445704340934753,
"step": 2590
},
{
"epoch": 2.1818181818181817,
"grad_norm": 12.376832962036133,
"learning_rate": 2.713462305931966e-07,
"loss": 0.5584303736686707,
"step": 2592
},
{
"epoch": 2.1835016835016834,
"grad_norm": 3.6782195568084717,
"learning_rate": 2.706908390246232e-07,
"loss": 0.42317822575569153,
"step": 2594
},
{
"epoch": 2.185185185185185,
"grad_norm": 33.567867279052734,
"learning_rate": 2.7003640979363133e-07,
"loss": 0.8278957605361938,
"step": 2596
},
{
"epoch": 2.186868686868687,
"grad_norm": 8.654823303222656,
"learning_rate": 2.6938294515500463e-07,
"loss": 0.8979749083518982,
"step": 2598
},
{
"epoch": 2.1885521885521886,
"grad_norm": 4.184791564941406,
"learning_rate": 2.687304473602039e-07,
"loss": 0.7217346429824829,
"step": 2600
},
{
"epoch": 2.1902356902356903,
"grad_norm": 22.945192337036133,
"learning_rate": 2.6807891865735865e-07,
"loss": 0.9164705276489258,
"step": 2602
},
{
"epoch": 2.191919191919192,
"grad_norm": 8.126714706420898,
"learning_rate": 2.674283612912591e-07,
"loss": 0.853008508682251,
"step": 2604
},
{
"epoch": 2.1936026936026938,
"grad_norm": 9.629704475402832,
"learning_rate": 2.6677877750334935e-07,
"loss": 0.6331396102905273,
"step": 2606
},
{
"epoch": 2.1952861952861955,
"grad_norm": 3.6879630088806152,
"learning_rate": 2.6613016953171894e-07,
"loss": 0.9496104121208191,
"step": 2608
},
{
"epoch": 2.196969696969697,
"grad_norm": 3.8279647827148438,
"learning_rate": 2.65482539611095e-07,
"loss": 0.7846404910087585,
"step": 2610
},
{
"epoch": 2.1986531986531985,
"grad_norm": 20.262147903442383,
"learning_rate": 2.648358899728351e-07,
"loss": 0.489252507686615,
"step": 2612
},
{
"epoch": 2.2003367003367003,
"grad_norm": 8.78268051147461,
"learning_rate": 2.6419022284491965e-07,
"loss": 0.8057292699813843,
"step": 2614
},
{
"epoch": 2.202020202020202,
"grad_norm": 4.244235038757324,
"learning_rate": 2.635455404519433e-07,
"loss": 0.6223278641700745,
"step": 2616
},
{
"epoch": 2.2037037037037037,
"grad_norm": 11.954898834228516,
"learning_rate": 2.629018450151081e-07,
"loss": 0.5752437114715576,
"step": 2618
},
{
"epoch": 2.2053872053872055,
"grad_norm": 13.8149995803833,
"learning_rate": 2.6225913875221594e-07,
"loss": 0.3817511796951294,
"step": 2620
},
{
"epoch": 2.207070707070707,
"grad_norm": 4.223384857177734,
"learning_rate": 2.6161742387766e-07,
"loss": 0.6272555589675903,
"step": 2622
},
{
"epoch": 2.208754208754209,
"grad_norm": 21.69821548461914,
"learning_rate": 2.609767026024182e-07,
"loss": 0.7172547578811646,
"step": 2624
},
{
"epoch": 2.2104377104377106,
"grad_norm": 8.276639938354492,
"learning_rate": 2.6033697713404514e-07,
"loss": 0.6655735373497009,
"step": 2626
},
{
"epoch": 2.212121212121212,
"grad_norm": 8.716046333312988,
"learning_rate": 2.5969824967666374e-07,
"loss": 0.6124321818351746,
"step": 2628
},
{
"epoch": 2.2138047138047137,
"grad_norm": 8.070818901062012,
"learning_rate": 2.590605224309592e-07,
"loss": 0.4091968536376953,
"step": 2630
},
{
"epoch": 2.2154882154882154,
"grad_norm": 5.510800361633301,
"learning_rate": 2.5842379759417023e-07,
"loss": 0.521186113357544,
"step": 2632
},
{
"epoch": 2.217171717171717,
"grad_norm": 39.2961540222168,
"learning_rate": 2.5778807736008153e-07,
"loss": 0.2366686761379242,
"step": 2634
},
{
"epoch": 2.218855218855219,
"grad_norm": 3.912489891052246,
"learning_rate": 2.5715336391901695e-07,
"loss": 0.8710294961929321,
"step": 2636
},
{
"epoch": 2.2205387205387206,
"grad_norm": 4.83061408996582,
"learning_rate": 2.565196594578315e-07,
"loss": 1.1489973068237305,
"step": 2638
},
{
"epoch": 2.2222222222222223,
"grad_norm": 4.629734516143799,
"learning_rate": 2.5588696615990336e-07,
"loss": 1.078352928161621,
"step": 2640
},
{
"epoch": 2.223905723905724,
"grad_norm": 7.968264102935791,
"learning_rate": 2.5525528620512737e-07,
"loss": 0.7425380349159241,
"step": 2642
},
{
"epoch": 2.225589225589226,
"grad_norm": 13.291003227233887,
"learning_rate": 2.5462462176990686e-07,
"loss": 0.7818918228149414,
"step": 2644
},
{
"epoch": 2.227272727272727,
"grad_norm": 11.734708786010742,
"learning_rate": 2.539949750271458e-07,
"loss": 0.7145400047302246,
"step": 2646
},
{
"epoch": 2.228956228956229,
"grad_norm": 5.949611186981201,
"learning_rate": 2.533663481462424e-07,
"loss": 0.4055989980697632,
"step": 2648
},
{
"epoch": 2.2306397306397305,
"grad_norm": 5.281031608581543,
"learning_rate": 2.5273874329308083e-07,
"loss": 1.0042195320129395,
"step": 2650
},
{
"epoch": 2.2323232323232323,
"grad_norm": 8.864117622375488,
"learning_rate": 2.5211216263002375e-07,
"loss": 0.604977011680603,
"step": 2652
},
{
"epoch": 2.234006734006734,
"grad_norm": 28.879344940185547,
"learning_rate": 2.514866083159053e-07,
"loss": 0.566184937953949,
"step": 2654
},
{
"epoch": 2.2356902356902357,
"grad_norm": 7.084741592407227,
"learning_rate": 2.508620825060231e-07,
"loss": 0.6506372094154358,
"step": 2656
},
{
"epoch": 2.2373737373737375,
"grad_norm": 22.613136291503906,
"learning_rate": 2.5023858735213156e-07,
"loss": 0.9167625904083252,
"step": 2658
},
{
"epoch": 2.239057239057239,
"grad_norm": 6.915469169616699,
"learning_rate": 2.4961612500243364e-07,
"loss": 0.7674777507781982,
"step": 2660
},
{
"epoch": 2.240740740740741,
"grad_norm": 8.177582740783691,
"learning_rate": 2.4899469760157413e-07,
"loss": 0.8097570538520813,
"step": 2662
},
{
"epoch": 2.242424242424242,
"grad_norm": 14.568964004516602,
"learning_rate": 2.48374307290632e-07,
"loss": 0.4266725182533264,
"step": 2664
},
{
"epoch": 2.244107744107744,
"grad_norm": 4.135527610778809,
"learning_rate": 2.4775495620711254e-07,
"loss": 0.7610059976577759,
"step": 2666
},
{
"epoch": 2.2457912457912457,
"grad_norm": 7.860456466674805,
"learning_rate": 2.4713664648494133e-07,
"loss": 0.6509280204772949,
"step": 2668
},
{
"epoch": 2.2474747474747474,
"grad_norm": 7.511784553527832,
"learning_rate": 2.465193802544552e-07,
"loss": 0.5061072111129761,
"step": 2670
},
{
"epoch": 2.249158249158249,
"grad_norm": 4.732418060302734,
"learning_rate": 2.4590315964239606e-07,
"loss": 0.36101067066192627,
"step": 2672
},
{
"epoch": 2.250841750841751,
"grad_norm": 10.225937843322754,
"learning_rate": 2.452879867719034e-07,
"loss": 0.6636744737625122,
"step": 2674
},
{
"epoch": 2.2525252525252526,
"grad_norm": 6.152078628540039,
"learning_rate": 2.4467386376250633e-07,
"loss": 0.8210121989250183,
"step": 2676
},
{
"epoch": 2.2542087542087543,
"grad_norm": 6.384221076965332,
"learning_rate": 2.440607927301171e-07,
"loss": 0.5604538917541504,
"step": 2678
},
{
"epoch": 2.255892255892256,
"grad_norm": 3.0290005207061768,
"learning_rate": 2.4344877578702355e-07,
"loss": 0.9680004119873047,
"step": 2680
},
{
"epoch": 2.257575757575758,
"grad_norm": 8.649748802185059,
"learning_rate": 2.4283781504188126e-07,
"loss": 0.2856512665748596,
"step": 2682
},
{
"epoch": 2.259259259259259,
"grad_norm": 12.650278091430664,
"learning_rate": 2.422279125997073e-07,
"loss": 0.21757878363132477,
"step": 2684
},
{
"epoch": 2.260942760942761,
"grad_norm": 5.625198841094971,
"learning_rate": 2.416190705618722e-07,
"loss": 0.7161245346069336,
"step": 2686
},
{
"epoch": 2.2626262626262625,
"grad_norm": 3.8364768028259277,
"learning_rate": 2.4101129102609273e-07,
"loss": 0.44631901383399963,
"step": 2688
},
{
"epoch": 2.2643097643097643,
"grad_norm": 2.489049196243286,
"learning_rate": 2.404045760864253e-07,
"loss": 1.060034155845642,
"step": 2690
},
{
"epoch": 2.265993265993266,
"grad_norm": 22.317943572998047,
"learning_rate": 2.397989278332583e-07,
"loss": 0.8590011596679688,
"step": 2692
},
{
"epoch": 2.2676767676767677,
"grad_norm": 3.2131800651550293,
"learning_rate": 2.391943483533044e-07,
"loss": 0.7794303297996521,
"step": 2694
},
{
"epoch": 2.2693602693602695,
"grad_norm": 3.656132936477661,
"learning_rate": 2.385908397295945e-07,
"loss": 0.6720019578933716,
"step": 2696
},
{
"epoch": 2.271043771043771,
"grad_norm": 3.8519668579101562,
"learning_rate": 2.3798840404146995e-07,
"loss": 0.7614578008651733,
"step": 2698
},
{
"epoch": 2.2727272727272725,
"grad_norm": 4.142553329467773,
"learning_rate": 2.3738704336457484e-07,
"loss": 0.8712958097457886,
"step": 2700
},
{
"epoch": 2.274410774410774,
"grad_norm": 6.8363237380981445,
"learning_rate": 2.3678675977084986e-07,
"loss": 0.5424622297286987,
"step": 2702
},
{
"epoch": 2.276094276094276,
"grad_norm": 3.5155107975006104,
"learning_rate": 2.3618755532852466e-07,
"loss": 0.973854660987854,
"step": 2704
},
{
"epoch": 2.2777777777777777,
"grad_norm": 7.004105091094971,
"learning_rate": 2.3558943210211047e-07,
"loss": 1.0108654499053955,
"step": 2706
},
{
"epoch": 2.2794612794612794,
"grad_norm": 1.2474193572998047,
"learning_rate": 2.3499239215239357e-07,
"loss": 0.5368537306785583,
"step": 2708
},
{
"epoch": 2.281144781144781,
"grad_norm": 5.437285423278809,
"learning_rate": 2.3439643753642798e-07,
"loss": 0.690973162651062,
"step": 2710
},
{
"epoch": 2.282828282828283,
"grad_norm": 11.235260009765625,
"learning_rate": 2.3380157030752775e-07,
"loss": 0.6230310201644897,
"step": 2712
},
{
"epoch": 2.2845117845117846,
"grad_norm": 9.484489440917969,
"learning_rate": 2.33207792515261e-07,
"loss": 0.5481805205345154,
"step": 2714
},
{
"epoch": 2.2861952861952863,
"grad_norm": 9.018638610839844,
"learning_rate": 2.3261510620544208e-07,
"loss": 0.8037227392196655,
"step": 2716
},
{
"epoch": 2.287878787878788,
"grad_norm": 12.419392585754395,
"learning_rate": 2.3202351342012452e-07,
"loss": 0.6880577802658081,
"step": 2718
},
{
"epoch": 2.28956228956229,
"grad_norm": 29.25603485107422,
"learning_rate": 2.3143301619759456e-07,
"loss": 0.579788327217102,
"step": 2720
},
{
"epoch": 2.291245791245791,
"grad_norm": 12.553728103637695,
"learning_rate": 2.308436165723636e-07,
"loss": 0.7886263132095337,
"step": 2722
},
{
"epoch": 2.292929292929293,
"grad_norm": 14.242766380310059,
"learning_rate": 2.3025531657516115e-07,
"loss": 0.7852193117141724,
"step": 2724
},
{
"epoch": 2.2946127946127945,
"grad_norm": 7.794075012207031,
"learning_rate": 2.2966811823292842e-07,
"loss": 0.7775453925132751,
"step": 2726
},
{
"epoch": 2.2962962962962963,
"grad_norm": 7.859867572784424,
"learning_rate": 2.2908202356881075e-07,
"loss": 0.6673729419708252,
"step": 2728
},
{
"epoch": 2.297979797979798,
"grad_norm": 6.257922172546387,
"learning_rate": 2.2849703460215077e-07,
"loss": 1.060187816619873,
"step": 2730
},
{
"epoch": 2.2996632996632997,
"grad_norm": 5.627756595611572,
"learning_rate": 2.2791315334848162e-07,
"loss": 0.6064283847808838,
"step": 2732
},
{
"epoch": 2.3013468013468015,
"grad_norm": 6.193628787994385,
"learning_rate": 2.2733038181952e-07,
"loss": 0.648173451423645,
"step": 2734
},
{
"epoch": 2.303030303030303,
"grad_norm": 10.281158447265625,
"learning_rate": 2.2674872202315892e-07,
"loss": 0.49927544593811035,
"step": 2736
},
{
"epoch": 2.3047138047138045,
"grad_norm": 7.590847969055176,
"learning_rate": 2.2616817596346103e-07,
"loss": 0.7152895927429199,
"step": 2738
},
{
"epoch": 2.3063973063973062,
"grad_norm": 7.842513084411621,
"learning_rate": 2.2558874564065215e-07,
"loss": 0.5551795959472656,
"step": 2740
},
{
"epoch": 2.308080808080808,
"grad_norm": 5.1881890296936035,
"learning_rate": 2.2501043305111313e-07,
"loss": 0.8357152938842773,
"step": 2742
},
{
"epoch": 2.3097643097643097,
"grad_norm": 5.037477493286133,
"learning_rate": 2.2443324018737436e-07,
"loss": 0.8395123481750488,
"step": 2744
},
{
"epoch": 2.3114478114478114,
"grad_norm": 4.545862674713135,
"learning_rate": 2.2385716903810822e-07,
"loss": 0.8929284811019897,
"step": 2746
},
{
"epoch": 2.313131313131313,
"grad_norm": 10.017370223999023,
"learning_rate": 2.2328222158812198e-07,
"loss": 0.707942008972168,
"step": 2748
},
{
"epoch": 2.314814814814815,
"grad_norm": 7.563255310058594,
"learning_rate": 2.227083998183516e-07,
"loss": 0.12098832428455353,
"step": 2750
},
{
"epoch": 2.3164983164983166,
"grad_norm": 7.330215930938721,
"learning_rate": 2.221357057058546e-07,
"loss": 0.4100933074951172,
"step": 2752
},
{
"epoch": 2.3181818181818183,
"grad_norm": 4.7282185554504395,
"learning_rate": 2.2156414122380307e-07,
"loss": 0.5965608358383179,
"step": 2754
},
{
"epoch": 2.31986531986532,
"grad_norm": 3.0822274684906006,
"learning_rate": 2.2099370834147712e-07,
"loss": 0.945094645023346,
"step": 2756
},
{
"epoch": 2.3215488215488214,
"grad_norm": 7.529977321624756,
"learning_rate": 2.2042440902425822e-07,
"loss": 0.7363934516906738,
"step": 2758
},
{
"epoch": 2.323232323232323,
"grad_norm": 13.28249740600586,
"learning_rate": 2.1985624523362185e-07,
"loss": 0.7786830067634583,
"step": 2760
},
{
"epoch": 2.324915824915825,
"grad_norm": 11.899820327758789,
"learning_rate": 2.1928921892713132e-07,
"loss": 0.6262949705123901,
"step": 2762
},
{
"epoch": 2.3265993265993266,
"grad_norm": 4.841851234436035,
"learning_rate": 2.187233320584311e-07,
"loss": 0.9699975252151489,
"step": 2764
},
{
"epoch": 2.3282828282828283,
"grad_norm": 9.435696601867676,
"learning_rate": 2.181585865772393e-07,
"loss": 0.8197389245033264,
"step": 2766
},
{
"epoch": 2.32996632996633,
"grad_norm": 4.551506042480469,
"learning_rate": 2.175949844293417e-07,
"loss": 0.6494600772857666,
"step": 2768
},
{
"epoch": 2.3316498316498318,
"grad_norm": 19.35220718383789,
"learning_rate": 2.1703252755658512e-07,
"loss": 0.7402999997138977,
"step": 2770
},
{
"epoch": 2.3333333333333335,
"grad_norm": 5.450087070465088,
"learning_rate": 2.1647121789686985e-07,
"loss": 0.7242530584335327,
"step": 2772
},
{
"epoch": 2.3350168350168348,
"grad_norm": 6.281241416931152,
"learning_rate": 2.1591105738414395e-07,
"loss": 0.7737699151039124,
"step": 2774
},
{
"epoch": 2.3367003367003365,
"grad_norm": 4.79439640045166,
"learning_rate": 2.153520479483962e-07,
"loss": 0.7753046751022339,
"step": 2776
},
{
"epoch": 2.3383838383838382,
"grad_norm": 7.926896095275879,
"learning_rate": 2.1479419151564908e-07,
"loss": 0.5965973138809204,
"step": 2778
},
{
"epoch": 2.34006734006734,
"grad_norm": 13.744224548339844,
"learning_rate": 2.1423749000795286e-07,
"loss": 0.7432798743247986,
"step": 2780
},
{
"epoch": 2.3417508417508417,
"grad_norm": 4.1591949462890625,
"learning_rate": 2.1368194534337864e-07,
"loss": 0.6963976621627808,
"step": 2782
},
{
"epoch": 2.3434343434343434,
"grad_norm": 5.26281213760376,
"learning_rate": 2.1312755943601113e-07,
"loss": 0.8363964557647705,
"step": 2784
},
{
"epoch": 2.345117845117845,
"grad_norm": 4.026867389678955,
"learning_rate": 2.1257433419594329e-07,
"loss": 0.6121779680252075,
"step": 2786
},
{
"epoch": 2.346801346801347,
"grad_norm": 3.700312614440918,
"learning_rate": 2.1202227152926898e-07,
"loss": 1.0569815635681152,
"step": 2788
},
{
"epoch": 2.3484848484848486,
"grad_norm": 5.786956310272217,
"learning_rate": 2.114713733380761e-07,
"loss": 0.8500775098800659,
"step": 2790
},
{
"epoch": 2.3501683501683504,
"grad_norm": 3.6336448192596436,
"learning_rate": 2.1092164152044082e-07,
"loss": 0.6126809120178223,
"step": 2792
},
{
"epoch": 2.351851851851852,
"grad_norm": 16.343307495117188,
"learning_rate": 2.1037307797042073e-07,
"loss": 0.7721902132034302,
"step": 2794
},
{
"epoch": 2.3535353535353534,
"grad_norm": 4.7194600105285645,
"learning_rate": 2.0982568457804772e-07,
"loss": 1.0643179416656494,
"step": 2796
},
{
"epoch": 2.355218855218855,
"grad_norm": 5.305932998657227,
"learning_rate": 2.0927946322932257e-07,
"loss": 0.6048824191093445,
"step": 2798
},
{
"epoch": 2.356902356902357,
"grad_norm": 2.0404253005981445,
"learning_rate": 2.0873441580620778e-07,
"loss": 1.1490514278411865,
"step": 2800
},
{
"epoch": 2.3585858585858586,
"grad_norm": 4.3384480476379395,
"learning_rate": 2.0819054418662068e-07,
"loss": 1.0097895860671997,
"step": 2802
},
{
"epoch": 2.3602693602693603,
"grad_norm": 7.471581935882568,
"learning_rate": 2.0764785024442816e-07,
"loss": 0.8789470791816711,
"step": 2804
},
{
"epoch": 2.361952861952862,
"grad_norm": 9.630654335021973,
"learning_rate": 2.071063358494392e-07,
"loss": 0.8657972812652588,
"step": 2806
},
{
"epoch": 2.3636363636363638,
"grad_norm": 9.908742904663086,
"learning_rate": 2.0656600286739846e-07,
"loss": 0.9500114917755127,
"step": 2808
},
{
"epoch": 2.3653198653198655,
"grad_norm": 3.0417370796203613,
"learning_rate": 2.060268531599806e-07,
"loss": 1.0881528854370117,
"step": 2810
},
{
"epoch": 2.3670033670033668,
"grad_norm": 15.979384422302246,
"learning_rate": 2.0548888858478314e-07,
"loss": 0.8370237350463867,
"step": 2812
},
{
"epoch": 2.3686868686868685,
"grad_norm": 2.701646327972412,
"learning_rate": 2.0495211099532051e-07,
"loss": 0.7017450332641602,
"step": 2814
},
{
"epoch": 2.3703703703703702,
"grad_norm": 3.518488645553589,
"learning_rate": 2.0441652224101739e-07,
"loss": 0.7352346777915955,
"step": 2816
},
{
"epoch": 2.372053872053872,
"grad_norm": 5.064514636993408,
"learning_rate": 2.038821241672022e-07,
"loss": 0.7799332141876221,
"step": 2818
},
{
"epoch": 2.3737373737373737,
"grad_norm": 4.822017192840576,
"learning_rate": 2.0334891861510124e-07,
"loss": 0.8013976812362671,
"step": 2820
},
{
"epoch": 2.3754208754208754,
"grad_norm": 13.20271110534668,
"learning_rate": 2.0281690742183214e-07,
"loss": 0.5635098814964294,
"step": 2822
},
{
"epoch": 2.377104377104377,
"grad_norm": 4.322653293609619,
"learning_rate": 2.0228609242039707e-07,
"loss": 1.05335533618927,
"step": 2824
},
{
"epoch": 2.378787878787879,
"grad_norm": 8.060440063476562,
"learning_rate": 2.017564754396771e-07,
"loss": 0.9288073778152466,
"step": 2826
},
{
"epoch": 2.3804713804713806,
"grad_norm": 6.93074369430542,
"learning_rate": 2.012280583044258e-07,
"loss": 0.49736571311950684,
"step": 2828
},
{
"epoch": 2.3821548821548824,
"grad_norm": 11.825316429138184,
"learning_rate": 2.0070084283526223e-07,
"loss": 1.044695258140564,
"step": 2830
},
{
"epoch": 2.3838383838383836,
"grad_norm": 7.59405517578125,
"learning_rate": 2.001748308486656e-07,
"loss": 0.8302027583122253,
"step": 2832
},
{
"epoch": 2.3855218855218854,
"grad_norm": 3.9063162803649902,
"learning_rate": 1.9965002415696878e-07,
"loss": 0.658703088760376,
"step": 2834
},
{
"epoch": 2.387205387205387,
"grad_norm": 9.86563491821289,
"learning_rate": 1.9912642456835125e-07,
"loss": 0.6858144998550415,
"step": 2836
},
{
"epoch": 2.388888888888889,
"grad_norm": 4.106326580047607,
"learning_rate": 1.9860403388683408e-07,
"loss": 0.5258500576019287,
"step": 2838
},
{
"epoch": 2.3905723905723906,
"grad_norm": 3.920785427093506,
"learning_rate": 1.980828539122731e-07,
"loss": 0.9032931327819824,
"step": 2840
},
{
"epoch": 2.3922558922558923,
"grad_norm": 1.4234728813171387,
"learning_rate": 1.9756288644035244e-07,
"loss": 0.43326181173324585,
"step": 2842
},
{
"epoch": 2.393939393939394,
"grad_norm": 4.104327201843262,
"learning_rate": 1.970441332625788e-07,
"loss": 1.114197015762329,
"step": 2844
},
{
"epoch": 2.3956228956228958,
"grad_norm": 7.699793815612793,
"learning_rate": 1.965265961662753e-07,
"loss": 0.8347800970077515,
"step": 2846
},
{
"epoch": 2.3973063973063975,
"grad_norm": 4.057286262512207,
"learning_rate": 1.9601027693457485e-07,
"loss": 1.1171047687530518,
"step": 2848
},
{
"epoch": 2.398989898989899,
"grad_norm": 4.676527976989746,
"learning_rate": 1.9549517734641453e-07,
"loss": 0.8913414478302002,
"step": 2850
},
{
"epoch": 2.4006734006734005,
"grad_norm": 5.339909076690674,
"learning_rate": 1.9498129917652917e-07,
"loss": 0.5060603022575378,
"step": 2852
},
{
"epoch": 2.4023569023569022,
"grad_norm": 7.147670269012451,
"learning_rate": 1.9446864419544517e-07,
"loss": 0.7295070886611938,
"step": 2854
},
{
"epoch": 2.404040404040404,
"grad_norm": 6.569252014160156,
"learning_rate": 1.9395721416947475e-07,
"loss": 0.6507788896560669,
"step": 2856
},
{
"epoch": 2.4057239057239057,
"grad_norm": 6.89575719833374,
"learning_rate": 1.9344701086070957e-07,
"loss": 0.7100333571434021,
"step": 2858
},
{
"epoch": 2.4074074074074074,
"grad_norm": 7.443199634552002,
"learning_rate": 1.9293803602701458e-07,
"loss": 0.49127644300460815,
"step": 2860
},
{
"epoch": 2.409090909090909,
"grad_norm": 3.398568868637085,
"learning_rate": 1.924302914220222e-07,
"loss": 0.8142455816268921,
"step": 2862
},
{
"epoch": 2.410774410774411,
"grad_norm": 7.437132835388184,
"learning_rate": 1.9192377879512656e-07,
"loss": 0.5337988138198853,
"step": 2864
},
{
"epoch": 2.4124579124579126,
"grad_norm": 4.250380516052246,
"learning_rate": 1.914184998914764e-07,
"loss": 0.7382153868675232,
"step": 2866
},
{
"epoch": 2.4141414141414144,
"grad_norm": 4.774903297424316,
"learning_rate": 1.9091445645197024e-07,
"loss": 0.9528558254241943,
"step": 2868
},
{
"epoch": 2.4158249158249157,
"grad_norm": 3.7426023483276367,
"learning_rate": 1.9041165021324986e-07,
"loss": 0.8381022214889526,
"step": 2870
},
{
"epoch": 2.4175084175084174,
"grad_norm": 2.178778648376465,
"learning_rate": 1.899100829076945e-07,
"loss": 0.5464705228805542,
"step": 2872
},
{
"epoch": 2.419191919191919,
"grad_norm": 4.025269031524658,
"learning_rate": 1.894097562634142e-07,
"loss": 1.0029910802841187,
"step": 2874
},
{
"epoch": 2.420875420875421,
"grad_norm": 5.291448593139648,
"learning_rate": 1.8891067200424498e-07,
"loss": 0.8049919605255127,
"step": 2876
},
{
"epoch": 2.4225589225589226,
"grad_norm": 3.155411720275879,
"learning_rate": 1.8841283184974216e-07,
"loss": 0.5165250301361084,
"step": 2878
},
{
"epoch": 2.4242424242424243,
"grad_norm": 3.550431251525879,
"learning_rate": 1.8791623751517432e-07,
"loss": 0.9810848832130432,
"step": 2880
},
{
"epoch": 2.425925925925926,
"grad_norm": 8.65785026550293,
"learning_rate": 1.8742089071151812e-07,
"loss": 0.6320451498031616,
"step": 2882
},
{
"epoch": 2.4276094276094278,
"grad_norm": 24.364227294921875,
"learning_rate": 1.8692679314545155e-07,
"loss": 0.691448450088501,
"step": 2884
},
{
"epoch": 2.429292929292929,
"grad_norm": 3.4331605434417725,
"learning_rate": 1.8643394651934867e-07,
"loss": 0.5786364078521729,
"step": 2886
},
{
"epoch": 2.430976430976431,
"grad_norm": 16.977510452270508,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.6802031993865967,
"step": 2888
},
{
"epoch": 2.4326599326599325,
"grad_norm": 5.363550662994385,
"learning_rate": 1.8545201287497442e-07,
"loss": 0.5717660188674927,
"step": 2890
},
{
"epoch": 2.4343434343434343,
"grad_norm": 23.09035873413086,
"learning_rate": 1.849629292398774e-07,
"loss": 0.7750734686851501,
"step": 2892
},
{
"epoch": 2.436026936026936,
"grad_norm": 2.4943952560424805,
"learning_rate": 1.8447510331108163e-07,
"loss": 0.9770002365112305,
"step": 2894
},
{
"epoch": 2.4377104377104377,
"grad_norm": 6.26854133605957,
"learning_rate": 1.839885367693526e-07,
"loss": 0.8726930618286133,
"step": 2896
},
{
"epoch": 2.4393939393939394,
"grad_norm": 11.332048416137695,
"learning_rate": 1.8350323129111672e-07,
"loss": 0.7943978309631348,
"step": 2898
},
{
"epoch": 2.441077441077441,
"grad_norm": 20.655651092529297,
"learning_rate": 1.8301918854845577e-07,
"loss": 0.5449969172477722,
"step": 2900
},
{
"epoch": 2.442760942760943,
"grad_norm": 3.659409761428833,
"learning_rate": 1.8253641020910043e-07,
"loss": 0.9310587644577026,
"step": 2902
},
{
"epoch": 2.4444444444444446,
"grad_norm": 11.209527969360352,
"learning_rate": 1.820548979364253e-07,
"loss": 0.5611803531646729,
"step": 2904
},
{
"epoch": 2.4461279461279464,
"grad_norm": 6.078222751617432,
"learning_rate": 1.815746533894429e-07,
"loss": 0.4734145998954773,
"step": 2906
},
{
"epoch": 2.4478114478114477,
"grad_norm": 43.15976333618164,
"learning_rate": 1.8109567822279753e-07,
"loss": 0.6027005910873413,
"step": 2908
},
{
"epoch": 2.4494949494949494,
"grad_norm": 8.55388355255127,
"learning_rate": 1.8061797408676023e-07,
"loss": 0.7029461860656738,
"step": 2910
},
{
"epoch": 2.451178451178451,
"grad_norm": 3.861863374710083,
"learning_rate": 1.801415426272229e-07,
"loss": 0.5813450813293457,
"step": 2912
},
{
"epoch": 2.452861952861953,
"grad_norm": 4.0164103507995605,
"learning_rate": 1.796663854856922e-07,
"loss": 0.8507091999053955,
"step": 2914
},
{
"epoch": 2.4545454545454546,
"grad_norm": 9.30286693572998,
"learning_rate": 1.7919250429928446e-07,
"loss": 0.7457901239395142,
"step": 2916
},
{
"epoch": 2.4562289562289563,
"grad_norm": 1.883726954460144,
"learning_rate": 1.7871990070071987e-07,
"loss": 0.45636504888534546,
"step": 2918
},
{
"epoch": 2.457912457912458,
"grad_norm": 4.311119079589844,
"learning_rate": 1.7824857631831648e-07,
"loss": 0.9269647002220154,
"step": 2920
},
{
"epoch": 2.45959595959596,
"grad_norm": 19.505645751953125,
"learning_rate": 1.7777853277598522e-07,
"loss": 0.5110766887664795,
"step": 2922
},
{
"epoch": 2.461279461279461,
"grad_norm": 3.043074131011963,
"learning_rate": 1.7730977169322397e-07,
"loss": 0.41358011960983276,
"step": 2924
},
{
"epoch": 2.462962962962963,
"grad_norm": 3.94612455368042,
"learning_rate": 1.768422946851117e-07,
"loss": 0.7347300052642822,
"step": 2926
},
{
"epoch": 2.4646464646464645,
"grad_norm": 13.160529136657715,
"learning_rate": 1.763761033623034e-07,
"loss": 0.652132511138916,
"step": 2928
},
{
"epoch": 2.4663299663299663,
"grad_norm": 7.081724643707275,
"learning_rate": 1.7591119933102455e-07,
"loss": 0.4731465280056,
"step": 2930
},
{
"epoch": 2.468013468013468,
"grad_norm": 7.2086358070373535,
"learning_rate": 1.7544758419306493e-07,
"loss": 0.8537788391113281,
"step": 2932
},
{
"epoch": 2.4696969696969697,
"grad_norm": 5.239010810852051,
"learning_rate": 1.749852595457738e-07,
"loss": 0.7587542533874512,
"step": 2934
},
{
"epoch": 2.4713804713804715,
"grad_norm": 7.071168899536133,
"learning_rate": 1.7452422698205427e-07,
"loss": 0.5985921621322632,
"step": 2936
},
{
"epoch": 2.473063973063973,
"grad_norm": 3.5129053592681885,
"learning_rate": 1.7406448809035723e-07,
"loss": 0.674223780632019,
"step": 2938
},
{
"epoch": 2.474747474747475,
"grad_norm": 4.072961807250977,
"learning_rate": 1.736060444546768e-07,
"loss": 0.6285250186920166,
"step": 2940
},
{
"epoch": 2.4764309764309766,
"grad_norm": 5.048702239990234,
"learning_rate": 1.731488976545442e-07,
"loss": 0.5890775918960571,
"step": 2942
},
{
"epoch": 2.478114478114478,
"grad_norm": 5.47603178024292,
"learning_rate": 1.726930492650223e-07,
"loss": 0.6147992610931396,
"step": 2944
},
{
"epoch": 2.4797979797979797,
"grad_norm": 3.560030221939087,
"learning_rate": 1.7223850085670082e-07,
"loss": 0.9968768358230591,
"step": 2946
},
{
"epoch": 2.4814814814814814,
"grad_norm": 2.8818583488464355,
"learning_rate": 1.7178525399569026e-07,
"loss": 1.031359314918518,
"step": 2948
},
{
"epoch": 2.483164983164983,
"grad_norm": 2.790241241455078,
"learning_rate": 1.7133331024361668e-07,
"loss": 1.090069055557251,
"step": 2950
},
{
"epoch": 2.484848484848485,
"grad_norm": 56.298179626464844,
"learning_rate": 1.7088267115761645e-07,
"loss": 0.9623196125030518,
"step": 2952
},
{
"epoch": 2.4865319865319866,
"grad_norm": 4.772038459777832,
"learning_rate": 1.7043333829033093e-07,
"loss": 0.6764428019523621,
"step": 2954
},
{
"epoch": 2.4882154882154883,
"grad_norm": 15.111934661865234,
"learning_rate": 1.6998531318990084e-07,
"loss": 0.9181029796600342,
"step": 2956
},
{
"epoch": 2.48989898989899,
"grad_norm": 17.119279861450195,
"learning_rate": 1.695385973999612e-07,
"loss": 0.603553056716919,
"step": 2958
},
{
"epoch": 2.4915824915824913,
"grad_norm": 3.2011559009552,
"learning_rate": 1.690931924596359e-07,
"loss": 0.9430979490280151,
"step": 2960
},
{
"epoch": 2.493265993265993,
"grad_norm": 10.394431114196777,
"learning_rate": 1.6864909990353222e-07,
"loss": 0.6838173866271973,
"step": 2962
},
{
"epoch": 2.494949494949495,
"grad_norm": 2.8894050121307373,
"learning_rate": 1.6820632126173595e-07,
"loss": 0.829933762550354,
"step": 2964
},
{
"epoch": 2.4966329966329965,
"grad_norm": 6.6212544441223145,
"learning_rate": 1.6776485805980593e-07,
"loss": 0.7385812997817993,
"step": 2966
},
{
"epoch": 2.4983164983164983,
"grad_norm": 3.0128917694091797,
"learning_rate": 1.673247118187685e-07,
"loss": 0.9367114901542664,
"step": 2968
},
{
"epoch": 2.5,
"grad_norm": 7.358500003814697,
"learning_rate": 1.6688588405511265e-07,
"loss": 0.9481908082962036,
"step": 2970
},
{
"epoch": 2.5016835016835017,
"grad_norm": 7.319785118103027,
"learning_rate": 1.6644837628078485e-07,
"loss": 0.4760739207267761,
"step": 2972
},
{
"epoch": 2.5033670033670035,
"grad_norm": 24.174762725830078,
"learning_rate": 1.6601219000318317e-07,
"loss": 0.529428243637085,
"step": 2974
},
{
"epoch": 2.505050505050505,
"grad_norm": 90.50502014160156,
"learning_rate": 1.6557732672515305e-07,
"loss": 0.8081066012382507,
"step": 2976
},
{
"epoch": 2.506734006734007,
"grad_norm": 9.112408638000488,
"learning_rate": 1.6514378794498152e-07,
"loss": 0.46742603182792664,
"step": 2978
},
{
"epoch": 2.5084175084175087,
"grad_norm": 3.2685351371765137,
"learning_rate": 1.6471157515639195e-07,
"loss": 0.8512880802154541,
"step": 2980
},
{
"epoch": 2.51010101010101,
"grad_norm": 11.603774070739746,
"learning_rate": 1.6428068984853923e-07,
"loss": 0.8741171360015869,
"step": 2982
},
{
"epoch": 2.5117845117845117,
"grad_norm": 6.443422317504883,
"learning_rate": 1.6385113350600476e-07,
"loss": 0.4871176779270172,
"step": 2984
},
{
"epoch": 2.5134680134680134,
"grad_norm": 22.373445510864258,
"learning_rate": 1.6342290760879064e-07,
"loss": 0.8540467023849487,
"step": 2986
},
{
"epoch": 2.515151515151515,
"grad_norm": 5.546900272369385,
"learning_rate": 1.6299601363231542e-07,
"loss": 0.7414556741714478,
"step": 2988
},
{
"epoch": 2.516835016835017,
"grad_norm": 4.198864459991455,
"learning_rate": 1.6257045304740842e-07,
"loss": 0.11034494638442993,
"step": 2990
},
{
"epoch": 2.5185185185185186,
"grad_norm": 3.5712265968322754,
"learning_rate": 1.6214622732030483e-07,
"loss": 0.988459050655365,
"step": 2992
},
{
"epoch": 2.5202020202020203,
"grad_norm": 6.247505187988281,
"learning_rate": 1.617233379126409e-07,
"loss": 0.6715781092643738,
"step": 2994
},
{
"epoch": 2.5218855218855216,
"grad_norm": 4.307699680328369,
"learning_rate": 1.6130178628144858e-07,
"loss": 0.7559702396392822,
"step": 2996
},
{
"epoch": 2.5235690235690234,
"grad_norm": 32.38378143310547,
"learning_rate": 1.6088157387915046e-07,
"loss": 0.61976158618927,
"step": 2998
},
{
"epoch": 2.525252525252525,
"grad_norm": 5.182736396789551,
"learning_rate": 1.6046270215355522e-07,
"loss": 0.5721726417541504,
"step": 3000
},
{
"epoch": 2.526936026936027,
"grad_norm": 11.062474250793457,
"learning_rate": 1.600451725478522e-07,
"loss": 0.5903807878494263,
"step": 3002
},
{
"epoch": 2.5286195286195285,
"grad_norm": 3.9762520790100098,
"learning_rate": 1.5962898650060646e-07,
"loss": 1.0528504848480225,
"step": 3004
},
{
"epoch": 2.5303030303030303,
"grad_norm": 12.059609413146973,
"learning_rate": 1.5921414544575406e-07,
"loss": 0.8805992603302002,
"step": 3006
},
{
"epoch": 2.531986531986532,
"grad_norm": 5.079036235809326,
"learning_rate": 1.5880065081259714e-07,
"loss": 0.8200486898422241,
"step": 3008
},
{
"epoch": 2.5336700336700337,
"grad_norm": 6.0202741622924805,
"learning_rate": 1.583885040257985e-07,
"loss": 0.5228027105331421,
"step": 3010
},
{
"epoch": 2.5353535353535355,
"grad_norm": 10.853965759277344,
"learning_rate": 1.579777065053773e-07,
"loss": 0.8398838639259338,
"step": 3012
},
{
"epoch": 2.537037037037037,
"grad_norm": 7.994739055633545,
"learning_rate": 1.5756825966670399e-07,
"loss": 0.7166822552680969,
"step": 3014
},
{
"epoch": 2.538720538720539,
"grad_norm": 8.935235977172852,
"learning_rate": 1.5716016492049495e-07,
"loss": 0.7087036371231079,
"step": 3016
},
{
"epoch": 2.5404040404040407,
"grad_norm": 11.376523971557617,
"learning_rate": 1.5675342367280838e-07,
"loss": 1.0162254571914673,
"step": 3018
},
{
"epoch": 2.542087542087542,
"grad_norm": 3.1120080947875977,
"learning_rate": 1.563480373250392e-07,
"loss": 0.7916754484176636,
"step": 3020
},
{
"epoch": 2.5437710437710437,
"grad_norm": 16.1352596282959,
"learning_rate": 1.559440072739137e-07,
"loss": 0.8983919024467468,
"step": 3022
},
{
"epoch": 2.5454545454545454,
"grad_norm": 5.045600891113281,
"learning_rate": 1.5554133491148556e-07,
"loss": 1.0679364204406738,
"step": 3024
},
{
"epoch": 2.547138047138047,
"grad_norm": 24.617691040039062,
"learning_rate": 1.5514002162513035e-07,
"loss": 0.3964739143848419,
"step": 3026
},
{
"epoch": 2.548821548821549,
"grad_norm": 4.4443359375,
"learning_rate": 1.5474006879754137e-07,
"loss": 0.7372143268585205,
"step": 3028
},
{
"epoch": 2.5505050505050506,
"grad_norm": 6.077057838439941,
"learning_rate": 1.5434147780672437e-07,
"loss": 0.6355978846549988,
"step": 3030
},
{
"epoch": 2.5521885521885523,
"grad_norm": 2.639094829559326,
"learning_rate": 1.539442500259929e-07,
"loss": 0.5554131269454956,
"step": 3032
},
{
"epoch": 2.5538720538720536,
"grad_norm": 5.577948093414307,
"learning_rate": 1.5354838682396384e-07,
"loss": 0.9816339612007141,
"step": 3034
},
{
"epoch": 2.5555555555555554,
"grad_norm": 4.363624572753906,
"learning_rate": 1.5315388956455266e-07,
"loss": 1.0391297340393066,
"step": 3036
},
{
"epoch": 2.557239057239057,
"grad_norm": 9.215215682983398,
"learning_rate": 1.5276075960696817e-07,
"loss": 0.7156937122344971,
"step": 3038
},
{
"epoch": 2.558922558922559,
"grad_norm": 2.8174784183502197,
"learning_rate": 1.5236899830570854e-07,
"loss": 1.0105350017547607,
"step": 3040
},
{
"epoch": 2.5606060606060606,
"grad_norm": 3.025399923324585,
"learning_rate": 1.5197860701055643e-07,
"loss": 0.767303466796875,
"step": 3042
},
{
"epoch": 2.5622895622895623,
"grad_norm": 8.97220230102539,
"learning_rate": 1.515895870665739e-07,
"loss": 0.99961256980896,
"step": 3044
},
{
"epoch": 2.563973063973064,
"grad_norm": 10.237662315368652,
"learning_rate": 1.5120193981409848e-07,
"loss": 0.7313355207443237,
"step": 3046
},
{
"epoch": 2.5656565656565657,
"grad_norm": 4.435790538787842,
"learning_rate": 1.508156665887381e-07,
"loss": 0.9470257759094238,
"step": 3048
},
{
"epoch": 2.5673400673400675,
"grad_norm": 8.973566055297852,
"learning_rate": 1.5043076872136646e-07,
"loss": 0.4554850459098816,
"step": 3050
},
{
"epoch": 2.569023569023569,
"grad_norm": 3.580697774887085,
"learning_rate": 1.5004724753811864e-07,
"loss": 1.0283160209655762,
"step": 3052
},
{
"epoch": 2.570707070707071,
"grad_norm": 3.4427924156188965,
"learning_rate": 1.496651043603866e-07,
"loss": 0.12811371684074402,
"step": 3054
},
{
"epoch": 2.5723905723905722,
"grad_norm": 30.826913833618164,
"learning_rate": 1.4928434050481424e-07,
"loss": 0.7465952634811401,
"step": 3056
},
{
"epoch": 2.574074074074074,
"grad_norm": 12.796523094177246,
"learning_rate": 1.4890495728329334e-07,
"loss": 0.4082253873348236,
"step": 3058
},
{
"epoch": 2.5757575757575757,
"grad_norm": 3.8712823390960693,
"learning_rate": 1.485269560029587e-07,
"loss": 0.8437204360961914,
"step": 3060
},
{
"epoch": 2.5774410774410774,
"grad_norm": 4.653648376464844,
"learning_rate": 1.481503379661838e-07,
"loss": 0.7468912601470947,
"step": 3062
},
{
"epoch": 2.579124579124579,
"grad_norm": 9.738509178161621,
"learning_rate": 1.4777510447057616e-07,
"loss": 0.6074585318565369,
"step": 3064
},
{
"epoch": 2.580808080808081,
"grad_norm": 4.1727495193481445,
"learning_rate": 1.4740125680897328e-07,
"loss": 0.7406507730484009,
"step": 3066
},
{
"epoch": 2.5824915824915826,
"grad_norm": 9.242506980895996,
"learning_rate": 1.470287962694373e-07,
"loss": 0.4214972257614136,
"step": 3068
},
{
"epoch": 2.584175084175084,
"grad_norm": 12.610301971435547,
"learning_rate": 1.4665772413525175e-07,
"loss": 0.17865464091300964,
"step": 3070
},
{
"epoch": 2.5858585858585856,
"grad_norm": 21.455978393554688,
"learning_rate": 1.4628804168491636e-07,
"loss": 0.6329761743545532,
"step": 3072
},
{
"epoch": 2.5875420875420874,
"grad_norm": 5.749107837677002,
"learning_rate": 1.4591975019214238e-07,
"loss": 1.0531988143920898,
"step": 3074
},
{
"epoch": 2.589225589225589,
"grad_norm": 6.151569366455078,
"learning_rate": 1.4555285092584917e-07,
"loss": 0.4620995819568634,
"step": 3076
},
{
"epoch": 2.590909090909091,
"grad_norm": 9.935331344604492,
"learning_rate": 1.451873451501592e-07,
"loss": 0.9808303117752075,
"step": 3078
},
{
"epoch": 2.5925925925925926,
"grad_norm": 8.355198860168457,
"learning_rate": 1.448232341243933e-07,
"loss": 0.7373911142349243,
"step": 3080
},
{
"epoch": 2.5942760942760943,
"grad_norm": 3.359959125518799,
"learning_rate": 1.4446051910306743e-07,
"loss": 1.0398435592651367,
"step": 3082
},
{
"epoch": 2.595959595959596,
"grad_norm": 6.27101469039917,
"learning_rate": 1.440992013358875e-07,
"loss": 0.6558928489685059,
"step": 3084
},
{
"epoch": 2.5976430976430978,
"grad_norm": 4.128625869750977,
"learning_rate": 1.4373928206774504e-07,
"loss": 0.6560384035110474,
"step": 3086
},
{
"epoch": 2.5993265993265995,
"grad_norm": 4.040182113647461,
"learning_rate": 1.4338076253871345e-07,
"loss": 0.9103618264198303,
"step": 3088
},
{
"epoch": 2.601010101010101,
"grad_norm": 3.0742857456207275,
"learning_rate": 1.4302364398404344e-07,
"loss": 0.9666507244110107,
"step": 3090
},
{
"epoch": 2.602693602693603,
"grad_norm": 6.105360507965088,
"learning_rate": 1.4266792763415863e-07,
"loss": 0.7367033362388611,
"step": 3092
},
{
"epoch": 2.6043771043771042,
"grad_norm": 4.493860244750977,
"learning_rate": 1.4231361471465143e-07,
"loss": 0.614148736000061,
"step": 3094
},
{
"epoch": 2.606060606060606,
"grad_norm": 2.295088052749634,
"learning_rate": 1.4196070644627903e-07,
"loss": 0.7760593891143799,
"step": 3096
},
{
"epoch": 2.6077441077441077,
"grad_norm": 3.1130990982055664,
"learning_rate": 1.4160920404495887e-07,
"loss": 0.6581928730010986,
"step": 3098
},
{
"epoch": 2.6094276094276094,
"grad_norm": 4.99691104888916,
"learning_rate": 1.4125910872176466e-07,
"loss": 0.7904366254806519,
"step": 3100
},
{
"epoch": 2.611111111111111,
"grad_norm": 5.191680908203125,
"learning_rate": 1.4091042168292211e-07,
"loss": 0.6951947212219238,
"step": 3102
},
{
"epoch": 2.612794612794613,
"grad_norm": 3.600395679473877,
"learning_rate": 1.4056314412980463e-07,
"loss": 0.9162784218788147,
"step": 3104
},
{
"epoch": 2.6144781144781146,
"grad_norm": 7.186698913574219,
"learning_rate": 1.402172772589297e-07,
"loss": 0.917360782623291,
"step": 3106
},
{
"epoch": 2.616161616161616,
"grad_norm": 19.83490753173828,
"learning_rate": 1.3987282226195416e-07,
"loss": 0.2932959198951721,
"step": 3108
},
{
"epoch": 2.6178451178451176,
"grad_norm": 3.4233388900756836,
"learning_rate": 1.395297803256703e-07,
"loss": 0.9224929809570312,
"step": 3110
},
{
"epoch": 2.6195286195286194,
"grad_norm": 5.625677585601807,
"learning_rate": 1.39188152632002e-07,
"loss": 0.526210367679596,
"step": 3112
},
{
"epoch": 2.621212121212121,
"grad_norm": 3.952099323272705,
"learning_rate": 1.3884794035800056e-07,
"loss": 0.610154926776886,
"step": 3114
},
{
"epoch": 2.622895622895623,
"grad_norm": 3.5759785175323486,
"learning_rate": 1.3850914467584013e-07,
"loss": 0.9689432382583618,
"step": 3116
},
{
"epoch": 2.6245791245791246,
"grad_norm": 3.9002864360809326,
"learning_rate": 1.3817176675281456e-07,
"loss": 1.0947141647338867,
"step": 3118
},
{
"epoch": 2.6262626262626263,
"grad_norm": 8.866259574890137,
"learning_rate": 1.378358077513328e-07,
"loss": 0.7083148956298828,
"step": 3120
},
{
"epoch": 2.627946127946128,
"grad_norm": 2.6722095012664795,
"learning_rate": 1.3750126882891475e-07,
"loss": 0.9863229393959045,
"step": 3122
},
{
"epoch": 2.6296296296296298,
"grad_norm": 3.054203510284424,
"learning_rate": 1.371681511381879e-07,
"loss": 0.9456894397735596,
"step": 3124
},
{
"epoch": 2.6313131313131315,
"grad_norm": 7.191009521484375,
"learning_rate": 1.3683645582688296e-07,
"loss": 0.7224574685096741,
"step": 3126
},
{
"epoch": 2.6329966329966332,
"grad_norm": 4.021665096282959,
"learning_rate": 1.3650618403782963e-07,
"loss": 0.8824139833450317,
"step": 3128
},
{
"epoch": 2.634680134680135,
"grad_norm": 8.395366668701172,
"learning_rate": 1.3617733690895327e-07,
"loss": 0.6597309112548828,
"step": 3130
},
{
"epoch": 2.6363636363636362,
"grad_norm": 5.360447883605957,
"learning_rate": 1.3584991557327076e-07,
"loss": 0.3653567433357239,
"step": 3132
},
{
"epoch": 2.638047138047138,
"grad_norm": 5.391804218292236,
"learning_rate": 1.355239211588861e-07,
"loss": 0.9479780793190002,
"step": 3134
},
{
"epoch": 2.6397306397306397,
"grad_norm": 5.806617736816406,
"learning_rate": 1.3519935478898732e-07,
"loss": 0.880384087562561,
"step": 3136
},
{
"epoch": 2.6414141414141414,
"grad_norm": 5.112968921661377,
"learning_rate": 1.348762175818422e-07,
"loss": 0.5330120921134949,
"step": 3138
},
{
"epoch": 2.643097643097643,
"grad_norm": 5.756229400634766,
"learning_rate": 1.345545106507943e-07,
"loss": 1.0363292694091797,
"step": 3140
},
{
"epoch": 2.644781144781145,
"grad_norm": 3.318345785140991,
"learning_rate": 1.3423423510425942e-07,
"loss": 0.6152174472808838,
"step": 3142
},
{
"epoch": 2.6464646464646466,
"grad_norm": 2.97463321685791,
"learning_rate": 1.3391539204572155e-07,
"loss": 0.9172265529632568,
"step": 3144
},
{
"epoch": 2.648148148148148,
"grad_norm": 3.3665931224823,
"learning_rate": 1.3359798257372913e-07,
"loss": 1.0443644523620605,
"step": 3146
},
{
"epoch": 2.6498316498316496,
"grad_norm": 18.04493522644043,
"learning_rate": 1.332820077818914e-07,
"loss": 0.6649324297904968,
"step": 3148
},
{
"epoch": 2.6515151515151514,
"grad_norm": 3.4756081104278564,
"learning_rate": 1.3296746875887445e-07,
"loss": 0.9889142513275146,
"step": 3150
},
{
"epoch": 2.653198653198653,
"grad_norm": 6.93226432800293,
"learning_rate": 1.3265436658839757e-07,
"loss": 0.3890528082847595,
"step": 3152
},
{
"epoch": 2.654882154882155,
"grad_norm": 26.966766357421875,
"learning_rate": 1.3234270234922947e-07,
"loss": 1.0187561511993408,
"step": 3154
},
{
"epoch": 2.6565656565656566,
"grad_norm": 7.550897121429443,
"learning_rate": 1.3203247711518466e-07,
"loss": 0.691092848777771,
"step": 3156
},
{
"epoch": 2.6582491582491583,
"grad_norm": 26.227054595947266,
"learning_rate": 1.3172369195511945e-07,
"loss": 0.5036376118659973,
"step": 3158
},
{
"epoch": 2.65993265993266,
"grad_norm": 28.069713592529297,
"learning_rate": 1.3141634793292868e-07,
"loss": 0.5947234034538269,
"step": 3160
},
{
"epoch": 2.6616161616161618,
"grad_norm": 4.802524566650391,
"learning_rate": 1.3111044610754202e-07,
"loss": 0.7470720410346985,
"step": 3162
},
{
"epoch": 2.6632996632996635,
"grad_norm": 7.207154273986816,
"learning_rate": 1.3080598753291972e-07,
"loss": 0.9500914812088013,
"step": 3164
},
{
"epoch": 2.6649831649831652,
"grad_norm": 9.974961280822754,
"learning_rate": 1.3050297325804975e-07,
"loss": 0.7958386540412903,
"step": 3166
},
{
"epoch": 2.6666666666666665,
"grad_norm": 4.640317440032959,
"learning_rate": 1.3020140432694386e-07,
"loss": 0.8439849615097046,
"step": 3168
},
{
"epoch": 2.6683501683501682,
"grad_norm": 8.00158977508545,
"learning_rate": 1.2990128177863372e-07,
"loss": 0.7472466230392456,
"step": 3170
},
{
"epoch": 2.67003367003367,
"grad_norm": 5.717433929443359,
"learning_rate": 1.2960260664716803e-07,
"loss": 1.0863356590270996,
"step": 3172
},
{
"epoch": 2.6717171717171717,
"grad_norm": 3.502814769744873,
"learning_rate": 1.293053799616082e-07,
"loss": 1.0433530807495117,
"step": 3174
},
{
"epoch": 2.6734006734006734,
"grad_norm": 8.597749710083008,
"learning_rate": 1.2900960274602512e-07,
"loss": 0.6032207608222961,
"step": 3176
},
{
"epoch": 2.675084175084175,
"grad_norm": 2.7261288166046143,
"learning_rate": 1.2871527601949583e-07,
"loss": 1.049224853515625,
"step": 3178
},
{
"epoch": 2.676767676767677,
"grad_norm": 3.249244213104248,
"learning_rate": 1.284224007960998e-07,
"loss": 0.7596105337142944,
"step": 3180
},
{
"epoch": 2.678451178451178,
"grad_norm": 3.4446780681610107,
"learning_rate": 1.281309780849153e-07,
"loss": 0.9340767860412598,
"step": 3182
},
{
"epoch": 2.68013468013468,
"grad_norm": 4.839624404907227,
"learning_rate": 1.278410088900162e-07,
"loss": 1.0885896682739258,
"step": 3184
},
{
"epoch": 2.6818181818181817,
"grad_norm": 4.542941093444824,
"learning_rate": 1.2755249421046854e-07,
"loss": 0.9115286469459534,
"step": 3186
},
{
"epoch": 2.6835016835016834,
"grad_norm": 42.488826751708984,
"learning_rate": 1.2726543504032654e-07,
"loss": 0.7943265438079834,
"step": 3188
},
{
"epoch": 2.685185185185185,
"grad_norm": 1.467315435409546,
"learning_rate": 1.2697983236862997e-07,
"loss": 0.7184177041053772,
"step": 3190
},
{
"epoch": 2.686868686868687,
"grad_norm": 4.0649213790893555,
"learning_rate": 1.2669568717940022e-07,
"loss": 0.7381956577301025,
"step": 3192
},
{
"epoch": 2.6885521885521886,
"grad_norm": 3.688559055328369,
"learning_rate": 1.2641300045163692e-07,
"loss": 0.8747034072875977,
"step": 3194
},
{
"epoch": 2.6902356902356903,
"grad_norm": 6.0027337074279785,
"learning_rate": 1.2613177315931483e-07,
"loss": 0.6696113348007202,
"step": 3196
},
{
"epoch": 2.691919191919192,
"grad_norm": 4.327197551727295,
"learning_rate": 1.258520062713804e-07,
"loss": 0.8139593601226807,
"step": 3198
},
{
"epoch": 2.6936026936026938,
"grad_norm": 3.2183837890625,
"learning_rate": 1.255737007517482e-07,
"loss": 0.9807404279708862,
"step": 3200
},
{
"epoch": 2.6952861952861955,
"grad_norm": 7.404758453369141,
"learning_rate": 1.2529685755929779e-07,
"loss": 0.8705126047134399,
"step": 3202
},
{
"epoch": 2.6969696969696972,
"grad_norm": 4.221065044403076,
"learning_rate": 1.250214776478705e-07,
"loss": 0.7467024326324463,
"step": 3204
},
{
"epoch": 2.6986531986531985,
"grad_norm": 8.078044891357422,
"learning_rate": 1.2474756196626604e-07,
"loss": 0.9621119499206543,
"step": 3206
},
{
"epoch": 2.7003367003367003,
"grad_norm": 4.394944190979004,
"learning_rate": 1.2447511145823904e-07,
"loss": 0.6447912454605103,
"step": 3208
},
{
"epoch": 2.702020202020202,
"grad_norm": 3.8557052612304688,
"learning_rate": 1.2420412706249637e-07,
"loss": 0.9262001514434814,
"step": 3210
},
{
"epoch": 2.7037037037037037,
"grad_norm": 4.75685977935791,
"learning_rate": 1.2393460971269306e-07,
"loss": 0.6955965161323547,
"step": 3212
},
{
"epoch": 2.7053872053872055,
"grad_norm": 4.87318229675293,
"learning_rate": 1.2366656033742985e-07,
"loss": 0.6773475408554077,
"step": 3214
},
{
"epoch": 2.707070707070707,
"grad_norm": 5.815934658050537,
"learning_rate": 1.233999798602498e-07,
"loss": 0.48384755849838257,
"step": 3216
},
{
"epoch": 2.708754208754209,
"grad_norm": 3.4917256832122803,
"learning_rate": 1.2313486919963455e-07,
"loss": 0.8545089960098267,
"step": 3218
},
{
"epoch": 2.71043771043771,
"grad_norm": 19.76070785522461,
"learning_rate": 1.2287122926900205e-07,
"loss": 0.4410606026649475,
"step": 3220
},
{
"epoch": 2.712121212121212,
"grad_norm": 5.099394798278809,
"learning_rate": 1.2260906097670272e-07,
"loss": 0.8183356523513794,
"step": 3222
},
{
"epoch": 2.7138047138047137,
"grad_norm": 4.549499034881592,
"learning_rate": 1.2234836522601667e-07,
"loss": 0.5615583062171936,
"step": 3224
},
{
"epoch": 2.7154882154882154,
"grad_norm": 4.583916187286377,
"learning_rate": 1.2208914291515035e-07,
"loss": 0.4506787657737732,
"step": 3226
},
{
"epoch": 2.717171717171717,
"grad_norm": 3.6649909019470215,
"learning_rate": 1.218313949372339e-07,
"loss": 0.8952913284301758,
"step": 3228
},
{
"epoch": 2.718855218855219,
"grad_norm": 69.62271118164062,
"learning_rate": 1.2157512218031732e-07,
"loss": 0.4370509088039398,
"step": 3230
},
{
"epoch": 2.7205387205387206,
"grad_norm": 8.150357246398926,
"learning_rate": 1.2132032552736818e-07,
"loss": 0.9717521071434021,
"step": 3232
},
{
"epoch": 2.7222222222222223,
"grad_norm": 3.7782340049743652,
"learning_rate": 1.2106700585626828e-07,
"loss": 0.7311519384384155,
"step": 3234
},
{
"epoch": 2.723905723905724,
"grad_norm": 3.866910219192505,
"learning_rate": 1.208151640398103e-07,
"loss": 0.8734760880470276,
"step": 3236
},
{
"epoch": 2.725589225589226,
"grad_norm": 4.73469877243042,
"learning_rate": 1.2056480094569536e-07,
"loss": 0.855620265007019,
"step": 3238
},
{
"epoch": 2.7272727272727275,
"grad_norm": 4.2583699226379395,
"learning_rate": 1.203159174365296e-07,
"loss": 0.8622401356697083,
"step": 3240
},
{
"epoch": 2.728956228956229,
"grad_norm": 3.042707920074463,
"learning_rate": 1.200685143698214e-07,
"loss": 0.8962169885635376,
"step": 3242
},
{
"epoch": 2.7306397306397305,
"grad_norm": 3.5346055030822754,
"learning_rate": 1.1982259259797856e-07,
"loss": 0.6588426232337952,
"step": 3244
},
{
"epoch": 2.7323232323232323,
"grad_norm": 3.266772747039795,
"learning_rate": 1.1957815296830494e-07,
"loss": 0.8494440317153931,
"step": 3246
},
{
"epoch": 2.734006734006734,
"grad_norm": 15.043532371520996,
"learning_rate": 1.1933519632299793e-07,
"loss": 0.9317235946655273,
"step": 3248
},
{
"epoch": 2.7356902356902357,
"grad_norm": 5.527817249298096,
"learning_rate": 1.1909372349914553e-07,
"loss": 0.9118114709854126,
"step": 3250
},
{
"epoch": 2.7373737373737375,
"grad_norm": 3.4315481185913086,
"learning_rate": 1.1885373532872297e-07,
"loss": 0.4174748957157135,
"step": 3252
},
{
"epoch": 2.739057239057239,
"grad_norm": 3.0668060779571533,
"learning_rate": 1.1861523263859069e-07,
"loss": 0.6279425621032715,
"step": 3254
},
{
"epoch": 2.7407407407407405,
"grad_norm": 5.9774651527404785,
"learning_rate": 1.1837821625049076e-07,
"loss": 0.6725097894668579,
"step": 3256
},
{
"epoch": 2.742424242424242,
"grad_norm": 3.119798183441162,
"learning_rate": 1.1814268698104425e-07,
"loss": 0.70163893699646,
"step": 3258
},
{
"epoch": 2.744107744107744,
"grad_norm": 5.229933261871338,
"learning_rate": 1.1790864564174873e-07,
"loss": 0.5799877643585205,
"step": 3260
},
{
"epoch": 2.7457912457912457,
"grad_norm": 8.287593841552734,
"learning_rate": 1.1767609303897506e-07,
"loss": 0.7188424468040466,
"step": 3262
},
{
"epoch": 2.7474747474747474,
"grad_norm": 19.248619079589844,
"learning_rate": 1.1744502997396474e-07,
"loss": 0.9669326543807983,
"step": 3264
},
{
"epoch": 2.749158249158249,
"grad_norm": 6.006091594696045,
"learning_rate": 1.1721545724282727e-07,
"loss": 1.0581872463226318,
"step": 3266
},
{
"epoch": 2.750841750841751,
"grad_norm": 20.20122528076172,
"learning_rate": 1.1698737563653745e-07,
"loss": 0.5354408621788025,
"step": 3268
},
{
"epoch": 2.7525252525252526,
"grad_norm": 3.2330803871154785,
"learning_rate": 1.1676078594093212e-07,
"loss": 1.0935049057006836,
"step": 3270
},
{
"epoch": 2.7542087542087543,
"grad_norm": 15.937528610229492,
"learning_rate": 1.1653568893670834e-07,
"loss": 0.5233392715454102,
"step": 3272
},
{
"epoch": 2.755892255892256,
"grad_norm": 5.933197498321533,
"learning_rate": 1.1631208539941993e-07,
"loss": 0.8539717197418213,
"step": 3274
},
{
"epoch": 2.757575757575758,
"grad_norm": 34.71628189086914,
"learning_rate": 1.1608997609947508e-07,
"loss": 0.35395973920822144,
"step": 3276
},
{
"epoch": 2.7592592592592595,
"grad_norm": 3.8929779529571533,
"learning_rate": 1.158693618021339e-07,
"loss": 0.09008853882551193,
"step": 3278
},
{
"epoch": 2.760942760942761,
"grad_norm": 3.974247694015503,
"learning_rate": 1.1565024326750545e-07,
"loss": 1.1840243339538574,
"step": 3280
},
{
"epoch": 2.7626262626262625,
"grad_norm": 4.763762474060059,
"learning_rate": 1.1543262125054523e-07,
"loss": 1.1094727516174316,
"step": 3282
},
{
"epoch": 2.7643097643097643,
"grad_norm": 26.06635093688965,
"learning_rate": 1.1521649650105264e-07,
"loss": 0.40256187319755554,
"step": 3284
},
{
"epoch": 2.765993265993266,
"grad_norm": 3.0916554927825928,
"learning_rate": 1.150018697636685e-07,
"loss": 0.9139037132263184,
"step": 3286
},
{
"epoch": 2.7676767676767677,
"grad_norm": 5.2920026779174805,
"learning_rate": 1.1478874177787204e-07,
"loss": 0.8635107278823853,
"step": 3288
},
{
"epoch": 2.7693602693602695,
"grad_norm": 7.0219573974609375,
"learning_rate": 1.1457711327797898e-07,
"loss": 0.3769862651824951,
"step": 3290
},
{
"epoch": 2.771043771043771,
"grad_norm": 4.999329090118408,
"learning_rate": 1.1436698499313855e-07,
"loss": 1.1161870956420898,
"step": 3292
},
{
"epoch": 2.7727272727272725,
"grad_norm": 3.669652223587036,
"learning_rate": 1.1415835764733103e-07,
"loss": 0.949033796787262,
"step": 3294
},
{
"epoch": 2.774410774410774,
"grad_norm": 3.6179237365722656,
"learning_rate": 1.1395123195936543e-07,
"loss": 0.9398729801177979,
"step": 3296
},
{
"epoch": 2.776094276094276,
"grad_norm": 4.2878947257995605,
"learning_rate": 1.1374560864287696e-07,
"loss": 0.3119538426399231,
"step": 3298
},
{
"epoch": 2.7777777777777777,
"grad_norm": 3.065671443939209,
"learning_rate": 1.1354148840632437e-07,
"loss": 0.5504776239395142,
"step": 3300
},
{
"epoch": 2.7794612794612794,
"grad_norm": 8.210501670837402,
"learning_rate": 1.1333887195298781e-07,
"loss": 0.6545171737670898,
"step": 3302
},
{
"epoch": 2.781144781144781,
"grad_norm": 24.731203079223633,
"learning_rate": 1.1313775998096624e-07,
"loss": 0.5451493263244629,
"step": 3304
},
{
"epoch": 2.782828282828283,
"grad_norm": 24.600292205810547,
"learning_rate": 1.1293815318317493e-07,
"loss": 0.8595808148384094,
"step": 3306
},
{
"epoch": 2.7845117845117846,
"grad_norm": 9.5689058303833,
"learning_rate": 1.1274005224734338e-07,
"loss": 0.573542058467865,
"step": 3308
},
{
"epoch": 2.7861952861952863,
"grad_norm": 7.985528945922852,
"learning_rate": 1.1254345785601264e-07,
"loss": 0.7355629205703735,
"step": 3310
},
{
"epoch": 2.787878787878788,
"grad_norm": 4.394935607910156,
"learning_rate": 1.1234837068653313e-07,
"loss": 0.5512019395828247,
"step": 3312
},
{
"epoch": 2.78956228956229,
"grad_norm": 3.483132839202881,
"learning_rate": 1.1215479141106207e-07,
"loss": 0.8127498626708984,
"step": 3314
},
{
"epoch": 2.791245791245791,
"grad_norm": 18.29281234741211,
"learning_rate": 1.119627206965618e-07,
"loss": 1.0899267196655273,
"step": 3316
},
{
"epoch": 2.792929292929293,
"grad_norm": 9.126869201660156,
"learning_rate": 1.1177215920479654e-07,
"loss": 0.8100671172142029,
"step": 3318
},
{
"epoch": 2.7946127946127945,
"grad_norm": 7.040780544281006,
"learning_rate": 1.1158310759233083e-07,
"loss": 0.43027007579803467,
"step": 3320
},
{
"epoch": 2.7962962962962963,
"grad_norm": 23.27480697631836,
"learning_rate": 1.113955665105271e-07,
"loss": 0.7666506767272949,
"step": 3322
},
{
"epoch": 2.797979797979798,
"grad_norm": 3.0902600288391113,
"learning_rate": 1.1120953660554319e-07,
"loss": 0.769917905330658,
"step": 3324
},
{
"epoch": 2.7996632996632997,
"grad_norm": 12.026223182678223,
"learning_rate": 1.110250185183305e-07,
"loss": 0.4246026277542114,
"step": 3326
},
{
"epoch": 2.8013468013468015,
"grad_norm": 3.31463360786438,
"learning_rate": 1.108420128846314e-07,
"loss": 0.8164354562759399,
"step": 3328
},
{
"epoch": 2.8030303030303028,
"grad_norm": 4.4698486328125,
"learning_rate": 1.1066052033497734e-07,
"loss": 0.8739584684371948,
"step": 3330
},
{
"epoch": 2.8047138047138045,
"grad_norm": 11.48538875579834,
"learning_rate": 1.1048054149468646e-07,
"loss": 0.6384426951408386,
"step": 3332
},
{
"epoch": 2.8063973063973062,
"grad_norm": 3.5100231170654297,
"learning_rate": 1.1030207698386169e-07,
"loss": 0.7716495990753174,
"step": 3334
},
{
"epoch": 2.808080808080808,
"grad_norm": 14.741443634033203,
"learning_rate": 1.1012512741738827e-07,
"loss": 0.7237218618392944,
"step": 3336
},
{
"epoch": 2.8097643097643097,
"grad_norm": 4.032273292541504,
"learning_rate": 1.0994969340493191e-07,
"loss": 0.4440898895263672,
"step": 3338
},
{
"epoch": 2.8114478114478114,
"grad_norm": 9.446307182312012,
"learning_rate": 1.0977577555093672e-07,
"loss": 0.791456937789917,
"step": 3340
},
{
"epoch": 2.813131313131313,
"grad_norm": 5.13400936126709,
"learning_rate": 1.0960337445462273e-07,
"loss": 0.897986650466919,
"step": 3342
},
{
"epoch": 2.814814814814815,
"grad_norm": 3.0911977291107178,
"learning_rate": 1.0943249070998429e-07,
"loss": 0.4430878162384033,
"step": 3344
},
{
"epoch": 2.8164983164983166,
"grad_norm": 3.849289655685425,
"learning_rate": 1.0926312490578795e-07,
"loss": 0.9019819498062134,
"step": 3346
},
{
"epoch": 2.8181818181818183,
"grad_norm": 11.346426010131836,
"learning_rate": 1.0909527762556997e-07,
"loss": 0.6365593671798706,
"step": 3348
},
{
"epoch": 2.81986531986532,
"grad_norm": 33.87907028198242,
"learning_rate": 1.089289494476349e-07,
"loss": 0.9726608395576477,
"step": 3350
},
{
"epoch": 2.821548821548822,
"grad_norm": 13.32236385345459,
"learning_rate": 1.0876414094505339e-07,
"loss": 0.9321683049201965,
"step": 3352
},
{
"epoch": 2.823232323232323,
"grad_norm": 3.9304733276367188,
"learning_rate": 1.0860085268566002e-07,
"loss": 0.8083049058914185,
"step": 3354
},
{
"epoch": 2.824915824915825,
"grad_norm": 14.073712348937988,
"learning_rate": 1.084390852320515e-07,
"loss": 0.8524267673492432,
"step": 3356
},
{
"epoch": 2.8265993265993266,
"grad_norm": 3.060366630554199,
"learning_rate": 1.0827883914158484e-07,
"loss": 0.6664683818817139,
"step": 3358
},
{
"epoch": 2.8282828282828283,
"grad_norm": 7.288575172424316,
"learning_rate": 1.0812011496637521e-07,
"loss": 0.6165136098861694,
"step": 3360
},
{
"epoch": 2.82996632996633,
"grad_norm": 9.925680160522461,
"learning_rate": 1.0796291325329419e-07,
"loss": 0.782645583152771,
"step": 3362
},
{
"epoch": 2.8316498316498318,
"grad_norm": 7.882792949676514,
"learning_rate": 1.0780723454396788e-07,
"loss": 0.6414890289306641,
"step": 3364
},
{
"epoch": 2.8333333333333335,
"grad_norm": 3.061633825302124,
"learning_rate": 1.0765307937477489e-07,
"loss": 0.5577088594436646,
"step": 3366
},
{
"epoch": 2.8350168350168348,
"grad_norm": 10.289335250854492,
"learning_rate": 1.0750044827684457e-07,
"loss": 0.5626717209815979,
"step": 3368
},
{
"epoch": 2.8367003367003365,
"grad_norm": 4.862819194793701,
"learning_rate": 1.073493417760554e-07,
"loss": 0.8943830132484436,
"step": 3370
},
{
"epoch": 2.8383838383838382,
"grad_norm": 5.187376499176025,
"learning_rate": 1.0719976039303275e-07,
"loss": 0.747265100479126,
"step": 3372
},
{
"epoch": 2.84006734006734,
"grad_norm": 11.796051979064941,
"learning_rate": 1.0705170464314741e-07,
"loss": 0.46709078550338745,
"step": 3374
},
{
"epoch": 2.8417508417508417,
"grad_norm": 2.545010805130005,
"learning_rate": 1.069051750365139e-07,
"loss": 1.0213559865951538,
"step": 3376
},
{
"epoch": 2.8434343434343434,
"grad_norm": 4.854709148406982,
"learning_rate": 1.0676017207798818e-07,
"loss": 0.872999906539917,
"step": 3378
},
{
"epoch": 2.845117845117845,
"grad_norm": 4.323747158050537,
"learning_rate": 1.0661669626716654e-07,
"loss": 0.6622998118400574,
"step": 3380
},
{
"epoch": 2.846801346801347,
"grad_norm": 15.708161354064941,
"learning_rate": 1.0647474809838358e-07,
"loss": 0.6927282810211182,
"step": 3382
},
{
"epoch": 2.8484848484848486,
"grad_norm": 5.333789348602295,
"learning_rate": 1.0633432806071032e-07,
"loss": 0.6410980224609375,
"step": 3384
},
{
"epoch": 2.8501683501683504,
"grad_norm": 5.245711803436279,
"learning_rate": 1.0619543663795291e-07,
"loss": 0.8350679874420166,
"step": 3386
},
{
"epoch": 2.851851851851852,
"grad_norm": 1.0896079540252686,
"learning_rate": 1.0605807430865085e-07,
"loss": 0.8719289302825928,
"step": 3388
},
{
"epoch": 2.8535353535353534,
"grad_norm": 6.591011047363281,
"learning_rate": 1.0592224154607507e-07,
"loss": 0.6173574328422546,
"step": 3390
},
{
"epoch": 2.855218855218855,
"grad_norm": 6.573550224304199,
"learning_rate": 1.0578793881822661e-07,
"loss": 0.5777392387390137,
"step": 3392
},
{
"epoch": 2.856902356902357,
"grad_norm": 2.5468170642852783,
"learning_rate": 1.056551665878349e-07,
"loss": 1.0248732566833496,
"step": 3394
},
{
"epoch": 2.8585858585858586,
"grad_norm": 3.26706862449646,
"learning_rate": 1.055239253123561e-07,
"loss": 1.0530986785888672,
"step": 3396
},
{
"epoch": 2.8602693602693603,
"grad_norm": 4.729337215423584,
"learning_rate": 1.0539421544397163e-07,
"loss": 0.5177785158157349,
"step": 3398
},
{
"epoch": 2.861952861952862,
"grad_norm": 10.192523002624512,
"learning_rate": 1.052660374295866e-07,
"loss": 0.48648959398269653,
"step": 3400
},
{
"epoch": 2.8636363636363638,
"grad_norm": 9.012269973754883,
"learning_rate": 1.0513939171082812e-07,
"loss": 0.5270302295684814,
"step": 3402
},
{
"epoch": 2.865319865319865,
"grad_norm": 4.075140476226807,
"learning_rate": 1.0501427872404407e-07,
"loss": 0.49075964093208313,
"step": 3404
},
{
"epoch": 2.8670033670033668,
"grad_norm": 5.544951438903809,
"learning_rate": 1.0489069890030129e-07,
"loss": 0.883784294128418,
"step": 3406
},
{
"epoch": 2.8686868686868685,
"grad_norm": 4.438905715942383,
"learning_rate": 1.0476865266538431e-07,
"loss": 0.43367594480514526,
"step": 3408
},
{
"epoch": 2.8703703703703702,
"grad_norm": 4.913120269775391,
"learning_rate": 1.0464814043979367e-07,
"loss": 0.9170664548873901,
"step": 3410
},
{
"epoch": 2.872053872053872,
"grad_norm": 4.941054821014404,
"learning_rate": 1.0452916263874477e-07,
"loss": 0.5428977608680725,
"step": 3412
},
{
"epoch": 2.8737373737373737,
"grad_norm": 5.263132572174072,
"learning_rate": 1.0441171967216618e-07,
"loss": 0.7901989817619324,
"step": 3414
},
{
"epoch": 2.8754208754208754,
"grad_norm": 9.088157653808594,
"learning_rate": 1.042958119446983e-07,
"loss": 0.2979390025138855,
"step": 3416
},
{
"epoch": 2.877104377104377,
"grad_norm": 10.997780799865723,
"learning_rate": 1.0418143985569209e-07,
"loss": 0.6635469198226929,
"step": 3418
},
{
"epoch": 2.878787878787879,
"grad_norm": 3.945129871368408,
"learning_rate": 1.0406860379920746e-07,
"loss": 0.4760744273662567,
"step": 3420
},
{
"epoch": 2.8804713804713806,
"grad_norm": 8.165509223937988,
"learning_rate": 1.0395730416401211e-07,
"loss": 0.8622602820396423,
"step": 3422
},
{
"epoch": 2.8821548821548824,
"grad_norm": 2.621253728866577,
"learning_rate": 1.0384754133358014e-07,
"loss": 0.6706223487854004,
"step": 3424
},
{
"epoch": 2.883838383838384,
"grad_norm": 7.00462532043457,
"learning_rate": 1.0373931568609063e-07,
"loss": 0.7515609264373779,
"step": 3426
},
{
"epoch": 2.8855218855218854,
"grad_norm": 6.707590103149414,
"learning_rate": 1.0363262759442654e-07,
"loss": 0.6428268551826477,
"step": 3428
},
{
"epoch": 2.887205387205387,
"grad_norm": 15.433223724365234,
"learning_rate": 1.0352747742617327e-07,
"loss": 0.4187021851539612,
"step": 3430
},
{
"epoch": 2.888888888888889,
"grad_norm": 5.754827976226807,
"learning_rate": 1.0342386554361728e-07,
"loss": 0.6734333634376526,
"step": 3432
},
{
"epoch": 2.8905723905723906,
"grad_norm": 16.424394607543945,
"learning_rate": 1.0332179230374509e-07,
"loss": 0.6447641253471375,
"step": 3434
},
{
"epoch": 2.8922558922558923,
"grad_norm": 7.980709552764893,
"learning_rate": 1.032212580582421e-07,
"loss": 1.063244104385376,
"step": 3436
},
{
"epoch": 2.893939393939394,
"grad_norm": 11.630234718322754,
"learning_rate": 1.0312226315349098e-07,
"loss": 0.9426344037055969,
"step": 3438
},
{
"epoch": 2.8956228956228958,
"grad_norm": 4.453112602233887,
"learning_rate": 1.0302480793057082e-07,
"loss": 0.8930955529212952,
"step": 3440
},
{
"epoch": 2.897306397306397,
"grad_norm": 4.323969841003418,
"learning_rate": 1.0292889272525597e-07,
"loss": 1.0264780521392822,
"step": 3442
},
{
"epoch": 2.898989898989899,
"grad_norm": 4.514182090759277,
"learning_rate": 1.0283451786801456e-07,
"loss": 0.4191988706588745,
"step": 3444
},
{
"epoch": 2.9006734006734005,
"grad_norm": 3.260342597961426,
"learning_rate": 1.0274168368400774e-07,
"loss": 0.5836988687515259,
"step": 3446
},
{
"epoch": 2.9023569023569022,
"grad_norm": 3.745016574859619,
"learning_rate": 1.0265039049308834e-07,
"loss": 1.1238579750061035,
"step": 3448
},
{
"epoch": 2.904040404040404,
"grad_norm": 12.539746284484863,
"learning_rate": 1.0256063860979977e-07,
"loss": 0.40760430693626404,
"step": 3450
},
{
"epoch": 2.9057239057239057,
"grad_norm": 2.8215339183807373,
"learning_rate": 1.0247242834337502e-07,
"loss": 0.7182443737983704,
"step": 3452
},
{
"epoch": 2.9074074074074074,
"grad_norm": 27.12503433227539,
"learning_rate": 1.0238575999773569e-07,
"loss": 0.6834052205085754,
"step": 3454
},
{
"epoch": 2.909090909090909,
"grad_norm": 2.9466779232025146,
"learning_rate": 1.0230063387149058e-07,
"loss": 1.065738320350647,
"step": 3456
},
{
"epoch": 2.910774410774411,
"grad_norm": 7.221368789672852,
"learning_rate": 1.0221705025793505e-07,
"loss": 0.8638687133789062,
"step": 3458
},
{
"epoch": 2.9124579124579126,
"grad_norm": 15.298805236816406,
"learning_rate": 1.021350094450498e-07,
"loss": 1.0362968444824219,
"step": 3460
},
{
"epoch": 2.9141414141414144,
"grad_norm": 2.772352695465088,
"learning_rate": 1.0205451171549999e-07,
"loss": 1.0920348167419434,
"step": 3462
},
{
"epoch": 2.915824915824916,
"grad_norm": 6.832037448883057,
"learning_rate": 1.0197555734663415e-07,
"loss": 0.8181166648864746,
"step": 3464
},
{
"epoch": 2.9175084175084174,
"grad_norm": 10.260382652282715,
"learning_rate": 1.0189814661048329e-07,
"loss": 1.0308600664138794,
"step": 3466
},
{
"epoch": 2.919191919191919,
"grad_norm": 8.912053108215332,
"learning_rate": 1.0182227977375995e-07,
"loss": 0.6785660982131958,
"step": 3468
},
{
"epoch": 2.920875420875421,
"grad_norm": 3.7292585372924805,
"learning_rate": 1.0174795709785737e-07,
"loss": 0.2668553590774536,
"step": 3470
},
{
"epoch": 2.9225589225589226,
"grad_norm": 15.397346496582031,
"learning_rate": 1.0167517883884837e-07,
"loss": 0.8357558250427246,
"step": 3472
},
{
"epoch": 2.9242424242424243,
"grad_norm": 5.987993240356445,
"learning_rate": 1.016039452474847e-07,
"loss": 0.7866486310958862,
"step": 3474
},
{
"epoch": 2.925925925925926,
"grad_norm": 5.408625602722168,
"learning_rate": 1.0153425656919609e-07,
"loss": 0.40831270813941956,
"step": 3476
},
{
"epoch": 2.9276094276094278,
"grad_norm": 5.647230625152588,
"learning_rate": 1.0146611304408931e-07,
"loss": 0.8993617296218872,
"step": 3478
},
{
"epoch": 2.929292929292929,
"grad_norm": 4.667529106140137,
"learning_rate": 1.0139951490694746e-07,
"loss": 0.570891261100769,
"step": 3480
},
{
"epoch": 2.930976430976431,
"grad_norm": 3.3206403255462646,
"learning_rate": 1.013344623872292e-07,
"loss": 0.8598926663398743,
"step": 3482
},
{
"epoch": 2.9326599326599325,
"grad_norm": 16.66160774230957,
"learning_rate": 1.0127095570906781e-07,
"loss": 0.6207292079925537,
"step": 3484
},
{
"epoch": 2.9343434343434343,
"grad_norm": 5.509361267089844,
"learning_rate": 1.0120899509127051e-07,
"loss": 0.7470987439155579,
"step": 3486
},
{
"epoch": 2.936026936026936,
"grad_norm": 4.825704097747803,
"learning_rate": 1.0114858074731771e-07,
"loss": 0.8294214606285095,
"step": 3488
},
{
"epoch": 2.9377104377104377,
"grad_norm": 6.74330997467041,
"learning_rate": 1.0108971288536224e-07,
"loss": 0.8246122598648071,
"step": 3490
},
{
"epoch": 2.9393939393939394,
"grad_norm": 14.303301811218262,
"learning_rate": 1.0103239170822867e-07,
"loss": 0.936402440071106,
"step": 3492
},
{
"epoch": 2.941077441077441,
"grad_norm": 11.945917129516602,
"learning_rate": 1.0097661741341254e-07,
"loss": 0.5219341516494751,
"step": 3494
},
{
"epoch": 2.942760942760943,
"grad_norm": 4.1998820304870605,
"learning_rate": 1.0092239019307974e-07,
"loss": 0.8593817949295044,
"step": 3496
},
{
"epoch": 2.9444444444444446,
"grad_norm": 11.838610649108887,
"learning_rate": 1.0086971023406596e-07,
"loss": 0.4355551600456238,
"step": 3498
},
{
"epoch": 2.9461279461279464,
"grad_norm": 7.149326801300049,
"learning_rate": 1.0081857771787575e-07,
"loss": 0.6722170114517212,
"step": 3500
},
{
"epoch": 2.9478114478114477,
"grad_norm": 6.950955390930176,
"learning_rate": 1.0076899282068215e-07,
"loss": 0.8052189350128174,
"step": 3502
},
{
"epoch": 2.9494949494949494,
"grad_norm": 8.336926460266113,
"learning_rate": 1.00720955713326e-07,
"loss": 0.3886244297027588,
"step": 3504
},
{
"epoch": 2.951178451178451,
"grad_norm": 2.504477024078369,
"learning_rate": 1.0067446656131536e-07,
"loss": 0.7975258231163025,
"step": 3506
},
{
"epoch": 2.952861952861953,
"grad_norm": 3.895413398742676,
"learning_rate": 1.0062952552482489e-07,
"loss": 0.9311509132385254,
"step": 3508
},
{
"epoch": 2.9545454545454546,
"grad_norm": 3.7988312244415283,
"learning_rate": 1.0058613275869534e-07,
"loss": 0.7745894193649292,
"step": 3510
},
{
"epoch": 2.9562289562289563,
"grad_norm": 5.594661235809326,
"learning_rate": 1.0054428841243314e-07,
"loss": 0.8809847235679626,
"step": 3512
},
{
"epoch": 2.957912457912458,
"grad_norm": 10.233572006225586,
"learning_rate": 1.0050399263020963e-07,
"loss": 0.5408470630645752,
"step": 3514
},
{
"epoch": 2.9595959595959593,
"grad_norm": 4.301779747009277,
"learning_rate": 1.0046524555086075e-07,
"loss": 0.9347457885742188,
"step": 3516
},
{
"epoch": 2.961279461279461,
"grad_norm": 3.945042610168457,
"learning_rate": 1.0042804730788647e-07,
"loss": 0.9306644797325134,
"step": 3518
},
{
"epoch": 2.962962962962963,
"grad_norm": 8.176459312438965,
"learning_rate": 1.0039239802945032e-07,
"loss": 0.5422787666320801,
"step": 3520
},
{
"epoch": 2.9646464646464645,
"grad_norm": 7.75977087020874,
"learning_rate": 1.003582978383792e-07,
"loss": 0.8068456649780273,
"step": 3522
},
{
"epoch": 2.9663299663299663,
"grad_norm": 4.503453731536865,
"learning_rate": 1.003257468521625e-07,
"loss": 0.329245924949646,
"step": 3524
},
{
"epoch": 2.968013468013468,
"grad_norm": 5.413825035095215,
"learning_rate": 1.0029474518295213e-07,
"loss": 0.9549334049224854,
"step": 3526
},
{
"epoch": 2.9696969696969697,
"grad_norm": 5.911332607269287,
"learning_rate": 1.0026529293756189e-07,
"loss": 0.8807719945907593,
"step": 3528
},
{
"epoch": 2.9713804713804715,
"grad_norm": 4.6141462326049805,
"learning_rate": 1.0023739021746709e-07,
"loss": 0.9357779026031494,
"step": 3530
},
{
"epoch": 2.973063973063973,
"grad_norm": 22.926517486572266,
"learning_rate": 1.002110371188044e-07,
"loss": 0.7154991626739502,
"step": 3532
},
{
"epoch": 2.974747474747475,
"grad_norm": 10.70749282836914,
"learning_rate": 1.0018623373237139e-07,
"loss": 0.3366190493106842,
"step": 3534
},
{
"epoch": 2.9764309764309766,
"grad_norm": 5.614308834075928,
"learning_rate": 1.0016298014362602e-07,
"loss": 0.9368351697921753,
"step": 3536
},
{
"epoch": 2.9781144781144784,
"grad_norm": 17.715473175048828,
"learning_rate": 1.0014127643268678e-07,
"loss": 0.5009272694587708,
"step": 3538
},
{
"epoch": 2.9797979797979797,
"grad_norm": 4.969324111938477,
"learning_rate": 1.0012112267433204e-07,
"loss": 1.1264997720718384,
"step": 3540
},
{
"epoch": 2.9814814814814814,
"grad_norm": 4.244232177734375,
"learning_rate": 1.0010251893799999e-07,
"loss": 0.9415320158004761,
"step": 3542
},
{
"epoch": 2.983164983164983,
"grad_norm": 5.749111652374268,
"learning_rate": 1.0008546528778836e-07,
"loss": 0.5878887176513672,
"step": 3544
},
{
"epoch": 2.984848484848485,
"grad_norm": 4.314813613891602,
"learning_rate": 1.0006996178245414e-07,
"loss": 0.934430718421936,
"step": 3546
},
{
"epoch": 2.9865319865319866,
"grad_norm": 11.60458755493164,
"learning_rate": 1.0005600847541344e-07,
"loss": 0.4331338703632355,
"step": 3548
},
{
"epoch": 2.9882154882154883,
"grad_norm": 16.16608428955078,
"learning_rate": 1.0004360541474121e-07,
"loss": 0.4102497398853302,
"step": 3550
},
{
"epoch": 2.98989898989899,
"grad_norm": 4.306326866149902,
"learning_rate": 1.0003275264317129e-07,
"loss": 0.6111245155334473,
"step": 3552
},
{
"epoch": 2.9915824915824913,
"grad_norm": 3.421985387802124,
"learning_rate": 1.00023450198096e-07,
"loss": 1.004423975944519,
"step": 3554
},
{
"epoch": 2.993265993265993,
"grad_norm": 5.036525249481201,
"learning_rate": 1.0001569811156621e-07,
"loss": 0.9042291045188904,
"step": 3556
},
{
"epoch": 2.994949494949495,
"grad_norm": 2.6041200160980225,
"learning_rate": 1.0000949641029108e-07,
"loss": 0.8039933443069458,
"step": 3558
},
{
"epoch": 2.9966329966329965,
"grad_norm": 4.743382930755615,
"learning_rate": 1.000048451156381e-07,
"loss": 0.5211207270622253,
"step": 3560
},
{
"epoch": 2.9983164983164983,
"grad_norm": 7.2076416015625,
"learning_rate": 1.0000174424363293e-07,
"loss": 0.7096606492996216,
"step": 3562
},
{
"epoch": 3.0,
"grad_norm": 8.714981079101562,
"learning_rate": 1.0000019380495939e-07,
"loss": 0.6827124953269958,
"step": 3564
},
{
"epoch": 3.0,
"step": 3564,
"total_flos": 4.2988160857187287e+18,
"train_loss": 0.8751117374819068,
"train_runtime": 6006.754,
"train_samples_per_second": 9.493,
"train_steps_per_second": 0.593
}
],
"logging_steps": 2,
"max_steps": 3564,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.2988160857187287e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}