9b-108 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
17a48a6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1896,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004219409282700422,
"grad_norm": 7.742424964904785,
"learning_rate": 1.263157894736842e-08,
"loss": 2.195801019668579,
"step": 2
},
{
"epoch": 0.008438818565400843,
"grad_norm": 9.064373016357422,
"learning_rate": 3.7894736842105265e-08,
"loss": 1.77604079246521,
"step": 4
},
{
"epoch": 0.012658227848101266,
"grad_norm": 1.5363776683807373,
"learning_rate": 6.31578947368421e-08,
"loss": 1.9369428157806396,
"step": 6
},
{
"epoch": 0.016877637130801686,
"grad_norm": 3.058716297149658,
"learning_rate": 8.842105263157893e-08,
"loss": 1.949758768081665,
"step": 8
},
{
"epoch": 0.02109704641350211,
"grad_norm": 2.559150457382202,
"learning_rate": 1.1368421052631579e-07,
"loss": 1.855597972869873,
"step": 10
},
{
"epoch": 0.02531645569620253,
"grad_norm": 1.0957772731781006,
"learning_rate": 1.3894736842105263e-07,
"loss": 1.3166148662567139,
"step": 12
},
{
"epoch": 0.029535864978902954,
"grad_norm": 2.902003049850464,
"learning_rate": 1.642105263157895e-07,
"loss": 1.662704348564148,
"step": 14
},
{
"epoch": 0.03375527426160337,
"grad_norm": 7.220351219177246,
"learning_rate": 1.894736842105263e-07,
"loss": 2.225470542907715,
"step": 16
},
{
"epoch": 0.0379746835443038,
"grad_norm": 1.3415447473526,
"learning_rate": 2.1473684210526315e-07,
"loss": 1.8447153568267822,
"step": 18
},
{
"epoch": 0.04219409282700422,
"grad_norm": 1.5747126340866089,
"learning_rate": 2.4e-07,
"loss": 1.9135501384735107,
"step": 20
},
{
"epoch": 0.046413502109704644,
"grad_norm": 1.7888033390045166,
"learning_rate": 2.6526315789473684e-07,
"loss": 1.6221210956573486,
"step": 22
},
{
"epoch": 0.05063291139240506,
"grad_norm": 2.052851915359497,
"learning_rate": 2.905263157894737e-07,
"loss": 1.8789974451065063,
"step": 24
},
{
"epoch": 0.05485232067510549,
"grad_norm": 2.472539186477661,
"learning_rate": 3.157894736842105e-07,
"loss": 1.7755436897277832,
"step": 26
},
{
"epoch": 0.05907172995780591,
"grad_norm": 2.0235300064086914,
"learning_rate": 3.4105263157894735e-07,
"loss": 1.9977495670318604,
"step": 28
},
{
"epoch": 0.06329113924050633,
"grad_norm": 3.731635808944702,
"learning_rate": 3.663157894736842e-07,
"loss": 2.019644021987915,
"step": 30
},
{
"epoch": 0.06751054852320675,
"grad_norm": 1.7156628370285034,
"learning_rate": 3.9157894736842107e-07,
"loss": 1.8407037258148193,
"step": 32
},
{
"epoch": 0.07172995780590717,
"grad_norm": 7.599488735198975,
"learning_rate": 4.168421052631579e-07,
"loss": 2.1601576805114746,
"step": 34
},
{
"epoch": 0.0759493670886076,
"grad_norm": 1.4482383728027344,
"learning_rate": 4.4210526315789467e-07,
"loss": 1.8958334922790527,
"step": 36
},
{
"epoch": 0.08016877637130802,
"grad_norm": 3.731816530227661,
"learning_rate": 4.6736842105263153e-07,
"loss": 2.1196088790893555,
"step": 38
},
{
"epoch": 0.08438818565400844,
"grad_norm": 1.4769682884216309,
"learning_rate": 4.926315789473684e-07,
"loss": 1.9134398698806763,
"step": 40
},
{
"epoch": 0.08860759493670886,
"grad_norm": 13.363183975219727,
"learning_rate": 5.178947368421052e-07,
"loss": 1.8113877773284912,
"step": 42
},
{
"epoch": 0.09282700421940929,
"grad_norm": 3.724055051803589,
"learning_rate": 5.431578947368421e-07,
"loss": 2.103419542312622,
"step": 44
},
{
"epoch": 0.0970464135021097,
"grad_norm": 2.3887927532196045,
"learning_rate": 5.684210526315788e-07,
"loss": 1.8661903142929077,
"step": 46
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.5525636672973633,
"learning_rate": 5.936842105263157e-07,
"loss": 1.8071659803390503,
"step": 48
},
{
"epoch": 0.10548523206751055,
"grad_norm": 5.839144229888916,
"learning_rate": 6.189473684210527e-07,
"loss": 1.561785340309143,
"step": 50
},
{
"epoch": 0.10970464135021098,
"grad_norm": 5.124746799468994,
"learning_rate": 6.442105263157894e-07,
"loss": 1.3494197130203247,
"step": 52
},
{
"epoch": 0.11392405063291139,
"grad_norm": 5.008734703063965,
"learning_rate": 6.694736842105263e-07,
"loss": 1.3085637092590332,
"step": 54
},
{
"epoch": 0.11814345991561181,
"grad_norm": 1.1680630445480347,
"learning_rate": 6.947368421052631e-07,
"loss": 1.710934042930603,
"step": 56
},
{
"epoch": 0.12236286919831224,
"grad_norm": 11.505062103271484,
"learning_rate": 7.2e-07,
"loss": 1.460722804069519,
"step": 58
},
{
"epoch": 0.12658227848101267,
"grad_norm": 7.362393856048584,
"learning_rate": 7.452631578947368e-07,
"loss": 1.740147590637207,
"step": 60
},
{
"epoch": 0.1308016877637131,
"grad_norm": 1.551930546760559,
"learning_rate": 7.705263157894736e-07,
"loss": 1.7590422630310059,
"step": 62
},
{
"epoch": 0.1350210970464135,
"grad_norm": 1.2569609880447388,
"learning_rate": 7.957894736842105e-07,
"loss": 1.2291865348815918,
"step": 64
},
{
"epoch": 0.13924050632911392,
"grad_norm": 2.3231699466705322,
"learning_rate": 8.210526315789473e-07,
"loss": 1.040055513381958,
"step": 66
},
{
"epoch": 0.14345991561181434,
"grad_norm": 1.0935379266738892,
"learning_rate": 8.463157894736842e-07,
"loss": 1.300035834312439,
"step": 68
},
{
"epoch": 0.14767932489451477,
"grad_norm": 4.188493728637695,
"learning_rate": 8.71578947368421e-07,
"loss": 1.1873421669006348,
"step": 70
},
{
"epoch": 0.1518987341772152,
"grad_norm": 1.0681216716766357,
"learning_rate": 8.968421052631579e-07,
"loss": 1.4782516956329346,
"step": 72
},
{
"epoch": 0.15611814345991562,
"grad_norm": 2.1197876930236816,
"learning_rate": 9.221052631578946e-07,
"loss": 1.2450737953186035,
"step": 74
},
{
"epoch": 0.16033755274261605,
"grad_norm": 4.197497844696045,
"learning_rate": 9.473684210526316e-07,
"loss": 1.0491926670074463,
"step": 76
},
{
"epoch": 0.16455696202531644,
"grad_norm": 1.161306619644165,
"learning_rate": 9.726315789473682e-07,
"loss": 1.60398268699646,
"step": 78
},
{
"epoch": 0.16877637130801687,
"grad_norm": 1.0192948579788208,
"learning_rate": 9.978947368421053e-07,
"loss": 1.5951966047286987,
"step": 80
},
{
"epoch": 0.1729957805907173,
"grad_norm": 2.497844934463501,
"learning_rate": 1.023157894736842e-06,
"loss": 1.564704179763794,
"step": 82
},
{
"epoch": 0.17721518987341772,
"grad_norm": 9.568504333496094,
"learning_rate": 1.048421052631579e-06,
"loss": 1.1448438167572021,
"step": 84
},
{
"epoch": 0.18143459915611815,
"grad_norm": 1.6581389904022217,
"learning_rate": 1.0736842105263157e-06,
"loss": 1.2066229581832886,
"step": 86
},
{
"epoch": 0.18565400843881857,
"grad_norm": 2.0455548763275146,
"learning_rate": 1.0989473684210525e-06,
"loss": 1.475752353668213,
"step": 88
},
{
"epoch": 0.189873417721519,
"grad_norm": 1.7110133171081543,
"learning_rate": 1.1242105263157894e-06,
"loss": 0.7750993967056274,
"step": 90
},
{
"epoch": 0.1940928270042194,
"grad_norm": 1.7819750308990479,
"learning_rate": 1.1494736842105262e-06,
"loss": 1.8029006719589233,
"step": 92
},
{
"epoch": 0.19831223628691982,
"grad_norm": 2.48787784576416,
"learning_rate": 1.174736842105263e-06,
"loss": 1.0352967977523804,
"step": 94
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.236476182937622,
"learning_rate": 1.2e-06,
"loss": 1.5603318214416504,
"step": 96
},
{
"epoch": 0.20675105485232068,
"grad_norm": 1.040940761566162,
"learning_rate": 1.1999967137875644e-06,
"loss": 1.6248691082000732,
"step": 98
},
{
"epoch": 0.2109704641350211,
"grad_norm": 2.421082019805908,
"learning_rate": 1.199986855190255e-06,
"loss": 1.3791676759719849,
"step": 100
},
{
"epoch": 0.21518987341772153,
"grad_norm": 1.4071706533432007,
"learning_rate": 1.1999704243280622e-06,
"loss": 1.1831879615783691,
"step": 102
},
{
"epoch": 0.21940928270042195,
"grad_norm": 0.912856936454773,
"learning_rate": 1.1999474214009684e-06,
"loss": 1.097001552581787,
"step": 104
},
{
"epoch": 0.22362869198312235,
"grad_norm": 2.3082234859466553,
"learning_rate": 1.1999178466889462e-06,
"loss": 1.089848518371582,
"step": 106
},
{
"epoch": 0.22784810126582278,
"grad_norm": 1.7334387302398682,
"learning_rate": 1.1998817005519536e-06,
"loss": 1.0864239931106567,
"step": 108
},
{
"epoch": 0.2320675105485232,
"grad_norm": 1.158636212348938,
"learning_rate": 1.1998389834299315e-06,
"loss": 1.135922908782959,
"step": 110
},
{
"epoch": 0.23628691983122363,
"grad_norm": 1.3626004457473755,
"learning_rate": 1.1997896958427962e-06,
"loss": 1.511846661567688,
"step": 112
},
{
"epoch": 0.24050632911392406,
"grad_norm": 2.417889356613159,
"learning_rate": 1.199733838390435e-06,
"loss": 1.387232780456543,
"step": 114
},
{
"epoch": 0.24472573839662448,
"grad_norm": 2.854128837585449,
"learning_rate": 1.1996714117526975e-06,
"loss": 1.7170121669769287,
"step": 116
},
{
"epoch": 0.2489451476793249,
"grad_norm": 1.1655317544937134,
"learning_rate": 1.1996024166893883e-06,
"loss": 1.4113752841949463,
"step": 118
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.6216784715652466,
"learning_rate": 1.199526854040257e-06,
"loss": 1.0603108406066895,
"step": 120
},
{
"epoch": 0.25738396624472576,
"grad_norm": 8.596741676330566,
"learning_rate": 1.1994447247249886e-06,
"loss": 1.3766067028045654,
"step": 122
},
{
"epoch": 0.2616033755274262,
"grad_norm": 6.650040149688721,
"learning_rate": 1.199356029743192e-06,
"loss": 1.3911750316619873,
"step": 124
},
{
"epoch": 0.26582278481012656,
"grad_norm": 1.3944730758666992,
"learning_rate": 1.1992607701743877e-06,
"loss": 1.479828953742981,
"step": 126
},
{
"epoch": 0.270042194092827,
"grad_norm": 1.8790662288665771,
"learning_rate": 1.1991589471779944e-06,
"loss": 1.0149238109588623,
"step": 128
},
{
"epoch": 0.2742616033755274,
"grad_norm": 2.3544161319732666,
"learning_rate": 1.1990505619933166e-06,
"loss": 1.2252846956253052,
"step": 130
},
{
"epoch": 0.27848101265822783,
"grad_norm": 1.664776086807251,
"learning_rate": 1.1989356159395268e-06,
"loss": 1.3019721508026123,
"step": 132
},
{
"epoch": 0.28270042194092826,
"grad_norm": 9.16057014465332,
"learning_rate": 1.1988141104156518e-06,
"loss": 0.8186183571815491,
"step": 134
},
{
"epoch": 0.2869198312236287,
"grad_norm": 1.194026231765747,
"learning_rate": 1.1986860469005543e-06,
"loss": 1.0649269819259644,
"step": 136
},
{
"epoch": 0.2911392405063291,
"grad_norm": 1.9782294034957886,
"learning_rate": 1.1985514269529155e-06,
"loss": 1.479400873184204,
"step": 138
},
{
"epoch": 0.29535864978902954,
"grad_norm": 5.742581367492676,
"learning_rate": 1.1984102522112159e-06,
"loss": 1.0161385536193848,
"step": 140
},
{
"epoch": 0.29957805907172996,
"grad_norm": 1.9590661525726318,
"learning_rate": 1.1982625243937158e-06,
"loss": 1.3290033340454102,
"step": 142
},
{
"epoch": 0.3037974683544304,
"grad_norm": 7.880887031555176,
"learning_rate": 1.198108245298433e-06,
"loss": 1.1459200382232666,
"step": 144
},
{
"epoch": 0.3080168776371308,
"grad_norm": 3.6530940532684326,
"learning_rate": 1.1979474168031232e-06,
"loss": 1.4865257740020752,
"step": 146
},
{
"epoch": 0.31223628691983124,
"grad_norm": 1.9205989837646484,
"learning_rate": 1.1977800408652552e-06,
"loss": 1.4399386644363403,
"step": 148
},
{
"epoch": 0.31645569620253167,
"grad_norm": 1.6300798654556274,
"learning_rate": 1.1976061195219877e-06,
"loss": 1.3092478513717651,
"step": 150
},
{
"epoch": 0.3206751054852321,
"grad_norm": 1.216039776802063,
"learning_rate": 1.1974256548901447e-06,
"loss": 1.3857874870300293,
"step": 152
},
{
"epoch": 0.32489451476793246,
"grad_norm": 1.1664661169052124,
"learning_rate": 1.1972386491661896e-06,
"loss": 1.5414711236953735,
"step": 154
},
{
"epoch": 0.3291139240506329,
"grad_norm": 6.513598918914795,
"learning_rate": 1.1970451046261986e-06,
"loss": 1.3435574769973755,
"step": 156
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.0399960279464722,
"learning_rate": 1.196845023625833e-06,
"loss": 1.4113006591796875,
"step": 158
},
{
"epoch": 0.33755274261603374,
"grad_norm": 1.2614532709121704,
"learning_rate": 1.196638408600309e-06,
"loss": 1.4127811193466187,
"step": 160
},
{
"epoch": 0.34177215189873417,
"grad_norm": 1.940744161605835,
"learning_rate": 1.1964252620643718e-06,
"loss": 0.9027857184410095,
"step": 162
},
{
"epoch": 0.3459915611814346,
"grad_norm": 2.2784762382507324,
"learning_rate": 1.1962055866122608e-06,
"loss": 1.305877447128296,
"step": 164
},
{
"epoch": 0.350210970464135,
"grad_norm": 2.4151477813720703,
"learning_rate": 1.1959793849176804e-06,
"loss": 0.8810802698135376,
"step": 166
},
{
"epoch": 0.35443037974683544,
"grad_norm": 1.4606540203094482,
"learning_rate": 1.195746659733767e-06,
"loss": 1.3153032064437866,
"step": 168
},
{
"epoch": 0.35864978902953587,
"grad_norm": 2.1664512157440186,
"learning_rate": 1.1955074138930558e-06,
"loss": 1.409055233001709,
"step": 170
},
{
"epoch": 0.3628691983122363,
"grad_norm": 1.8031892776489258,
"learning_rate": 1.1952616503074452e-06,
"loss": 1.288240909576416,
"step": 172
},
{
"epoch": 0.3670886075949367,
"grad_norm": 1.2246201038360596,
"learning_rate": 1.1950093719681623e-06,
"loss": 1.0962798595428467,
"step": 174
},
{
"epoch": 0.37130801687763715,
"grad_norm": 2.090580701828003,
"learning_rate": 1.1947505819457264e-06,
"loss": 1.4130232334136963,
"step": 176
},
{
"epoch": 0.3755274261603376,
"grad_norm": 0.9846044778823853,
"learning_rate": 1.1944852833899122e-06,
"loss": 1.4005430936813354,
"step": 178
},
{
"epoch": 0.379746835443038,
"grad_norm": 1.043843388557434,
"learning_rate": 1.1942134795297092e-06,
"loss": 1.0696699619293213,
"step": 180
},
{
"epoch": 0.38396624472573837,
"grad_norm": 1.0868744850158691,
"learning_rate": 1.1939351736732854e-06,
"loss": 1.3760430812835693,
"step": 182
},
{
"epoch": 0.3881856540084388,
"grad_norm": 1.203220009803772,
"learning_rate": 1.193650369207945e-06,
"loss": 1.3777748346328735,
"step": 184
},
{
"epoch": 0.3924050632911392,
"grad_norm": 2.734304666519165,
"learning_rate": 1.1933590696000883e-06,
"loss": 0.5890464186668396,
"step": 186
},
{
"epoch": 0.39662447257383965,
"grad_norm": 1.4362255334854126,
"learning_rate": 1.193061278395168e-06,
"loss": 1.0182992219924927,
"step": 188
},
{
"epoch": 0.4008438818565401,
"grad_norm": 1.4891494512557983,
"learning_rate": 1.1927569992176479e-06,
"loss": 1.1124638319015503,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.0771912336349487,
"learning_rate": 1.1924462357709577e-06,
"loss": 1.3731889724731445,
"step": 192
},
{
"epoch": 0.4092827004219409,
"grad_norm": 1.2685896158218384,
"learning_rate": 1.1921289918374481e-06,
"loss": 1.1032942533493042,
"step": 194
},
{
"epoch": 0.41350210970464135,
"grad_norm": 1.8579025268554688,
"learning_rate": 1.1918052712783451e-06,
"loss": 1.364923357963562,
"step": 196
},
{
"epoch": 0.4177215189873418,
"grad_norm": 1.3035788536071777,
"learning_rate": 1.1914750780337023e-06,
"loss": 1.0887572765350342,
"step": 198
},
{
"epoch": 0.4219409282700422,
"grad_norm": 1.1359857320785522,
"learning_rate": 1.1911384161223538e-06,
"loss": 1.1425602436065674,
"step": 200
},
{
"epoch": 0.42616033755274263,
"grad_norm": 1.8307956457138062,
"learning_rate": 1.1907952896418643e-06,
"loss": 1.177668809890747,
"step": 202
},
{
"epoch": 0.43037974683544306,
"grad_norm": 1.2392957210540771,
"learning_rate": 1.1904457027684802e-06,
"loss": 0.9585235714912415,
"step": 204
},
{
"epoch": 0.4345991561181435,
"grad_norm": 1.794783115386963,
"learning_rate": 1.1900896597570784e-06,
"loss": 1.6650797128677368,
"step": 206
},
{
"epoch": 0.4388185654008439,
"grad_norm": 1.6542989015579224,
"learning_rate": 1.1897271649411145e-06,
"loss": 0.7469709515571594,
"step": 208
},
{
"epoch": 0.4430379746835443,
"grad_norm": 1.3410842418670654,
"learning_rate": 1.1893582227325694e-06,
"loss": 1.3532118797302246,
"step": 210
},
{
"epoch": 0.4472573839662447,
"grad_norm": 3.8952300548553467,
"learning_rate": 1.1889828376218972e-06,
"loss": 0.8239259719848633,
"step": 212
},
{
"epoch": 0.45147679324894513,
"grad_norm": 1.3895829916000366,
"learning_rate": 1.1886010141779688e-06,
"loss": 1.2587556838989258,
"step": 214
},
{
"epoch": 0.45569620253164556,
"grad_norm": 0.944612443447113,
"learning_rate": 1.1882127570480174e-06,
"loss": 1.3315932750701904,
"step": 216
},
{
"epoch": 0.459915611814346,
"grad_norm": 1.4940253496170044,
"learning_rate": 1.1878180709575815e-06,
"loss": 1.3548877239227295,
"step": 218
},
{
"epoch": 0.4641350210970464,
"grad_norm": 1.9045593738555908,
"learning_rate": 1.1874169607104478e-06,
"loss": 1.3191989660263062,
"step": 220
},
{
"epoch": 0.46835443037974683,
"grad_norm": 2.8294312953948975,
"learning_rate": 1.187009431188592e-06,
"loss": 1.270053505897522,
"step": 222
},
{
"epoch": 0.47257383966244726,
"grad_norm": 1.5677937269210815,
"learning_rate": 1.1865954873521197e-06,
"loss": 1.4200479984283447,
"step": 224
},
{
"epoch": 0.4767932489451477,
"grad_norm": 1.1733640432357788,
"learning_rate": 1.1861751342392067e-06,
"loss": 1.3603910207748413,
"step": 226
},
{
"epoch": 0.4810126582278481,
"grad_norm": 2.0203163623809814,
"learning_rate": 1.185748376966037e-06,
"loss": 0.8049441576004028,
"step": 228
},
{
"epoch": 0.48523206751054854,
"grad_norm": 1.1351509094238281,
"learning_rate": 1.18531522072674e-06,
"loss": 0.9551719427108765,
"step": 230
},
{
"epoch": 0.48945147679324896,
"grad_norm": 2.4217798709869385,
"learning_rate": 1.1848756707933284e-06,
"loss": 0.9277099967002869,
"step": 232
},
{
"epoch": 0.4936708860759494,
"grad_norm": 1.298305869102478,
"learning_rate": 1.1844297325156337e-06,
"loss": 1.334661602973938,
"step": 234
},
{
"epoch": 0.4978902953586498,
"grad_norm": 1.7692943811416626,
"learning_rate": 1.183977411321241e-06,
"loss": 1.372158169746399,
"step": 236
},
{
"epoch": 0.5021097046413502,
"grad_norm": 2.271902322769165,
"learning_rate": 1.1835187127154221e-06,
"loss": 1.036437749862671,
"step": 238
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.205810070037842,
"learning_rate": 1.18305364228107e-06,
"loss": 0.470305472612381,
"step": 240
},
{
"epoch": 0.510548523206751,
"grad_norm": 1.814501404762268,
"learning_rate": 1.1825822056786304e-06,
"loss": 1.4641677141189575,
"step": 242
},
{
"epoch": 0.5147679324894515,
"grad_norm": 1.974550724029541,
"learning_rate": 1.182104408646032e-06,
"loss": 0.9201871156692505,
"step": 244
},
{
"epoch": 0.5189873417721519,
"grad_norm": 1.1704046726226807,
"learning_rate": 1.1816202569986176e-06,
"loss": 1.5398619174957275,
"step": 246
},
{
"epoch": 0.5232067510548524,
"grad_norm": 2.400852918624878,
"learning_rate": 1.181129756629073e-06,
"loss": 1.265621542930603,
"step": 248
},
{
"epoch": 0.5274261603375527,
"grad_norm": 1.2383782863616943,
"learning_rate": 1.1806329135073552e-06,
"loss": 1.3679600954055786,
"step": 250
},
{
"epoch": 0.5316455696202531,
"grad_norm": 1.7232836484909058,
"learning_rate": 1.18012973368062e-06,
"loss": 1.171999216079712,
"step": 252
},
{
"epoch": 0.5358649789029536,
"grad_norm": 1.5587044954299927,
"learning_rate": 1.1796202232731485e-06,
"loss": 1.2946254014968872,
"step": 254
},
{
"epoch": 0.540084388185654,
"grad_norm": 2.5242385864257812,
"learning_rate": 1.1791043884862711e-06,
"loss": 1.254220724105835,
"step": 256
},
{
"epoch": 0.5443037974683544,
"grad_norm": 2.730691432952881,
"learning_rate": 1.178582235598295e-06,
"loss": 1.078680396080017,
"step": 258
},
{
"epoch": 0.5485232067510548,
"grad_norm": 4.930171489715576,
"learning_rate": 1.1780537709644245e-06,
"loss": 1.0161340236663818,
"step": 260
},
{
"epoch": 0.5527426160337553,
"grad_norm": 1.1866450309753418,
"learning_rate": 1.177519001016686e-06,
"loss": 1.352670431137085,
"step": 262
},
{
"epoch": 0.5569620253164557,
"grad_norm": 1.242305040359497,
"learning_rate": 1.1769779322638483e-06,
"loss": 1.3570655584335327,
"step": 264
},
{
"epoch": 0.5611814345991561,
"grad_norm": 3.957782745361328,
"learning_rate": 1.1764305712913445e-06,
"loss": 1.311238169670105,
"step": 266
},
{
"epoch": 0.5654008438818565,
"grad_norm": 2.596217393875122,
"learning_rate": 1.1758769247611908e-06,
"loss": 1.5630828142166138,
"step": 268
},
{
"epoch": 0.569620253164557,
"grad_norm": 2.2337851524353027,
"learning_rate": 1.1753169994119063e-06,
"loss": 1.0898045301437378,
"step": 270
},
{
"epoch": 0.5738396624472574,
"grad_norm": 1.5337291955947876,
"learning_rate": 1.1747508020584302e-06,
"loss": 1.3198161125183105,
"step": 272
},
{
"epoch": 0.5780590717299579,
"grad_norm": 4.55377721786499,
"learning_rate": 1.17417833959204e-06,
"loss": 1.1360241174697876,
"step": 274
},
{
"epoch": 0.5822784810126582,
"grad_norm": 10.100658416748047,
"learning_rate": 1.173599618980266e-06,
"loss": 1.3351401090621948,
"step": 276
},
{
"epoch": 0.5864978902953587,
"grad_norm": 1.0856088399887085,
"learning_rate": 1.1730146472668075e-06,
"loss": 1.4669663906097412,
"step": 278
},
{
"epoch": 0.5907172995780591,
"grad_norm": 1.9721051454544067,
"learning_rate": 1.1724234315714474e-06,
"loss": 1.003104329109192,
"step": 280
},
{
"epoch": 0.5949367088607594,
"grad_norm": 1.5582189559936523,
"learning_rate": 1.1718259790899647e-06,
"loss": 1.405082106590271,
"step": 282
},
{
"epoch": 0.5991561181434599,
"grad_norm": 1.6864080429077148,
"learning_rate": 1.1712222970940478e-06,
"loss": 1.595037579536438,
"step": 284
},
{
"epoch": 0.6033755274261603,
"grad_norm": 0.3938112258911133,
"learning_rate": 1.1706123929312049e-06,
"loss": 1.1622782945632935,
"step": 286
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.0948325395584106,
"learning_rate": 1.1699962740246754e-06,
"loss": 1.325197458267212,
"step": 288
},
{
"epoch": 0.6118143459915611,
"grad_norm": 1.514491081237793,
"learning_rate": 1.1693739478733393e-06,
"loss": 0.8543146848678589,
"step": 290
},
{
"epoch": 0.6160337552742616,
"grad_norm": 1.247450590133667,
"learning_rate": 1.1687454220516262e-06,
"loss": 0.6629498600959778,
"step": 292
},
{
"epoch": 0.620253164556962,
"grad_norm": 0.9832797050476074,
"learning_rate": 1.1681107042094227e-06,
"loss": 1.3061555624008179,
"step": 294
},
{
"epoch": 0.6244725738396625,
"grad_norm": 1.6159130334854126,
"learning_rate": 1.1674698020719791e-06,
"loss": 0.7377375364303589,
"step": 296
},
{
"epoch": 0.6286919831223629,
"grad_norm": 2.47055983543396,
"learning_rate": 1.1668227234398165e-06,
"loss": 0.6730928421020508,
"step": 298
},
{
"epoch": 0.6329113924050633,
"grad_norm": 3.4460909366607666,
"learning_rate": 1.16616947618863e-06,
"loss": 1.8364771604537964,
"step": 300
},
{
"epoch": 0.6371308016877637,
"grad_norm": 1.025884985923767,
"learning_rate": 1.1655100682691951e-06,
"loss": 1.3243968486785889,
"step": 302
},
{
"epoch": 0.6413502109704642,
"grad_norm": 4.711573600769043,
"learning_rate": 1.1648445077072692e-06,
"loss": 0.9149092435836792,
"step": 304
},
{
"epoch": 0.6455696202531646,
"grad_norm": 1.3360319137573242,
"learning_rate": 1.164172802603494e-06,
"loss": 1.0405997037887573,
"step": 306
},
{
"epoch": 0.6497890295358649,
"grad_norm": 0.9437978267669678,
"learning_rate": 1.1634949611332986e-06,
"loss": 1.3173035383224487,
"step": 308
},
{
"epoch": 0.6540084388185654,
"grad_norm": 1.0983142852783203,
"learning_rate": 1.1628109915467975e-06,
"loss": 1.251430869102478,
"step": 310
},
{
"epoch": 0.6582278481012658,
"grad_norm": 2.2314326763153076,
"learning_rate": 1.1621209021686924e-06,
"loss": 1.0687130689620972,
"step": 312
},
{
"epoch": 0.6624472573839663,
"grad_norm": 1.2847200632095337,
"learning_rate": 1.1614247013981692e-06,
"loss": 1.2770864963531494,
"step": 314
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.491546630859375,
"learning_rate": 1.1607223977087972e-06,
"loss": 1.1677052974700928,
"step": 316
},
{
"epoch": 0.6708860759493671,
"grad_norm": 1.112823486328125,
"learning_rate": 1.160013999648425e-06,
"loss": 1.1452233791351318,
"step": 318
},
{
"epoch": 0.6751054852320675,
"grad_norm": 2.3609695434570312,
"learning_rate": 1.1592995158390764e-06,
"loss": 1.1290454864501953,
"step": 320
},
{
"epoch": 0.679324894514768,
"grad_norm": 1.2427384853363037,
"learning_rate": 1.1585789549768468e-06,
"loss": 0.9067545533180237,
"step": 322
},
{
"epoch": 0.6835443037974683,
"grad_norm": 1.1474452018737793,
"learning_rate": 1.157852325831795e-06,
"loss": 1.0441116094589233,
"step": 324
},
{
"epoch": 0.6877637130801688,
"grad_norm": 2.173767328262329,
"learning_rate": 1.157119637247839e-06,
"loss": 0.8966209292411804,
"step": 326
},
{
"epoch": 0.6919831223628692,
"grad_norm": 1.1117126941680908,
"learning_rate": 1.1563808981426463e-06,
"loss": 0.9047636985778809,
"step": 328
},
{
"epoch": 0.6962025316455697,
"grad_norm": 5.761388778686523,
"learning_rate": 1.155636117507527e-06,
"loss": 1.6347975730895996,
"step": 330
},
{
"epoch": 0.70042194092827,
"grad_norm": 2.036907434463501,
"learning_rate": 1.1548853044073231e-06,
"loss": 1.1312888860702515,
"step": 332
},
{
"epoch": 0.7046413502109705,
"grad_norm": 1.1030317544937134,
"learning_rate": 1.1541284679802987e-06,
"loss": 1.441202163696289,
"step": 334
},
{
"epoch": 0.7088607594936709,
"grad_norm": 1.0779141187667847,
"learning_rate": 1.1533656174380295e-06,
"loss": 1.3240406513214111,
"step": 336
},
{
"epoch": 0.7130801687763713,
"grad_norm": 1.0553290843963623,
"learning_rate": 1.1525967620652888e-06,
"loss": 1.355104684829712,
"step": 338
},
{
"epoch": 0.7172995780590717,
"grad_norm": 1.1798067092895508,
"learning_rate": 1.151821911219936e-06,
"loss": 1.3105881214141846,
"step": 340
},
{
"epoch": 0.7215189873417721,
"grad_norm": 1.144148349761963,
"learning_rate": 1.151041074332803e-06,
"loss": 1.3141815662384033,
"step": 342
},
{
"epoch": 0.7257383966244726,
"grad_norm": 7.016842365264893,
"learning_rate": 1.1502542609075783e-06,
"loss": 1.1324222087860107,
"step": 344
},
{
"epoch": 0.729957805907173,
"grad_norm": 2.61010479927063,
"learning_rate": 1.1494614805206915e-06,
"loss": 0.908640444278717,
"step": 346
},
{
"epoch": 0.7341772151898734,
"grad_norm": 1.0395554304122925,
"learning_rate": 1.1486627428211974e-06,
"loss": 1.308266282081604,
"step": 348
},
{
"epoch": 0.7383966244725738,
"grad_norm": 2.721623659133911,
"learning_rate": 1.147858057530658e-06,
"loss": 1.228571891784668,
"step": 350
},
{
"epoch": 0.7426160337552743,
"grad_norm": 22.471782684326172,
"learning_rate": 1.1470474344430244e-06,
"loss": 1.149246335029602,
"step": 352
},
{
"epoch": 0.7468354430379747,
"grad_norm": 2.0179569721221924,
"learning_rate": 1.1462308834245177e-06,
"loss": 0.6629557013511658,
"step": 354
},
{
"epoch": 0.7510548523206751,
"grad_norm": 2.0603108406066895,
"learning_rate": 1.1454084144135089e-06,
"loss": 1.0916632413864136,
"step": 356
},
{
"epoch": 0.7552742616033755,
"grad_norm": 12.69516372680664,
"learning_rate": 1.1445800374203972e-06,
"loss": 1.026712417602539,
"step": 358
},
{
"epoch": 0.759493670886076,
"grad_norm": 1.3232800960540771,
"learning_rate": 1.1437457625274893e-06,
"loss": 1.2708055973052979,
"step": 360
},
{
"epoch": 0.7637130801687764,
"grad_norm": 2.96313738822937,
"learning_rate": 1.1429055998888764e-06,
"loss": 1.0283684730529785,
"step": 362
},
{
"epoch": 0.7679324894514767,
"grad_norm": 2.2174739837646484,
"learning_rate": 1.1420595597303093e-06,
"loss": 1.6322853565216064,
"step": 364
},
{
"epoch": 0.7721518987341772,
"grad_norm": 3.5580825805664062,
"learning_rate": 1.1412076523490762e-06,
"loss": 0.7543882727622986,
"step": 366
},
{
"epoch": 0.7763713080168776,
"grad_norm": 1.3570228815078735,
"learning_rate": 1.140349888113876e-06,
"loss": 1.1952807903289795,
"step": 368
},
{
"epoch": 0.7805907172995781,
"grad_norm": 3.309746503829956,
"learning_rate": 1.1394862774646915e-06,
"loss": 1.5346460342407227,
"step": 370
},
{
"epoch": 0.7848101265822784,
"grad_norm": 2.107482433319092,
"learning_rate": 1.1386168309126637e-06,
"loss": 1.0200968980789185,
"step": 372
},
{
"epoch": 0.7890295358649789,
"grad_norm": 1.8146216869354248,
"learning_rate": 1.1377415590399635e-06,
"loss": 1.0982069969177246,
"step": 374
},
{
"epoch": 0.7932489451476793,
"grad_norm": 1.029420256614685,
"learning_rate": 1.1368604724996625e-06,
"loss": 1.197360873222351,
"step": 376
},
{
"epoch": 0.7974683544303798,
"grad_norm": 1.1954386234283447,
"learning_rate": 1.1359735820156029e-06,
"loss": 1.1520774364471436,
"step": 378
},
{
"epoch": 0.8016877637130801,
"grad_norm": 1.2728540897369385,
"learning_rate": 1.1350808983822688e-06,
"loss": 0.7453869581222534,
"step": 380
},
{
"epoch": 0.8059071729957806,
"grad_norm": 2.447286605834961,
"learning_rate": 1.134182432464653e-06,
"loss": 1.3108738660812378,
"step": 382
},
{
"epoch": 0.810126582278481,
"grad_norm": 5.362612247467041,
"learning_rate": 1.1332781951981248e-06,
"loss": 1.0827962160110474,
"step": 384
},
{
"epoch": 0.8143459915611815,
"grad_norm": 1.7063276767730713,
"learning_rate": 1.1323681975882984e-06,
"loss": 1.3062907457351685,
"step": 386
},
{
"epoch": 0.8185654008438819,
"grad_norm": 2.8370184898376465,
"learning_rate": 1.131452450710898e-06,
"loss": 0.9684048295021057,
"step": 388
},
{
"epoch": 0.8227848101265823,
"grad_norm": 1.1811680793762207,
"learning_rate": 1.1305309657116222e-06,
"loss": 1.2863088846206665,
"step": 390
},
{
"epoch": 0.8270042194092827,
"grad_norm": 3.667228937149048,
"learning_rate": 1.1296037538060104e-06,
"loss": 1.0412209033966064,
"step": 392
},
{
"epoch": 0.8312236286919831,
"grad_norm": 4.117892265319824,
"learning_rate": 1.128670826279304e-06,
"loss": 0.9639609456062317,
"step": 394
},
{
"epoch": 0.8354430379746836,
"grad_norm": 1.29248046875,
"learning_rate": 1.1277321944863108e-06,
"loss": 1.2934151887893677,
"step": 396
},
{
"epoch": 0.8396624472573839,
"grad_norm": 0.26427099108695984,
"learning_rate": 1.1267878698512655e-06,
"loss": 1.1188089847564697,
"step": 398
},
{
"epoch": 0.8438818565400844,
"grad_norm": 0.8574454188346863,
"learning_rate": 1.125837863867692e-06,
"loss": 0.9975463151931763,
"step": 400
},
{
"epoch": 0.8481012658227848,
"grad_norm": 1.629779577255249,
"learning_rate": 1.1248821880982622e-06,
"loss": 0.7363186478614807,
"step": 402
},
{
"epoch": 0.8523206751054853,
"grad_norm": 1.8325449228286743,
"learning_rate": 1.1239208541746565e-06,
"loss": 1.2270734310150146,
"step": 404
},
{
"epoch": 0.8565400843881856,
"grad_norm": 0.7708742618560791,
"learning_rate": 1.1229538737974207e-06,
"loss": 0.9653185606002808,
"step": 406
},
{
"epoch": 0.8607594936708861,
"grad_norm": 2.376756429672241,
"learning_rate": 1.1219812587358254e-06,
"loss": 0.997606098651886,
"step": 408
},
{
"epoch": 0.8649789029535865,
"grad_norm": 1.2060413360595703,
"learning_rate": 1.121003020827721e-06,
"loss": 1.2897322177886963,
"step": 410
},
{
"epoch": 0.869198312236287,
"grad_norm": 1.555523157119751,
"learning_rate": 1.1200191719793948e-06,
"loss": 0.876572847366333,
"step": 412
},
{
"epoch": 0.8734177215189873,
"grad_norm": 3.1254689693450928,
"learning_rate": 1.1190297241654262e-06,
"loss": 1.2611523866653442,
"step": 414
},
{
"epoch": 0.8776371308016878,
"grad_norm": 1.103677749633789,
"learning_rate": 1.1180346894285397e-06,
"loss": 1.0928722620010376,
"step": 416
},
{
"epoch": 0.8818565400843882,
"grad_norm": 2.221696615219116,
"learning_rate": 1.1170340798794594e-06,
"loss": 1.2073904275894165,
"step": 418
},
{
"epoch": 0.8860759493670886,
"grad_norm": 1.7576788663864136,
"learning_rate": 1.1160279076967616e-06,
"loss": 0.9891563057899475,
"step": 420
},
{
"epoch": 0.890295358649789,
"grad_norm": 2.0383450984954834,
"learning_rate": 1.1150161851267262e-06,
"loss": 1.399549126625061,
"step": 422
},
{
"epoch": 0.8945147679324894,
"grad_norm": 3.365711212158203,
"learning_rate": 1.1139989244831874e-06,
"loss": 1.029995083808899,
"step": 424
},
{
"epoch": 0.8987341772151899,
"grad_norm": 2.773817539215088,
"learning_rate": 1.1129761381473842e-06,
"loss": 1.2264801263809204,
"step": 426
},
{
"epoch": 0.9029535864978903,
"grad_norm": 2.2570652961730957,
"learning_rate": 1.11194783856781e-06,
"loss": 1.0824590921401978,
"step": 428
},
{
"epoch": 0.9071729957805907,
"grad_norm": 5.947412967681885,
"learning_rate": 1.1109140382600606e-06,
"loss": 1.057291865348816,
"step": 430
},
{
"epoch": 0.9113924050632911,
"grad_norm": 5.3977580070495605,
"learning_rate": 1.1098747498066824e-06,
"loss": 1.1226750612258911,
"step": 432
},
{
"epoch": 0.9156118143459916,
"grad_norm": 3.355656385421753,
"learning_rate": 1.108829985857018e-06,
"loss": 1.3119703531265259,
"step": 434
},
{
"epoch": 0.919831223628692,
"grad_norm": 3.1750526428222656,
"learning_rate": 1.1077797591270538e-06,
"loss": 0.9117200970649719,
"step": 436
},
{
"epoch": 0.9240506329113924,
"grad_norm": 1.7613136768341064,
"learning_rate": 1.1067240823992643e-06,
"loss": 1.2639193534851074,
"step": 438
},
{
"epoch": 0.9282700421940928,
"grad_norm": 1.1626863479614258,
"learning_rate": 1.105662968522457e-06,
"loss": 1.0154443979263306,
"step": 440
},
{
"epoch": 0.9324894514767933,
"grad_norm": 4.403058052062988,
"learning_rate": 1.1045964304116158e-06,
"loss": 0.9742609262466431,
"step": 442
},
{
"epoch": 0.9367088607594937,
"grad_norm": 4.4023237228393555,
"learning_rate": 1.1035244810477435e-06,
"loss": 1.161311388015747,
"step": 444
},
{
"epoch": 0.9409282700421941,
"grad_norm": 1.6864513158798218,
"learning_rate": 1.1024471334777044e-06,
"loss": 1.3747820854187012,
"step": 446
},
{
"epoch": 0.9451476793248945,
"grad_norm": 1.197826623916626,
"learning_rate": 1.1013644008140647e-06,
"loss": 1.0570836067199707,
"step": 448
},
{
"epoch": 0.9493670886075949,
"grad_norm": 4.425671577453613,
"learning_rate": 1.1002762962349342e-06,
"loss": 1.066590666770935,
"step": 450
},
{
"epoch": 0.9535864978902954,
"grad_norm": 1.4791566133499146,
"learning_rate": 1.0991828329838048e-06,
"loss": 1.3325567245483398,
"step": 452
},
{
"epoch": 0.9578059071729957,
"grad_norm": 1.0424067974090576,
"learning_rate": 1.0980840243693891e-06,
"loss": 1.0253040790557861,
"step": 454
},
{
"epoch": 0.9620253164556962,
"grad_norm": 1.6803632974624634,
"learning_rate": 1.0969798837654603e-06,
"loss": 1.2115472555160522,
"step": 456
},
{
"epoch": 0.9662447257383966,
"grad_norm": 1.7260364294052124,
"learning_rate": 1.0958704246106864e-06,
"loss": 0.9136871695518494,
"step": 458
},
{
"epoch": 0.9704641350210971,
"grad_norm": 4.201066493988037,
"learning_rate": 1.0947556604084698e-06,
"loss": 0.7265217304229736,
"step": 460
},
{
"epoch": 0.9746835443037974,
"grad_norm": 1.5730266571044922,
"learning_rate": 1.09363560472678e-06,
"loss": 0.9232859015464783,
"step": 462
},
{
"epoch": 0.9789029535864979,
"grad_norm": 1.9785159826278687,
"learning_rate": 1.0925102711979916e-06,
"loss": 1.2320111989974976,
"step": 464
},
{
"epoch": 0.9831223628691983,
"grad_norm": 2.112661123275757,
"learning_rate": 1.0913796735187152e-06,
"loss": 0.7564235925674438,
"step": 466
},
{
"epoch": 0.9873417721518988,
"grad_norm": 3.1255481243133545,
"learning_rate": 1.0902438254496335e-06,
"loss": 1.3790355920791626,
"step": 468
},
{
"epoch": 0.9915611814345991,
"grad_norm": 2.643756866455078,
"learning_rate": 1.0891027408153311e-06,
"loss": 0.8968592286109924,
"step": 470
},
{
"epoch": 0.9957805907172996,
"grad_norm": 1.2613961696624756,
"learning_rate": 1.087956433504129e-06,
"loss": 1.2724238634109497,
"step": 472
},
{
"epoch": 1.0,
"grad_norm": 1.897484302520752,
"learning_rate": 1.0868049174679133e-06,
"loss": 1.3249882459640503,
"step": 474
},
{
"epoch": 1.0042194092827004,
"grad_norm": 15.572712898254395,
"learning_rate": 1.0856482067219672e-06,
"loss": 1.1418063640594482,
"step": 476
},
{
"epoch": 1.0084388185654007,
"grad_norm": 0.9338006377220154,
"learning_rate": 1.0844863153447983e-06,
"loss": 1.2509591579437256,
"step": 478
},
{
"epoch": 1.0126582278481013,
"grad_norm": 1.7618024349212646,
"learning_rate": 1.0833192574779696e-06,
"loss": 1.2292466163635254,
"step": 480
},
{
"epoch": 1.0168776371308017,
"grad_norm": 2.1609816551208496,
"learning_rate": 1.0821470473259254e-06,
"loss": 0.9470843076705933,
"step": 482
},
{
"epoch": 1.021097046413502,
"grad_norm": 1.7773792743682861,
"learning_rate": 1.0809696991558202e-06,
"loss": 1.2175320386886597,
"step": 484
},
{
"epoch": 1.0253164556962024,
"grad_norm": 3.248727560043335,
"learning_rate": 1.0797872272973435e-06,
"loss": 0.5157210230827332,
"step": 486
},
{
"epoch": 1.029535864978903,
"grad_norm": 4.235684394836426,
"learning_rate": 1.078599646142546e-06,
"loss": 1.0747886896133423,
"step": 488
},
{
"epoch": 1.0337552742616034,
"grad_norm": 1.0086420774459839,
"learning_rate": 1.0774069701456646e-06,
"loss": 0.91233229637146,
"step": 490
},
{
"epoch": 1.0379746835443038,
"grad_norm": 1.760449767112732,
"learning_rate": 1.0762092138229461e-06,
"loss": 1.2355482578277588,
"step": 492
},
{
"epoch": 1.0421940928270041,
"grad_norm": 2.7897939682006836,
"learning_rate": 1.0750063917524715e-06,
"loss": 0.876376748085022,
"step": 494
},
{
"epoch": 1.0464135021097047,
"grad_norm": 1.583694577217102,
"learning_rate": 1.073798518573977e-06,
"loss": 0.9621012806892395,
"step": 496
},
{
"epoch": 1.0506329113924051,
"grad_norm": 1.2283833026885986,
"learning_rate": 1.0725856089886768e-06,
"loss": 1.3705410957336426,
"step": 498
},
{
"epoch": 1.0548523206751055,
"grad_norm": 1.892619013786316,
"learning_rate": 1.071367677759084e-06,
"loss": 1.2057194709777832,
"step": 500
},
{
"epoch": 1.0590717299578059,
"grad_norm": 1.5852138996124268,
"learning_rate": 1.0701447397088314e-06,
"loss": 1.225092887878418,
"step": 502
},
{
"epoch": 1.0632911392405062,
"grad_norm": 1.6359580755233765,
"learning_rate": 1.0689168097224896e-06,
"loss": 1.3359899520874023,
"step": 504
},
{
"epoch": 1.0675105485232068,
"grad_norm": 2.100905418395996,
"learning_rate": 1.0676839027453882e-06,
"loss": 0.8091757297515869,
"step": 506
},
{
"epoch": 1.0717299578059072,
"grad_norm": 0.9373227953910828,
"learning_rate": 1.0664460337834312e-06,
"loss": 1.20570969581604,
"step": 508
},
{
"epoch": 1.0759493670886076,
"grad_norm": 1.1788032054901123,
"learning_rate": 1.0652032179029165e-06,
"loss": 1.2286429405212402,
"step": 510
},
{
"epoch": 1.080168776371308,
"grad_norm": 2.070732355117798,
"learning_rate": 1.0639554702303516e-06,
"loss": 1.1464022397994995,
"step": 512
},
{
"epoch": 1.0843881856540085,
"grad_norm": 1.4905924797058105,
"learning_rate": 1.0627028059522697e-06,
"loss": 1.2270240783691406,
"step": 514
},
{
"epoch": 1.0886075949367089,
"grad_norm": 1.2873064279556274,
"learning_rate": 1.061445240315044e-06,
"loss": 1.2191872596740723,
"step": 516
},
{
"epoch": 1.0928270042194093,
"grad_norm": 1.5833098888397217,
"learning_rate": 1.060182788624704e-06,
"loss": 1.0899208784103394,
"step": 518
},
{
"epoch": 1.0970464135021096,
"grad_norm": 1.2056680917739868,
"learning_rate": 1.0589154662467476e-06,
"loss": 1.002990484237671,
"step": 520
},
{
"epoch": 1.1012658227848102,
"grad_norm": 2.3490617275238037,
"learning_rate": 1.0576432886059546e-06,
"loss": 0.9123169779777527,
"step": 522
},
{
"epoch": 1.1054852320675106,
"grad_norm": 1.398703694343567,
"learning_rate": 1.056366271186199e-06,
"loss": 1.1336543560028076,
"step": 524
},
{
"epoch": 1.109704641350211,
"grad_norm": 5.56015682220459,
"learning_rate": 1.0550844295302604e-06,
"loss": 0.8910406231880188,
"step": 526
},
{
"epoch": 1.1139240506329113,
"grad_norm": 1.501484751701355,
"learning_rate": 1.0537977792396352e-06,
"loss": 1.4902470111846924,
"step": 528
},
{
"epoch": 1.1181434599156117,
"grad_norm": 3.0271458625793457,
"learning_rate": 1.0525063359743461e-06,
"loss": 1.2566696405410767,
"step": 530
},
{
"epoch": 1.1223628691983123,
"grad_norm": 3.5508389472961426,
"learning_rate": 1.0512101154527524e-06,
"loss": 0.6722557544708252,
"step": 532
},
{
"epoch": 1.1265822784810127,
"grad_norm": 5.183070182800293,
"learning_rate": 1.049909133451358e-06,
"loss": 1.16892409324646,
"step": 534
},
{
"epoch": 1.130801687763713,
"grad_norm": 1.5916978120803833,
"learning_rate": 1.0486034058046184e-06,
"loss": 1.2602534294128418,
"step": 536
},
{
"epoch": 1.1350210970464134,
"grad_norm": 2.219564914703369,
"learning_rate": 1.0472929484047508e-06,
"loss": 0.9274411797523499,
"step": 538
},
{
"epoch": 1.139240506329114,
"grad_norm": 2.6550791263580322,
"learning_rate": 1.0459777772015377e-06,
"loss": 0.7955924868583679,
"step": 540
},
{
"epoch": 1.1434599156118144,
"grad_norm": 1.0860302448272705,
"learning_rate": 1.044657908202135e-06,
"loss": 0.8460701704025269,
"step": 542
},
{
"epoch": 1.1476793248945147,
"grad_norm": 2.0455169677734375,
"learning_rate": 1.0433333574708754e-06,
"loss": 0.9194719791412354,
"step": 544
},
{
"epoch": 1.1518987341772151,
"grad_norm": 0.8281774520874023,
"learning_rate": 1.042004141129074e-06,
"loss": 1.209435224533081,
"step": 546
},
{
"epoch": 1.1561181434599157,
"grad_norm": 3.4284002780914307,
"learning_rate": 1.040670275354832e-06,
"loss": 1.1639091968536377,
"step": 548
},
{
"epoch": 1.160337552742616,
"grad_norm": 3.964017152786255,
"learning_rate": 1.0393317763828394e-06,
"loss": 1.0248503684997559,
"step": 550
},
{
"epoch": 1.1645569620253164,
"grad_norm": 1.5311849117279053,
"learning_rate": 1.0379886605041773e-06,
"loss": 1.5549976825714111,
"step": 552
},
{
"epoch": 1.1687763713080168,
"grad_norm": 1.1133424043655396,
"learning_rate": 1.0366409440661203e-06,
"loss": 1.2537164688110352,
"step": 554
},
{
"epoch": 1.1729957805907172,
"grad_norm": 2.7838144302368164,
"learning_rate": 1.035288643471937e-06,
"loss": 0.6379430890083313,
"step": 556
},
{
"epoch": 1.1772151898734178,
"grad_norm": 1.4451053142547607,
"learning_rate": 1.0339317751806905e-06,
"loss": 1.1707175970077515,
"step": 558
},
{
"epoch": 1.1814345991561181,
"grad_norm": 1.5942399501800537,
"learning_rate": 1.0325703557070377e-06,
"loss": 0.7751450538635254,
"step": 560
},
{
"epoch": 1.1856540084388185,
"grad_norm": 0.90716153383255,
"learning_rate": 1.0312044016210299e-06,
"loss": 0.9596038460731506,
"step": 562
},
{
"epoch": 1.189873417721519,
"grad_norm": 3.479564905166626,
"learning_rate": 1.029833929547908e-06,
"loss": 1.4486083984375,
"step": 564
},
{
"epoch": 1.1940928270042195,
"grad_norm": 1.8199161291122437,
"learning_rate": 1.028458956167903e-06,
"loss": 1.2960246801376343,
"step": 566
},
{
"epoch": 1.1983122362869199,
"grad_norm": 2.1159839630126953,
"learning_rate": 1.0270794982160328e-06,
"loss": 0.9260680079460144,
"step": 568
},
{
"epoch": 1.2025316455696202,
"grad_norm": 1.1287225484848022,
"learning_rate": 1.0256955724818963e-06,
"loss": 1.1793110370635986,
"step": 570
},
{
"epoch": 1.2067510548523206,
"grad_norm": 3.6729793548583984,
"learning_rate": 1.0243071958094713e-06,
"loss": 1.1447832584381104,
"step": 572
},
{
"epoch": 1.2109704641350212,
"grad_norm": 1.18032705783844,
"learning_rate": 1.0229143850969086e-06,
"loss": 1.230734944343567,
"step": 574
},
{
"epoch": 1.2151898734177216,
"grad_norm": 1.3310970067977905,
"learning_rate": 1.0215171572963262e-06,
"loss": 1.0188127756118774,
"step": 576
},
{
"epoch": 1.219409282700422,
"grad_norm": 1.4861373901367188,
"learning_rate": 1.020115529413603e-06,
"loss": 0.6179706454277039,
"step": 578
},
{
"epoch": 1.2236286919831223,
"grad_norm": 1.791669249534607,
"learning_rate": 1.0187095185081726e-06,
"loss": 1.0826208591461182,
"step": 580
},
{
"epoch": 1.2278481012658227,
"grad_norm": 2.172065258026123,
"learning_rate": 1.0172991416928149e-06,
"loss": 0.9076665639877319,
"step": 582
},
{
"epoch": 1.2320675105485233,
"grad_norm": 1.2151978015899658,
"learning_rate": 1.0158844161334472e-06,
"loss": 0.9629290103912354,
"step": 584
},
{
"epoch": 1.2362869198312236,
"grad_norm": 3.7680106163024902,
"learning_rate": 1.014465359048917e-06,
"loss": 1.0498627424240112,
"step": 586
},
{
"epoch": 1.240506329113924,
"grad_norm": 1.3523688316345215,
"learning_rate": 1.0130419877107911e-06,
"loss": 0.8714591860771179,
"step": 588
},
{
"epoch": 1.2447257383966246,
"grad_norm": 1.3897624015808105,
"learning_rate": 1.0116143194431453e-06,
"loss": 1.247403860092163,
"step": 590
},
{
"epoch": 1.248945147679325,
"grad_norm": 0.8131434917449951,
"learning_rate": 1.0101823716223555e-06,
"loss": 0.6486424207687378,
"step": 592
},
{
"epoch": 1.2531645569620253,
"grad_norm": 0.7691041827201843,
"learning_rate": 1.0087461616768827e-06,
"loss": 0.8923141956329346,
"step": 594
},
{
"epoch": 1.2573839662447257,
"grad_norm": 3.8645286560058594,
"learning_rate": 1.0073057070870643e-06,
"loss": 0.8870598673820496,
"step": 596
},
{
"epoch": 1.261603375527426,
"grad_norm": 1.9958916902542114,
"learning_rate": 1.0058610253848993e-06,
"loss": 0.6330664753913879,
"step": 598
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.7626611590385437,
"learning_rate": 1.0044121341538363e-06,
"loss": 1.0315228700637817,
"step": 600
},
{
"epoch": 1.270042194092827,
"grad_norm": 3.175126791000366,
"learning_rate": 1.0029590510285573e-06,
"loss": 1.4815832376480103,
"step": 602
},
{
"epoch": 1.2742616033755274,
"grad_norm": 1.2133333683013916,
"learning_rate": 1.001501793694766e-06,
"loss": 1.1330845355987549,
"step": 604
},
{
"epoch": 1.2784810126582278,
"grad_norm": 6.052526473999023,
"learning_rate": 1.0000403798889702e-06,
"loss": 1.0692338943481445,
"step": 606
},
{
"epoch": 1.2827004219409281,
"grad_norm": 1.9215449094772339,
"learning_rate": 9.985748273982674e-07,
"loss": 0.8957496285438538,
"step": 608
},
{
"epoch": 1.2869198312236287,
"grad_norm": 2.6900336742401123,
"learning_rate": 9.97105154060127e-07,
"loss": 1.0588608980178833,
"step": 610
},
{
"epoch": 1.2911392405063291,
"grad_norm": 4.2595014572143555,
"learning_rate": 9.956313777621743e-07,
"loss": 0.6556817293167114,
"step": 612
},
{
"epoch": 1.2953586497890295,
"grad_norm": 8.070136070251465,
"learning_rate": 9.941535164419721e-07,
"loss": 0.718927800655365,
"step": 614
},
{
"epoch": 1.29957805907173,
"grad_norm": 1.7044512033462524,
"learning_rate": 9.926715880868028e-07,
"loss": 1.1856049299240112,
"step": 616
},
{
"epoch": 1.3037974683544304,
"grad_norm": 1.7254352569580078,
"learning_rate": 9.911856107334497e-07,
"loss": 1.2073801755905151,
"step": 618
},
{
"epoch": 1.3080168776371308,
"grad_norm": 1.0367379188537598,
"learning_rate": 9.896956024679761e-07,
"loss": 0.7765376567840576,
"step": 620
},
{
"epoch": 1.3122362869198312,
"grad_norm": 1.8108422756195068,
"learning_rate": 9.882015814255073e-07,
"loss": 1.221542477607727,
"step": 622
},
{
"epoch": 1.3164556962025316,
"grad_norm": 1.8170989751815796,
"learning_rate": 9.867035657900079e-07,
"loss": 0.9256758689880371,
"step": 624
},
{
"epoch": 1.3206751054852321,
"grad_norm": 1.397377610206604,
"learning_rate": 9.852015737940618e-07,
"loss": 1.1996105909347534,
"step": 626
},
{
"epoch": 1.3248945147679325,
"grad_norm": 2.3642630577087402,
"learning_rate": 9.836956237186495e-07,
"loss": 1.7291648387908936,
"step": 628
},
{
"epoch": 1.3291139240506329,
"grad_norm": 1.3181530237197876,
"learning_rate": 9.821857338929266e-07,
"loss": 0.9664700627326965,
"step": 630
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.441937804222107,
"learning_rate": 9.806719226939986e-07,
"loss": 1.2257553339004517,
"step": 632
},
{
"epoch": 1.3375527426160336,
"grad_norm": 0.5612751841545105,
"learning_rate": 9.791542085467003e-07,
"loss": 0.9133172035217285,
"step": 634
},
{
"epoch": 1.3417721518987342,
"grad_norm": 1.5648330450057983,
"learning_rate": 9.776326099233684e-07,
"loss": 0.7176555395126343,
"step": 636
},
{
"epoch": 1.3459915611814346,
"grad_norm": 21.417667388916016,
"learning_rate": 9.761071453436195e-07,
"loss": 0.9039233326911926,
"step": 638
},
{
"epoch": 1.350210970464135,
"grad_norm": 1.9340534210205078,
"learning_rate": 9.745778333741227e-07,
"loss": 1.2601927518844604,
"step": 640
},
{
"epoch": 1.3544303797468356,
"grad_norm": 2.36677885055542,
"learning_rate": 9.73044692628374e-07,
"loss": 0.9230378866195679,
"step": 642
},
{
"epoch": 1.358649789029536,
"grad_norm": 0.7526681423187256,
"learning_rate": 9.715077417664705e-07,
"loss": 1.3141403198242188,
"step": 644
},
{
"epoch": 1.3628691983122363,
"grad_norm": 1.5295689105987549,
"learning_rate": 9.699669994948829e-07,
"loss": 1.20694899559021,
"step": 646
},
{
"epoch": 1.3670886075949367,
"grad_norm": 8.918047904968262,
"learning_rate": 9.684224845662273e-07,
"loss": 0.9112899899482727,
"step": 648
},
{
"epoch": 1.371308016877637,
"grad_norm": 2.3975322246551514,
"learning_rate": 9.668742157790378e-07,
"loss": 1.4381672143936157,
"step": 650
},
{
"epoch": 1.3755274261603376,
"grad_norm": 1.3441905975341797,
"learning_rate": 9.653222119775373e-07,
"loss": 1.224355936050415,
"step": 652
},
{
"epoch": 1.379746835443038,
"grad_norm": 8.170360565185547,
"learning_rate": 9.637664920514075e-07,
"loss": 0.9496920108795166,
"step": 654
},
{
"epoch": 1.3839662447257384,
"grad_norm": 1.037866234779358,
"learning_rate": 9.622070749355605e-07,
"loss": 1.2685517072677612,
"step": 656
},
{
"epoch": 1.3881856540084387,
"grad_norm": 5.456931114196777,
"learning_rate": 9.60643979609907e-07,
"loss": 0.676283597946167,
"step": 658
},
{
"epoch": 1.3924050632911391,
"grad_norm": 1.2280727624893188,
"learning_rate": 9.59077225099126e-07,
"loss": 1.1961660385131836,
"step": 660
},
{
"epoch": 1.3966244725738397,
"grad_norm": 2.4184653759002686,
"learning_rate": 9.57506830472433e-07,
"loss": 0.7105515599250793,
"step": 662
},
{
"epoch": 1.40084388185654,
"grad_norm": 2.039471387863159,
"learning_rate": 9.559328148433473e-07,
"loss": 1.2236860990524292,
"step": 664
},
{
"epoch": 1.4050632911392404,
"grad_norm": 3.3632094860076904,
"learning_rate": 9.54355197369461e-07,
"loss": 0.8225454092025757,
"step": 666
},
{
"epoch": 1.409282700421941,
"grad_norm": 2.8079605102539062,
"learning_rate": 9.527739972522041e-07,
"loss": 1.224509835243225,
"step": 668
},
{
"epoch": 1.4135021097046414,
"grad_norm": 1.0237503051757812,
"learning_rate": 9.511892337366117e-07,
"loss": 1.2146466970443726,
"step": 670
},
{
"epoch": 1.4177215189873418,
"grad_norm": 2.5967676639556885,
"learning_rate": 9.496009261110901e-07,
"loss": 1.5150516033172607,
"step": 672
},
{
"epoch": 1.4219409282700421,
"grad_norm": 1.5330960750579834,
"learning_rate": 9.480090937071802e-07,
"loss": 0.8809629082679749,
"step": 674
},
{
"epoch": 1.4261603375527425,
"grad_norm": 1.5120795965194702,
"learning_rate": 9.464137558993251e-07,
"loss": 0.7257891893386841,
"step": 676
},
{
"epoch": 1.4303797468354431,
"grad_norm": 1.8336877822875977,
"learning_rate": 9.448149321046316e-07,
"loss": 1.0394529104232788,
"step": 678
},
{
"epoch": 1.4345991561181435,
"grad_norm": 3.8357579708099365,
"learning_rate": 9.432126417826358e-07,
"loss": 1.1556706428527832,
"step": 680
},
{
"epoch": 1.4388185654008439,
"grad_norm": 1.9536528587341309,
"learning_rate": 9.416069044350646e-07,
"loss": 0.9677222967147827,
"step": 682
},
{
"epoch": 1.4430379746835442,
"grad_norm": 3.1365737915039062,
"learning_rate": 9.399977396055995e-07,
"loss": 1.2571027278900146,
"step": 684
},
{
"epoch": 1.4472573839662446,
"grad_norm": 3.375725746154785,
"learning_rate": 9.383851668796392e-07,
"loss": 0.7981452345848083,
"step": 686
},
{
"epoch": 1.4514767932489452,
"grad_norm": 2.0981504917144775,
"learning_rate": 9.367692058840594e-07,
"loss": 0.9887269735336304,
"step": 688
},
{
"epoch": 1.4556962025316456,
"grad_norm": 5.0241265296936035,
"learning_rate": 9.351498762869752e-07,
"loss": 1.1597225666046143,
"step": 690
},
{
"epoch": 1.459915611814346,
"grad_norm": 3.389521598815918,
"learning_rate": 9.33527197797502e-07,
"loss": 0.7292091846466064,
"step": 692
},
{
"epoch": 1.4641350210970465,
"grad_norm": 2.6481471061706543,
"learning_rate": 9.319011901655145e-07,
"loss": 1.3359123468399048,
"step": 694
},
{
"epoch": 1.4683544303797469,
"grad_norm": 3.0631887912750244,
"learning_rate": 9.302718731814072e-07,
"loss": 0.7314563393592834,
"step": 696
},
{
"epoch": 1.4725738396624473,
"grad_norm": 1.1294174194335938,
"learning_rate": 9.286392666758532e-07,
"loss": 1.202915072441101,
"step": 698
},
{
"epoch": 1.4767932489451476,
"grad_norm": 0.9764413237571716,
"learning_rate": 9.270033905195628e-07,
"loss": 1.2414040565490723,
"step": 700
},
{
"epoch": 1.481012658227848,
"grad_norm": 2.000211000442505,
"learning_rate": 9.25364264623042e-07,
"loss": 1.1095331907272339,
"step": 702
},
{
"epoch": 1.4852320675105486,
"grad_norm": 2.9179534912109375,
"learning_rate": 9.237219089363494e-07,
"loss": 0.8434455990791321,
"step": 704
},
{
"epoch": 1.489451476793249,
"grad_norm": 1.4670263528823853,
"learning_rate": 9.220763434488545e-07,
"loss": 1.1951138973236084,
"step": 706
},
{
"epoch": 1.4936708860759493,
"grad_norm": 1.2732899188995361,
"learning_rate": 9.204275881889934e-07,
"loss": 1.2532763481140137,
"step": 708
},
{
"epoch": 1.49789029535865,
"grad_norm": 1.302393913269043,
"learning_rate": 9.187756632240253e-07,
"loss": 1.1061906814575195,
"step": 710
},
{
"epoch": 1.50210970464135,
"grad_norm": 3.0175118446350098,
"learning_rate": 9.171205886597887e-07,
"loss": 0.5435208082199097,
"step": 712
},
{
"epoch": 1.5063291139240507,
"grad_norm": 2.06999135017395,
"learning_rate": 9.154623846404564e-07,
"loss": 1.2072559595108032,
"step": 714
},
{
"epoch": 1.510548523206751,
"grad_norm": 0.7862725853919983,
"learning_rate": 9.138010713482899e-07,
"loss": 1.1671605110168457,
"step": 716
},
{
"epoch": 1.5147679324894514,
"grad_norm": 1.871321439743042,
"learning_rate": 9.121366690033944e-07,
"loss": 1.1794459819793701,
"step": 718
},
{
"epoch": 1.518987341772152,
"grad_norm": 2.5938880443573,
"learning_rate": 9.104691978634728e-07,
"loss": 1.0995539426803589,
"step": 720
},
{
"epoch": 1.5232067510548524,
"grad_norm": 3.1606552600860596,
"learning_rate": 9.08798678223578e-07,
"loss": 1.231619954109192,
"step": 722
},
{
"epoch": 1.5274261603375527,
"grad_norm": 0.9736570715904236,
"learning_rate": 9.071251304158672e-07,
"loss": 1.250243067741394,
"step": 724
},
{
"epoch": 1.5316455696202531,
"grad_norm": 3.1312761306762695,
"learning_rate": 9.054485748093538e-07,
"loss": 0.6082893013954163,
"step": 726
},
{
"epoch": 1.5358649789029535,
"grad_norm": 2.6216931343078613,
"learning_rate": 9.037690318096597e-07,
"loss": 0.4211277663707733,
"step": 728
},
{
"epoch": 1.540084388185654,
"grad_norm": 1.668655276298523,
"learning_rate": 9.020865218587668e-07,
"loss": 1.0038397312164307,
"step": 730
},
{
"epoch": 1.5443037974683544,
"grad_norm": 2.301889419555664,
"learning_rate": 9.004010654347677e-07,
"loss": 0.9896605610847473,
"step": 732
},
{
"epoch": 1.5485232067510548,
"grad_norm": 1.4354939460754395,
"learning_rate": 8.98712683051618e-07,
"loss": 1.235560417175293,
"step": 734
},
{
"epoch": 1.5527426160337554,
"grad_norm": 1.394313097000122,
"learning_rate": 8.970213952588844e-07,
"loss": 0.986316442489624,
"step": 736
},
{
"epoch": 1.5569620253164556,
"grad_norm": 2.5624778270721436,
"learning_rate": 8.953272226414971e-07,
"loss": 0.9202096462249756,
"step": 738
},
{
"epoch": 1.5611814345991561,
"grad_norm": 2.074122667312622,
"learning_rate": 8.936301858194968e-07,
"loss": 1.1290022134780884,
"step": 740
},
{
"epoch": 1.5654008438818565,
"grad_norm": 1.5154873132705688,
"learning_rate": 8.919303054477857e-07,
"loss": 0.8514289855957031,
"step": 742
},
{
"epoch": 1.5696202531645569,
"grad_norm": 6.750024318695068,
"learning_rate": 8.90227602215875e-07,
"loss": 0.5599585175514221,
"step": 744
},
{
"epoch": 1.5738396624472575,
"grad_norm": 1.8320426940917969,
"learning_rate": 8.885220968476331e-07,
"loss": 0.780780017375946,
"step": 746
},
{
"epoch": 1.5780590717299579,
"grad_norm": 3.8813395500183105,
"learning_rate": 8.868138101010339e-07,
"loss": 0.656001091003418,
"step": 748
},
{
"epoch": 1.5822784810126582,
"grad_norm": 1.3229504823684692,
"learning_rate": 8.85102762767904e-07,
"loss": 1.215933084487915,
"step": 750
},
{
"epoch": 1.5864978902953588,
"grad_norm": 1.5343960523605347,
"learning_rate": 8.833889756736696e-07,
"loss": 0.7347640991210938,
"step": 752
},
{
"epoch": 1.590717299578059,
"grad_norm": 1.3849875926971436,
"learning_rate": 8.816724696771023e-07,
"loss": 0.8356782793998718,
"step": 754
},
{
"epoch": 1.5949367088607596,
"grad_norm": 1.4311785697937012,
"learning_rate": 8.799532656700668e-07,
"loss": 0.6571628451347351,
"step": 756
},
{
"epoch": 1.59915611814346,
"grad_norm": 1.954759955406189,
"learning_rate": 8.78231384577265e-07,
"loss": 0.8940713405609131,
"step": 758
},
{
"epoch": 1.6033755274261603,
"grad_norm": 1.7239028215408325,
"learning_rate": 8.765068473559826e-07,
"loss": 1.1826146841049194,
"step": 760
},
{
"epoch": 1.6075949367088609,
"grad_norm": 0.5811662673950195,
"learning_rate": 8.747796749958329e-07,
"loss": 0.8342135548591614,
"step": 762
},
{
"epoch": 1.611814345991561,
"grad_norm": 2.9614205360412598,
"learning_rate": 8.730498885185022e-07,
"loss": 1.2261645793914795,
"step": 764
},
{
"epoch": 1.6160337552742616,
"grad_norm": 1.8469215631484985,
"learning_rate": 8.713175089774935e-07,
"loss": 1.0828239917755127,
"step": 766
},
{
"epoch": 1.620253164556962,
"grad_norm": 0.7462615966796875,
"learning_rate": 8.695825574578708e-07,
"loss": 1.08014976978302,
"step": 768
},
{
"epoch": 1.6244725738396624,
"grad_norm": 1.5869735479354858,
"learning_rate": 8.678450550760013e-07,
"loss": 1.2228014469146729,
"step": 770
},
{
"epoch": 1.628691983122363,
"grad_norm": 1.267874836921692,
"learning_rate": 8.661050229793e-07,
"loss": 1.2381342649459839,
"step": 772
},
{
"epoch": 1.6329113924050633,
"grad_norm": 1.8816311359405518,
"learning_rate": 8.643624823459705e-07,
"loss": 1.2392218112945557,
"step": 774
},
{
"epoch": 1.6371308016877637,
"grad_norm": 2.270045280456543,
"learning_rate": 8.626174543847494e-07,
"loss": 1.2957593202590942,
"step": 776
},
{
"epoch": 1.6413502109704643,
"grad_norm": 0.9629765152931213,
"learning_rate": 8.608699603346457e-07,
"loss": 0.8434277772903442,
"step": 778
},
{
"epoch": 1.6455696202531644,
"grad_norm": 6.275035381317139,
"learning_rate": 8.591200214646842e-07,
"loss": 0.3582332730293274,
"step": 780
},
{
"epoch": 1.649789029535865,
"grad_norm": 1.902158260345459,
"learning_rate": 8.573676590736464e-07,
"loss": 1.1803405284881592,
"step": 782
},
{
"epoch": 1.6540084388185654,
"grad_norm": 1.5325170755386353,
"learning_rate": 8.556128944898098e-07,
"loss": 0.9606213569641113,
"step": 784
},
{
"epoch": 1.6582278481012658,
"grad_norm": 1.5803859233856201,
"learning_rate": 8.538557490706904e-07,
"loss": 1.1115106344223022,
"step": 786
},
{
"epoch": 1.6624472573839664,
"grad_norm": 1.5015286207199097,
"learning_rate": 8.520962442027808e-07,
"loss": 0.5854233503341675,
"step": 788
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.0308812856674194,
"learning_rate": 8.503344013012916e-07,
"loss": 1.2016632556915283,
"step": 790
},
{
"epoch": 1.6708860759493671,
"grad_norm": 3.8510706424713135,
"learning_rate": 8.485702418098897e-07,
"loss": 0.648362398147583,
"step": 792
},
{
"epoch": 1.6751054852320675,
"grad_norm": 0.47625789046287537,
"learning_rate": 8.468037872004374e-07,
"loss": 1.0536069869995117,
"step": 794
},
{
"epoch": 1.6793248945147679,
"grad_norm": 1.4682717323303223,
"learning_rate": 8.450350589727312e-07,
"loss": 1.2215386629104614,
"step": 796
},
{
"epoch": 1.6835443037974684,
"grad_norm": 4.169456481933594,
"learning_rate": 8.432640786542407e-07,
"loss": 0.9762102961540222,
"step": 798
},
{
"epoch": 1.6877637130801688,
"grad_norm": 4.598972797393799,
"learning_rate": 8.414908677998456e-07,
"loss": 1.2525511980056763,
"step": 800
},
{
"epoch": 1.6919831223628692,
"grad_norm": 4.160123825073242,
"learning_rate": 8.39715447991574e-07,
"loss": 0.6331847906112671,
"step": 802
},
{
"epoch": 1.6962025316455698,
"grad_norm": 1.725115180015564,
"learning_rate": 8.379378408383392e-07,
"loss": 1.2866941690444946,
"step": 804
},
{
"epoch": 1.70042194092827,
"grad_norm": 1.8015742301940918,
"learning_rate": 8.361580679756771e-07,
"loss": 1.1813989877700806,
"step": 806
},
{
"epoch": 1.7046413502109705,
"grad_norm": 2.260415554046631,
"learning_rate": 8.343761510654834e-07,
"loss": 0.8856143355369568,
"step": 808
},
{
"epoch": 1.7088607594936709,
"grad_norm": 3.748908042907715,
"learning_rate": 8.325921117957487e-07,
"loss": 0.9216241240501404,
"step": 810
},
{
"epoch": 1.7130801687763713,
"grad_norm": 1.0273261070251465,
"learning_rate": 8.308059718802953e-07,
"loss": 1.1896474361419678,
"step": 812
},
{
"epoch": 1.7172995780590719,
"grad_norm": 5.24169921875,
"learning_rate": 8.290177530585126e-07,
"loss": 1.525089144706726,
"step": 814
},
{
"epoch": 1.721518987341772,
"grad_norm": 2.1483330726623535,
"learning_rate": 8.272274770950934e-07,
"loss": 1.2185280323028564,
"step": 816
},
{
"epoch": 1.7257383966244726,
"grad_norm": 3.7641489505767822,
"learning_rate": 8.254351657797674e-07,
"loss": 0.8334339261054993,
"step": 818
},
{
"epoch": 1.729957805907173,
"grad_norm": 1.9054092168807983,
"learning_rate": 8.236408409270376e-07,
"loss": 0.4915008842945099,
"step": 820
},
{
"epoch": 1.7341772151898733,
"grad_norm": 18.341960906982422,
"learning_rate": 8.218445243759137e-07,
"loss": 0.7150586843490601,
"step": 822
},
{
"epoch": 1.738396624472574,
"grad_norm": 1.671027421951294,
"learning_rate": 8.200462379896468e-07,
"loss": 0.7935347557067871,
"step": 824
},
{
"epoch": 1.7426160337552743,
"grad_norm": 1.1407239437103271,
"learning_rate": 8.182460036554631e-07,
"loss": 1.0441514253616333,
"step": 826
},
{
"epoch": 1.7468354430379747,
"grad_norm": 1.8266801834106445,
"learning_rate": 8.164438432842973e-07,
"loss": 1.0361227989196777,
"step": 828
},
{
"epoch": 1.7510548523206753,
"grad_norm": 1.3403441905975342,
"learning_rate": 8.146397788105272e-07,
"loss": 1.1865990161895752,
"step": 830
},
{
"epoch": 1.7552742616033754,
"grad_norm": 0.9253147840499878,
"learning_rate": 8.128338321917045e-07,
"loss": 1.1751179695129395,
"step": 832
},
{
"epoch": 1.759493670886076,
"grad_norm": 3.3475301265716553,
"learning_rate": 8.110260254082898e-07,
"loss": 0.9232848286628723,
"step": 834
},
{
"epoch": 1.7637130801687764,
"grad_norm": 1.080689549446106,
"learning_rate": 8.092163804633832e-07,
"loss": 1.2128963470458984,
"step": 836
},
{
"epoch": 1.7679324894514767,
"grad_norm": 1.2559332847595215,
"learning_rate": 8.074049193824579e-07,
"loss": 1.0571973323822021,
"step": 838
},
{
"epoch": 1.7721518987341773,
"grad_norm": 1.474366307258606,
"learning_rate": 8.055916642130914e-07,
"loss": 1.1260405778884888,
"step": 840
},
{
"epoch": 1.7763713080168775,
"grad_norm": 2.3742828369140625,
"learning_rate": 8.037766370246972e-07,
"loss": 1.0088326930999756,
"step": 842
},
{
"epoch": 1.780590717299578,
"grad_norm": 0.6461037397384644,
"learning_rate": 8.019598599082567e-07,
"loss": 0.5369378328323364,
"step": 844
},
{
"epoch": 1.7848101265822784,
"grad_norm": 0.5748505592346191,
"learning_rate": 8.001413549760496e-07,
"loss": 0.8393441438674927,
"step": 846
},
{
"epoch": 1.7890295358649788,
"grad_norm": 3.184478282928467,
"learning_rate": 7.983211443613853e-07,
"loss": 0.7285841107368469,
"step": 848
},
{
"epoch": 1.7932489451476794,
"grad_norm": 6.552399635314941,
"learning_rate": 7.964992502183333e-07,
"loss": 0.8242054581642151,
"step": 850
},
{
"epoch": 1.7974683544303798,
"grad_norm": 1.3520995378494263,
"learning_rate": 7.946756947214536e-07,
"loss": 1.210748314857483,
"step": 852
},
{
"epoch": 1.8016877637130801,
"grad_norm": 2.167078733444214,
"learning_rate": 7.928505000655264e-07,
"loss": 1.4572898149490356,
"step": 854
},
{
"epoch": 1.8059071729957807,
"grad_norm": 0.506519615650177,
"learning_rate": 7.910236884652833e-07,
"loss": 1.0607579946517944,
"step": 856
},
{
"epoch": 1.810126582278481,
"grad_norm": 2.641108512878418,
"learning_rate": 7.891952821551348e-07,
"loss": 1.0674760341644287,
"step": 858
},
{
"epoch": 1.8143459915611815,
"grad_norm": 2.2153725624084473,
"learning_rate": 7.87365303388902e-07,
"loss": 0.8174270987510681,
"step": 860
},
{
"epoch": 1.8185654008438819,
"grad_norm": 2.081165075302124,
"learning_rate": 7.855337744395437e-07,
"loss": 1.2201720476150513,
"step": 862
},
{
"epoch": 1.8227848101265822,
"grad_norm": 1.6072843074798584,
"learning_rate": 7.837007175988869e-07,
"loss": 1.0889828205108643,
"step": 864
},
{
"epoch": 1.8270042194092828,
"grad_norm": 5.830190181732178,
"learning_rate": 7.818661551773542e-07,
"loss": 1.2174073457717896,
"step": 866
},
{
"epoch": 1.831223628691983,
"grad_norm": 3.3919594287872314,
"learning_rate": 7.800301095036933e-07,
"loss": 0.9814926385879517,
"step": 868
},
{
"epoch": 1.8354430379746836,
"grad_norm": 3.2243967056274414,
"learning_rate": 7.781926029247048e-07,
"loss": 1.1042759418487549,
"step": 870
},
{
"epoch": 1.839662447257384,
"grad_norm": 3.5643749237060547,
"learning_rate": 7.763536578049699e-07,
"loss": 0.8058743476867676,
"step": 872
},
{
"epoch": 1.8438818565400843,
"grad_norm": 1.4269522428512573,
"learning_rate": 7.745132965265788e-07,
"loss": 0.987337052822113,
"step": 874
},
{
"epoch": 1.8481012658227849,
"grad_norm": 0.985817015171051,
"learning_rate": 7.726715414888577e-07,
"loss": 1.2107572555541992,
"step": 876
},
{
"epoch": 1.8523206751054853,
"grad_norm": 1.9935749769210815,
"learning_rate": 7.708284151080968e-07,
"loss": 0.9476048946380615,
"step": 878
},
{
"epoch": 1.8565400843881856,
"grad_norm": 3.2491185665130615,
"learning_rate": 7.689839398172767e-07,
"loss": 0.9019596576690674,
"step": 880
},
{
"epoch": 1.8607594936708862,
"grad_norm": 2.6126883029937744,
"learning_rate": 7.671381380657965e-07,
"loss": 1.1691335439682007,
"step": 882
},
{
"epoch": 1.8649789029535864,
"grad_norm": 1.3800222873687744,
"learning_rate": 7.65291032319199e-07,
"loss": 0.8417655229568481,
"step": 884
},
{
"epoch": 1.869198312236287,
"grad_norm": 2.0879945755004883,
"learning_rate": 7.634426450588988e-07,
"loss": 0.8084736466407776,
"step": 886
},
{
"epoch": 1.8734177215189873,
"grad_norm": 1.3853508234024048,
"learning_rate": 7.615929987819075e-07,
"loss": 1.136643648147583,
"step": 888
},
{
"epoch": 1.8776371308016877,
"grad_norm": 7.130331993103027,
"learning_rate": 7.597421160005612e-07,
"loss": 0.4776380956172943,
"step": 890
},
{
"epoch": 1.8818565400843883,
"grad_norm": 3.002958059310913,
"learning_rate": 7.578900192422443e-07,
"loss": 0.7818654179573059,
"step": 892
},
{
"epoch": 1.8860759493670884,
"grad_norm": 1.7899680137634277,
"learning_rate": 7.560367310491182e-07,
"loss": 1.1894859075546265,
"step": 894
},
{
"epoch": 1.890295358649789,
"grad_norm": 1.8424100875854492,
"learning_rate": 7.541822739778445e-07,
"loss": 1.3867307901382446,
"step": 896
},
{
"epoch": 1.8945147679324894,
"grad_norm": 11.084500312805176,
"learning_rate": 7.523266705993115e-07,
"loss": 0.8175121545791626,
"step": 898
},
{
"epoch": 1.8987341772151898,
"grad_norm": 2.078657388687134,
"learning_rate": 7.504699434983602e-07,
"loss": 1.1003247499465942,
"step": 900
},
{
"epoch": 1.9029535864978904,
"grad_norm": 1.9573427438735962,
"learning_rate": 7.486121152735074e-07,
"loss": 1.3067007064819336,
"step": 902
},
{
"epoch": 1.9071729957805907,
"grad_norm": 6.904273509979248,
"learning_rate": 7.467532085366726e-07,
"loss": 1.073278784751892,
"step": 904
},
{
"epoch": 1.9113924050632911,
"grad_norm": 0.9965894818305969,
"learning_rate": 7.448932459129016e-07,
"loss": 1.3775935173034668,
"step": 906
},
{
"epoch": 1.9156118143459917,
"grad_norm": 4.340345859527588,
"learning_rate": 7.430322500400924e-07,
"loss": 0.5346195697784424,
"step": 908
},
{
"epoch": 1.9198312236286919,
"grad_norm": 1.2633851766586304,
"learning_rate": 7.411702435687177e-07,
"loss": 1.1176321506500244,
"step": 910
},
{
"epoch": 1.9240506329113924,
"grad_norm": 2.212251901626587,
"learning_rate": 7.393072491615511e-07,
"loss": 0.8476999402046204,
"step": 912
},
{
"epoch": 1.9282700421940928,
"grad_norm": 1.6803202629089355,
"learning_rate": 7.374432894933905e-07,
"loss": 1.2019180059432983,
"step": 914
},
{
"epoch": 1.9324894514767932,
"grad_norm": 2.0837478637695312,
"learning_rate": 7.355783872507818e-07,
"loss": 0.9530687928199768,
"step": 916
},
{
"epoch": 1.9367088607594938,
"grad_norm": 2.765504837036133,
"learning_rate": 7.337125651317433e-07,
"loss": 1.0955183506011963,
"step": 918
},
{
"epoch": 1.9409282700421941,
"grad_norm": 2.821669101715088,
"learning_rate": 7.318458458454892e-07,
"loss": 0.5842803120613098,
"step": 920
},
{
"epoch": 1.9451476793248945,
"grad_norm": 1.241335153579712,
"learning_rate": 7.299782521121536e-07,
"loss": 1.1832818984985352,
"step": 922
},
{
"epoch": 1.9493670886075949,
"grad_norm": 1.0857776403427124,
"learning_rate": 7.281098066625129e-07,
"loss": 1.262142539024353,
"step": 924
},
{
"epoch": 1.9535864978902953,
"grad_norm": 1.392290472984314,
"learning_rate": 7.262405322377109e-07,
"loss": 0.9511996507644653,
"step": 926
},
{
"epoch": 1.9578059071729959,
"grad_norm": 1.1899210214614868,
"learning_rate": 7.243704515889799e-07,
"loss": 0.797012448310852,
"step": 928
},
{
"epoch": 1.9620253164556962,
"grad_norm": 3.176696300506592,
"learning_rate": 7.224995874773657e-07,
"loss": 1.2408126592636108,
"step": 930
},
{
"epoch": 1.9662447257383966,
"grad_norm": 3.178877592086792,
"learning_rate": 7.206279626734492e-07,
"loss": 0.9860198497772217,
"step": 932
},
{
"epoch": 1.9704641350210972,
"grad_norm": 1.3276498317718506,
"learning_rate": 7.187555999570705e-07,
"loss": 1.2460663318634033,
"step": 934
},
{
"epoch": 1.9746835443037973,
"grad_norm": 1.3889448642730713,
"learning_rate": 7.1688252211705e-07,
"loss": 1.2101694345474243,
"step": 936
},
{
"epoch": 1.978902953586498,
"grad_norm": 0.8887938261032104,
"learning_rate": 7.150087519509128e-07,
"loss": 0.8580332398414612,
"step": 938
},
{
"epoch": 1.9831223628691983,
"grad_norm": 1.4792304039001465,
"learning_rate": 7.131343122646098e-07,
"loss": 1.231054663658142,
"step": 940
},
{
"epoch": 1.9873417721518987,
"grad_norm": 2.457331418991089,
"learning_rate": 7.11259225872241e-07,
"loss": 1.006805658340454,
"step": 942
},
{
"epoch": 1.9915611814345993,
"grad_norm": 1.956710696220398,
"learning_rate": 7.093835155957782e-07,
"loss": 0.7936272025108337,
"step": 944
},
{
"epoch": 1.9957805907172996,
"grad_norm": 1.3758666515350342,
"learning_rate": 7.075072042647852e-07,
"loss": 1.1611456871032715,
"step": 946
},
{
"epoch": 2.0,
"grad_norm": 3.4326858520507812,
"learning_rate": 7.056303147161428e-07,
"loss": 0.5819499492645264,
"step": 948
},
{
"epoch": 2.0042194092827006,
"grad_norm": 6.350503921508789,
"learning_rate": 7.03752869793768e-07,
"loss": 0.9798819422721863,
"step": 950
},
{
"epoch": 2.0084388185654007,
"grad_norm": 3.770968437194824,
"learning_rate": 7.018748923483386e-07,
"loss": 0.6936891078948975,
"step": 952
},
{
"epoch": 2.0126582278481013,
"grad_norm": 3.1057989597320557,
"learning_rate": 6.99996405237013e-07,
"loss": 0.857315182685852,
"step": 954
},
{
"epoch": 2.0168776371308015,
"grad_norm": 1.2099494934082031,
"learning_rate": 6.98117431323153e-07,
"loss": 1.0093313455581665,
"step": 956
},
{
"epoch": 2.021097046413502,
"grad_norm": 2.805772542953491,
"learning_rate": 6.962379934760456e-07,
"loss": 0.7519159913063049,
"step": 958
},
{
"epoch": 2.0253164556962027,
"grad_norm": 2.69637131690979,
"learning_rate": 6.94358114570624e-07,
"loss": 0.8004332780838013,
"step": 960
},
{
"epoch": 2.029535864978903,
"grad_norm": 4.524166584014893,
"learning_rate": 6.924778174871901e-07,
"loss": 1.2693367004394531,
"step": 962
},
{
"epoch": 2.0337552742616034,
"grad_norm": 1.710188388824463,
"learning_rate": 6.905971251111349e-07,
"loss": 0.8327010869979858,
"step": 964
},
{
"epoch": 2.037974683544304,
"grad_norm": 1.4968762397766113,
"learning_rate": 6.887160603326612e-07,
"loss": 0.8057103753089905,
"step": 966
},
{
"epoch": 2.042194092827004,
"grad_norm": 2.4308996200561523,
"learning_rate": 6.868346460465038e-07,
"loss": 0.7996687889099121,
"step": 968
},
{
"epoch": 2.0464135021097047,
"grad_norm": 1.531032681465149,
"learning_rate": 6.849529051516521e-07,
"loss": 1.125715732574463,
"step": 970
},
{
"epoch": 2.050632911392405,
"grad_norm": 3.428903579711914,
"learning_rate": 6.830708605510697e-07,
"loss": 1.0384615659713745,
"step": 972
},
{
"epoch": 2.0548523206751055,
"grad_norm": 1.0824832916259766,
"learning_rate": 6.811885351514176e-07,
"loss": 0.9185305237770081,
"step": 974
},
{
"epoch": 2.059071729957806,
"grad_norm": 1.7839653491973877,
"learning_rate": 6.793059518627739e-07,
"loss": 0.8305885195732117,
"step": 976
},
{
"epoch": 2.0632911392405062,
"grad_norm": 0.7381780743598938,
"learning_rate": 6.77423133598356e-07,
"loss": 0.8384730815887451,
"step": 978
},
{
"epoch": 2.067510548523207,
"grad_norm": 1.6481800079345703,
"learning_rate": 6.755401032742407e-07,
"loss": 0.8727558255195618,
"step": 980
},
{
"epoch": 2.071729957805907,
"grad_norm": 5.477509021759033,
"learning_rate": 6.736568838090859e-07,
"loss": 1.1277180910110474,
"step": 982
},
{
"epoch": 2.0759493670886076,
"grad_norm": 2.758972644805908,
"learning_rate": 6.71773498123852e-07,
"loss": 1.0967183113098145,
"step": 984
},
{
"epoch": 2.080168776371308,
"grad_norm": 1.1603978872299194,
"learning_rate": 6.698899691415218e-07,
"loss": 1.1284269094467163,
"step": 986
},
{
"epoch": 2.0843881856540083,
"grad_norm": 1.3078337907791138,
"learning_rate": 6.680063197868228e-07,
"loss": 1.166777491569519,
"step": 988
},
{
"epoch": 2.088607594936709,
"grad_norm": 3.5238006114959717,
"learning_rate": 6.661225729859475e-07,
"loss": 0.5711318850517273,
"step": 990
},
{
"epoch": 2.0928270042194095,
"grad_norm": 2.0197713375091553,
"learning_rate": 6.64238751666274e-07,
"loss": 0.608964204788208,
"step": 992
},
{
"epoch": 2.0970464135021096,
"grad_norm": 1.3378883600234985,
"learning_rate": 6.623548787560878e-07,
"loss": 1.175323247909546,
"step": 994
},
{
"epoch": 2.1012658227848102,
"grad_norm": 1.223233938217163,
"learning_rate": 6.604709771843022e-07,
"loss": 1.1399847269058228,
"step": 996
},
{
"epoch": 2.1054852320675104,
"grad_norm": 0.5097165703773499,
"learning_rate": 6.585870698801791e-07,
"loss": 0.8538580536842346,
"step": 998
},
{
"epoch": 2.109704641350211,
"grad_norm": 1.8075917959213257,
"learning_rate": 6.567031797730507e-07,
"loss": 1.2541990280151367,
"step": 1000
},
{
"epoch": 2.1139240506329116,
"grad_norm": 1.6272530555725098,
"learning_rate": 6.548193297920393e-07,
"loss": 1.182500958442688,
"step": 1002
},
{
"epoch": 2.1181434599156117,
"grad_norm": 1.8821264505386353,
"learning_rate": 6.529355428657795e-07,
"loss": 1.1924080848693848,
"step": 1004
},
{
"epoch": 2.1223628691983123,
"grad_norm": 1.0999635457992554,
"learning_rate": 6.510518419221377e-07,
"loss": 0.6417333483695984,
"step": 1006
},
{
"epoch": 2.1265822784810124,
"grad_norm": 1.3833292722702026,
"learning_rate": 6.49168249887934e-07,
"loss": 0.7661027908325195,
"step": 1008
},
{
"epoch": 2.130801687763713,
"grad_norm": 1.4525195360183716,
"learning_rate": 6.472847896886636e-07,
"loss": 0.7349141240119934,
"step": 1010
},
{
"epoch": 2.1350210970464136,
"grad_norm": 3.5440096855163574,
"learning_rate": 6.454014842482162e-07,
"loss": 0.9432771801948547,
"step": 1012
},
{
"epoch": 2.1392405063291138,
"grad_norm": 4.978313446044922,
"learning_rate": 6.435183564885985e-07,
"loss": 1.375197172164917,
"step": 1014
},
{
"epoch": 2.1434599156118144,
"grad_norm": 1.7762482166290283,
"learning_rate": 6.416354293296542e-07,
"loss": 0.8380042910575867,
"step": 1016
},
{
"epoch": 2.147679324894515,
"grad_norm": 1.8821486234664917,
"learning_rate": 6.39752725688786e-07,
"loss": 0.9462857842445374,
"step": 1018
},
{
"epoch": 2.151898734177215,
"grad_norm": 1.470024585723877,
"learning_rate": 6.378702684806757e-07,
"loss": 0.8377196192741394,
"step": 1020
},
{
"epoch": 2.1561181434599157,
"grad_norm": 2.115182638168335,
"learning_rate": 6.359880806170058e-07,
"loss": 0.9362459182739258,
"step": 1022
},
{
"epoch": 2.160337552742616,
"grad_norm": 2.337805986404419,
"learning_rate": 6.341061850061807e-07,
"loss": 0.8514955639839172,
"step": 1024
},
{
"epoch": 2.1645569620253164,
"grad_norm": 9.63266372680664,
"learning_rate": 6.322246045530474e-07,
"loss": 1.1533026695251465,
"step": 1026
},
{
"epoch": 2.168776371308017,
"grad_norm": 1.6961092948913574,
"learning_rate": 6.303433621586177e-07,
"loss": 1.1458700895309448,
"step": 1028
},
{
"epoch": 2.172995780590717,
"grad_norm": 1.3575078248977661,
"learning_rate": 6.28462480719788e-07,
"loss": 1.1239484548568726,
"step": 1030
},
{
"epoch": 2.1772151898734178,
"grad_norm": 1.2787476778030396,
"learning_rate": 6.265819831290624e-07,
"loss": 1.1294289827346802,
"step": 1032
},
{
"epoch": 2.181434599156118,
"grad_norm": 4.088858604431152,
"learning_rate": 6.247018922742722e-07,
"loss": 1.1388219594955444,
"step": 1034
},
{
"epoch": 2.1856540084388185,
"grad_norm": 6.764144420623779,
"learning_rate": 6.228222310382992e-07,
"loss": 1.0533146858215332,
"step": 1036
},
{
"epoch": 2.189873417721519,
"grad_norm": 2.094905138015747,
"learning_rate": 6.209430222987952e-07,
"loss": 1.132552146911621,
"step": 1038
},
{
"epoch": 2.1940928270042193,
"grad_norm": 1.7523225545883179,
"learning_rate": 6.190642889279052e-07,
"loss": 1.2820512056350708,
"step": 1040
},
{
"epoch": 2.19831223628692,
"grad_norm": 4.281554222106934,
"learning_rate": 6.171860537919886e-07,
"loss": 0.39310938119888306,
"step": 1042
},
{
"epoch": 2.2025316455696204,
"grad_norm": 2.323817491531372,
"learning_rate": 6.153083397513404e-07,
"loss": 1.1017502546310425,
"step": 1044
},
{
"epoch": 2.2067510548523206,
"grad_norm": 4.524064064025879,
"learning_rate": 6.134311696599129e-07,
"loss": 0.6054593324661255,
"step": 1046
},
{
"epoch": 2.210970464135021,
"grad_norm": 2.6248085498809814,
"learning_rate": 6.115545663650389e-07,
"loss": 0.9862580299377441,
"step": 1048
},
{
"epoch": 2.2151898734177213,
"grad_norm": 1.9876245260238647,
"learning_rate": 6.096785527071516e-07,
"loss": 1.1376148462295532,
"step": 1050
},
{
"epoch": 2.219409282700422,
"grad_norm": 2.210066080093384,
"learning_rate": 6.078031515195085e-07,
"loss": 0.9529132843017578,
"step": 1052
},
{
"epoch": 2.2236286919831225,
"grad_norm": 3.2140283584594727,
"learning_rate": 6.059283856279118e-07,
"loss": 1.0213066339492798,
"step": 1054
},
{
"epoch": 2.2278481012658227,
"grad_norm": 6.621954917907715,
"learning_rate": 6.040542778504319e-07,
"loss": 0.9980672001838684,
"step": 1056
},
{
"epoch": 2.2320675105485233,
"grad_norm": 10.540366172790527,
"learning_rate": 6.021808509971293e-07,
"loss": 0.5453277826309204,
"step": 1058
},
{
"epoch": 2.2362869198312234,
"grad_norm": 1.3416770696640015,
"learning_rate": 6.003081278697764e-07,
"loss": 1.1391900777816772,
"step": 1060
},
{
"epoch": 2.240506329113924,
"grad_norm": 0.30088382959365845,
"learning_rate": 5.984361312615811e-07,
"loss": 0.9888620972633362,
"step": 1062
},
{
"epoch": 2.2447257383966246,
"grad_norm": 1.483581781387329,
"learning_rate": 5.96564883956908e-07,
"loss": 0.6946426033973694,
"step": 1064
},
{
"epoch": 2.2489451476793247,
"grad_norm": 2.5259406566619873,
"learning_rate": 5.946944087310022e-07,
"loss": 1.0866342782974243,
"step": 1066
},
{
"epoch": 2.2531645569620253,
"grad_norm": 2.395719528198242,
"learning_rate": 5.928247283497117e-07,
"loss": 1.3847568035125732,
"step": 1068
},
{
"epoch": 2.257383966244726,
"grad_norm": 2.874040126800537,
"learning_rate": 5.909558655692104e-07,
"loss": 1.1452842950820923,
"step": 1070
},
{
"epoch": 2.261603375527426,
"grad_norm": 2.1399810314178467,
"learning_rate": 5.890878431357208e-07,
"loss": 1.1274282932281494,
"step": 1072
},
{
"epoch": 2.2658227848101267,
"grad_norm": 3.358569383621216,
"learning_rate": 5.872206837852376e-07,
"loss": 1.3512498140335083,
"step": 1074
},
{
"epoch": 2.270042194092827,
"grad_norm": 1.4806420803070068,
"learning_rate": 5.853544102432505e-07,
"loss": 1.14762282371521,
"step": 1076
},
{
"epoch": 2.2742616033755274,
"grad_norm": 1.1972980499267578,
"learning_rate": 5.834890452244685e-07,
"loss": 0.9154924750328064,
"step": 1078
},
{
"epoch": 2.278481012658228,
"grad_norm": 10.489628791809082,
"learning_rate": 5.816246114325421e-07,
"loss": 0.9368666410446167,
"step": 1080
},
{
"epoch": 2.282700421940928,
"grad_norm": 5.601263046264648,
"learning_rate": 5.79761131559788e-07,
"loss": 0.6107386350631714,
"step": 1082
},
{
"epoch": 2.2869198312236287,
"grad_norm": 2.7437796592712402,
"learning_rate": 5.778986282869127e-07,
"loss": 0.7205576300621033,
"step": 1084
},
{
"epoch": 2.291139240506329,
"grad_norm": 0.8865097761154175,
"learning_rate": 5.760371242827363e-07,
"loss": 0.6305662393569946,
"step": 1086
},
{
"epoch": 2.2953586497890295,
"grad_norm": 2.2365691661834717,
"learning_rate": 5.741766422039167e-07,
"loss": 0.9999610781669617,
"step": 1088
},
{
"epoch": 2.29957805907173,
"grad_norm": 1.5019956827163696,
"learning_rate": 5.723172046946733e-07,
"loss": 0.589636504650116,
"step": 1090
},
{
"epoch": 2.3037974683544302,
"grad_norm": 2.1107327938079834,
"learning_rate": 5.704588343865127e-07,
"loss": 0.8981572389602661,
"step": 1092
},
{
"epoch": 2.308016877637131,
"grad_norm": 4.003733158111572,
"learning_rate": 5.686015538979518e-07,
"loss": 0.732837438583374,
"step": 1094
},
{
"epoch": 2.3122362869198314,
"grad_norm": 2.012057065963745,
"learning_rate": 5.667453858342434e-07,
"loss": 0.4853237271308899,
"step": 1096
},
{
"epoch": 2.3164556962025316,
"grad_norm": 2.796154260635376,
"learning_rate": 5.648903527871006e-07,
"loss": 1.1909679174423218,
"step": 1098
},
{
"epoch": 2.320675105485232,
"grad_norm": 1.6839478015899658,
"learning_rate": 5.630364773344224e-07,
"loss": 1.0224688053131104,
"step": 1100
},
{
"epoch": 2.3248945147679323,
"grad_norm": 1.592947006225586,
"learning_rate": 5.611837820400182e-07,
"loss": 1.1030757427215576,
"step": 1102
},
{
"epoch": 2.329113924050633,
"grad_norm": 1.691872239112854,
"learning_rate": 5.593322894533334e-07,
"loss": 1.2941904067993164,
"step": 1104
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.7891680002212524,
"learning_rate": 5.574820221091757e-07,
"loss": 0.8782735466957092,
"step": 1106
},
{
"epoch": 2.3375527426160336,
"grad_norm": 3.5078885555267334,
"learning_rate": 5.556330025274393e-07,
"loss": 0.5180922150611877,
"step": 1108
},
{
"epoch": 2.3417721518987342,
"grad_norm": 1.8680453300476074,
"learning_rate": 5.537852532128322e-07,
"loss": 1.1475764513015747,
"step": 1110
},
{
"epoch": 2.3459915611814344,
"grad_norm": 1.262511968612671,
"learning_rate": 5.519387966546021e-07,
"loss": 1.1460936069488525,
"step": 1112
},
{
"epoch": 2.350210970464135,
"grad_norm": 12.242781639099121,
"learning_rate": 5.500936553262616e-07,
"loss": 1.1747325658798218,
"step": 1114
},
{
"epoch": 2.3544303797468356,
"grad_norm": 0.7147314548492432,
"learning_rate": 5.48249851685316e-07,
"loss": 0.7451015114784241,
"step": 1116
},
{
"epoch": 2.3586497890295357,
"grad_norm": 4.066142559051514,
"learning_rate": 5.464074081729892e-07,
"loss": 1.0633448362350464,
"step": 1118
},
{
"epoch": 2.3628691983122363,
"grad_norm": 1.6116374731063843,
"learning_rate": 5.445663472139506e-07,
"loss": 0.8038894534111023,
"step": 1120
},
{
"epoch": 2.367088607594937,
"grad_norm": 2.5959835052490234,
"learning_rate": 5.427266912160427e-07,
"loss": 1.0548654794692993,
"step": 1122
},
{
"epoch": 2.371308016877637,
"grad_norm": 1.4511165618896484,
"learning_rate": 5.408884625700076e-07,
"loss": 0.744436502456665,
"step": 1124
},
{
"epoch": 2.3755274261603376,
"grad_norm": 2.0259265899658203,
"learning_rate": 5.390516836492152e-07,
"loss": 1.0626447200775146,
"step": 1126
},
{
"epoch": 2.379746835443038,
"grad_norm": 1.5352128744125366,
"learning_rate": 5.372163768093903e-07,
"loss": 1.1404402256011963,
"step": 1128
},
{
"epoch": 2.3839662447257384,
"grad_norm": 3.401780366897583,
"learning_rate": 5.35382564388341e-07,
"loss": 0.5039758086204529,
"step": 1130
},
{
"epoch": 2.388185654008439,
"grad_norm": 1.8972293138504028,
"learning_rate": 5.335502687056865e-07,
"loss": 0.345048725605011,
"step": 1132
},
{
"epoch": 2.392405063291139,
"grad_norm": 4.107486248016357,
"learning_rate": 5.317195120625855e-07,
"loss": 0.4859941303730011,
"step": 1134
},
{
"epoch": 2.3966244725738397,
"grad_norm": 2.5772571563720703,
"learning_rate": 5.298903167414648e-07,
"loss": 0.5732159614562988,
"step": 1136
},
{
"epoch": 2.40084388185654,
"grad_norm": 1.3114792108535767,
"learning_rate": 5.280627050057483e-07,
"loss": 1.1417685747146606,
"step": 1138
},
{
"epoch": 2.4050632911392404,
"grad_norm": 7.5032267570495605,
"learning_rate": 5.262366990995852e-07,
"loss": 0.8103894591331482,
"step": 1140
},
{
"epoch": 2.409282700421941,
"grad_norm": 3.7041962146759033,
"learning_rate": 5.244123212475811e-07,
"loss": 0.3755455017089844,
"step": 1142
},
{
"epoch": 2.413502109704641,
"grad_norm": 1.3423445224761963,
"learning_rate": 5.22589593654525e-07,
"loss": 0.8771740198135376,
"step": 1144
},
{
"epoch": 2.4177215189873418,
"grad_norm": 1.499751329421997,
"learning_rate": 5.207685385051213e-07,
"loss": 1.168401837348938,
"step": 1146
},
{
"epoch": 2.4219409282700424,
"grad_norm": 4.436310291290283,
"learning_rate": 5.189491779637181e-07,
"loss": 0.8418995141983032,
"step": 1148
},
{
"epoch": 2.4261603375527425,
"grad_norm": 1.6216802597045898,
"learning_rate": 5.171315341740387e-07,
"loss": 1.147579550743103,
"step": 1150
},
{
"epoch": 2.430379746835443,
"grad_norm": 43.39120864868164,
"learning_rate": 5.153156292589112e-07,
"loss": 0.8518908619880676,
"step": 1152
},
{
"epoch": 2.4345991561181437,
"grad_norm": 1.7255734205245972,
"learning_rate": 5.1350148532e-07,
"loss": 1.205424427986145,
"step": 1154
},
{
"epoch": 2.438818565400844,
"grad_norm": 6.3630475997924805,
"learning_rate": 5.116891244375358e-07,
"loss": 0.43493425846099854,
"step": 1156
},
{
"epoch": 2.4430379746835444,
"grad_norm": 2.129798412322998,
"learning_rate": 5.098785686700478e-07,
"loss": 0.9413697719573975,
"step": 1158
},
{
"epoch": 2.4472573839662446,
"grad_norm": 1.4703646898269653,
"learning_rate": 5.080698400540949e-07,
"loss": 1.1531509160995483,
"step": 1160
},
{
"epoch": 2.451476793248945,
"grad_norm": 1.771309494972229,
"learning_rate": 5.062629606039975e-07,
"loss": 0.7602155208587646,
"step": 1162
},
{
"epoch": 2.4556962025316453,
"grad_norm": 1.3786754608154297,
"learning_rate": 5.04457952311569e-07,
"loss": 1.1161296367645264,
"step": 1164
},
{
"epoch": 2.459915611814346,
"grad_norm": 2.190340280532837,
"learning_rate": 5.026548371458493e-07,
"loss": 1.1393266916275024,
"step": 1166
},
{
"epoch": 2.4641350210970465,
"grad_norm": 0.6447933316230774,
"learning_rate": 5.008536370528365e-07,
"loss": 0.728462815284729,
"step": 1168
},
{
"epoch": 2.4683544303797467,
"grad_norm": 1.4482827186584473,
"learning_rate": 4.990543739552197e-07,
"loss": 1.0875799655914307,
"step": 1170
},
{
"epoch": 2.4725738396624473,
"grad_norm": 1.4591885805130005,
"learning_rate": 4.972570697521133e-07,
"loss": 1.124202013015747,
"step": 1172
},
{
"epoch": 2.476793248945148,
"grad_norm": 1.3290364742279053,
"learning_rate": 4.954617463187888e-07,
"loss": 1.1189545392990112,
"step": 1174
},
{
"epoch": 2.481012658227848,
"grad_norm": 2.232417106628418,
"learning_rate": 4.936684255064102e-07,
"loss": 0.8213171362876892,
"step": 1176
},
{
"epoch": 2.4852320675105486,
"grad_norm": 1.8304226398468018,
"learning_rate": 4.918771291417669e-07,
"loss": 0.40340158343315125,
"step": 1178
},
{
"epoch": 2.489451476793249,
"grad_norm": 8.472685813903809,
"learning_rate": 4.900878790270084e-07,
"loss": 0.9105018973350525,
"step": 1180
},
{
"epoch": 2.4936708860759493,
"grad_norm": 4.292173385620117,
"learning_rate": 4.883006969393791e-07,
"loss": 1.0442423820495605,
"step": 1182
},
{
"epoch": 2.49789029535865,
"grad_norm": 4.11490535736084,
"learning_rate": 4.865156046309528e-07,
"loss": 0.5216444730758667,
"step": 1184
},
{
"epoch": 2.50210970464135,
"grad_norm": 0.9223915338516235,
"learning_rate": 4.847326238283692e-07,
"loss": 0.7885441780090332,
"step": 1186
},
{
"epoch": 2.5063291139240507,
"grad_norm": 1.5045500993728638,
"learning_rate": 4.829517762325671e-07,
"loss": 0.8654785752296448,
"step": 1188
},
{
"epoch": 2.510548523206751,
"grad_norm": 1.1637619733810425,
"learning_rate": 4.811730835185232e-07,
"loss": 1.1407520771026611,
"step": 1190
},
{
"epoch": 2.5147679324894514,
"grad_norm": 6.294982433319092,
"learning_rate": 4.793965673349857e-07,
"loss": 0.5034950971603394,
"step": 1192
},
{
"epoch": 2.518987341772152,
"grad_norm": 3.0815131664276123,
"learning_rate": 4.776222493042122e-07,
"loss": 1.443105697631836,
"step": 1194
},
{
"epoch": 2.523206751054852,
"grad_norm": 1.1364555358886719,
"learning_rate": 4.758501510217066e-07,
"loss": 1.1503655910491943,
"step": 1196
},
{
"epoch": 2.5274261603375527,
"grad_norm": 2.4966022968292236,
"learning_rate": 4.740802940559553e-07,
"loss": 1.0758484601974487,
"step": 1198
},
{
"epoch": 2.5316455696202533,
"grad_norm": 2.249464511871338,
"learning_rate": 4.7231269994816584e-07,
"loss": 0.6718664765357971,
"step": 1200
},
{
"epoch": 2.5358649789029535,
"grad_norm": 1.933441400527954,
"learning_rate": 4.705473902120039e-07,
"loss": 0.8221999406814575,
"step": 1202
},
{
"epoch": 2.540084388185654,
"grad_norm": 2.7778429985046387,
"learning_rate": 4.687843863333317e-07,
"loss": 0.7672927975654602,
"step": 1204
},
{
"epoch": 2.5443037974683547,
"grad_norm": 1.3384257555007935,
"learning_rate": 4.670237097699464e-07,
"loss": 1.0449153184890747,
"step": 1206
},
{
"epoch": 2.548523206751055,
"grad_norm": 4.268535137176514,
"learning_rate": 4.6526538195131944e-07,
"loss": 0.7585489749908447,
"step": 1208
},
{
"epoch": 2.5527426160337554,
"grad_norm": 1.7707507610321045,
"learning_rate": 4.6350942427833463e-07,
"loss": 1.191308617591858,
"step": 1210
},
{
"epoch": 2.5569620253164556,
"grad_norm": 1.2368897199630737,
"learning_rate": 4.6175585812302914e-07,
"loss": 1.115039348602295,
"step": 1212
},
{
"epoch": 2.561181434599156,
"grad_norm": 6.681468486785889,
"learning_rate": 4.600047048283323e-07,
"loss": 0.35992902517318726,
"step": 1214
},
{
"epoch": 2.5654008438818563,
"grad_norm": 2.588292360305786,
"learning_rate": 4.582559857078059e-07,
"loss": 0.831079363822937,
"step": 1216
},
{
"epoch": 2.569620253164557,
"grad_norm": 1.0166317224502563,
"learning_rate": 4.565097220453852e-07,
"loss": 1.160988211631775,
"step": 1218
},
{
"epoch": 2.5738396624472575,
"grad_norm": 4.55487060546875,
"learning_rate": 4.5476593509511975e-07,
"loss": 0.8059465289115906,
"step": 1220
},
{
"epoch": 2.5780590717299576,
"grad_norm": 1.7319614887237549,
"learning_rate": 4.5302464608091444e-07,
"loss": 0.9973964095115662,
"step": 1222
},
{
"epoch": 2.5822784810126582,
"grad_norm": 2.5319430828094482,
"learning_rate": 4.512858761962719e-07,
"loss": 0.8335304260253906,
"step": 1224
},
{
"epoch": 2.586497890295359,
"grad_norm": 2.232879161834717,
"learning_rate": 4.495496466040333e-07,
"loss": 0.7188448309898376,
"step": 1226
},
{
"epoch": 2.590717299578059,
"grad_norm": 1.1091829538345337,
"learning_rate": 4.478159784361222e-07,
"loss": 1.0995886325836182,
"step": 1228
},
{
"epoch": 2.5949367088607596,
"grad_norm": 1.6400138139724731,
"learning_rate": 4.4608489279328616e-07,
"loss": 1.197192907333374,
"step": 1230
},
{
"epoch": 2.59915611814346,
"grad_norm": 2.313340187072754,
"learning_rate": 4.443564107448406e-07,
"loss": 1.024308204650879,
"step": 1232
},
{
"epoch": 2.6033755274261603,
"grad_norm": 3.4521191120147705,
"learning_rate": 4.4263055332841223e-07,
"loss": 0.30793383717536926,
"step": 1234
},
{
"epoch": 2.607594936708861,
"grad_norm": 3.510732889175415,
"learning_rate": 4.409073415496829e-07,
"loss": 1.2074471712112427,
"step": 1236
},
{
"epoch": 2.611814345991561,
"grad_norm": 5.145284652709961,
"learning_rate": 4.391867963821341e-07,
"loss": 1.3546441793441772,
"step": 1238
},
{
"epoch": 2.6160337552742616,
"grad_norm": 1.3848285675048828,
"learning_rate": 4.374689387667913e-07,
"loss": 0.7564114332199097,
"step": 1240
},
{
"epoch": 2.620253164556962,
"grad_norm": 1.5747932195663452,
"learning_rate": 4.3575378961196987e-07,
"loss": 1.1020171642303467,
"step": 1242
},
{
"epoch": 2.6244725738396624,
"grad_norm": 7.985814094543457,
"learning_rate": 4.340413697930193e-07,
"loss": 0.6297235488891602,
"step": 1244
},
{
"epoch": 2.628691983122363,
"grad_norm": 1.5127233266830444,
"learning_rate": 4.3233170015207045e-07,
"loss": 0.7452877163887024,
"step": 1246
},
{
"epoch": 2.632911392405063,
"grad_norm": 2.9100558757781982,
"learning_rate": 4.306248014977816e-07,
"loss": 1.5952140092849731,
"step": 1248
},
{
"epoch": 2.6371308016877637,
"grad_norm": 2.4428465366363525,
"learning_rate": 4.2892069460508416e-07,
"loss": 1.142899990081787,
"step": 1250
},
{
"epoch": 2.6413502109704643,
"grad_norm": 1.664016604423523,
"learning_rate": 4.27219400214931e-07,
"loss": 1.067954182624817,
"step": 1252
},
{
"epoch": 2.6455696202531644,
"grad_norm": 14.406804084777832,
"learning_rate": 4.255209390340436e-07,
"loss": 0.608812689781189,
"step": 1254
},
{
"epoch": 2.649789029535865,
"grad_norm": 1.7003490924835205,
"learning_rate": 4.238253317346602e-07,
"loss": 0.5725827813148499,
"step": 1256
},
{
"epoch": 2.6540084388185656,
"grad_norm": 2.5306010246276855,
"learning_rate": 4.221325989542832e-07,
"loss": 0.9772995710372925,
"step": 1258
},
{
"epoch": 2.6582278481012658,
"grad_norm": 1.6626094579696655,
"learning_rate": 4.2044276129542956e-07,
"loss": 1.0970871448516846,
"step": 1260
},
{
"epoch": 2.6624472573839664,
"grad_norm": 1.178289771080017,
"learning_rate": 4.1875583932537926e-07,
"loss": 1.2285281419754028,
"step": 1262
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.0838637351989746,
"learning_rate": 4.1707185357592434e-07,
"loss": 0.6816955208778381,
"step": 1264
},
{
"epoch": 2.670886075949367,
"grad_norm": 2.826573133468628,
"learning_rate": 4.1539082454312016e-07,
"loss": 0.9266291856765747,
"step": 1266
},
{
"epoch": 2.6751054852320673,
"grad_norm": 2.5557596683502197,
"learning_rate": 4.1371277268703537e-07,
"loss": 0.7625723481178284,
"step": 1268
},
{
"epoch": 2.679324894514768,
"grad_norm": 1.970330834388733,
"learning_rate": 4.120377184315029e-07,
"loss": 0.6248302459716797,
"step": 1270
},
{
"epoch": 2.6835443037974684,
"grad_norm": 1.4884620904922485,
"learning_rate": 4.103656821638711e-07,
"loss": 1.0319654941558838,
"step": 1272
},
{
"epoch": 2.6877637130801686,
"grad_norm": 7.556288242340088,
"learning_rate": 4.086966842347563e-07,
"loss": 0.745881199836731,
"step": 1274
},
{
"epoch": 2.691983122362869,
"grad_norm": 9.8837890625,
"learning_rate": 4.0703074495779464e-07,
"loss": 0.8159171342849731,
"step": 1276
},
{
"epoch": 2.6962025316455698,
"grad_norm": 0.8211575746536255,
"learning_rate": 4.053678846093952e-07,
"loss": 0.4533369243144989,
"step": 1278
},
{
"epoch": 2.70042194092827,
"grad_norm": 2.684162139892578,
"learning_rate": 4.03708123428492e-07,
"loss": 0.9859198331832886,
"step": 1280
},
{
"epoch": 2.7046413502109705,
"grad_norm": 2.0273070335388184,
"learning_rate": 4.0205148161629964e-07,
"loss": 1.131312608718872,
"step": 1282
},
{
"epoch": 2.708860759493671,
"grad_norm": 1.63193678855896,
"learning_rate": 4.003979793360661e-07,
"loss": 0.5977147221565247,
"step": 1284
},
{
"epoch": 2.7130801687763713,
"grad_norm": 4.451190948486328,
"learning_rate": 3.987476367128271e-07,
"loss": 0.6326662302017212,
"step": 1286
},
{
"epoch": 2.717299578059072,
"grad_norm": 0.9102901816368103,
"learning_rate": 3.9710047383316225e-07,
"loss": 0.8215235471725464,
"step": 1288
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.5546138286590576,
"learning_rate": 3.954565107449499e-07,
"loss": 1.0023081302642822,
"step": 1290
},
{
"epoch": 2.7257383966244726,
"grad_norm": 1.5768028497695923,
"learning_rate": 3.9381576745712347e-07,
"loss": 1.1236157417297363,
"step": 1292
},
{
"epoch": 2.7299578059071727,
"grad_norm": 1.9878129959106445,
"learning_rate": 3.921782639394268e-07,
"loss": 0.7208356857299805,
"step": 1294
},
{
"epoch": 2.7341772151898733,
"grad_norm": 1.5068297386169434,
"learning_rate": 3.905440201221729e-07,
"loss": 1.1069350242614746,
"step": 1296
},
{
"epoch": 2.738396624472574,
"grad_norm": 1.8150042295455933,
"learning_rate": 3.8891305589600005e-07,
"loss": 1.1665513515472412,
"step": 1298
},
{
"epoch": 2.742616033755274,
"grad_norm": 1.199442744255066,
"learning_rate": 3.872853911116304e-07,
"loss": 0.8418156504631042,
"step": 1300
},
{
"epoch": 2.7468354430379747,
"grad_norm": 1.2970781326293945,
"learning_rate": 3.856610455796275e-07,
"loss": 1.1775513887405396,
"step": 1302
},
{
"epoch": 2.7510548523206753,
"grad_norm": 2.245298147201538,
"learning_rate": 3.840400390701562e-07,
"loss": 0.740407407283783,
"step": 1304
},
{
"epoch": 2.7552742616033754,
"grad_norm": 1.889854073524475,
"learning_rate": 3.824223913127419e-07,
"loss": 1.4258309602737427,
"step": 1306
},
{
"epoch": 2.759493670886076,
"grad_norm": 1.5117489099502563,
"learning_rate": 3.808081219960292e-07,
"loss": 1.0796724557876587,
"step": 1308
},
{
"epoch": 2.7637130801687766,
"grad_norm": 5.329049110412598,
"learning_rate": 3.791972507675438e-07,
"loss": 0.9499403834342957,
"step": 1310
},
{
"epoch": 2.7679324894514767,
"grad_norm": 1.1606426239013672,
"learning_rate": 3.775897972334526e-07,
"loss": 1.1145509481430054,
"step": 1312
},
{
"epoch": 2.7721518987341773,
"grad_norm": 5.624031066894531,
"learning_rate": 3.759857809583255e-07,
"loss": 1.1557338237762451,
"step": 1314
},
{
"epoch": 2.7763713080168775,
"grad_norm": 2.0686893463134766,
"learning_rate": 3.7438522146489624e-07,
"loss": 0.9982014894485474,
"step": 1316
},
{
"epoch": 2.780590717299578,
"grad_norm": 7.816877841949463,
"learning_rate": 3.727881382338262e-07,
"loss": 0.642890453338623,
"step": 1318
},
{
"epoch": 2.7848101265822782,
"grad_norm": 8.81386661529541,
"learning_rate": 3.711945507034663e-07,
"loss": 1.1752903461456299,
"step": 1320
},
{
"epoch": 2.789029535864979,
"grad_norm": 0.7249853014945984,
"learning_rate": 3.696044782696211e-07,
"loss": 0.7932354807853699,
"step": 1322
},
{
"epoch": 2.7932489451476794,
"grad_norm": 4.055787563323975,
"learning_rate": 3.680179402853118e-07,
"loss": 1.365350604057312,
"step": 1324
},
{
"epoch": 2.7974683544303796,
"grad_norm": 3.504054546356201,
"learning_rate": 3.6643495606054153e-07,
"loss": 0.9429040551185608,
"step": 1326
},
{
"epoch": 2.80168776371308,
"grad_norm": 1.6331150531768799,
"learning_rate": 3.6485554486206035e-07,
"loss": 0.8298648595809937,
"step": 1328
},
{
"epoch": 2.8059071729957807,
"grad_norm": 1.505171298980713,
"learning_rate": 3.632797259131301e-07,
"loss": 1.119720458984375,
"step": 1330
},
{
"epoch": 2.810126582278481,
"grad_norm": 1.824774980545044,
"learning_rate": 3.6170751839329087e-07,
"loss": 1.1578552722930908,
"step": 1332
},
{
"epoch": 2.8143459915611815,
"grad_norm": 1.4384500980377197,
"learning_rate": 3.601389414381272e-07,
"loss": 0.7492596507072449,
"step": 1334
},
{
"epoch": 2.818565400843882,
"grad_norm": 0.6595861911773682,
"learning_rate": 3.585740141390362e-07,
"loss": 1.0319997072219849,
"step": 1336
},
{
"epoch": 2.8227848101265822,
"grad_norm": 1.1182562112808228,
"learning_rate": 3.570127555429937e-07,
"loss": 0.8679478168487549,
"step": 1338
},
{
"epoch": 2.827004219409283,
"grad_norm": 2.751737117767334,
"learning_rate": 3.554551846523234e-07,
"loss": 0.992285430431366,
"step": 1340
},
{
"epoch": 2.831223628691983,
"grad_norm": 1.0553044080734253,
"learning_rate": 3.5390132042446593e-07,
"loss": 1.0697180032730103,
"step": 1342
},
{
"epoch": 2.8354430379746836,
"grad_norm": 6.8535590171813965,
"learning_rate": 3.5235118177174633e-07,
"loss": 1.3121901750564575,
"step": 1344
},
{
"epoch": 2.8396624472573837,
"grad_norm": 3.6509854793548584,
"learning_rate": 3.5080478756114603e-07,
"loss": 0.7838273048400879,
"step": 1346
},
{
"epoch": 2.8438818565400843,
"grad_norm": 1.319709062576294,
"learning_rate": 3.4926215661407224e-07,
"loss": 0.6845376491546631,
"step": 1348
},
{
"epoch": 2.848101265822785,
"grad_norm": 1.3884485960006714,
"learning_rate": 3.4772330770612856e-07,
"loss": 1.1699258089065552,
"step": 1350
},
{
"epoch": 2.852320675105485,
"grad_norm": 1.5885751247406006,
"learning_rate": 3.4618825956688674e-07,
"loss": 1.0469439029693604,
"step": 1352
},
{
"epoch": 2.8565400843881856,
"grad_norm": 3.556104898452759,
"learning_rate": 3.4465703087965895e-07,
"loss": 0.8466750383377075,
"step": 1354
},
{
"epoch": 2.8607594936708862,
"grad_norm": 4.589015960693359,
"learning_rate": 3.4312964028127036e-07,
"loss": 0.5300393104553223,
"step": 1356
},
{
"epoch": 2.8649789029535864,
"grad_norm": 1.3245023488998413,
"learning_rate": 3.416061063618321e-07,
"loss": 1.1446274518966675,
"step": 1358
},
{
"epoch": 2.869198312236287,
"grad_norm": 2.1873321533203125,
"learning_rate": 3.400864476645146e-07,
"loss": 0.9219729900360107,
"step": 1360
},
{
"epoch": 2.8734177215189876,
"grad_norm": 8.96743392944336,
"learning_rate": 3.3857068268532285e-07,
"loss": 0.7180023789405823,
"step": 1362
},
{
"epoch": 2.8776371308016877,
"grad_norm": 2.621079444885254,
"learning_rate": 3.3705882987287096e-07,
"loss": 1.0849711894989014,
"step": 1364
},
{
"epoch": 2.8818565400843883,
"grad_norm": 1.501388669013977,
"learning_rate": 3.355509076281567e-07,
"loss": 1.1244922876358032,
"step": 1366
},
{
"epoch": 2.8860759493670884,
"grad_norm": 2.502393960952759,
"learning_rate": 3.3404693430433883e-07,
"loss": 1.1720871925354004,
"step": 1368
},
{
"epoch": 2.890295358649789,
"grad_norm": 1.8280988931655884,
"learning_rate": 3.32546928206513e-07,
"loss": 1.0966358184814453,
"step": 1370
},
{
"epoch": 2.894514767932489,
"grad_norm": 19.55905532836914,
"learning_rate": 3.3105090759148967e-07,
"loss": 0.48501160740852356,
"step": 1372
},
{
"epoch": 2.8987341772151898,
"grad_norm": 1.4679443836212158,
"learning_rate": 3.2955889066757016e-07,
"loss": 0.8926799297332764,
"step": 1374
},
{
"epoch": 2.9029535864978904,
"grad_norm": 1.4338287115097046,
"learning_rate": 3.280708955943272e-07,
"loss": 1.1578876972198486,
"step": 1376
},
{
"epoch": 2.9071729957805905,
"grad_norm": 3.8778481483459473,
"learning_rate": 3.265869404823828e-07,
"loss": 0.9735660552978516,
"step": 1378
},
{
"epoch": 2.911392405063291,
"grad_norm": 2.506485939025879,
"learning_rate": 3.2510704339318803e-07,
"loss": 1.3276560306549072,
"step": 1380
},
{
"epoch": 2.9156118143459917,
"grad_norm": 2.2147409915924072,
"learning_rate": 3.2363122233880246e-07,
"loss": 0.7593087553977966,
"step": 1382
},
{
"epoch": 2.919831223628692,
"grad_norm": 5.710489749908447,
"learning_rate": 3.221594952816764e-07,
"loss": 0.7504158616065979,
"step": 1384
},
{
"epoch": 2.9240506329113924,
"grad_norm": 1.8435603380203247,
"learning_rate": 3.2069188013443137e-07,
"loss": 0.8508476614952087,
"step": 1386
},
{
"epoch": 2.928270042194093,
"grad_norm": 1.3359285593032837,
"learning_rate": 3.192283947596416e-07,
"loss": 1.1549383401870728,
"step": 1388
},
{
"epoch": 2.932489451476793,
"grad_norm": 1.435840129852295,
"learning_rate": 3.1776905696961776e-07,
"loss": 1.0318659543991089,
"step": 1390
},
{
"epoch": 2.9367088607594938,
"grad_norm": 1.5136826038360596,
"learning_rate": 3.163138845261895e-07,
"loss": 0.7768437266349792,
"step": 1392
},
{
"epoch": 2.9409282700421944,
"grad_norm": 2.1672825813293457,
"learning_rate": 3.148628951404894e-07,
"loss": 0.7318160533905029,
"step": 1394
},
{
"epoch": 2.9451476793248945,
"grad_norm": 2.231234550476074,
"learning_rate": 3.134161064727371e-07,
"loss": 1.1114449501037598,
"step": 1396
},
{
"epoch": 2.9493670886075947,
"grad_norm": 1.1347553730010986,
"learning_rate": 3.1197353613202493e-07,
"loss": 0.98956298828125,
"step": 1398
},
{
"epoch": 2.9535864978902953,
"grad_norm": 2.6513452529907227,
"learning_rate": 3.1053520167610327e-07,
"loss": 0.8672858476638794,
"step": 1400
},
{
"epoch": 2.957805907172996,
"grad_norm": 5.388697147369385,
"learning_rate": 3.0910112061116706e-07,
"loss": 0.8895928263664246,
"step": 1402
},
{
"epoch": 2.962025316455696,
"grad_norm": 1.2345776557922363,
"learning_rate": 3.07671310391642e-07,
"loss": 1.0770245790481567,
"step": 1404
},
{
"epoch": 2.9662447257383966,
"grad_norm": 2.581882953643799,
"learning_rate": 3.06245788419973e-07,
"loss": 0.9955227971076965,
"step": 1406
},
{
"epoch": 2.970464135021097,
"grad_norm": 1.2137199640274048,
"learning_rate": 3.0482457204641244e-07,
"loss": 0.6025493741035461,
"step": 1408
},
{
"epoch": 2.9746835443037973,
"grad_norm": 7.410897731781006,
"learning_rate": 3.0340767856880765e-07,
"loss": 0.9356251358985901,
"step": 1410
},
{
"epoch": 2.978902953586498,
"grad_norm": 1.3034472465515137,
"learning_rate": 3.019951252323922e-07,
"loss": 1.1050803661346436,
"step": 1412
},
{
"epoch": 2.9831223628691985,
"grad_norm": 4.278261184692383,
"learning_rate": 3.005869292295745e-07,
"loss": 0.9199661016464233,
"step": 1414
},
{
"epoch": 2.9873417721518987,
"grad_norm": 1.678770661354065,
"learning_rate": 2.9918310769972974e-07,
"loss": 1.006180763244629,
"step": 1416
},
{
"epoch": 2.9915611814345993,
"grad_norm": 1.6135848760604858,
"learning_rate": 2.9778367772899007e-07,
"loss": 1.0220967531204224,
"step": 1418
},
{
"epoch": 2.9957805907173,
"grad_norm": 4.610077857971191,
"learning_rate": 2.963886563500377e-07,
"loss": 1.10872220993042,
"step": 1420
},
{
"epoch": 3.0,
"grad_norm": 1.7171008586883545,
"learning_rate": 2.949980605418972e-07,
"loss": 0.4870656132698059,
"step": 1422
},
{
"epoch": 3.0042194092827006,
"grad_norm": 1.3645132780075073,
"learning_rate": 2.936119072297288e-07,
"loss": 0.8511791825294495,
"step": 1424
},
{
"epoch": 3.0084388185654007,
"grad_norm": 1.9679698944091797,
"learning_rate": 2.9223021328462197e-07,
"loss": 0.7651324272155762,
"step": 1426
},
{
"epoch": 3.0126582278481013,
"grad_norm": 3.9621288776397705,
"learning_rate": 2.908529955233911e-07,
"loss": 0.699533224105835,
"step": 1428
},
{
"epoch": 3.0168776371308015,
"grad_norm": 3.126701831817627,
"learning_rate": 2.8948027070836994e-07,
"loss": 0.4490070939064026,
"step": 1430
},
{
"epoch": 3.021097046413502,
"grad_norm": 2.446420431137085,
"learning_rate": 2.881120555472082e-07,
"loss": 1.0999044179916382,
"step": 1432
},
{
"epoch": 3.0253164556962027,
"grad_norm": 1.639694333076477,
"learning_rate": 2.867483666926673e-07,
"loss": 1.0761295557022095,
"step": 1434
},
{
"epoch": 3.029535864978903,
"grad_norm": 2.0383009910583496,
"learning_rate": 2.853892207424188e-07,
"loss": 1.2911527156829834,
"step": 1436
},
{
"epoch": 3.0337552742616034,
"grad_norm": 2.1497604846954346,
"learning_rate": 2.840346342388418e-07,
"loss": 0.7010747790336609,
"step": 1438
},
{
"epoch": 3.037974683544304,
"grad_norm": 1.3137015104293823,
"learning_rate": 2.8268462366882116e-07,
"loss": 1.0549767017364502,
"step": 1440
},
{
"epoch": 3.042194092827004,
"grad_norm": 2.2534055709838867,
"learning_rate": 2.81339205463548e-07,
"loss": 0.7904849052429199,
"step": 1442
},
{
"epoch": 3.0464135021097047,
"grad_norm": 1.8378784656524658,
"learning_rate": 2.7999839599831866e-07,
"loss": 0.9793230891227722,
"step": 1444
},
{
"epoch": 3.050632911392405,
"grad_norm": 1.6699494123458862,
"learning_rate": 2.786622115923361e-07,
"loss": 1.100398302078247,
"step": 1446
},
{
"epoch": 3.0548523206751055,
"grad_norm": 4.9398722648620605,
"learning_rate": 2.773306685085103e-07,
"loss": 0.7494297027587891,
"step": 1448
},
{
"epoch": 3.059071729957806,
"grad_norm": 2.751260757446289,
"learning_rate": 2.760037829532616e-07,
"loss": 0.9139360189437866,
"step": 1450
},
{
"epoch": 3.0632911392405062,
"grad_norm": 1.659805178642273,
"learning_rate": 2.746815710763228e-07,
"loss": 1.121703028678894,
"step": 1452
},
{
"epoch": 3.067510548523207,
"grad_norm": 17.309215545654297,
"learning_rate": 2.733640489705424e-07,
"loss": 0.8850579261779785,
"step": 1454
},
{
"epoch": 3.071729957805907,
"grad_norm": 1.963599443435669,
"learning_rate": 2.7205123267168884e-07,
"loss": 0.7342712879180908,
"step": 1456
},
{
"epoch": 3.0759493670886076,
"grad_norm": 1.344913125038147,
"learning_rate": 2.7074313815825577e-07,
"loss": 0.8235659003257751,
"step": 1458
},
{
"epoch": 3.080168776371308,
"grad_norm": 2.194878101348877,
"learning_rate": 2.694397813512672e-07,
"loss": 0.8748940229415894,
"step": 1460
},
{
"epoch": 3.0843881856540083,
"grad_norm": 1.611878514289856,
"learning_rate": 2.6814117811408343e-07,
"loss": 1.0315779447555542,
"step": 1462
},
{
"epoch": 3.088607594936709,
"grad_norm": 1.17129647731781,
"learning_rate": 2.668473442522087e-07,
"loss": 1.089264154434204,
"step": 1464
},
{
"epoch": 3.0928270042194095,
"grad_norm": 1.8487638235092163,
"learning_rate": 2.655582955130983e-07,
"loss": 0.9789541959762573,
"step": 1466
},
{
"epoch": 3.0970464135021096,
"grad_norm": 2.393946409225464,
"learning_rate": 2.6427404758596716e-07,
"loss": 0.7049380540847778,
"step": 1468
},
{
"epoch": 3.1012658227848102,
"grad_norm": 6.393697261810303,
"learning_rate": 2.6299461610159823e-07,
"loss": 0.2891662120819092,
"step": 1470
},
{
"epoch": 3.1054852320675104,
"grad_norm": 3.184678316116333,
"learning_rate": 2.617200166321536e-07,
"loss": 1.5170872211456299,
"step": 1472
},
{
"epoch": 3.109704641350211,
"grad_norm": 0.9037976264953613,
"learning_rate": 2.604502646909835e-07,
"loss": 0.6711030602455139,
"step": 1474
},
{
"epoch": 3.1139240506329116,
"grad_norm": 1.8876357078552246,
"learning_rate": 2.591853757324387e-07,
"loss": 1.0795202255249023,
"step": 1476
},
{
"epoch": 3.1181434599156117,
"grad_norm": 2.756838083267212,
"learning_rate": 2.579253651516811e-07,
"loss": 1.132811427116394,
"step": 1478
},
{
"epoch": 3.1223628691983123,
"grad_norm": 1.3386019468307495,
"learning_rate": 2.566702482844977e-07,
"loss": 1.08835768699646,
"step": 1480
},
{
"epoch": 3.1265822784810124,
"grad_norm": 2.480353593826294,
"learning_rate": 2.554200404071133e-07,
"loss": 1.070718765258789,
"step": 1482
},
{
"epoch": 3.130801687763713,
"grad_norm": 1.40932297706604,
"learning_rate": 2.541747567360042e-07,
"loss": 1.0528981685638428,
"step": 1484
},
{
"epoch": 3.1350210970464136,
"grad_norm": 1.5161710977554321,
"learning_rate": 2.529344124277137e-07,
"loss": 0.701133131980896,
"step": 1486
},
{
"epoch": 3.1392405063291138,
"grad_norm": 1.7773646116256714,
"learning_rate": 2.516990225786675e-07,
"loss": 0.714127242565155,
"step": 1488
},
{
"epoch": 3.1434599156118144,
"grad_norm": 1.856155276298523,
"learning_rate": 2.5046860222498974e-07,
"loss": 1.374661922454834,
"step": 1490
},
{
"epoch": 3.147679324894515,
"grad_norm": 1.7023481130599976,
"learning_rate": 2.492431663423195e-07,
"loss": 0.7714812159538269,
"step": 1492
},
{
"epoch": 3.151898734177215,
"grad_norm": 2.849262237548828,
"learning_rate": 2.480227298456298e-07,
"loss": 0.9089514017105103,
"step": 1494
},
{
"epoch": 3.1561181434599157,
"grad_norm": 1.426505208015442,
"learning_rate": 2.468073075890449e-07,
"loss": 0.885564386844635,
"step": 1496
},
{
"epoch": 3.160337552742616,
"grad_norm": 1.386016845703125,
"learning_rate": 2.455969143656604e-07,
"loss": 0.6194628477096558,
"step": 1498
},
{
"epoch": 3.1645569620253164,
"grad_norm": 3.545844316482544,
"learning_rate": 2.4439156490736206e-07,
"loss": 0.6920610070228577,
"step": 1500
},
{
"epoch": 3.168776371308017,
"grad_norm": 2.4662020206451416,
"learning_rate": 2.431912738846479e-07,
"loss": 1.0780019760131836,
"step": 1502
},
{
"epoch": 3.172995780590717,
"grad_norm": 1.5884943008422852,
"learning_rate": 2.4199605590644834e-07,
"loss": 0.987308144569397,
"step": 1504
},
{
"epoch": 3.1772151898734178,
"grad_norm": 1.7786238193511963,
"learning_rate": 2.4080592551994957e-07,
"loss": 1.1196187734603882,
"step": 1506
},
{
"epoch": 3.181434599156118,
"grad_norm": 1.3663359880447388,
"learning_rate": 2.396208972104153e-07,
"loss": 1.2225620746612549,
"step": 1508
},
{
"epoch": 3.1856540084388185,
"grad_norm": 2.5622196197509766,
"learning_rate": 2.384409854010114e-07,
"loss": 1.0651240348815918,
"step": 1510
},
{
"epoch": 3.189873417721519,
"grad_norm": 0.9567521214485168,
"learning_rate": 2.372662044526301e-07,
"loss": 0.3738023042678833,
"step": 1512
},
{
"epoch": 3.1940928270042193,
"grad_norm": 1.9998040199279785,
"learning_rate": 2.3609656866371468e-07,
"loss": 1.1397721767425537,
"step": 1514
},
{
"epoch": 3.19831223628692,
"grad_norm": 0.7790340781211853,
"learning_rate": 2.3493209227008635e-07,
"loss": 0.7803550958633423,
"step": 1516
},
{
"epoch": 3.2025316455696204,
"grad_norm": 1.4339203834533691,
"learning_rate": 2.3377278944477026e-07,
"loss": 1.136408805847168,
"step": 1518
},
{
"epoch": 3.2067510548523206,
"grad_norm": 2.4172418117523193,
"learning_rate": 2.3261867429782352e-07,
"loss": 1.0867120027542114,
"step": 1520
},
{
"epoch": 3.210970464135021,
"grad_norm": 5.30928373336792,
"learning_rate": 2.3146976087616251e-07,
"loss": 0.40863823890686035,
"step": 1522
},
{
"epoch": 3.2151898734177213,
"grad_norm": 1.3400903940200806,
"learning_rate": 2.3032606316339343e-07,
"loss": 0.9426780343055725,
"step": 1524
},
{
"epoch": 3.219409282700422,
"grad_norm": 2.4984984397888184,
"learning_rate": 2.2918759507964067e-07,
"loss": 1.065047025680542,
"step": 1526
},
{
"epoch": 3.2236286919831225,
"grad_norm": 1.2303318977355957,
"learning_rate": 2.280543704813786e-07,
"loss": 0.7552684545516968,
"step": 1528
},
{
"epoch": 3.2278481012658227,
"grad_norm": 8.25938606262207,
"learning_rate": 2.2692640316126142e-07,
"loss": 0.8803672790527344,
"step": 1530
},
{
"epoch": 3.2320675105485233,
"grad_norm": 0.49941709637641907,
"learning_rate": 2.258037068479569e-07,
"loss": 0.4145871102809906,
"step": 1532
},
{
"epoch": 3.2362869198312234,
"grad_norm": 4.213127613067627,
"learning_rate": 2.246862952059784e-07,
"loss": 0.8059659600257874,
"step": 1534
},
{
"epoch": 3.240506329113924,
"grad_norm": 1.6297084093093872,
"learning_rate": 2.2357418183551847e-07,
"loss": 1.0444282293319702,
"step": 1536
},
{
"epoch": 3.2447257383966246,
"grad_norm": 1.131995677947998,
"learning_rate": 2.2246738027228375e-07,
"loss": 1.0914216041564941,
"step": 1538
},
{
"epoch": 3.2489451476793247,
"grad_norm": 4.478993892669678,
"learning_rate": 2.2136590398733008e-07,
"loss": 0.9430460929870605,
"step": 1540
},
{
"epoch": 3.2531645569620253,
"grad_norm": 3.0573625564575195,
"learning_rate": 2.2026976638689858e-07,
"loss": 0.911579966545105,
"step": 1542
},
{
"epoch": 3.257383966244726,
"grad_norm": 3.4742343425750732,
"learning_rate": 2.1917898081225196e-07,
"loss": 0.7584477066993713,
"step": 1544
},
{
"epoch": 3.261603375527426,
"grad_norm": 8.812678337097168,
"learning_rate": 2.1809356053951312e-07,
"loss": 0.8638182878494263,
"step": 1546
},
{
"epoch": 3.2658227848101267,
"grad_norm": 2.5531651973724365,
"learning_rate": 2.1701351877950265e-07,
"loss": 0.9924852848052979,
"step": 1548
},
{
"epoch": 3.270042194092827,
"grad_norm": 2.971946954727173,
"learning_rate": 2.1593886867757877e-07,
"loss": 0.4322529435157776,
"step": 1550
},
{
"epoch": 3.2742616033755274,
"grad_norm": 1.717172384262085,
"learning_rate": 2.148696233134765e-07,
"loss": 0.550542414188385,
"step": 1552
},
{
"epoch": 3.278481012658228,
"grad_norm": 5.607646942138672,
"learning_rate": 2.1380579570114936e-07,
"loss": 0.5011199116706848,
"step": 1554
},
{
"epoch": 3.282700421940928,
"grad_norm": 1.612561821937561,
"learning_rate": 2.1274739878861052e-07,
"loss": 1.0595111846923828,
"step": 1556
},
{
"epoch": 3.2869198312236287,
"grad_norm": 0.5656753182411194,
"learning_rate": 2.1169444545777492e-07,
"loss": 0.9489805102348328,
"step": 1558
},
{
"epoch": 3.291139240506329,
"grad_norm": 1.542765736579895,
"learning_rate": 2.1064694852430298e-07,
"loss": 0.7409214377403259,
"step": 1560
},
{
"epoch": 3.2953586497890295,
"grad_norm": 4.1754326820373535,
"learning_rate": 2.0960492073744497e-07,
"loss": 0.6657558679580688,
"step": 1562
},
{
"epoch": 3.29957805907173,
"grad_norm": 2.3946285247802734,
"learning_rate": 2.0856837477988444e-07,
"loss": 1.0093276500701904,
"step": 1564
},
{
"epoch": 3.3037974683544302,
"grad_norm": 54.370628356933594,
"learning_rate": 2.075373232675853e-07,
"loss": 0.911258339881897,
"step": 1566
},
{
"epoch": 3.308016877637131,
"grad_norm": 0.5367670655250549,
"learning_rate": 2.0651177874963756e-07,
"loss": 0.5720005035400391,
"step": 1568
},
{
"epoch": 3.3122362869198314,
"grad_norm": 2.9743804931640625,
"learning_rate": 2.054917537081048e-07,
"loss": 0.7077758312225342,
"step": 1570
},
{
"epoch": 3.3164556962025316,
"grad_norm": 1.33404541015625,
"learning_rate": 2.0447726055787184e-07,
"loss": 0.7469961047172546,
"step": 1572
},
{
"epoch": 3.320675105485232,
"grad_norm": 5.848537445068359,
"learning_rate": 2.0346831164649456e-07,
"loss": 1.2882143259048462,
"step": 1574
},
{
"epoch": 3.3248945147679323,
"grad_norm": 2.0500552654266357,
"learning_rate": 2.024649192540486e-07,
"loss": 1.0107818841934204,
"step": 1576
},
{
"epoch": 3.329113924050633,
"grad_norm": 1.4133131504058838,
"learning_rate": 2.0146709559298057e-07,
"loss": 1.098578929901123,
"step": 1578
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2.475172281265259,
"learning_rate": 2.004748528079589e-07,
"loss": 0.907584547996521,
"step": 1580
},
{
"epoch": 3.3375527426160336,
"grad_norm": 1.8427865505218506,
"learning_rate": 1.9948820297572654e-07,
"loss": 0.5680180191993713,
"step": 1582
},
{
"epoch": 3.3417721518987342,
"grad_norm": 2.7834925651550293,
"learning_rate": 1.9850715810495388e-07,
"loss": 0.8737412095069885,
"step": 1584
},
{
"epoch": 3.3459915611814344,
"grad_norm": 3.1142473220825195,
"learning_rate": 1.9753173013609188e-07,
"loss": 0.9088540077209473,
"step": 1586
},
{
"epoch": 3.350210970464135,
"grad_norm": 1.0896648168563843,
"learning_rate": 1.9656193094122788e-07,
"loss": 0.6729345917701721,
"step": 1588
},
{
"epoch": 3.3544303797468356,
"grad_norm": 0.7042174339294434,
"learning_rate": 1.955977723239402e-07,
"loss": 1.0873976945877075,
"step": 1590
},
{
"epoch": 3.3586497890295357,
"grad_norm": 2.3321895599365234,
"learning_rate": 1.946392660191551e-07,
"loss": 1.0663033723831177,
"step": 1592
},
{
"epoch": 3.3628691983122363,
"grad_norm": 0.490595281124115,
"learning_rate": 1.9368642369300324e-07,
"loss": 0.9354673624038696,
"step": 1594
},
{
"epoch": 3.367088607594937,
"grad_norm": 10.656190872192383,
"learning_rate": 1.927392569426783e-07,
"loss": 0.4992368817329407,
"step": 1596
},
{
"epoch": 3.371308016877637,
"grad_norm": 0.5064166784286499,
"learning_rate": 1.917977772962959e-07,
"loss": 0.528096616268158,
"step": 1598
},
{
"epoch": 3.3755274261603376,
"grad_norm": 5.858240604400635,
"learning_rate": 1.9086199621275264e-07,
"loss": 0.8440109491348267,
"step": 1600
},
{
"epoch": 3.379746835443038,
"grad_norm": 8.531730651855469,
"learning_rate": 1.899319250815872e-07,
"loss": 0.6302809119224548,
"step": 1602
},
{
"epoch": 3.3839662447257384,
"grad_norm": 0.5061826705932617,
"learning_rate": 1.8900757522284133e-07,
"loss": 0.8138654828071594,
"step": 1604
},
{
"epoch": 3.388185654008439,
"grad_norm": 2.710231065750122,
"learning_rate": 1.880889578869227e-07,
"loss": 1.1358734369277954,
"step": 1606
},
{
"epoch": 3.392405063291139,
"grad_norm": 2.9734416007995605,
"learning_rate": 1.8717608425446727e-07,
"loss": 0.7783518433570862,
"step": 1608
},
{
"epoch": 3.3966244725738397,
"grad_norm": 1.6831233501434326,
"learning_rate": 1.8626896543620322e-07,
"loss": 0.7331032156944275,
"step": 1610
},
{
"epoch": 3.40084388185654,
"grad_norm": 1.832513451576233,
"learning_rate": 1.853676124728165e-07,
"loss": 1.0596171617507935,
"step": 1612
},
{
"epoch": 3.4050632911392404,
"grad_norm": 1.777066946029663,
"learning_rate": 1.8447203633481567e-07,
"loss": 0.5832729935646057,
"step": 1614
},
{
"epoch": 3.409282700421941,
"grad_norm": 3.6729393005371094,
"learning_rate": 1.8358224792239858e-07,
"loss": 0.9451841115951538,
"step": 1616
},
{
"epoch": 3.413502109704641,
"grad_norm": 1.5150253772735596,
"learning_rate": 1.8269825806531981e-07,
"loss": 1.205118179321289,
"step": 1618
},
{
"epoch": 3.4177215189873418,
"grad_norm": 1.506641149520874,
"learning_rate": 1.8182007752275897e-07,
"loss": 1.1017844676971436,
"step": 1620
},
{
"epoch": 3.4219409282700424,
"grad_norm": 1.7625582218170166,
"learning_rate": 1.8094771698318949e-07,
"loss": 0.7701492309570312,
"step": 1622
},
{
"epoch": 3.4261603375527425,
"grad_norm": 3.7757952213287354,
"learning_rate": 1.8008118706424835e-07,
"loss": 0.47009673714637756,
"step": 1624
},
{
"epoch": 3.430379746835443,
"grad_norm": 86.28419494628906,
"learning_rate": 1.792204983126077e-07,
"loss": 0.3835935592651367,
"step": 1626
},
{
"epoch": 3.4345991561181437,
"grad_norm": 1.6593104600906372,
"learning_rate": 1.7836566120384535e-07,
"loss": 1.0729460716247559,
"step": 1628
},
{
"epoch": 3.438818565400844,
"grad_norm": 1.3321086168289185,
"learning_rate": 1.7751668614231838e-07,
"loss": 0.5311670303344727,
"step": 1630
},
{
"epoch": 3.4430379746835444,
"grad_norm": 1.7757083177566528,
"learning_rate": 1.7667358346103543e-07,
"loss": 1.0757611989974976,
"step": 1632
},
{
"epoch": 3.4472573839662446,
"grad_norm": 0.7050431370735168,
"learning_rate": 1.7583636342153186e-07,
"loss": 0.8372207283973694,
"step": 1634
},
{
"epoch": 3.451476793248945,
"grad_norm": 3.041806221008301,
"learning_rate": 1.7500503621374447e-07,
"loss": 1.3023487329483032,
"step": 1636
},
{
"epoch": 3.4556962025316453,
"grad_norm": 2.8929758071899414,
"learning_rate": 1.7417961195588712e-07,
"loss": 1.2805616855621338,
"step": 1638
},
{
"epoch": 3.459915611814346,
"grad_norm": 1.4591811895370483,
"learning_rate": 1.733601006943283e-07,
"loss": 1.0746394395828247,
"step": 1640
},
{
"epoch": 3.4641350210970465,
"grad_norm": 21.10038185119629,
"learning_rate": 1.7254651240346834e-07,
"loss": 1.2883800268173218,
"step": 1642
},
{
"epoch": 3.4683544303797467,
"grad_norm": 0.8419481515884399,
"learning_rate": 1.717388569856184e-07,
"loss": 0.4558939039707184,
"step": 1644
},
{
"epoch": 3.4725738396624473,
"grad_norm": 1.598176121711731,
"learning_rate": 1.7093714427087921e-07,
"loss": 1.1013548374176025,
"step": 1646
},
{
"epoch": 3.476793248945148,
"grad_norm": 1.9482252597808838,
"learning_rate": 1.7014138401702235e-07,
"loss": 1.064300537109375,
"step": 1648
},
{
"epoch": 3.481012658227848,
"grad_norm": 2.4247756004333496,
"learning_rate": 1.6935158590937102e-07,
"loss": 0.5595088005065918,
"step": 1650
},
{
"epoch": 3.4852320675105486,
"grad_norm": 1.5676363706588745,
"learning_rate": 1.685677595606821e-07,
"loss": 0.9377724528312683,
"step": 1652
},
{
"epoch": 3.489451476793249,
"grad_norm": 1.7761136293411255,
"learning_rate": 1.6778991451102917e-07,
"loss": 0.6129472255706787,
"step": 1654
},
{
"epoch": 3.4936708860759493,
"grad_norm": 1.6247411966323853,
"learning_rate": 1.6701806022768664e-07,
"loss": 0.9987605214118958,
"step": 1656
},
{
"epoch": 3.49789029535865,
"grad_norm": 7.621754169464111,
"learning_rate": 1.662522061050143e-07,
"loss": 0.7994301319122314,
"step": 1658
},
{
"epoch": 3.50210970464135,
"grad_norm": 1.6483778953552246,
"learning_rate": 1.6549236146434306e-07,
"loss": 1.0804067850112915,
"step": 1660
},
{
"epoch": 3.5063291139240507,
"grad_norm": 2.4437475204467773,
"learning_rate": 1.6473853555386138e-07,
"loss": 1.301591396331787,
"step": 1662
},
{
"epoch": 3.510548523206751,
"grad_norm": 6.270905017852783,
"learning_rate": 1.63990737548503e-07,
"loss": 0.5238262414932251,
"step": 1664
},
{
"epoch": 3.5147679324894514,
"grad_norm": 1.6719293594360352,
"learning_rate": 1.6324897654983497e-07,
"loss": 1.1141690015792847,
"step": 1666
},
{
"epoch": 3.518987341772152,
"grad_norm": 3.9029476642608643,
"learning_rate": 1.6251326158594697e-07,
"loss": 0.9623671770095825,
"step": 1668
},
{
"epoch": 3.523206751054852,
"grad_norm": 1.746028184890747,
"learning_rate": 1.617836016113414e-07,
"loss": 1.0135071277618408,
"step": 1670
},
{
"epoch": 3.5274261603375527,
"grad_norm": 3.1107168197631836,
"learning_rate": 1.610600055068245e-07,
"loss": 0.4389096200466156,
"step": 1672
},
{
"epoch": 3.5316455696202533,
"grad_norm": 1.7027398347854614,
"learning_rate": 1.603424820793983e-07,
"loss": 0.6981071829795837,
"step": 1674
},
{
"epoch": 3.5358649789029535,
"grad_norm": 2.8486416339874268,
"learning_rate": 1.5963104006215308e-07,
"loss": 0.7279437780380249,
"step": 1676
},
{
"epoch": 3.540084388185654,
"grad_norm": 3.396284341812134,
"learning_rate": 1.589256881141614e-07,
"loss": 0.9122246503829956,
"step": 1678
},
{
"epoch": 3.5443037974683547,
"grad_norm": 3.791874647140503,
"learning_rate": 1.5822643482037287e-07,
"loss": 1.1270490884780884,
"step": 1680
},
{
"epoch": 3.548523206751055,
"grad_norm": 1.448197364807129,
"learning_rate": 1.5753328869150915e-07,
"loss": 0.958101749420166,
"step": 1682
},
{
"epoch": 3.5527426160337554,
"grad_norm": 2.4740562438964844,
"learning_rate": 1.5684625816396065e-07,
"loss": 0.9169100522994995,
"step": 1684
},
{
"epoch": 3.5569620253164556,
"grad_norm": 4.803852081298828,
"learning_rate": 1.5616535159968395e-07,
"loss": 0.4023887515068054,
"step": 1686
},
{
"epoch": 3.561181434599156,
"grad_norm": 1.6774110794067383,
"learning_rate": 1.5549057728609994e-07,
"loss": 0.7174091935157776,
"step": 1688
},
{
"epoch": 3.5654008438818563,
"grad_norm": 2.055140972137451,
"learning_rate": 1.5482194343599262e-07,
"loss": 1.1519484519958496,
"step": 1690
},
{
"epoch": 3.569620253164557,
"grad_norm": 2.408010482788086,
"learning_rate": 1.5415945818740984e-07,
"loss": 0.2424314320087433,
"step": 1692
},
{
"epoch": 3.5738396624472575,
"grad_norm": 0.4710818827152252,
"learning_rate": 1.5350312960356366e-07,
"loss": 0.975223183631897,
"step": 1694
},
{
"epoch": 3.5780590717299576,
"grad_norm": 7.146688461303711,
"learning_rate": 1.5285296567273247e-07,
"loss": 0.2773347795009613,
"step": 1696
},
{
"epoch": 3.5822784810126582,
"grad_norm": 2.3770270347595215,
"learning_rate": 1.5220897430816355e-07,
"loss": 0.8169768452644348,
"step": 1698
},
{
"epoch": 3.586497890295359,
"grad_norm": 1.4037396907806396,
"learning_rate": 1.5157116334797708e-07,
"loss": 0.900860071182251,
"step": 1700
},
{
"epoch": 3.590717299578059,
"grad_norm": 1.6098082065582275,
"learning_rate": 1.5093954055507043e-07,
"loss": 0.6856269240379333,
"step": 1702
},
{
"epoch": 3.5949367088607596,
"grad_norm": 1.4536845684051514,
"learning_rate": 1.5031411361702408e-07,
"loss": 1.1157587766647339,
"step": 1704
},
{
"epoch": 3.59915611814346,
"grad_norm": 3.0524935722351074,
"learning_rate": 1.4969489014600732e-07,
"loss": 0.812619149684906,
"step": 1706
},
{
"epoch": 3.6033755274261603,
"grad_norm": 4.811793804168701,
"learning_rate": 1.4908187767868651e-07,
"loss": 0.7652060389518738,
"step": 1708
},
{
"epoch": 3.607594936708861,
"grad_norm": 0.5443377494812012,
"learning_rate": 1.484750836761328e-07,
"loss": 0.677264392375946,
"step": 1710
},
{
"epoch": 3.611814345991561,
"grad_norm": 6.806301593780518,
"learning_rate": 1.4787451552373115e-07,
"loss": 1.052730679512024,
"step": 1712
},
{
"epoch": 3.6160337552742616,
"grad_norm": 4.058206081390381,
"learning_rate": 1.4728018053109103e-07,
"loss": 1.285649299621582,
"step": 1714
},
{
"epoch": 3.620253164556962,
"grad_norm": 3.216102361679077,
"learning_rate": 1.4669208593195704e-07,
"loss": 0.6992135047912598,
"step": 1716
},
{
"epoch": 3.6244725738396624,
"grad_norm": 2.728694438934326,
"learning_rate": 1.4611023888412115e-07,
"loss": 0.8372994065284729,
"step": 1718
},
{
"epoch": 3.628691983122363,
"grad_norm": 8.481232643127441,
"learning_rate": 1.4553464646933492e-07,
"loss": 0.5174750685691833,
"step": 1720
},
{
"epoch": 3.632911392405063,
"grad_norm": 3.1336352825164795,
"learning_rate": 1.4496531569322426e-07,
"loss": 1.101250410079956,
"step": 1722
},
{
"epoch": 3.6371308016877637,
"grad_norm": 3.442155122756958,
"learning_rate": 1.4440225348520354e-07,
"loss": 0.6749483346939087,
"step": 1724
},
{
"epoch": 3.6413502109704643,
"grad_norm": 3.023040771484375,
"learning_rate": 1.4384546669839147e-07,
"loss": 0.48659658432006836,
"step": 1726
},
{
"epoch": 3.6455696202531644,
"grad_norm": 6.006860733032227,
"learning_rate": 1.432949621095273e-07,
"loss": 1.0057132244110107,
"step": 1728
},
{
"epoch": 3.649789029535865,
"grad_norm": 5.072360992431641,
"learning_rate": 1.4275074641888904e-07,
"loss": 0.29357773065567017,
"step": 1730
},
{
"epoch": 3.6540084388185656,
"grad_norm": 18.242097854614258,
"learning_rate": 1.4221282625021142e-07,
"loss": 1.019067406654358,
"step": 1732
},
{
"epoch": 3.6582278481012658,
"grad_norm": 2.2106029987335205,
"learning_rate": 1.4168120815060542e-07,
"loss": 0.5755662322044373,
"step": 1734
},
{
"epoch": 3.6624472573839664,
"grad_norm": 2.0836057662963867,
"learning_rate": 1.4115589859047829e-07,
"loss": 0.5893323421478271,
"step": 1736
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.689981460571289,
"learning_rate": 1.4063690396345539e-07,
"loss": 0.8215257525444031,
"step": 1738
},
{
"epoch": 3.670886075949367,
"grad_norm": 2.466362714767456,
"learning_rate": 1.401242305863019e-07,
"loss": 0.5873066782951355,
"step": 1740
},
{
"epoch": 3.6751054852320673,
"grad_norm": 2.1418519020080566,
"learning_rate": 1.3961788469884597e-07,
"loss": 1.2188622951507568,
"step": 1742
},
{
"epoch": 3.679324894514768,
"grad_norm": 2.1476902961730957,
"learning_rate": 1.39117872463903e-07,
"loss": 0.6782402396202087,
"step": 1744
},
{
"epoch": 3.6835443037974684,
"grad_norm": 2.313478946685791,
"learning_rate": 1.3862419996720055e-07,
"loss": 0.6638330817222595,
"step": 1746
},
{
"epoch": 3.6877637130801686,
"grad_norm": 1.2573710680007935,
"learning_rate": 1.381368732173042e-07,
"loss": 1.1310936212539673,
"step": 1748
},
{
"epoch": 3.691983122362869,
"grad_norm": 4.773893356323242,
"learning_rate": 1.376558981455443e-07,
"loss": 0.9830767512321472,
"step": 1750
},
{
"epoch": 3.6962025316455698,
"grad_norm": 1.9760856628417969,
"learning_rate": 1.371812806059441e-07,
"loss": 1.0266754627227783,
"step": 1752
},
{
"epoch": 3.70042194092827,
"grad_norm": 1.8001806735992432,
"learning_rate": 1.3671302637514825e-07,
"loss": 1.1445378065109253,
"step": 1754
},
{
"epoch": 3.7046413502109705,
"grad_norm": 2.3651130199432373,
"learning_rate": 1.3625114115235267e-07,
"loss": 0.8746024370193481,
"step": 1756
},
{
"epoch": 3.708860759493671,
"grad_norm": 2.966754913330078,
"learning_rate": 1.357956305592349e-07,
"loss": 0.8632293343544006,
"step": 1758
},
{
"epoch": 3.7130801687763713,
"grad_norm": 2.7932474613189697,
"learning_rate": 1.35346500139886e-07,
"loss": 0.8797197937965393,
"step": 1760
},
{
"epoch": 3.717299578059072,
"grad_norm": 3.4520580768585205,
"learning_rate": 1.3490375536074293e-07,
"loss": 0.4202856123447418,
"step": 1762
},
{
"epoch": 3.721518987341772,
"grad_norm": 5.053709506988525,
"learning_rate": 1.3446740161052182e-07,
"loss": 0.7906475067138672,
"step": 1764
},
{
"epoch": 3.7257383966244726,
"grad_norm": 1.492531418800354,
"learning_rate": 1.3403744420015293e-07,
"loss": 1.0731313228607178,
"step": 1766
},
{
"epoch": 3.7299578059071727,
"grad_norm": 4.506521701812744,
"learning_rate": 1.3361388836271545e-07,
"loss": 0.6830440163612366,
"step": 1768
},
{
"epoch": 3.7341772151898733,
"grad_norm": 2.127143383026123,
"learning_rate": 1.33196739253374e-07,
"loss": 0.7407412528991699,
"step": 1770
},
{
"epoch": 3.738396624472574,
"grad_norm": 2.162644147872925,
"learning_rate": 1.3278600194931595e-07,
"loss": 1.099405288696289,
"step": 1772
},
{
"epoch": 3.742616033755274,
"grad_norm": 0.9268086552619934,
"learning_rate": 1.323816814496896e-07,
"loss": 0.7270370721817017,
"step": 1774
},
{
"epoch": 3.7468354430379747,
"grad_norm": 0.7520632743835449,
"learning_rate": 1.3198378267554327e-07,
"loss": 0.7462360262870789,
"step": 1776
},
{
"epoch": 3.7510548523206753,
"grad_norm": 1.411445140838623,
"learning_rate": 1.3159231046976552e-07,
"loss": 1.026281476020813,
"step": 1778
},
{
"epoch": 3.7552742616033754,
"grad_norm": 2.437485456466675,
"learning_rate": 1.3120726959702608e-07,
"loss": 1.0296030044555664,
"step": 1780
},
{
"epoch": 3.759493670886076,
"grad_norm": 2.7060513496398926,
"learning_rate": 1.308286647437179e-07,
"loss": 0.9808471202850342,
"step": 1782
},
{
"epoch": 3.7637130801687766,
"grad_norm": 2.146833658218384,
"learning_rate": 1.3045650051790027e-07,
"loss": 0.9502108097076416,
"step": 1784
},
{
"epoch": 3.7679324894514767,
"grad_norm": 1.3278952836990356,
"learning_rate": 1.300907814492422e-07,
"loss": 1.123317003250122,
"step": 1786
},
{
"epoch": 3.7721518987341773,
"grad_norm": 14.552665710449219,
"learning_rate": 1.2973151198896823e-07,
"loss": 0.525389552116394,
"step": 1788
},
{
"epoch": 3.7763713080168775,
"grad_norm": 1.6734447479248047,
"learning_rate": 1.2937869650980342e-07,
"loss": 0.7029292583465576,
"step": 1790
},
{
"epoch": 3.780590717299578,
"grad_norm": 1.3970534801483154,
"learning_rate": 1.2903233930592022e-07,
"loss": 1.0671159029006958,
"step": 1792
},
{
"epoch": 3.7848101265822782,
"grad_norm": 2.8452141284942627,
"learning_rate": 1.2869244459288677e-07,
"loss": 0.7484707832336426,
"step": 1794
},
{
"epoch": 3.789029535864979,
"grad_norm": 2.7676146030426025,
"learning_rate": 1.2835901650761496e-07,
"loss": 1.1054531335830688,
"step": 1796
},
{
"epoch": 3.7932489451476794,
"grad_norm": 2.6690499782562256,
"learning_rate": 1.2803205910831044e-07,
"loss": 1.1910511255264282,
"step": 1798
},
{
"epoch": 3.7974683544303796,
"grad_norm": 2.3067097663879395,
"learning_rate": 1.2771157637442308e-07,
"loss": 1.0350401401519775,
"step": 1800
},
{
"epoch": 3.80168776371308,
"grad_norm": 2.0456929206848145,
"learning_rate": 1.273975722065986e-07,
"loss": 1.1489591598510742,
"step": 1802
},
{
"epoch": 3.8059071729957807,
"grad_norm": 1.7378591299057007,
"learning_rate": 1.2709005042663118e-07,
"loss": 0.6581465005874634,
"step": 1804
},
{
"epoch": 3.810126582278481,
"grad_norm": 6.99116849899292,
"learning_rate": 1.267890147774167e-07,
"loss": 0.29897159337997437,
"step": 1806
},
{
"epoch": 3.8143459915611815,
"grad_norm": 4.381340026855469,
"learning_rate": 1.264944689229072e-07,
"loss": 1.0360081195831299,
"step": 1808
},
{
"epoch": 3.818565400843882,
"grad_norm": 2.1093826293945312,
"learning_rate": 1.2620641644806678e-07,
"loss": 1.0628427267074585,
"step": 1810
},
{
"epoch": 3.8227848101265822,
"grad_norm": 9.374409675598145,
"learning_rate": 1.2592486085882725e-07,
"loss": 0.7481462955474854,
"step": 1812
},
{
"epoch": 3.827004219409283,
"grad_norm": 1.5822006464004517,
"learning_rate": 1.25649805582046e-07,
"loss": 1.0469331741333008,
"step": 1814
},
{
"epoch": 3.831223628691983,
"grad_norm": 2.6007158756256104,
"learning_rate": 1.25381253965464e-07,
"loss": 0.9370917081832886,
"step": 1816
},
{
"epoch": 3.8354430379746836,
"grad_norm": 3.8402206897735596,
"learning_rate": 1.2511920927766525e-07,
"loss": 0.9214923977851868,
"step": 1818
},
{
"epoch": 3.8396624472573837,
"grad_norm": 11.853067398071289,
"learning_rate": 1.2486367470803673e-07,
"loss": 0.8060356378555298,
"step": 1820
},
{
"epoch": 3.8438818565400843,
"grad_norm": 1.407483696937561,
"learning_rate": 1.246146533667299e-07,
"loss": 1.076265573501587,
"step": 1822
},
{
"epoch": 3.848101265822785,
"grad_norm": 3.0918633937835693,
"learning_rate": 1.243721482846227e-07,
"loss": 0.9416312575340271,
"step": 1824
},
{
"epoch": 3.852320675105485,
"grad_norm": 2.7018940448760986,
"learning_rate": 1.2413616241328252e-07,
"loss": 1.026483416557312,
"step": 1826
},
{
"epoch": 3.8565400843881856,
"grad_norm": 2.9005277156829834,
"learning_rate": 1.2390669862493044e-07,
"loss": 1.033530354499817,
"step": 1828
},
{
"epoch": 3.8607594936708862,
"grad_norm": 1.5697400569915771,
"learning_rate": 1.2368375971240647e-07,
"loss": 1.0893433094024658,
"step": 1830
},
{
"epoch": 3.8649789029535864,
"grad_norm": 7.101255893707275,
"learning_rate": 1.2346734838913498e-07,
"loss": 0.4264039397239685,
"step": 1832
},
{
"epoch": 3.869198312236287,
"grad_norm": 1.942752718925476,
"learning_rate": 1.2325746728909227e-07,
"loss": 0.6822599172592163,
"step": 1834
},
{
"epoch": 3.8734177215189876,
"grad_norm": 2.711249351501465,
"learning_rate": 1.2305411896677423e-07,
"loss": 0.8705965280532837,
"step": 1836
},
{
"epoch": 3.8776371308016877,
"grad_norm": 3.3902530670166016,
"learning_rate": 1.228573058971652e-07,
"loss": 0.7575594186782837,
"step": 1838
},
{
"epoch": 3.8818565400843883,
"grad_norm": 5.287688732147217,
"learning_rate": 1.2266703047570794e-07,
"loss": 0.8974352478981018,
"step": 1840
},
{
"epoch": 3.8860759493670884,
"grad_norm": 2.1966428756713867,
"learning_rate": 1.2248329501827461e-07,
"loss": 0.7821562886238098,
"step": 1842
},
{
"epoch": 3.890295358649789,
"grad_norm": 2.125584125518799,
"learning_rate": 1.2230610176113828e-07,
"loss": 0.7629109621047974,
"step": 1844
},
{
"epoch": 3.894514767932489,
"grad_norm": 1.5011521577835083,
"learning_rate": 1.2213545286094602e-07,
"loss": 1.0465257167816162,
"step": 1846
},
{
"epoch": 3.8987341772151898,
"grad_norm": 3.0355629920959473,
"learning_rate": 1.219713503946922e-07,
"loss": 0.5780481100082397,
"step": 1848
},
{
"epoch": 3.9029535864978904,
"grad_norm": 2.1277599334716797,
"learning_rate": 1.21813796359694e-07,
"loss": 1.0891631841659546,
"step": 1850
},
{
"epoch": 3.9071729957805905,
"grad_norm": 1.4144175052642822,
"learning_rate": 1.2166279267356617e-07,
"loss": 1.0926233530044556,
"step": 1852
},
{
"epoch": 3.911392405063291,
"grad_norm": 1.8264589309692383,
"learning_rate": 1.2151834117419832e-07,
"loss": 1.0842887163162231,
"step": 1854
},
{
"epoch": 3.9156118143459917,
"grad_norm": 1.3971328735351562,
"learning_rate": 1.2138044361973238e-07,
"loss": 1.1029634475708008,
"step": 1856
},
{
"epoch": 3.919831223628692,
"grad_norm": 1.3931989669799805,
"learning_rate": 1.2124910168854125e-07,
"loss": 1.092046856880188,
"step": 1858
},
{
"epoch": 3.9240506329113924,
"grad_norm": 1.2768291234970093,
"learning_rate": 1.21124316979208e-07,
"loss": 1.0661836862564087,
"step": 1860
},
{
"epoch": 3.928270042194093,
"grad_norm": 5.809596538543701,
"learning_rate": 1.210060910105071e-07,
"loss": 0.9497167468070984,
"step": 1862
},
{
"epoch": 3.932489451476793,
"grad_norm": 1.809336543083191,
"learning_rate": 1.208944252213854e-07,
"loss": 0.7419611811637878,
"step": 1864
},
{
"epoch": 3.9367088607594938,
"grad_norm": 3.3719143867492676,
"learning_rate": 1.2078932097094474e-07,
"loss": 1.3616517782211304,
"step": 1866
},
{
"epoch": 3.9409282700421944,
"grad_norm": 1.4581533670425415,
"learning_rate": 1.2069077953842544e-07,
"loss": 1.0452879667282104,
"step": 1868
},
{
"epoch": 3.9451476793248945,
"grad_norm": 3.9840292930603027,
"learning_rate": 1.2059880212319078e-07,
"loss": 0.7806097269058228,
"step": 1870
},
{
"epoch": 3.9493670886075947,
"grad_norm": 0.3003561794757843,
"learning_rate": 1.2051338984471242e-07,
"loss": 0.568496584892273,
"step": 1872
},
{
"epoch": 3.9535864978902953,
"grad_norm": 3.0003912448883057,
"learning_rate": 1.2043454374255645e-07,
"loss": 0.5840458273887634,
"step": 1874
},
{
"epoch": 3.957805907172996,
"grad_norm": 17.255149841308594,
"learning_rate": 1.203622647763713e-07,
"loss": 0.9891324639320374,
"step": 1876
},
{
"epoch": 3.962025316455696,
"grad_norm": 4.442596435546875,
"learning_rate": 1.2029655382587557e-07,
"loss": 0.937990665435791,
"step": 1878
},
{
"epoch": 3.9662447257383966,
"grad_norm": 1.3784996271133423,
"learning_rate": 1.2023741169084767e-07,
"loss": 0.6944407224655151,
"step": 1880
},
{
"epoch": 3.970464135021097,
"grad_norm": 1.8049193620681763,
"learning_rate": 1.2018483909111572e-07,
"loss": 1.0277503728866577,
"step": 1882
},
{
"epoch": 3.9746835443037973,
"grad_norm": 6.727908611297607,
"learning_rate": 1.2013883666654907e-07,
"loss": 0.509749174118042,
"step": 1884
},
{
"epoch": 3.978902953586498,
"grad_norm": 2.5991525650024414,
"learning_rate": 1.2009940497705058e-07,
"loss": 1.0679656267166138,
"step": 1886
},
{
"epoch": 3.9831223628691985,
"grad_norm": 2.2473011016845703,
"learning_rate": 1.2006654450254938e-07,
"loss": 0.7142981290817261,
"step": 1888
},
{
"epoch": 3.9873417721518987,
"grad_norm": 3.0477726459503174,
"learning_rate": 1.2004025564299563e-07,
"loss": 1.0713993310928345,
"step": 1890
},
{
"epoch": 3.9915611814345993,
"grad_norm": 8.713078498840332,
"learning_rate": 1.2002053871835507e-07,
"loss": 0.6879635453224182,
"step": 1892
},
{
"epoch": 3.9957805907173,
"grad_norm": 1.979125738143921,
"learning_rate": 1.2000739396860554e-07,
"loss": 1.0905542373657227,
"step": 1894
},
{
"epoch": 4.0,
"grad_norm": 1.923147439956665,
"learning_rate": 1.2000082155373382e-07,
"loss": 1.1579601764678955,
"step": 1896
},
{
"epoch": 4.0,
"step": 1896,
"total_flos": 3.5948540672197263e+18,
"train_loss": 1.0285371271618309,
"train_runtime": 8697.1879,
"train_samples_per_second": 6.54,
"train_steps_per_second": 0.218
}
],
"logging_steps": 2,
"max_steps": 1896,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.5948540672197263e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}