9b-109 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
bb51eb0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 2844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004219409282700422,
"grad_norm": 14.014748573303223,
"learning_rate": 2.797202797202797e-08,
"loss": 2.1982650756835938,
"step": 2
},
{
"epoch": 0.008438818565400843,
"grad_norm": 12.766752243041992,
"learning_rate": 8.391608391608391e-08,
"loss": 1.7798584699630737,
"step": 4
},
{
"epoch": 0.012658227848101266,
"grad_norm": 2.7050275802612305,
"learning_rate": 1.3986013986013987e-07,
"loss": 1.9378855228424072,
"step": 6
},
{
"epoch": 0.016877637130801686,
"grad_norm": 6.62382173538208,
"learning_rate": 1.958041958041958e-07,
"loss": 1.9494550228118896,
"step": 8
},
{
"epoch": 0.02109704641350211,
"grad_norm": 6.674213886260986,
"learning_rate": 2.517482517482518e-07,
"loss": 1.8559846878051758,
"step": 10
},
{
"epoch": 0.02531645569620253,
"grad_norm": 2.3814892768859863,
"learning_rate": 3.076923076923077e-07,
"loss": 1.3210176229476929,
"step": 12
},
{
"epoch": 0.029535864978902954,
"grad_norm": 10.13178825378418,
"learning_rate": 3.636363636363636e-07,
"loss": 1.6618098020553589,
"step": 14
},
{
"epoch": 0.03375527426160337,
"grad_norm": 10.594144821166992,
"learning_rate": 4.1958041958041957e-07,
"loss": 2.217015266418457,
"step": 16
},
{
"epoch": 0.0379746835443038,
"grad_norm": 2.019131898880005,
"learning_rate": 4.755244755244755e-07,
"loss": 1.8345531225204468,
"step": 18
},
{
"epoch": 0.04219409282700422,
"grad_norm": 2.9380502700805664,
"learning_rate": 5.314685314685314e-07,
"loss": 1.9022338390350342,
"step": 20
},
{
"epoch": 0.046413502109704644,
"grad_norm": 2.7503366470336914,
"learning_rate": 5.874125874125873e-07,
"loss": 1.5993010997772217,
"step": 22
},
{
"epoch": 0.05063291139240506,
"grad_norm": 9.334077835083008,
"learning_rate": 6.433566433566433e-07,
"loss": 1.841583013534546,
"step": 24
},
{
"epoch": 0.05485232067510549,
"grad_norm": 4.110686302185059,
"learning_rate": 6.993006993006993e-07,
"loss": 1.7132441997528076,
"step": 26
},
{
"epoch": 0.05907172995780591,
"grad_norm": 5.482414245605469,
"learning_rate": 7.552447552447552e-07,
"loss": 1.9471144676208496,
"step": 28
},
{
"epoch": 0.06329113924050633,
"grad_norm": 7.981461524963379,
"learning_rate": 8.111888111888111e-07,
"loss": 1.8706644773483276,
"step": 30
},
{
"epoch": 0.06751054852320675,
"grad_norm": 2.825488567352295,
"learning_rate": 8.67132867132867e-07,
"loss": 1.7588139772415161,
"step": 32
},
{
"epoch": 0.07172995780590717,
"grad_norm": 27.41690444946289,
"learning_rate": 9.230769230769231e-07,
"loss": 1.775272011756897,
"step": 34
},
{
"epoch": 0.0759493670886076,
"grad_norm": 3.1006486415863037,
"learning_rate": 9.79020979020979e-07,
"loss": 1.784650444984436,
"step": 36
},
{
"epoch": 0.08016877637130802,
"grad_norm": 7.034008026123047,
"learning_rate": 1.034965034965035e-06,
"loss": 1.5920319557189941,
"step": 38
},
{
"epoch": 0.08438818565400844,
"grad_norm": 2.675238847732544,
"learning_rate": 1.0909090909090908e-06,
"loss": 1.7519234418869019,
"step": 40
},
{
"epoch": 0.08860759493670886,
"grad_norm": 10.898767471313477,
"learning_rate": 1.1468531468531469e-06,
"loss": 1.3292922973632812,
"step": 42
},
{
"epoch": 0.09282700421940929,
"grad_norm": 5.946654796600342,
"learning_rate": 1.2027972027972026e-06,
"loss": 1.9151337146759033,
"step": 44
},
{
"epoch": 0.0970464135021097,
"grad_norm": 4.006372451782227,
"learning_rate": 1.2587412587412587e-06,
"loss": 1.734480619430542,
"step": 46
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.6106147766113281,
"learning_rate": 1.3146853146853144e-06,
"loss": 1.6714043617248535,
"step": 48
},
{
"epoch": 0.10548523206751055,
"grad_norm": 3.555082321166992,
"learning_rate": 1.3706293706293705e-06,
"loss": 0.9601479172706604,
"step": 50
},
{
"epoch": 0.10970464135021098,
"grad_norm": 18.376201629638672,
"learning_rate": 1.4265734265734267e-06,
"loss": 0.9682204723358154,
"step": 52
},
{
"epoch": 0.11392405063291139,
"grad_norm": 3.829577684402466,
"learning_rate": 1.4825174825174824e-06,
"loss": 0.9149891138076782,
"step": 54
},
{
"epoch": 0.11814345991561181,
"grad_norm": 8.751733779907227,
"learning_rate": 1.5384615384615385e-06,
"loss": 1.5466492176055908,
"step": 56
},
{
"epoch": 0.12236286919831224,
"grad_norm": 43.25166702270508,
"learning_rate": 1.5944055944055942e-06,
"loss": 0.8738414645195007,
"step": 58
},
{
"epoch": 0.12658227848101267,
"grad_norm": 2.858604669570923,
"learning_rate": 1.6503496503496503e-06,
"loss": 1.5882339477539062,
"step": 60
},
{
"epoch": 0.1308016877637131,
"grad_norm": 2.080610990524292,
"learning_rate": 1.7062937062937063e-06,
"loss": 1.6133513450622559,
"step": 62
},
{
"epoch": 0.1350210970464135,
"grad_norm": 1.6210132837295532,
"learning_rate": 1.7622377622377622e-06,
"loss": 1.1352812051773071,
"step": 64
},
{
"epoch": 0.13924050632911392,
"grad_norm": 4.165830135345459,
"learning_rate": 1.818181818181818e-06,
"loss": 0.8928266763687134,
"step": 66
},
{
"epoch": 0.14345991561181434,
"grad_norm": 2.4804110527038574,
"learning_rate": 1.874125874125874e-06,
"loss": 1.182489275932312,
"step": 68
},
{
"epoch": 0.14767932489451477,
"grad_norm": 11.683263778686523,
"learning_rate": 1.9300699300699297e-06,
"loss": 1.0528309345245361,
"step": 70
},
{
"epoch": 0.1518987341772152,
"grad_norm": 5.113679885864258,
"learning_rate": 1.986013986013986e-06,
"loss": 1.3555092811584473,
"step": 72
},
{
"epoch": 0.15611814345991562,
"grad_norm": 3.419110059738159,
"learning_rate": 2.041958041958042e-06,
"loss": 1.1131813526153564,
"step": 74
},
{
"epoch": 0.16033755274261605,
"grad_norm": 5.5904622077941895,
"learning_rate": 2.097902097902098e-06,
"loss": 0.9376708269119263,
"step": 76
},
{
"epoch": 0.16455696202531644,
"grad_norm": 4.4593892097473145,
"learning_rate": 2.1538461538461538e-06,
"loss": 1.4518260955810547,
"step": 78
},
{
"epoch": 0.16877637130801687,
"grad_norm": 1.9147013425827026,
"learning_rate": 2.2097902097902093e-06,
"loss": 1.4421272277832031,
"step": 80
},
{
"epoch": 0.1729957805907173,
"grad_norm": 4.915895462036133,
"learning_rate": 2.2657342657342656e-06,
"loss": 1.4590272903442383,
"step": 82
},
{
"epoch": 0.17721518987341772,
"grad_norm": 6.905501842498779,
"learning_rate": 2.3216783216783215e-06,
"loss": 0.9708279371261597,
"step": 84
},
{
"epoch": 0.18143459915611815,
"grad_norm": 7.524752140045166,
"learning_rate": 2.3776223776223774e-06,
"loss": 1.141646385192871,
"step": 86
},
{
"epoch": 0.18565400843881857,
"grad_norm": 1.9856427907943726,
"learning_rate": 2.4335664335664338e-06,
"loss": 1.3669147491455078,
"step": 88
},
{
"epoch": 0.189873417721519,
"grad_norm": 5.223474025726318,
"learning_rate": 2.4895104895104893e-06,
"loss": 0.6930243968963623,
"step": 90
},
{
"epoch": 0.1940928270042194,
"grad_norm": 3.9480249881744385,
"learning_rate": 2.545454545454545e-06,
"loss": 1.7789967060089111,
"step": 92
},
{
"epoch": 0.19831223628691982,
"grad_norm": 6.213054180145264,
"learning_rate": 2.601398601398601e-06,
"loss": 0.9946894645690918,
"step": 94
},
{
"epoch": 0.20253164556962025,
"grad_norm": 2.132254123687744,
"learning_rate": 2.6573426573426574e-06,
"loss": 1.4530797004699707,
"step": 96
},
{
"epoch": 0.20675105485232068,
"grad_norm": 1.8356496095657349,
"learning_rate": 2.7132867132867134e-06,
"loss": 1.5200846195220947,
"step": 98
},
{
"epoch": 0.2109704641350211,
"grad_norm": 14.19537353515625,
"learning_rate": 2.769230769230769e-06,
"loss": 1.292062759399414,
"step": 100
},
{
"epoch": 0.21518987341772153,
"grad_norm": 2.111111640930176,
"learning_rate": 2.8251748251748248e-06,
"loss": 1.1042956113815308,
"step": 102
},
{
"epoch": 0.21940928270042195,
"grad_norm": 1.8971158266067505,
"learning_rate": 2.881118881118881e-06,
"loss": 1.0220731496810913,
"step": 104
},
{
"epoch": 0.22362869198312235,
"grad_norm": 5.727835178375244,
"learning_rate": 2.937062937062937e-06,
"loss": 1.0205355882644653,
"step": 106
},
{
"epoch": 0.22784810126582278,
"grad_norm": 3.1581368446350098,
"learning_rate": 2.993006993006993e-06,
"loss": 1.0161347389221191,
"step": 108
},
{
"epoch": 0.2320675105485232,
"grad_norm": 2.3190581798553467,
"learning_rate": 3.0489510489510484e-06,
"loss": 1.0544636249542236,
"step": 110
},
{
"epoch": 0.23628691983122363,
"grad_norm": 5.929664611816406,
"learning_rate": 3.1048951048951048e-06,
"loss": 1.4253602027893066,
"step": 112
},
{
"epoch": 0.24050632911392406,
"grad_norm": 2.6725683212280273,
"learning_rate": 3.1608391608391607e-06,
"loss": 1.318920612335205,
"step": 114
},
{
"epoch": 0.24472573839662448,
"grad_norm": 7.776963710784912,
"learning_rate": 3.2167832167832166e-06,
"loss": 1.6443480253219604,
"step": 116
},
{
"epoch": 0.2489451476793249,
"grad_norm": 2.3923261165618896,
"learning_rate": 3.272727272727273e-06,
"loss": 1.3153703212738037,
"step": 118
},
{
"epoch": 0.25316455696202533,
"grad_norm": 3.2848472595214844,
"learning_rate": 3.3286713286713284e-06,
"loss": 1.0184035301208496,
"step": 120
},
{
"epoch": 0.25738396624472576,
"grad_norm": 4.440483093261719,
"learning_rate": 3.3846153846153843e-06,
"loss": 1.312201976776123,
"step": 122
},
{
"epoch": 0.2616033755274262,
"grad_norm": 4.970678806304932,
"learning_rate": 3.4405594405594402e-06,
"loss": 1.3157330751419067,
"step": 124
},
{
"epoch": 0.26582278481012656,
"grad_norm": 3.659862995147705,
"learning_rate": 3.4965034965034966e-06,
"loss": 1.4062931537628174,
"step": 126
},
{
"epoch": 0.270042194092827,
"grad_norm": 4.357997894287109,
"learning_rate": 3.5524475524475525e-06,
"loss": 0.9154614210128784,
"step": 128
},
{
"epoch": 0.2742616033755274,
"grad_norm": 4.5792341232299805,
"learning_rate": 3.608391608391608e-06,
"loss": 1.1704046726226807,
"step": 130
},
{
"epoch": 0.27848101265822783,
"grad_norm": 5.039772033691406,
"learning_rate": 3.664335664335664e-06,
"loss": 1.2377243041992188,
"step": 132
},
{
"epoch": 0.28270042194092826,
"grad_norm": 6.672406196594238,
"learning_rate": 3.7202797202797202e-06,
"loss": 0.7351927757263184,
"step": 134
},
{
"epoch": 0.2869198312236287,
"grad_norm": 2.329267740249634,
"learning_rate": 3.776223776223776e-06,
"loss": 0.9117053151130676,
"step": 136
},
{
"epoch": 0.2911392405063291,
"grad_norm": 4.902188777923584,
"learning_rate": 3.832167832167832e-06,
"loss": 1.4102413654327393,
"step": 138
},
{
"epoch": 0.29535864978902954,
"grad_norm": 7.462285041809082,
"learning_rate": 3.888111888111888e-06,
"loss": 0.9595804214477539,
"step": 140
},
{
"epoch": 0.29957805907172996,
"grad_norm": 4.3409953117370605,
"learning_rate": 3.944055944055944e-06,
"loss": 1.2982699871063232,
"step": 142
},
{
"epoch": 0.3037974683544304,
"grad_norm": 5.797815799713135,
"learning_rate": 4e-06,
"loss": 1.0992412567138672,
"step": 144
},
{
"epoch": 0.3080168776371308,
"grad_norm": 3.4705042839050293,
"learning_rate": 3.999995129731755e-06,
"loss": 1.4175902605056763,
"step": 146
},
{
"epoch": 0.31223628691983124,
"grad_norm": 3.2805113792419434,
"learning_rate": 3.999980518953377e-06,
"loss": 1.3948296308517456,
"step": 148
},
{
"epoch": 0.31645569620253167,
"grad_norm": 2.5500190258026123,
"learning_rate": 3.9999561677439284e-06,
"loss": 1.2504572868347168,
"step": 150
},
{
"epoch": 0.3206751054852321,
"grad_norm": 2.943164825439453,
"learning_rate": 3.999922076235186e-06,
"loss": 1.3152413368225098,
"step": 152
},
{
"epoch": 0.32489451476793246,
"grad_norm": 1.8291728496551514,
"learning_rate": 3.999878244611632e-06,
"loss": 1.4914839267730713,
"step": 154
},
{
"epoch": 0.3291139240506329,
"grad_norm": 3.691744327545166,
"learning_rate": 3.999824673110458e-06,
"loss": 1.2806551456451416,
"step": 156
},
{
"epoch": 0.3333333333333333,
"grad_norm": 3.6490440368652344,
"learning_rate": 3.999761362021559e-06,
"loss": 1.3481640815734863,
"step": 158
},
{
"epoch": 0.33755274261603374,
"grad_norm": 2.0211308002471924,
"learning_rate": 3.999688311687539e-06,
"loss": 1.3426798582077026,
"step": 160
},
{
"epoch": 0.34177215189873417,
"grad_norm": 3.4758718013763428,
"learning_rate": 3.9996055225037035e-06,
"loss": 0.8756759762763977,
"step": 162
},
{
"epoch": 0.3459915611814346,
"grad_norm": 3.027031660079956,
"learning_rate": 3.999512994918057e-06,
"loss": 1.2513983249664307,
"step": 164
},
{
"epoch": 0.350210970464135,
"grad_norm": 4.0340094566345215,
"learning_rate": 3.999410729431306e-06,
"loss": 0.83528733253479,
"step": 166
},
{
"epoch": 0.35443037974683544,
"grad_norm": 4.2334747314453125,
"learning_rate": 3.9992987265968506e-06,
"loss": 1.2495150566101074,
"step": 168
},
{
"epoch": 0.35864978902953587,
"grad_norm": 2.4250214099884033,
"learning_rate": 3.999176987020782e-06,
"loss": 1.3424336910247803,
"step": 170
},
{
"epoch": 0.3628691983122363,
"grad_norm": 2.0446016788482666,
"learning_rate": 3.999045511361886e-06,
"loss": 1.2304866313934326,
"step": 172
},
{
"epoch": 0.3670886075949367,
"grad_norm": 2.2647955417633057,
"learning_rate": 3.998904300331629e-06,
"loss": 1.0302658081054688,
"step": 174
},
{
"epoch": 0.37130801687763715,
"grad_norm": 4.148885250091553,
"learning_rate": 3.998753354694162e-06,
"loss": 1.3435766696929932,
"step": 176
},
{
"epoch": 0.3755274261603376,
"grad_norm": 2.1456167697906494,
"learning_rate": 3.998592675266313e-06,
"loss": 1.3384077548980713,
"step": 178
},
{
"epoch": 0.379746835443038,
"grad_norm": 1.8021888732910156,
"learning_rate": 3.998422262917586e-06,
"loss": 1.0130809545516968,
"step": 180
},
{
"epoch": 0.38396624472573837,
"grad_norm": 1.8628857135772705,
"learning_rate": 3.99824211857015e-06,
"loss": 1.3068010807037354,
"step": 182
},
{
"epoch": 0.3881856540084388,
"grad_norm": 2.337610960006714,
"learning_rate": 3.998052243198841e-06,
"loss": 1.3072583675384521,
"step": 184
},
{
"epoch": 0.3924050632911392,
"grad_norm": 4.762563228607178,
"learning_rate": 3.997852637831152e-06,
"loss": 0.5184736847877502,
"step": 186
},
{
"epoch": 0.39662447257383965,
"grad_norm": 5.280208110809326,
"learning_rate": 3.9976433035472296e-06,
"loss": 0.9710695743560791,
"step": 188
},
{
"epoch": 0.4008438818565401,
"grad_norm": 2.887589693069458,
"learning_rate": 3.997424241479867e-06,
"loss": 1.0692715644836426,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.577860951423645,
"learning_rate": 3.997195452814498e-06,
"loss": 1.315537452697754,
"step": 192
},
{
"epoch": 0.4092827004219409,
"grad_norm": 3.5055530071258545,
"learning_rate": 3.996956938789193e-06,
"loss": 1.0743625164031982,
"step": 194
},
{
"epoch": 0.41350210970464135,
"grad_norm": 2.70391583442688,
"learning_rate": 3.996708700694647e-06,
"loss": 1.2994472980499268,
"step": 196
},
{
"epoch": 0.4177215189873418,
"grad_norm": 2.665532112121582,
"learning_rate": 3.99645073987418e-06,
"loss": 1.0376091003417969,
"step": 198
},
{
"epoch": 0.4219409282700422,
"grad_norm": 3.4091718196868896,
"learning_rate": 3.9961830577237225e-06,
"loss": 1.1265370845794678,
"step": 200
},
{
"epoch": 0.42616033755274263,
"grad_norm": 3.360374689102173,
"learning_rate": 3.9959056556918125e-06,
"loss": 1.1382226943969727,
"step": 202
},
{
"epoch": 0.43037974683544306,
"grad_norm": 3.247422218322754,
"learning_rate": 3.9956185352795864e-06,
"loss": 0.9122767448425293,
"step": 204
},
{
"epoch": 0.4345991561181435,
"grad_norm": 3.775322198867798,
"learning_rate": 3.995321698040768e-06,
"loss": 1.5471869707107544,
"step": 206
},
{
"epoch": 0.4388185654008439,
"grad_norm": 11.316990852355957,
"learning_rate": 3.995015145581668e-06,
"loss": 0.7269084453582764,
"step": 208
},
{
"epoch": 0.4430379746835443,
"grad_norm": 1.767858862876892,
"learning_rate": 3.994698879561165e-06,
"loss": 1.2886333465576172,
"step": 210
},
{
"epoch": 0.4472573839662447,
"grad_norm": 3.727637767791748,
"learning_rate": 3.994372901690705e-06,
"loss": 0.8034701943397522,
"step": 212
},
{
"epoch": 0.45147679324894513,
"grad_norm": 2.0933773517608643,
"learning_rate": 3.994037213734287e-06,
"loss": 1.209691047668457,
"step": 214
},
{
"epoch": 0.45569620253164556,
"grad_norm": 2.345202684402466,
"learning_rate": 3.993691817508457e-06,
"loss": 1.2683181762695312,
"step": 216
},
{
"epoch": 0.459915611814346,
"grad_norm": 6.4172868728637695,
"learning_rate": 3.993336714882294e-06,
"loss": 1.3031342029571533,
"step": 218
},
{
"epoch": 0.4641350210970464,
"grad_norm": 4.881870269775391,
"learning_rate": 3.992971907777404e-06,
"loss": 1.259873390197754,
"step": 220
},
{
"epoch": 0.46835443037974683,
"grad_norm": 4.619325637817383,
"learning_rate": 3.992597398167907e-06,
"loss": 1.2213921546936035,
"step": 222
},
{
"epoch": 0.47257383966244726,
"grad_norm": 2.6401724815368652,
"learning_rate": 3.99221318808043e-06,
"loss": 1.2425501346588135,
"step": 224
},
{
"epoch": 0.4767932489451477,
"grad_norm": 2.318206548690796,
"learning_rate": 3.9918192795940875e-06,
"loss": 1.2931036949157715,
"step": 226
},
{
"epoch": 0.4810126582278481,
"grad_norm": 3.360222339630127,
"learning_rate": 3.991415674840482e-06,
"loss": 0.7865722179412842,
"step": 228
},
{
"epoch": 0.48523206751054854,
"grad_norm": 7.906117916107178,
"learning_rate": 3.9910023760036835e-06,
"loss": 0.920839250087738,
"step": 230
},
{
"epoch": 0.48945147679324896,
"grad_norm": 4.246833324432373,
"learning_rate": 3.99057938532022e-06,
"loss": 0.8984707593917847,
"step": 232
},
{
"epoch": 0.4936708860759494,
"grad_norm": 1.9855449199676514,
"learning_rate": 3.990146705079069e-06,
"loss": 1.2834184169769287,
"step": 234
},
{
"epoch": 0.4978902953586498,
"grad_norm": 2.732619285583496,
"learning_rate": 3.989704337621639e-06,
"loss": 1.3313374519348145,
"step": 236
},
{
"epoch": 0.5021097046413502,
"grad_norm": 2.2487165927886963,
"learning_rate": 3.989252285341761e-06,
"loss": 0.9914782047271729,
"step": 238
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.918333053588867,
"learning_rate": 3.988790550685677e-06,
"loss": 0.4503798186779022,
"step": 240
},
{
"epoch": 0.510548523206751,
"grad_norm": 5.367257118225098,
"learning_rate": 3.98831913615202e-06,
"loss": 1.4287300109863281,
"step": 242
},
{
"epoch": 0.5147679324894515,
"grad_norm": 4.372511863708496,
"learning_rate": 3.987838044291807e-06,
"loss": 0.8704193830490112,
"step": 244
},
{
"epoch": 0.5189873417721519,
"grad_norm": 2.685379981994629,
"learning_rate": 3.987347277708424e-06,
"loss": 1.4937043190002441,
"step": 246
},
{
"epoch": 0.5232067510548524,
"grad_norm": 2.241354465484619,
"learning_rate": 3.986846839057609e-06,
"loss": 1.2054930925369263,
"step": 248
},
{
"epoch": 0.5274261603375527,
"grad_norm": 2.666008472442627,
"learning_rate": 3.98633673104744e-06,
"loss": 1.322192907333374,
"step": 250
},
{
"epoch": 0.5316455696202531,
"grad_norm": 3.0313169956207275,
"learning_rate": 3.985816956438322e-06,
"loss": 1.1353508234024048,
"step": 252
},
{
"epoch": 0.5358649789029536,
"grad_norm": 1.7615196704864502,
"learning_rate": 3.985287518042965e-06,
"loss": 1.2446702718734741,
"step": 254
},
{
"epoch": 0.540084388185654,
"grad_norm": 2.7614693641662598,
"learning_rate": 3.984748418726381e-06,
"loss": 1.2152833938598633,
"step": 256
},
{
"epoch": 0.5443037974683544,
"grad_norm": 1.3947678804397583,
"learning_rate": 3.9841996614058536e-06,
"loss": 1.0362350940704346,
"step": 258
},
{
"epoch": 0.5485232067510548,
"grad_norm": 3.6117563247680664,
"learning_rate": 3.983641249050933e-06,
"loss": 0.9856378436088562,
"step": 260
},
{
"epoch": 0.5527426160337553,
"grad_norm": 2.348914861679077,
"learning_rate": 3.983073184683419e-06,
"loss": 1.2900649309158325,
"step": 262
},
{
"epoch": 0.5569620253164557,
"grad_norm": 2.4478940963745117,
"learning_rate": 3.98249547137734e-06,
"loss": 1.30060613155365,
"step": 264
},
{
"epoch": 0.5611814345991561,
"grad_norm": 1.8957366943359375,
"learning_rate": 3.981908112258938e-06,
"loss": 1.2571529150009155,
"step": 266
},
{
"epoch": 0.5654008438818565,
"grad_norm": 2.468729257583618,
"learning_rate": 3.981311110506654e-06,
"loss": 1.522542119026184,
"step": 268
},
{
"epoch": 0.569620253164557,
"grad_norm": 5.101961612701416,
"learning_rate": 3.9807044693511086e-06,
"loss": 1.0608189105987549,
"step": 270
},
{
"epoch": 0.5738396624472574,
"grad_norm": 3.0331854820251465,
"learning_rate": 3.980088192075085e-06,
"loss": 1.3017442226409912,
"step": 272
},
{
"epoch": 0.5780590717299579,
"grad_norm": 2.463477373123169,
"learning_rate": 3.979462282013513e-06,
"loss": 1.099843144416809,
"step": 274
},
{
"epoch": 0.5822784810126582,
"grad_norm": 1.7117162942886353,
"learning_rate": 3.978826742553447e-06,
"loss": 1.2798070907592773,
"step": 276
},
{
"epoch": 0.5864978902953587,
"grad_norm": 3.3944342136383057,
"learning_rate": 3.978181577134051e-06,
"loss": 1.4166996479034424,
"step": 278
},
{
"epoch": 0.5907172995780591,
"grad_norm": 2.0399510860443115,
"learning_rate": 3.97752678924658e-06,
"loss": 0.9708434343338013,
"step": 280
},
{
"epoch": 0.5949367088607594,
"grad_norm": 5.146090984344482,
"learning_rate": 3.976862382434358e-06,
"loss": 1.3494899272918701,
"step": 282
},
{
"epoch": 0.5991561181434599,
"grad_norm": 2.0854623317718506,
"learning_rate": 3.976188360292762e-06,
"loss": 1.551278829574585,
"step": 284
},
{
"epoch": 0.6033755274261603,
"grad_norm": 0.7903197407722473,
"learning_rate": 3.975504726469204e-06,
"loss": 1.1335902214050293,
"step": 286
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.5145395994186401,
"learning_rate": 3.9748114846631025e-06,
"loss": 1.2714455127716064,
"step": 288
},
{
"epoch": 0.6118143459915611,
"grad_norm": 2.4970903396606445,
"learning_rate": 3.974108638625875e-06,
"loss": 0.8297945857048035,
"step": 290
},
{
"epoch": 0.6160337552742616,
"grad_norm": 1.9116922616958618,
"learning_rate": 3.973396192160909e-06,
"loss": 0.6557431221008301,
"step": 292
},
{
"epoch": 0.620253164556962,
"grad_norm": 1.597800374031067,
"learning_rate": 3.972674149123543e-06,
"loss": 1.251997709274292,
"step": 294
},
{
"epoch": 0.6244725738396625,
"grad_norm": 5.221956253051758,
"learning_rate": 3.971942513421049e-06,
"loss": 0.7073361873626709,
"step": 296
},
{
"epoch": 0.6286919831223629,
"grad_norm": 8.381784439086914,
"learning_rate": 3.971201289012605e-06,
"loss": 0.6594762802124023,
"step": 298
},
{
"epoch": 0.6329113924050633,
"grad_norm": 4.704819202423096,
"learning_rate": 3.97045047990928e-06,
"loss": 1.7869096994400024,
"step": 300
},
{
"epoch": 0.6371308016877637,
"grad_norm": 1.746824026107788,
"learning_rate": 3.969690090174009e-06,
"loss": 1.2827584743499756,
"step": 302
},
{
"epoch": 0.6413502109704642,
"grad_norm": 2.3811588287353516,
"learning_rate": 3.968920123921574e-06,
"loss": 0.8861095905303955,
"step": 304
},
{
"epoch": 0.6455696202531646,
"grad_norm": 2.874070644378662,
"learning_rate": 3.968140585318575e-06,
"loss": 1.0074717998504639,
"step": 306
},
{
"epoch": 0.6497890295358649,
"grad_norm": 1.4178441762924194,
"learning_rate": 3.967351478583417e-06,
"loss": 1.271646499633789,
"step": 308
},
{
"epoch": 0.6540084388185654,
"grad_norm": 2.7072203159332275,
"learning_rate": 3.9665528079862766e-06,
"loss": 1.2094981670379639,
"step": 310
},
{
"epoch": 0.6582278481012658,
"grad_norm": 2.434222936630249,
"learning_rate": 3.965744577849089e-06,
"loss": 1.016772747039795,
"step": 312
},
{
"epoch": 0.6624472573839663,
"grad_norm": 1.4761089086532593,
"learning_rate": 3.964926792545517e-06,
"loss": 1.2257163524627686,
"step": 314
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.9905054569244385,
"learning_rate": 3.964099456500932e-06,
"loss": 1.1116795539855957,
"step": 316
},
{
"epoch": 0.6708860759493671,
"grad_norm": 2.5824759006500244,
"learning_rate": 3.963262574192388e-06,
"loss": 1.0979809761047363,
"step": 318
},
{
"epoch": 0.6751054852320675,
"grad_norm": 2.126721143722534,
"learning_rate": 3.962416150148598e-06,
"loss": 1.0931775569915771,
"step": 320
},
{
"epoch": 0.679324894514768,
"grad_norm": 2.355828046798706,
"learning_rate": 3.961560188949909e-06,
"loss": 0.8760429620742798,
"step": 322
},
{
"epoch": 0.6835443037974683,
"grad_norm": 2.159811496734619,
"learning_rate": 3.9606946952282745e-06,
"loss": 0.8803830146789551,
"step": 324
},
{
"epoch": 0.6877637130801688,
"grad_norm": 2.930659294128418,
"learning_rate": 3.959819673667239e-06,
"loss": 0.8701751232147217,
"step": 326
},
{
"epoch": 0.6919831223628692,
"grad_norm": 1.615522027015686,
"learning_rate": 3.958935129001899e-06,
"loss": 0.8708148002624512,
"step": 328
},
{
"epoch": 0.6962025316455697,
"grad_norm": 5.590799331665039,
"learning_rate": 3.958041066018891e-06,
"loss": 1.591496229171753,
"step": 330
},
{
"epoch": 0.70042194092827,
"grad_norm": 3.333008050918579,
"learning_rate": 3.957137489556352e-06,
"loss": 1.1004414558410645,
"step": 332
},
{
"epoch": 0.7046413502109705,
"grad_norm": 2.3116865158081055,
"learning_rate": 3.956224404503906e-06,
"loss": 1.4001518487930298,
"step": 334
},
{
"epoch": 0.7088607594936709,
"grad_norm": 4.017354965209961,
"learning_rate": 3.955301815802629e-06,
"loss": 1.2720857858657837,
"step": 336
},
{
"epoch": 0.7130801687763713,
"grad_norm": 2.1855573654174805,
"learning_rate": 3.954369728445028e-06,
"loss": 1.2939956188201904,
"step": 338
},
{
"epoch": 0.7172995780590717,
"grad_norm": 2.4703359603881836,
"learning_rate": 3.953428147475006e-06,
"loss": 1.2735445499420166,
"step": 340
},
{
"epoch": 0.7215189873417721,
"grad_norm": 2.1738462448120117,
"learning_rate": 3.952477077987845e-06,
"loss": 1.2617197036743164,
"step": 342
},
{
"epoch": 0.7257383966244726,
"grad_norm": 2.704313278198242,
"learning_rate": 3.95151652513017e-06,
"loss": 1.0853008031845093,
"step": 344
},
{
"epoch": 0.729957805907173,
"grad_norm": 10.06601333618164,
"learning_rate": 3.950546494099926e-06,
"loss": 0.8921165466308594,
"step": 346
},
{
"epoch": 0.7341772151898734,
"grad_norm": 1.9999581575393677,
"learning_rate": 3.949566990146349e-06,
"loss": 1.256639003753662,
"step": 348
},
{
"epoch": 0.7383966244725738,
"grad_norm": 5.633319854736328,
"learning_rate": 3.948578018569932e-06,
"loss": 1.1841363906860352,
"step": 350
},
{
"epoch": 0.7426160337552743,
"grad_norm": 7.676711559295654,
"learning_rate": 3.94757958472241e-06,
"loss": 1.0944801568984985,
"step": 352
},
{
"epoch": 0.7468354430379747,
"grad_norm": 4.892640590667725,
"learning_rate": 3.946571694006712e-06,
"loss": 0.6508228182792664,
"step": 354
},
{
"epoch": 0.7510548523206751,
"grad_norm": 2.564443349838257,
"learning_rate": 3.945554351876951e-06,
"loss": 1.0562660694122314,
"step": 356
},
{
"epoch": 0.7552742616033755,
"grad_norm": 4.787500858306885,
"learning_rate": 3.94452756383838e-06,
"loss": 0.998831033706665,
"step": 358
},
{
"epoch": 0.759493670886076,
"grad_norm": 1.8746553659439087,
"learning_rate": 3.943491335447368e-06,
"loss": 1.2303812503814697,
"step": 360
},
{
"epoch": 0.7637130801687764,
"grad_norm": 2.7792534828186035,
"learning_rate": 3.942445672311373e-06,
"loss": 0.9920629858970642,
"step": 362
},
{
"epoch": 0.7679324894514767,
"grad_norm": 5.023082733154297,
"learning_rate": 3.941390580088905e-06,
"loss": 1.5890564918518066,
"step": 364
},
{
"epoch": 0.7721518987341772,
"grad_norm": 3.2253143787384033,
"learning_rate": 3.940326064489499e-06,
"loss": 0.7020189166069031,
"step": 366
},
{
"epoch": 0.7763713080168776,
"grad_norm": 4.751897811889648,
"learning_rate": 3.939252131273686e-06,
"loss": 1.1662057638168335,
"step": 368
},
{
"epoch": 0.7805907172995781,
"grad_norm": 4.707884788513184,
"learning_rate": 3.938168786252957e-06,
"loss": 1.490715742111206,
"step": 370
},
{
"epoch": 0.7848101265822784,
"grad_norm": 4.896017074584961,
"learning_rate": 3.937076035289735e-06,
"loss": 0.9990431070327759,
"step": 372
},
{
"epoch": 0.7890295358649789,
"grad_norm": 5.3917059898376465,
"learning_rate": 3.935973884297344e-06,
"loss": 1.06167471408844,
"step": 374
},
{
"epoch": 0.7932489451476793,
"grad_norm": 1.6713993549346924,
"learning_rate": 3.934862339239972e-06,
"loss": 1.1578385829925537,
"step": 376
},
{
"epoch": 0.7974683544303798,
"grad_norm": 1.267899990081787,
"learning_rate": 3.933741406132645e-06,
"loss": 1.1280488967895508,
"step": 378
},
{
"epoch": 0.8016877637130801,
"grad_norm": 2.5772478580474854,
"learning_rate": 3.932611091041192e-06,
"loss": 0.7228022217750549,
"step": 380
},
{
"epoch": 0.8059071729957806,
"grad_norm": 2.877981185913086,
"learning_rate": 3.931471400082208e-06,
"loss": 1.275989294052124,
"step": 382
},
{
"epoch": 0.810126582278481,
"grad_norm": 4.086211204528809,
"learning_rate": 3.930322339423029e-06,
"loss": 1.0468356609344482,
"step": 384
},
{
"epoch": 0.8143459915611815,
"grad_norm": 3.2680745124816895,
"learning_rate": 3.929163915281692e-06,
"loss": 1.2617956399917603,
"step": 386
},
{
"epoch": 0.8185654008438819,
"grad_norm": 2.128434181213379,
"learning_rate": 3.927996133926903e-06,
"loss": 0.9376715421676636,
"step": 388
},
{
"epoch": 0.8227848101265823,
"grad_norm": 1.895815372467041,
"learning_rate": 3.926819001678005e-06,
"loss": 1.2338812351226807,
"step": 390
},
{
"epoch": 0.8270042194092827,
"grad_norm": 2.774864435195923,
"learning_rate": 3.925632524904943e-06,
"loss": 0.9890301823616028,
"step": 392
},
{
"epoch": 0.8312236286919831,
"grad_norm": 3.1884961128234863,
"learning_rate": 3.924436710028228e-06,
"loss": 0.9189957976341248,
"step": 394
},
{
"epoch": 0.8354430379746836,
"grad_norm": 2.6990184783935547,
"learning_rate": 3.923231563518904e-06,
"loss": 1.2466810941696167,
"step": 396
},
{
"epoch": 0.8396624472573839,
"grad_norm": 0.5147901773452759,
"learning_rate": 3.922017091898511e-06,
"loss": 1.0888053178787231,
"step": 398
},
{
"epoch": 0.8438818565400844,
"grad_norm": 2.1018054485321045,
"learning_rate": 3.920793301739052e-06,
"loss": 0.9585396647453308,
"step": 400
},
{
"epoch": 0.8481012658227848,
"grad_norm": 4.2678046226501465,
"learning_rate": 3.9195601996629564e-06,
"loss": 0.702578067779541,
"step": 402
},
{
"epoch": 0.8523206751054853,
"grad_norm": 3.8625717163085938,
"learning_rate": 3.9183177923430445e-06,
"loss": 1.2020361423492432,
"step": 404
},
{
"epoch": 0.8565400843881856,
"grad_norm": 2.158465623855591,
"learning_rate": 3.917066086502491e-06,
"loss": 0.9442514181137085,
"step": 406
},
{
"epoch": 0.8607594936708861,
"grad_norm": 2.1818642616271973,
"learning_rate": 3.915805088914787e-06,
"loss": 0.9750051498413086,
"step": 408
},
{
"epoch": 0.8649789029535865,
"grad_norm": 1.9446742534637451,
"learning_rate": 3.914534806403707e-06,
"loss": 1.247662901878357,
"step": 410
},
{
"epoch": 0.869198312236287,
"grad_norm": 2.9858086109161377,
"learning_rate": 3.913255245843269e-06,
"loss": 0.8505547642707825,
"step": 412
},
{
"epoch": 0.8734177215189873,
"grad_norm": 3.3264975547790527,
"learning_rate": 3.911966414157699e-06,
"loss": 1.222496509552002,
"step": 414
},
{
"epoch": 0.8776371308016878,
"grad_norm": 1.9070676565170288,
"learning_rate": 3.910668318321395e-06,
"loss": 1.0650990009307861,
"step": 416
},
{
"epoch": 0.8818565400843882,
"grad_norm": 49.54351806640625,
"learning_rate": 3.90936096535888e-06,
"loss": 1.170435905456543,
"step": 418
},
{
"epoch": 0.8860759493670886,
"grad_norm": 3.324521064758301,
"learning_rate": 3.90804436234478e-06,
"loss": 0.9483715295791626,
"step": 420
},
{
"epoch": 0.890295358649789,
"grad_norm": 2.001574754714966,
"learning_rate": 3.9067185164037705e-06,
"loss": 1.3522322177886963,
"step": 422
},
{
"epoch": 0.8945147679324894,
"grad_norm": 13.360381126403809,
"learning_rate": 3.905383434710546e-06,
"loss": 0.980687141418457,
"step": 424
},
{
"epoch": 0.8987341772151899,
"grad_norm": 2.832037925720215,
"learning_rate": 3.904039124489782e-06,
"loss": 1.1890883445739746,
"step": 426
},
{
"epoch": 0.9029535864978903,
"grad_norm": 3.036261796951294,
"learning_rate": 3.902685593016088e-06,
"loss": 1.0536837577819824,
"step": 428
},
{
"epoch": 0.9071729957805907,
"grad_norm": 3.503538131713867,
"learning_rate": 3.90132284761398e-06,
"loss": 1.016420602798462,
"step": 430
},
{
"epoch": 0.9113924050632911,
"grad_norm": 2.102992534637451,
"learning_rate": 3.899950895657829e-06,
"loss": 1.0863244533538818,
"step": 432
},
{
"epoch": 0.9156118143459916,
"grad_norm": 2.5443339347839355,
"learning_rate": 3.8985697445718275e-06,
"loss": 1.2617383003234863,
"step": 434
},
{
"epoch": 0.919831223628692,
"grad_norm": 13.239272117614746,
"learning_rate": 3.8971794018299515e-06,
"loss": 0.8763201832771301,
"step": 436
},
{
"epoch": 0.9240506329113924,
"grad_norm": 1.944677710533142,
"learning_rate": 3.895779874955913e-06,
"loss": 1.2141039371490479,
"step": 438
},
{
"epoch": 0.9282700421940928,
"grad_norm": 1.6930376291275024,
"learning_rate": 3.894371171523124e-06,
"loss": 0.9925521016120911,
"step": 440
},
{
"epoch": 0.9324894514767933,
"grad_norm": 2.417435646057129,
"learning_rate": 3.892953299154657e-06,
"loss": 0.9523521661758423,
"step": 442
},
{
"epoch": 0.9367088607594937,
"grad_norm": 4.125819683074951,
"learning_rate": 3.8915262655231985e-06,
"loss": 1.1057894229888916,
"step": 444
},
{
"epoch": 0.9409282700421941,
"grad_norm": 5.843780517578125,
"learning_rate": 3.890090078351011e-06,
"loss": 1.3123371601104736,
"step": 446
},
{
"epoch": 0.9451476793248945,
"grad_norm": 1.6658388376235962,
"learning_rate": 3.8886447454098914e-06,
"loss": 1.013564109802246,
"step": 448
},
{
"epoch": 0.9493670886075949,
"grad_norm": 12.320473670959473,
"learning_rate": 3.887190274521128e-06,
"loss": 0.9290477633476257,
"step": 450
},
{
"epoch": 0.9535864978902954,
"grad_norm": 1.9203139543533325,
"learning_rate": 3.885726673555457e-06,
"loss": 1.2885007858276367,
"step": 452
},
{
"epoch": 0.9578059071729957,
"grad_norm": 1.4699382781982422,
"learning_rate": 3.884253950433022e-06,
"loss": 1.0010005235671997,
"step": 454
},
{
"epoch": 0.9620253164556962,
"grad_norm": 2.8639562129974365,
"learning_rate": 3.882772113123332e-06,
"loss": 1.1654586791992188,
"step": 456
},
{
"epoch": 0.9662447257383966,
"grad_norm": 3.0785443782806396,
"learning_rate": 3.881281169645212e-06,
"loss": 0.8937104940414429,
"step": 458
},
{
"epoch": 0.9704641350210971,
"grad_norm": 4.43109655380249,
"learning_rate": 3.879781128066771e-06,
"loss": 0.7110123634338379,
"step": 460
},
{
"epoch": 0.9746835443037974,
"grad_norm": 2.914869785308838,
"learning_rate": 3.878271996505345e-06,
"loss": 0.8978859186172485,
"step": 462
},
{
"epoch": 0.9789029535864979,
"grad_norm": 2.207864999771118,
"learning_rate": 3.876753783127464e-06,
"loss": 1.1789137125015259,
"step": 464
},
{
"epoch": 0.9831223628691983,
"grad_norm": 3.6072731018066406,
"learning_rate": 3.875226496148799e-06,
"loss": 0.7317770719528198,
"step": 466
},
{
"epoch": 0.9873417721518988,
"grad_norm": 6.211583614349365,
"learning_rate": 3.873690143834129e-06,
"loss": 1.3279008865356445,
"step": 468
},
{
"epoch": 0.9915611814345991,
"grad_norm": 8.5204496383667,
"learning_rate": 3.872144734497281e-06,
"loss": 0.8850146532058716,
"step": 470
},
{
"epoch": 0.9957805907172996,
"grad_norm": 2.666997194290161,
"learning_rate": 3.870590276501099e-06,
"loss": 1.220442295074463,
"step": 472
},
{
"epoch": 1.0,
"grad_norm": 3.6749136447906494,
"learning_rate": 3.869026778257392e-06,
"loss": 1.2717642784118652,
"step": 474
},
{
"epoch": 1.0042194092827004,
"grad_norm": 5.584980010986328,
"learning_rate": 3.867454248226887e-06,
"loss": 1.0543200969696045,
"step": 476
},
{
"epoch": 1.0084388185654007,
"grad_norm": 4.494807243347168,
"learning_rate": 3.86587269491919e-06,
"loss": 1.1978576183319092,
"step": 478
},
{
"epoch": 1.0126582278481013,
"grad_norm": 2.680006504058838,
"learning_rate": 3.86428212689273e-06,
"loss": 1.1809672117233276,
"step": 480
},
{
"epoch": 1.0168776371308017,
"grad_norm": 3.9369754791259766,
"learning_rate": 3.862682552754722e-06,
"loss": 0.9172142744064331,
"step": 482
},
{
"epoch": 1.021097046413502,
"grad_norm": 7.680518627166748,
"learning_rate": 3.861073981161118e-06,
"loss": 1.1449049711227417,
"step": 484
},
{
"epoch": 1.0253164556962024,
"grad_norm": 2.746133804321289,
"learning_rate": 3.859456420816556e-06,
"loss": 0.5115264654159546,
"step": 486
},
{
"epoch": 1.029535864978903,
"grad_norm": 2.72514271736145,
"learning_rate": 3.857829880474316e-06,
"loss": 0.9918684363365173,
"step": 488
},
{
"epoch": 1.0337552742616034,
"grad_norm": 2.223912000656128,
"learning_rate": 3.856194368936275e-06,
"loss": 0.8463398814201355,
"step": 490
},
{
"epoch": 1.0379746835443038,
"grad_norm": 2.9955148696899414,
"learning_rate": 3.8545498950528535e-06,
"loss": 1.173925757408142,
"step": 492
},
{
"epoch": 1.0421940928270041,
"grad_norm": 4.594770431518555,
"learning_rate": 3.852896467722974e-06,
"loss": 0.8562051057815552,
"step": 494
},
{
"epoch": 1.0464135021097047,
"grad_norm": 9.129888534545898,
"learning_rate": 3.851234095894007e-06,
"loss": 0.9281083345413208,
"step": 496
},
{
"epoch": 1.0506329113924051,
"grad_norm": 2.604607105255127,
"learning_rate": 3.849562788561727e-06,
"loss": 1.2945480346679688,
"step": 498
},
{
"epoch": 1.0548523206751055,
"grad_norm": 2.3840718269348145,
"learning_rate": 3.847882554770263e-06,
"loss": 1.1486706733703613,
"step": 500
},
{
"epoch": 1.0590717299578059,
"grad_norm": 1.9679715633392334,
"learning_rate": 3.846193403612046e-06,
"loss": 1.1716930866241455,
"step": 502
},
{
"epoch": 1.0632911392405062,
"grad_norm": 1.7950235605239868,
"learning_rate": 3.844495344227765e-06,
"loss": 1.2809019088745117,
"step": 504
},
{
"epoch": 1.0675105485232068,
"grad_norm": 2.0246713161468506,
"learning_rate": 3.842788385806312e-06,
"loss": 0.7856377363204956,
"step": 506
},
{
"epoch": 1.0717299578059072,
"grad_norm": 2.0895354747772217,
"learning_rate": 3.841072537584741e-06,
"loss": 1.1074151992797852,
"step": 508
},
{
"epoch": 1.0759493670886076,
"grad_norm": 2.316358804702759,
"learning_rate": 3.8393478088482065e-06,
"loss": 1.1439809799194336,
"step": 510
},
{
"epoch": 1.080168776371308,
"grad_norm": 4.703127384185791,
"learning_rate": 3.837614208929921e-06,
"loss": 1.035994291305542,
"step": 512
},
{
"epoch": 1.0843881856540085,
"grad_norm": 7.031744003295898,
"learning_rate": 3.835871747211105e-06,
"loss": 1.151397705078125,
"step": 514
},
{
"epoch": 1.0886075949367089,
"grad_norm": 2.653866767883301,
"learning_rate": 3.83412043312093e-06,
"loss": 1.16837739944458,
"step": 516
},
{
"epoch": 1.0928270042194093,
"grad_norm": 2.976186752319336,
"learning_rate": 3.832360276136474e-06,
"loss": 0.9901262521743774,
"step": 518
},
{
"epoch": 1.0970464135021096,
"grad_norm": 4.738975524902344,
"learning_rate": 3.830591285782666e-06,
"loss": 0.9500905871391296,
"step": 520
},
{
"epoch": 1.1012658227848102,
"grad_norm": 8.483416557312012,
"learning_rate": 3.828813471632237e-06,
"loss": 0.8555248975753784,
"step": 522
},
{
"epoch": 1.1054852320675106,
"grad_norm": 4.0885467529296875,
"learning_rate": 3.827026843305667e-06,
"loss": 1.0695732831954956,
"step": 524
},
{
"epoch": 1.109704641350211,
"grad_norm": 2.929239273071289,
"learning_rate": 3.825231410471132e-06,
"loss": 0.868694543838501,
"step": 526
},
{
"epoch": 1.1139240506329113,
"grad_norm": 2.6514179706573486,
"learning_rate": 3.823427182844455e-06,
"loss": 1.3674180507659912,
"step": 528
},
{
"epoch": 1.1181434599156117,
"grad_norm": 3.984480142593384,
"learning_rate": 3.821614170189049e-06,
"loss": 1.2144532203674316,
"step": 530
},
{
"epoch": 1.1223628691983123,
"grad_norm": 7.298747539520264,
"learning_rate": 3.819792382315868e-06,
"loss": 0.6592221260070801,
"step": 532
},
{
"epoch": 1.1265822784810127,
"grad_norm": 5.481675624847412,
"learning_rate": 3.81796182908335e-06,
"loss": 1.1008317470550537,
"step": 534
},
{
"epoch": 1.130801687763713,
"grad_norm": 2.6566853523254395,
"learning_rate": 3.816122520397369e-06,
"loss": 1.1687147617340088,
"step": 536
},
{
"epoch": 1.1350210970464134,
"grad_norm": 2.098435163497925,
"learning_rate": 3.8142744662111767e-06,
"loss": 0.8460148572921753,
"step": 538
},
{
"epoch": 1.139240506329114,
"grad_norm": 2.0900216102600098,
"learning_rate": 3.81241767652535e-06,
"loss": 0.7578733563423157,
"step": 540
},
{
"epoch": 1.1434599156118144,
"grad_norm": 2.375847578048706,
"learning_rate": 3.8105521613877386e-06,
"loss": 0.8102576732635498,
"step": 542
},
{
"epoch": 1.1476793248945147,
"grad_norm": 3.2528064250946045,
"learning_rate": 3.8086779308934066e-06,
"loss": 0.8352131247520447,
"step": 544
},
{
"epoch": 1.1518987341772151,
"grad_norm": 2.7880918979644775,
"learning_rate": 3.8067949951845836e-06,
"loss": 1.108149766921997,
"step": 546
},
{
"epoch": 1.1561181434599157,
"grad_norm": 2.515939712524414,
"learning_rate": 3.8049033644506043e-06,
"loss": 1.1225923299789429,
"step": 548
},
{
"epoch": 1.160337552742616,
"grad_norm": 7.062304973602295,
"learning_rate": 3.8030030489278563e-06,
"loss": 0.9247970581054688,
"step": 550
},
{
"epoch": 1.1645569620253164,
"grad_norm": 4.359371662139893,
"learning_rate": 3.8010940588997253e-06,
"loss": 1.4258188009262085,
"step": 552
},
{
"epoch": 1.1687763713080168,
"grad_norm": 2.2747061252593994,
"learning_rate": 3.799176404696537e-06,
"loss": 1.1855448484420776,
"step": 554
},
{
"epoch": 1.1729957805907172,
"grad_norm": 4.772888660430908,
"learning_rate": 3.797250096695503e-06,
"loss": 0.6528091430664062,
"step": 556
},
{
"epoch": 1.1772151898734178,
"grad_norm": 6.059512138366699,
"learning_rate": 3.7953151453206635e-06,
"loss": 1.0413281917572021,
"step": 558
},
{
"epoch": 1.1814345991561181,
"grad_norm": 3.8079075813293457,
"learning_rate": 3.793371561042833e-06,
"loss": 0.6656049489974976,
"step": 560
},
{
"epoch": 1.1856540084388185,
"grad_norm": 3.2168707847595215,
"learning_rate": 3.791419354379541e-06,
"loss": 0.8556336164474487,
"step": 562
},
{
"epoch": 1.189873417721519,
"grad_norm": 6.392472267150879,
"learning_rate": 3.7894585358949758e-06,
"loss": 1.3849632740020752,
"step": 564
},
{
"epoch": 1.1940928270042195,
"grad_norm": 6.333314418792725,
"learning_rate": 3.78748911619993e-06,
"loss": 1.1986020803451538,
"step": 566
},
{
"epoch": 1.1983122362869199,
"grad_norm": 3.8843421936035156,
"learning_rate": 3.7855111059517376e-06,
"loss": 0.834921658039093,
"step": 568
},
{
"epoch": 1.2025316455696202,
"grad_norm": 2.22169828414917,
"learning_rate": 3.7835245158542225e-06,
"loss": 1.1095911264419556,
"step": 570
},
{
"epoch": 1.2067510548523206,
"grad_norm": 2.5398857593536377,
"learning_rate": 3.7815293566576367e-06,
"loss": 1.06223464012146,
"step": 572
},
{
"epoch": 1.2109704641350212,
"grad_norm": 1.9426056146621704,
"learning_rate": 3.779525639158602e-06,
"loss": 1.1437506675720215,
"step": 574
},
{
"epoch": 1.2151898734177216,
"grad_norm": 3.523289203643799,
"learning_rate": 3.7775133742000542e-06,
"loss": 0.9638210535049438,
"step": 576
},
{
"epoch": 1.219409282700422,
"grad_norm": 2.9455223083496094,
"learning_rate": 3.7754925726711832e-06,
"loss": 0.6213325262069702,
"step": 578
},
{
"epoch": 1.2236286919831223,
"grad_norm": 1.926129698753357,
"learning_rate": 3.773463245507371e-06,
"loss": 0.9760810732841492,
"step": 580
},
{
"epoch": 1.2278481012658227,
"grad_norm": 5.75839900970459,
"learning_rate": 3.7714254036901382e-06,
"loss": 0.8893729448318481,
"step": 582
},
{
"epoch": 1.2320675105485233,
"grad_norm": 2.0400707721710205,
"learning_rate": 3.7693790582470815e-06,
"loss": 0.7321144342422485,
"step": 584
},
{
"epoch": 1.2362869198312236,
"grad_norm": 9.54411792755127,
"learning_rate": 3.767324220251812e-06,
"loss": 0.938395082950592,
"step": 586
},
{
"epoch": 1.240506329113924,
"grad_norm": 3.1993234157562256,
"learning_rate": 3.7652609008238994e-06,
"loss": 0.8318843841552734,
"step": 588
},
{
"epoch": 1.2447257383966246,
"grad_norm": 2.4239490032196045,
"learning_rate": 3.76318911112881e-06,
"loss": 1.1875081062316895,
"step": 590
},
{
"epoch": 1.248945147679325,
"grad_norm": 7.202500820159912,
"learning_rate": 3.761108862377844e-06,
"loss": 0.6182510852813721,
"step": 592
},
{
"epoch": 1.2531645569620253,
"grad_norm": 1.383612871170044,
"learning_rate": 3.75902016582808e-06,
"loss": 0.8994504809379578,
"step": 594
},
{
"epoch": 1.2573839662447257,
"grad_norm": 4.613704204559326,
"learning_rate": 3.756923032782309e-06,
"loss": 0.7695854902267456,
"step": 596
},
{
"epoch": 1.261603375527426,
"grad_norm": 3.9212303161621094,
"learning_rate": 3.754817474588976e-06,
"loss": 0.6324819922447205,
"step": 598
},
{
"epoch": 1.2658227848101267,
"grad_norm": 2.7459237575531006,
"learning_rate": 3.752703502642118e-06,
"loss": 1.0705938339233398,
"step": 600
},
{
"epoch": 1.270042194092827,
"grad_norm": 6.447327613830566,
"learning_rate": 3.7505811283813028e-06,
"loss": 1.4245244264602661,
"step": 602
},
{
"epoch": 1.2742616033755274,
"grad_norm": 1.7515556812286377,
"learning_rate": 3.7484503632915642e-06,
"loss": 1.0706822872161865,
"step": 604
},
{
"epoch": 1.2784810126582278,
"grad_norm": 4.614502429962158,
"learning_rate": 3.7463112189033452e-06,
"loss": 0.9431329965591431,
"step": 606
},
{
"epoch": 1.2827004219409281,
"grad_norm": 8.263338088989258,
"learning_rate": 3.7441637067924314e-06,
"loss": 0.8352319598197937,
"step": 608
},
{
"epoch": 1.2869198312236287,
"grad_norm": 3.6502585411071777,
"learning_rate": 3.7420078385798895e-06,
"loss": 0.9339005351066589,
"step": 610
},
{
"epoch": 1.2911392405063291,
"grad_norm": 8.820695877075195,
"learning_rate": 3.739843625932004e-06,
"loss": 0.6273094415664673,
"step": 612
},
{
"epoch": 1.2953586497890295,
"grad_norm": 2.1156527996063232,
"learning_rate": 3.737671080560215e-06,
"loss": 0.6872820854187012,
"step": 614
},
{
"epoch": 1.29957805907173,
"grad_norm": 2.442565679550171,
"learning_rate": 3.7354902142210548e-06,
"loss": 1.1194093227386475,
"step": 616
},
{
"epoch": 1.3037974683544304,
"grad_norm": 1.8104244470596313,
"learning_rate": 3.7333010387160834e-06,
"loss": 1.1286826133728027,
"step": 618
},
{
"epoch": 1.3080168776371308,
"grad_norm": 2.462080955505371,
"learning_rate": 3.7311035658918248e-06,
"loss": 0.7162832617759705,
"step": 620
},
{
"epoch": 1.3122362869198312,
"grad_norm": 3.075747013092041,
"learning_rate": 3.728897807639705e-06,
"loss": 0.9384140968322754,
"step": 622
},
{
"epoch": 1.3164556962025316,
"grad_norm": 30.50847053527832,
"learning_rate": 3.7266837758959825e-06,
"loss": 0.8814220428466797,
"step": 624
},
{
"epoch": 1.3206751054852321,
"grad_norm": 2.7363264560699463,
"learning_rate": 3.7244614826416896e-06,
"loss": 1.1194790601730347,
"step": 626
},
{
"epoch": 1.3248945147679325,
"grad_norm": 11.446985244750977,
"learning_rate": 3.722230939902565e-06,
"loss": 1.6146903038024902,
"step": 628
},
{
"epoch": 1.3291139240506329,
"grad_norm": 1.5937474966049194,
"learning_rate": 3.7199921597489876e-06,
"loss": 0.8981386423110962,
"step": 630
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.8236477375030518,
"learning_rate": 3.717745154295913e-06,
"loss": 1.0962973833084106,
"step": 632
},
{
"epoch": 1.3375527426160336,
"grad_norm": 1.031929850578308,
"learning_rate": 3.7154899357028072e-06,
"loss": 0.8632595539093018,
"step": 634
},
{
"epoch": 1.3417721518987342,
"grad_norm": 6.748950958251953,
"learning_rate": 3.7132265161735803e-06,
"loss": 0.6589536666870117,
"step": 636
},
{
"epoch": 1.3459915611814346,
"grad_norm": 9.24288558959961,
"learning_rate": 3.710954907956522e-06,
"loss": 0.8823557496070862,
"step": 638
},
{
"epoch": 1.350210970464135,
"grad_norm": 5.132577419281006,
"learning_rate": 3.7086751233442327e-06,
"loss": 1.2359545230865479,
"step": 640
},
{
"epoch": 1.3544303797468356,
"grad_norm": 2.1931583881378174,
"learning_rate": 3.7063871746735615e-06,
"loss": 0.839038610458374,
"step": 642
},
{
"epoch": 1.358649789029536,
"grad_norm": 1.920567512512207,
"learning_rate": 3.704091074325534e-06,
"loss": 1.2603816986083984,
"step": 644
},
{
"epoch": 1.3628691983122363,
"grad_norm": 1.3721178770065308,
"learning_rate": 3.7017868347252882e-06,
"loss": 1.1347554922103882,
"step": 646
},
{
"epoch": 1.3670886075949367,
"grad_norm": 6.712429523468018,
"learning_rate": 3.699474468342008e-06,
"loss": 0.8782555460929871,
"step": 648
},
{
"epoch": 1.371308016877637,
"grad_norm": 3.626140594482422,
"learning_rate": 3.6971539876888525e-06,
"loss": 1.3546593189239502,
"step": 650
},
{
"epoch": 1.3755274261603376,
"grad_norm": 2.531872034072876,
"learning_rate": 3.694825405322894e-06,
"loss": 1.1074378490447998,
"step": 652
},
{
"epoch": 1.379746835443038,
"grad_norm": 1.418874740600586,
"learning_rate": 3.692488733845044e-06,
"loss": 0.8609563112258911,
"step": 654
},
{
"epoch": 1.3839662447257384,
"grad_norm": 1.9295591115951538,
"learning_rate": 3.690143985899987e-06,
"loss": 1.2149752378463745,
"step": 656
},
{
"epoch": 1.3881856540084387,
"grad_norm": 9.573609352111816,
"learning_rate": 3.687791174176115e-06,
"loss": 0.6435118317604065,
"step": 658
},
{
"epoch": 1.3924050632911391,
"grad_norm": 2.0520520210266113,
"learning_rate": 3.685430311405453e-06,
"loss": 1.1482752561569214,
"step": 660
},
{
"epoch": 1.3966244725738397,
"grad_norm": 5.835472583770752,
"learning_rate": 3.6830614103635977e-06,
"loss": 0.6969774961471558,
"step": 662
},
{
"epoch": 1.40084388185654,
"grad_norm": 1.448106288909912,
"learning_rate": 3.6806844838696397e-06,
"loss": 1.1494622230529785,
"step": 664
},
{
"epoch": 1.4050632911392404,
"grad_norm": 2.3839871883392334,
"learning_rate": 3.6782995447861017e-06,
"loss": 0.7210063934326172,
"step": 666
},
{
"epoch": 1.409282700421941,
"grad_norm": 3.103909492492676,
"learning_rate": 3.675906606018865e-06,
"loss": 1.1002976894378662,
"step": 668
},
{
"epoch": 1.4135021097046414,
"grad_norm": 1.7114917039871216,
"learning_rate": 3.6735056805171012e-06,
"loss": 1.154873013496399,
"step": 670
},
{
"epoch": 1.4177215189873418,
"grad_norm": 3.427095651626587,
"learning_rate": 3.6710967812731994e-06,
"loss": 1.3804283142089844,
"step": 672
},
{
"epoch": 1.4219409282700421,
"grad_norm": 2.9029994010925293,
"learning_rate": 3.6686799213226984e-06,
"loss": 0.7311358451843262,
"step": 674
},
{
"epoch": 1.4261603375527425,
"grad_norm": 2.845263719558716,
"learning_rate": 3.666255113744218e-06,
"loss": 0.6623574495315552,
"step": 676
},
{
"epoch": 1.4303797468354431,
"grad_norm": 5.403914451599121,
"learning_rate": 3.663822371659383e-06,
"loss": 0.9805995225906372,
"step": 678
},
{
"epoch": 1.4345991561181435,
"grad_norm": 3.444819927215576,
"learning_rate": 3.6613817082327565e-06,
"loss": 1.088465690612793,
"step": 680
},
{
"epoch": 1.4388185654008439,
"grad_norm": 4.646100997924805,
"learning_rate": 3.658933136671767e-06,
"loss": 0.8819342851638794,
"step": 682
},
{
"epoch": 1.4430379746835442,
"grad_norm": 3.1290183067321777,
"learning_rate": 3.656476670226637e-06,
"loss": 1.2142698764801025,
"step": 684
},
{
"epoch": 1.4472573839662446,
"grad_norm": 4.68398904800415,
"learning_rate": 3.6540123221903123e-06,
"loss": 0.7775373458862305,
"step": 686
},
{
"epoch": 1.4514767932489452,
"grad_norm": 3.9637718200683594,
"learning_rate": 3.651540105898387e-06,
"loss": 0.9440705180168152,
"step": 688
},
{
"epoch": 1.4556962025316456,
"grad_norm": 6.741257190704346,
"learning_rate": 3.6490600347290353e-06,
"loss": 1.0546112060546875,
"step": 690
},
{
"epoch": 1.459915611814346,
"grad_norm": 4.779881000518799,
"learning_rate": 3.6465721221029376e-06,
"loss": 0.7046493887901306,
"step": 692
},
{
"epoch": 1.4641350210970465,
"grad_norm": 5.674314498901367,
"learning_rate": 3.6440763814832075e-06,
"loss": 1.2944858074188232,
"step": 694
},
{
"epoch": 1.4683544303797469,
"grad_norm": 2.4671552181243896,
"learning_rate": 3.6415728263753176e-06,
"loss": 0.6650893688201904,
"step": 696
},
{
"epoch": 1.4725738396624473,
"grad_norm": 3.0560495853424072,
"learning_rate": 3.63906147032703e-06,
"loss": 1.177491545677185,
"step": 698
},
{
"epoch": 1.4767932489451476,
"grad_norm": 2.7282063961029053,
"learning_rate": 3.6365423269283187e-06,
"loss": 1.2095248699188232,
"step": 700
},
{
"epoch": 1.481012658227848,
"grad_norm": 5.56691837310791,
"learning_rate": 3.6340154098113e-06,
"loss": 1.0211296081542969,
"step": 702
},
{
"epoch": 1.4852320675105486,
"grad_norm": 11.867128372192383,
"learning_rate": 3.631480732650156e-06,
"loss": 0.8005210161209106,
"step": 704
},
{
"epoch": 1.489451476793249,
"grad_norm": 1.5090935230255127,
"learning_rate": 3.6289383091610625e-06,
"loss": 1.1544265747070312,
"step": 706
},
{
"epoch": 1.4936708860759493,
"grad_norm": 1.969177484512329,
"learning_rate": 3.626388153102113e-06,
"loss": 1.180321455001831,
"step": 708
},
{
"epoch": 1.49789029535865,
"grad_norm": 1.4724305868148804,
"learning_rate": 3.6238302782732446e-06,
"loss": 1.0343523025512695,
"step": 710
},
{
"epoch": 1.50210970464135,
"grad_norm": 4.455009937286377,
"learning_rate": 3.621264698516166e-06,
"loss": 0.48465144634246826,
"step": 712
},
{
"epoch": 1.5063291139240507,
"grad_norm": 2.1380884647369385,
"learning_rate": 3.6186914277142776e-06,
"loss": 1.1161589622497559,
"step": 714
},
{
"epoch": 1.510548523206751,
"grad_norm": 3.7489266395568848,
"learning_rate": 3.6161104797926013e-06,
"loss": 1.091984510421753,
"step": 716
},
{
"epoch": 1.5147679324894514,
"grad_norm": 2.2989237308502197,
"learning_rate": 3.613521868717703e-06,
"loss": 1.1017979383468628,
"step": 718
},
{
"epoch": 1.518987341772152,
"grad_norm": 4.086328506469727,
"learning_rate": 3.6109256084976147e-06,
"loss": 1.0278382301330566,
"step": 720
},
{
"epoch": 1.5232067510548524,
"grad_norm": 4.82416296005249,
"learning_rate": 3.608321713181764e-06,
"loss": 1.198899745941162,
"step": 722
},
{
"epoch": 1.5274261603375527,
"grad_norm": 2.247619867324829,
"learning_rate": 3.6057101968608936e-06,
"loss": 1.2113308906555176,
"step": 724
},
{
"epoch": 1.5316455696202531,
"grad_norm": 5.557096004486084,
"learning_rate": 3.603091073666987e-06,
"loss": 0.5562316179275513,
"step": 726
},
{
"epoch": 1.5358649789029535,
"grad_norm": 8.159991264343262,
"learning_rate": 3.600464357773191e-06,
"loss": 0.414279580116272,
"step": 728
},
{
"epoch": 1.540084388185654,
"grad_norm": 2.0832576751708984,
"learning_rate": 3.5978300633937403e-06,
"loss": 0.9449454545974731,
"step": 730
},
{
"epoch": 1.5443037974683544,
"grad_norm": 2.1067464351654053,
"learning_rate": 3.5951882047838798e-06,
"loss": 0.9659292101860046,
"step": 732
},
{
"epoch": 1.5485232067510548,
"grad_norm": 1.711477518081665,
"learning_rate": 3.5925387962397866e-06,
"loss": 1.1613965034484863,
"step": 734
},
{
"epoch": 1.5527426160337554,
"grad_norm": 3.1845133304595947,
"learning_rate": 3.589881852098495e-06,
"loss": 0.864007830619812,
"step": 736
},
{
"epoch": 1.5569620253164556,
"grad_norm": 3.9110360145568848,
"learning_rate": 3.5872173867378177e-06,
"loss": 0.902462363243103,
"step": 738
},
{
"epoch": 1.5611814345991561,
"grad_norm": 3.437896490097046,
"learning_rate": 3.5845454145762657e-06,
"loss": 1.0834063291549683,
"step": 740
},
{
"epoch": 1.5654008438818565,
"grad_norm": 1.5851118564605713,
"learning_rate": 3.5818659500729735e-06,
"loss": 0.7697902917861938,
"step": 742
},
{
"epoch": 1.5696202531645569,
"grad_norm": 7.4633588790893555,
"learning_rate": 3.5791790077276214e-06,
"loss": 0.5523649454116821,
"step": 744
},
{
"epoch": 1.5738396624472575,
"grad_norm": 1.9582291841506958,
"learning_rate": 3.576484602080352e-06,
"loss": 0.6860834360122681,
"step": 746
},
{
"epoch": 1.5780590717299579,
"grad_norm": 3.9132864475250244,
"learning_rate": 3.573782747711697e-06,
"loss": 0.6468961834907532,
"step": 748
},
{
"epoch": 1.5822784810126582,
"grad_norm": 2.304565906524658,
"learning_rate": 3.571073459242498e-06,
"loss": 1.1524250507354736,
"step": 750
},
{
"epoch": 1.5864978902953588,
"grad_norm": 2.1101715564727783,
"learning_rate": 3.56835675133382e-06,
"loss": 0.7160176038742065,
"step": 752
},
{
"epoch": 1.590717299578059,
"grad_norm": 2.8462789058685303,
"learning_rate": 3.565632638686884e-06,
"loss": 0.7810688018798828,
"step": 754
},
{
"epoch": 1.5949367088607596,
"grad_norm": 2.3834588527679443,
"learning_rate": 3.562901136042977e-06,
"loss": 0.6207853555679321,
"step": 756
},
{
"epoch": 1.59915611814346,
"grad_norm": 3.6158013343811035,
"learning_rate": 3.560162258183377e-06,
"loss": 0.8702360987663269,
"step": 758
},
{
"epoch": 1.6033755274261603,
"grad_norm": 2.5689971446990967,
"learning_rate": 3.5574160199292737e-06,
"loss": 1.1135127544403076,
"step": 760
},
{
"epoch": 1.6075949367088609,
"grad_norm": 1.0458358526229858,
"learning_rate": 3.5546624361416855e-06,
"loss": 0.7249690294265747,
"step": 762
},
{
"epoch": 1.611814345991561,
"grad_norm": 1.9451916217803955,
"learning_rate": 3.55190152172138e-06,
"loss": 1.1511328220367432,
"step": 764
},
{
"epoch": 1.6160337552742616,
"grad_norm": 3.351893901824951,
"learning_rate": 3.549133291608796e-06,
"loss": 1.0460021495819092,
"step": 766
},
{
"epoch": 1.620253164556962,
"grad_norm": 4.358265399932861,
"learning_rate": 3.5463577607839588e-06,
"loss": 0.9370321035385132,
"step": 768
},
{
"epoch": 1.6244725738396624,
"grad_norm": 3.3822832107543945,
"learning_rate": 3.5435749442664016e-06,
"loss": 1.1469030380249023,
"step": 770
},
{
"epoch": 1.628691983122363,
"grad_norm": 2.77669358253479,
"learning_rate": 3.540784857115084e-06,
"loss": 1.1965186595916748,
"step": 772
},
{
"epoch": 1.6329113924050633,
"grad_norm": 2.8289971351623535,
"learning_rate": 3.537987514428307e-06,
"loss": 1.1629645824432373,
"step": 774
},
{
"epoch": 1.6371308016877637,
"grad_norm": 2.216648817062378,
"learning_rate": 3.535182931343638e-06,
"loss": 1.1647021770477295,
"step": 776
},
{
"epoch": 1.6413502109704643,
"grad_norm": 8.33935546875,
"learning_rate": 3.5323711230378236e-06,
"loss": 0.8370733261108398,
"step": 778
},
{
"epoch": 1.6455696202531644,
"grad_norm": 11.60359001159668,
"learning_rate": 3.5295521047267085e-06,
"loss": 0.3443516492843628,
"step": 780
},
{
"epoch": 1.649789029535865,
"grad_norm": 2.730212688446045,
"learning_rate": 3.5267258916651543e-06,
"loss": 1.091811180114746,
"step": 782
},
{
"epoch": 1.6540084388185654,
"grad_norm": 4.5352888107299805,
"learning_rate": 3.5238924991469567e-06,
"loss": 0.916614830493927,
"step": 784
},
{
"epoch": 1.6582278481012658,
"grad_norm": 11.15390682220459,
"learning_rate": 3.5210519425047618e-06,
"loss": 1.0898263454437256,
"step": 786
},
{
"epoch": 1.6624472573839664,
"grad_norm": 4.555877208709717,
"learning_rate": 3.518204237109983e-06,
"loss": 0.5768306255340576,
"step": 788
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.4780551195144653,
"learning_rate": 3.51534939837272e-06,
"loss": 1.1375391483306885,
"step": 790
},
{
"epoch": 1.6708860759493671,
"grad_norm": 6.170176982879639,
"learning_rate": 3.5124874417416734e-06,
"loss": 0.6376422643661499,
"step": 792
},
{
"epoch": 1.6751054852320675,
"grad_norm": 0.6833944916725159,
"learning_rate": 3.509618382704061e-06,
"loss": 0.900169849395752,
"step": 794
},
{
"epoch": 1.6793248945147679,
"grad_norm": 1.5629899501800537,
"learning_rate": 3.5067422367855364e-06,
"loss": 1.1173095703125,
"step": 796
},
{
"epoch": 1.6835443037974684,
"grad_norm": 3.2922439575195312,
"learning_rate": 3.5038590195501006e-06,
"loss": 0.8964512348175049,
"step": 798
},
{
"epoch": 1.6877637130801688,
"grad_norm": 4.654068470001221,
"learning_rate": 3.5009687466000224e-06,
"loss": 1.2155747413635254,
"step": 800
},
{
"epoch": 1.6919831223628692,
"grad_norm": 7.5080437660217285,
"learning_rate": 3.498071433575751e-06,
"loss": 0.5988451242446899,
"step": 802
},
{
"epoch": 1.6962025316455698,
"grad_norm": 1.9102202653884888,
"learning_rate": 3.495167096155834e-06,
"loss": 1.2323973178863525,
"step": 804
},
{
"epoch": 1.70042194092827,
"grad_norm": 3.7390973567962646,
"learning_rate": 3.4922557500568272e-06,
"loss": 1.1244511604309082,
"step": 806
},
{
"epoch": 1.7046413502109705,
"grad_norm": 32.09159851074219,
"learning_rate": 3.489337411033217e-06,
"loss": 0.8318772912025452,
"step": 808
},
{
"epoch": 1.7088607594936709,
"grad_norm": 3.8693466186523438,
"learning_rate": 3.48641209487733e-06,
"loss": 0.8628280758857727,
"step": 810
},
{
"epoch": 1.7130801687763713,
"grad_norm": 2.1568024158477783,
"learning_rate": 3.4834798174192476e-06,
"loss": 1.1509721279144287,
"step": 812
},
{
"epoch": 1.7172995780590719,
"grad_norm": 6.118010997772217,
"learning_rate": 3.4805405945267245e-06,
"loss": 1.4755480289459229,
"step": 814
},
{
"epoch": 1.721518987341772,
"grad_norm": 1.7534123659133911,
"learning_rate": 3.4775944421050976e-06,
"loss": 1.1487780809402466,
"step": 816
},
{
"epoch": 1.7257383966244726,
"grad_norm": 12.342169761657715,
"learning_rate": 3.4746413760972033e-06,
"loss": 0.8102009296417236,
"step": 818
},
{
"epoch": 1.729957805907173,
"grad_norm": 7.229720115661621,
"learning_rate": 3.4716814124832895e-06,
"loss": 0.38379400968551636,
"step": 820
},
{
"epoch": 1.7341772151898733,
"grad_norm": 5.0790886878967285,
"learning_rate": 3.468714567280931e-06,
"loss": 0.6532369256019592,
"step": 822
},
{
"epoch": 1.738396624472574,
"grad_norm": 9.148484230041504,
"learning_rate": 3.4657408565449413e-06,
"loss": 0.7519415616989136,
"step": 824
},
{
"epoch": 1.7426160337552743,
"grad_norm": 3.0881879329681396,
"learning_rate": 3.4627602963672854e-06,
"loss": 0.9758714437484741,
"step": 826
},
{
"epoch": 1.7468354430379747,
"grad_norm": 4.036842346191406,
"learning_rate": 3.459772902876994e-06,
"loss": 0.9723775386810303,
"step": 828
},
{
"epoch": 1.7510548523206753,
"grad_norm": 2.8862991333007812,
"learning_rate": 3.4567786922400757e-06,
"loss": 1.1287617683410645,
"step": 830
},
{
"epoch": 1.7552742616033754,
"grad_norm": 2.3025224208831787,
"learning_rate": 3.4537776806594293e-06,
"loss": 1.1016814708709717,
"step": 832
},
{
"epoch": 1.759493670886076,
"grad_norm": 2.3911185264587402,
"learning_rate": 3.4507698843747567e-06,
"loss": 0.8698973655700684,
"step": 834
},
{
"epoch": 1.7637130801687764,
"grad_norm": 2.9084486961364746,
"learning_rate": 3.4477553196624734e-06,
"loss": 1.1183581352233887,
"step": 836
},
{
"epoch": 1.7679324894514767,
"grad_norm": 2.349198579788208,
"learning_rate": 3.444734002835624e-06,
"loss": 1.0136666297912598,
"step": 838
},
{
"epoch": 1.7721518987341773,
"grad_norm": 3.0843915939331055,
"learning_rate": 3.441705950243789e-06,
"loss": 0.8606936931610107,
"step": 840
},
{
"epoch": 1.7763713080168775,
"grad_norm": 3.7463226318359375,
"learning_rate": 3.4386711782729996e-06,
"loss": 0.9574577808380127,
"step": 842
},
{
"epoch": 1.780590717299578,
"grad_norm": 1.5658468008041382,
"learning_rate": 3.4356297033456496e-06,
"loss": 0.46850845217704773,
"step": 844
},
{
"epoch": 1.7848101265822784,
"grad_norm": 1.2881762981414795,
"learning_rate": 3.432581541920404e-06,
"loss": 0.7656896114349365,
"step": 846
},
{
"epoch": 1.7890295358649788,
"grad_norm": 4.494737148284912,
"learning_rate": 3.429526710492111e-06,
"loss": 0.6177375912666321,
"step": 848
},
{
"epoch": 1.7932489451476794,
"grad_norm": 2.9015707969665527,
"learning_rate": 3.426465225591713e-06,
"loss": 0.8043622374534607,
"step": 850
},
{
"epoch": 1.7974683544303798,
"grad_norm": 2.33308482170105,
"learning_rate": 3.4233971037861587e-06,
"loss": 1.1262691020965576,
"step": 852
},
{
"epoch": 1.8016877637130801,
"grad_norm": 3.47825026512146,
"learning_rate": 3.4203223616783097e-06,
"loss": 1.4144643545150757,
"step": 854
},
{
"epoch": 1.8059071729957807,
"grad_norm": 0.613185703754425,
"learning_rate": 3.4172410159068545e-06,
"loss": 0.9285470247268677,
"step": 856
},
{
"epoch": 1.810126582278481,
"grad_norm": 4.0782623291015625,
"learning_rate": 3.414153083146215e-06,
"loss": 1.0604450702667236,
"step": 858
},
{
"epoch": 1.8143459915611815,
"grad_norm": 4.575808525085449,
"learning_rate": 3.411058580106458e-06,
"loss": 0.7167332172393799,
"step": 860
},
{
"epoch": 1.8185654008438819,
"grad_norm": 2.6179590225219727,
"learning_rate": 3.4079575235332077e-06,
"loss": 1.1503570079803467,
"step": 862
},
{
"epoch": 1.8227848101265822,
"grad_norm": 1.5741914510726929,
"learning_rate": 3.4048499302075485e-06,
"loss": 1.0776422023773193,
"step": 864
},
{
"epoch": 1.8270042194092828,
"grad_norm": 2.200496196746826,
"learning_rate": 3.40173581694594e-06,
"loss": 1.0765013694763184,
"step": 866
},
{
"epoch": 1.831223628691983,
"grad_norm": 7.11644172668457,
"learning_rate": 3.3986152006001233e-06,
"loss": 0.9683362245559692,
"step": 868
},
{
"epoch": 1.8354430379746836,
"grad_norm": 2.3128275871276855,
"learning_rate": 3.3954880980570296e-06,
"loss": 1.044558048248291,
"step": 870
},
{
"epoch": 1.839662447257384,
"grad_norm": 10.811915397644043,
"learning_rate": 3.392354526238691e-06,
"loss": 0.8069396615028381,
"step": 872
},
{
"epoch": 1.8438818565400843,
"grad_norm": 2.664677858352661,
"learning_rate": 3.3892145021021462e-06,
"loss": 0.9714232683181763,
"step": 874
},
{
"epoch": 1.8481012658227849,
"grad_norm": 2.777123212814331,
"learning_rate": 3.3860680426393515e-06,
"loss": 1.1506626605987549,
"step": 876
},
{
"epoch": 1.8523206751054853,
"grad_norm": 4.2269368171691895,
"learning_rate": 3.3829151648770855e-06,
"loss": 0.8257066011428833,
"step": 878
},
{
"epoch": 1.8565400843881856,
"grad_norm": 3.8701000213623047,
"learning_rate": 3.3797558858768593e-06,
"loss": 0.7449560761451721,
"step": 880
},
{
"epoch": 1.8607594936708862,
"grad_norm": 3.4201698303222656,
"learning_rate": 3.3765902227348255e-06,
"loss": 1.0331380367279053,
"step": 882
},
{
"epoch": 1.8649789029535864,
"grad_norm": 3.0394904613494873,
"learning_rate": 3.3734181925816826e-06,
"loss": 0.7403502464294434,
"step": 884
},
{
"epoch": 1.869198312236287,
"grad_norm": 2.232851266860962,
"learning_rate": 3.370239812582583e-06,
"loss": 0.7928322553634644,
"step": 886
},
{
"epoch": 1.8734177215189873,
"grad_norm": 1.918642282485962,
"learning_rate": 3.367055099937041e-06,
"loss": 1.0973682403564453,
"step": 888
},
{
"epoch": 1.8776371308016877,
"grad_norm": 4.839916229248047,
"learning_rate": 3.3638640718788406e-06,
"loss": 0.5104875564575195,
"step": 890
},
{
"epoch": 1.8818565400843883,
"grad_norm": 7.4713239669799805,
"learning_rate": 3.3606667456759397e-06,
"loss": 0.7245833873748779,
"step": 892
},
{
"epoch": 1.8860759493670884,
"grad_norm": 2.0137648582458496,
"learning_rate": 3.3574631386303797e-06,
"loss": 1.1190528869628906,
"step": 894
},
{
"epoch": 1.890295358649789,
"grad_norm": 1.844823956489563,
"learning_rate": 3.3542532680781876e-06,
"loss": 1.3033103942871094,
"step": 896
},
{
"epoch": 1.8945147679324894,
"grad_norm": 9.570866584777832,
"learning_rate": 3.351037151389287e-06,
"loss": 0.8090759515762329,
"step": 898
},
{
"epoch": 1.8987341772151898,
"grad_norm": 5.827152252197266,
"learning_rate": 3.3478148059674016e-06,
"loss": 1.06083083152771,
"step": 900
},
{
"epoch": 1.9029535864978904,
"grad_norm": 4.6404595375061035,
"learning_rate": 3.3445862492499595e-06,
"loss": 1.226179838180542,
"step": 902
},
{
"epoch": 1.9071729957805907,
"grad_norm": 4.473128318786621,
"learning_rate": 3.3413514987080043e-06,
"loss": 1.0048933029174805,
"step": 904
},
{
"epoch": 1.9113924050632911,
"grad_norm": 2.088918924331665,
"learning_rate": 3.338110571846093e-06,
"loss": 1.325439214706421,
"step": 906
},
{
"epoch": 1.9156118143459917,
"grad_norm": 7.492137432098389,
"learning_rate": 3.3348634862022074e-06,
"loss": 0.5317611694335938,
"step": 908
},
{
"epoch": 1.9198312236286919,
"grad_norm": 5.070749759674072,
"learning_rate": 3.331610259347657e-06,
"loss": 1.0684950351715088,
"step": 910
},
{
"epoch": 1.9240506329113924,
"grad_norm": 4.511446952819824,
"learning_rate": 3.328350908886983e-06,
"loss": 0.8111604452133179,
"step": 912
},
{
"epoch": 1.9282700421940928,
"grad_norm": 9.428959846496582,
"learning_rate": 3.3250854524578636e-06,
"loss": 1.1320171356201172,
"step": 914
},
{
"epoch": 1.9324894514767932,
"grad_norm": 13.564945220947266,
"learning_rate": 3.3218139077310206e-06,
"loss": 0.8104444742202759,
"step": 916
},
{
"epoch": 1.9367088607594938,
"grad_norm": 2.054192543029785,
"learning_rate": 3.3185362924101207e-06,
"loss": 1.0631756782531738,
"step": 918
},
{
"epoch": 1.9409282700421941,
"grad_norm": 3.2311954498291016,
"learning_rate": 3.315252624231682e-06,
"loss": 0.5999157428741455,
"step": 920
},
{
"epoch": 1.9451476793248945,
"grad_norm": 1.8943932056427002,
"learning_rate": 3.3119629209649763e-06,
"loss": 1.0982520580291748,
"step": 922
},
{
"epoch": 1.9493670886075949,
"grad_norm": 1.940902590751648,
"learning_rate": 3.3086672004119335e-06,
"loss": 1.226811408996582,
"step": 924
},
{
"epoch": 1.9535864978902953,
"grad_norm": 3.3977231979370117,
"learning_rate": 3.305365480407046e-06,
"loss": 0.9012327194213867,
"step": 926
},
{
"epoch": 1.9578059071729959,
"grad_norm": 3.1414709091186523,
"learning_rate": 3.3020577788172725e-06,
"loss": 0.7510135173797607,
"step": 928
},
{
"epoch": 1.9620253164556962,
"grad_norm": 2.9762823581695557,
"learning_rate": 3.2987441135419394e-06,
"loss": 1.1897534132003784,
"step": 930
},
{
"epoch": 1.9662447257383966,
"grad_norm": 3.8375062942504883,
"learning_rate": 3.2954245025126446e-06,
"loss": 0.9271247982978821,
"step": 932
},
{
"epoch": 1.9704641350210972,
"grad_norm": 1.9467542171478271,
"learning_rate": 3.292098963693163e-06,
"loss": 1.2084356546401978,
"step": 934
},
{
"epoch": 1.9746835443037973,
"grad_norm": 2.952320098876953,
"learning_rate": 3.2887675150793443e-06,
"loss": 1.1498595476150513,
"step": 936
},
{
"epoch": 1.978902953586498,
"grad_norm": 1.555445909500122,
"learning_rate": 3.2854301746990206e-06,
"loss": 0.8107820749282837,
"step": 938
},
{
"epoch": 1.9831223628691983,
"grad_norm": 1.9152470827102661,
"learning_rate": 3.2820869606119068e-06,
"loss": 1.1318726539611816,
"step": 940
},
{
"epoch": 1.9873417721518987,
"grad_norm": 3.219928026199341,
"learning_rate": 3.278737890909502e-06,
"loss": 0.9334742426872253,
"step": 942
},
{
"epoch": 1.9915611814345993,
"grad_norm": 1.992208480834961,
"learning_rate": 3.275382983714992e-06,
"loss": 0.7602829933166504,
"step": 944
},
{
"epoch": 1.9957805907172996,
"grad_norm": 2.6617956161499023,
"learning_rate": 3.272022257183153e-06,
"loss": 1.0931661128997803,
"step": 946
},
{
"epoch": 2.0,
"grad_norm": 12.275853157043457,
"learning_rate": 3.268655729500251e-06,
"loss": 0.5812578797340393,
"step": 948
},
{
"epoch": 2.0042194092827006,
"grad_norm": 3.4581050872802734,
"learning_rate": 3.265283418883945e-06,
"loss": 0.8604273200035095,
"step": 950
},
{
"epoch": 2.0084388185654007,
"grad_norm": 5.053099155426025,
"learning_rate": 3.2619053435831878e-06,
"loss": 0.6394712924957275,
"step": 952
},
{
"epoch": 2.0126582278481013,
"grad_norm": 2.949049711227417,
"learning_rate": 3.258521521878126e-06,
"loss": 0.8134095072746277,
"step": 954
},
{
"epoch": 2.0168776371308015,
"grad_norm": 3.0072250366210938,
"learning_rate": 3.2551319720800043e-06,
"loss": 0.9163396954536438,
"step": 956
},
{
"epoch": 2.021097046413502,
"grad_norm": 4.474330902099609,
"learning_rate": 3.251736712531063e-06,
"loss": 0.7234617471694946,
"step": 958
},
{
"epoch": 2.0253164556962027,
"grad_norm": 3.9642207622528076,
"learning_rate": 3.2483357616044418e-06,
"loss": 0.7650543451309204,
"step": 960
},
{
"epoch": 2.029535864978903,
"grad_norm": 4.6968793869018555,
"learning_rate": 3.244929137704076e-06,
"loss": 1.1930127143859863,
"step": 962
},
{
"epoch": 2.0337552742616034,
"grad_norm": 1.5408298969268799,
"learning_rate": 3.241516859264602e-06,
"loss": 0.7401737570762634,
"step": 964
},
{
"epoch": 2.037974683544304,
"grad_norm": 4.210058689117432,
"learning_rate": 3.238098944751256e-06,
"loss": 0.756514310836792,
"step": 966
},
{
"epoch": 2.042194092827004,
"grad_norm": 3.6998515129089355,
"learning_rate": 3.23467541265977e-06,
"loss": 0.750130295753479,
"step": 968
},
{
"epoch": 2.0464135021097047,
"grad_norm": 2.7548975944519043,
"learning_rate": 3.2312462815162777e-06,
"loss": 1.0819189548492432,
"step": 970
},
{
"epoch": 2.050632911392405,
"grad_norm": 4.967726707458496,
"learning_rate": 3.2278115698772116e-06,
"loss": 0.923316240310669,
"step": 972
},
{
"epoch": 2.0548523206751055,
"grad_norm": 2.2812294960021973,
"learning_rate": 3.2243712963292003e-06,
"loss": 0.8755730390548706,
"step": 974
},
{
"epoch": 2.059071729957806,
"grad_norm": 3.7565250396728516,
"learning_rate": 3.2209254794889724e-06,
"loss": 0.6916130781173706,
"step": 976
},
{
"epoch": 2.0632911392405062,
"grad_norm": 2.0674679279327393,
"learning_rate": 3.2174741380032523e-06,
"loss": 0.6281135082244873,
"step": 978
},
{
"epoch": 2.067510548523207,
"grad_norm": 3.7574315071105957,
"learning_rate": 3.2140172905486612e-06,
"loss": 0.7170443534851074,
"step": 980
},
{
"epoch": 2.071729957805907,
"grad_norm": 3.4279699325561523,
"learning_rate": 3.210554955831615e-06,
"loss": 1.0432848930358887,
"step": 982
},
{
"epoch": 2.0759493670886076,
"grad_norm": 2.687915802001953,
"learning_rate": 3.207087152588224e-06,
"loss": 0.9696755409240723,
"step": 984
},
{
"epoch": 2.080168776371308,
"grad_norm": 2.2797346115112305,
"learning_rate": 3.203613899584189e-06,
"loss": 1.0136628150939941,
"step": 986
},
{
"epoch": 2.0843881856540083,
"grad_norm": 2.3300132751464844,
"learning_rate": 3.2001352156147045e-06,
"loss": 1.0422950983047485,
"step": 988
},
{
"epoch": 2.088607594936709,
"grad_norm": 6.217328071594238,
"learning_rate": 3.1966511195043527e-06,
"loss": 0.5632253289222717,
"step": 990
},
{
"epoch": 2.0928270042194095,
"grad_norm": 2.278618335723877,
"learning_rate": 3.193161630107003e-06,
"loss": 0.5706143379211426,
"step": 992
},
{
"epoch": 2.0970464135021096,
"grad_norm": 2.097888946533203,
"learning_rate": 3.18966676630571e-06,
"loss": 1.1316472291946411,
"step": 994
},
{
"epoch": 2.1012658227848102,
"grad_norm": 4.8473286628723145,
"learning_rate": 3.186166547012612e-06,
"loss": 1.068217158317566,
"step": 996
},
{
"epoch": 2.1054852320675104,
"grad_norm": 1.3159743547439575,
"learning_rate": 3.1826609911688273e-06,
"loss": 0.643653154373169,
"step": 998
},
{
"epoch": 2.109704641350211,
"grad_norm": 2.744520425796509,
"learning_rate": 3.1791501177443533e-06,
"loss": 1.1834640502929688,
"step": 1000
},
{
"epoch": 2.1139240506329116,
"grad_norm": 5.579896926879883,
"learning_rate": 3.1756339457379626e-06,
"loss": 1.023376703262329,
"step": 1002
},
{
"epoch": 2.1181434599156117,
"grad_norm": 2.515099048614502,
"learning_rate": 3.1721124941771005e-06,
"loss": 1.092795491218567,
"step": 1004
},
{
"epoch": 2.1223628691983123,
"grad_norm": 1.9233348369598389,
"learning_rate": 3.1685857821177832e-06,
"loss": 0.6104440689086914,
"step": 1006
},
{
"epoch": 2.1265822784810124,
"grad_norm": 1.7998379468917847,
"learning_rate": 3.1650538286444902e-06,
"loss": 0.7144567966461182,
"step": 1008
},
{
"epoch": 2.130801687763713,
"grad_norm": 1.6687654256820679,
"learning_rate": 3.16151665287007e-06,
"loss": 0.6989231109619141,
"step": 1010
},
{
"epoch": 2.1350210970464136,
"grad_norm": 3.730558156967163,
"learning_rate": 3.1579742739356252e-06,
"loss": 0.8780606985092163,
"step": 1012
},
{
"epoch": 2.1392405063291138,
"grad_norm": 3.9646623134613037,
"learning_rate": 3.154426711010419e-06,
"loss": 1.304856300354004,
"step": 1014
},
{
"epoch": 2.1434599156118144,
"grad_norm": 4.966225624084473,
"learning_rate": 3.1508739832917664e-06,
"loss": 0.5962163209915161,
"step": 1016
},
{
"epoch": 2.147679324894515,
"grad_norm": 3.8472814559936523,
"learning_rate": 3.147316110004929e-06,
"loss": 0.8961644768714905,
"step": 1018
},
{
"epoch": 2.151898734177215,
"grad_norm": 16.210412979125977,
"learning_rate": 3.1437531104030172e-06,
"loss": 0.7574584484100342,
"step": 1020
},
{
"epoch": 2.1561181434599157,
"grad_norm": 6.170048713684082,
"learning_rate": 3.1401850037668773e-06,
"loss": 0.8245753049850464,
"step": 1022
},
{
"epoch": 2.160337552742616,
"grad_norm": 7.539897918701172,
"learning_rate": 3.1366118094049962e-06,
"loss": 0.8227906227111816,
"step": 1024
},
{
"epoch": 2.1645569620253164,
"grad_norm": 2.6890225410461426,
"learning_rate": 3.133033546653389e-06,
"loss": 1.0590184926986694,
"step": 1026
},
{
"epoch": 2.168776371308017,
"grad_norm": 2.2687880992889404,
"learning_rate": 3.129450234875501e-06,
"loss": 1.1196215152740479,
"step": 1028
},
{
"epoch": 2.172995780590717,
"grad_norm": 1.5322057008743286,
"learning_rate": 3.1258618934620977e-06,
"loss": 1.0350878238677979,
"step": 1030
},
{
"epoch": 2.1772151898734178,
"grad_norm": 1.7896466255187988,
"learning_rate": 3.1222685418311624e-06,
"loss": 1.0621168613433838,
"step": 1032
},
{
"epoch": 2.181434599156118,
"grad_norm": 1.9826382398605347,
"learning_rate": 3.1186701994277913e-06,
"loss": 1.0807254314422607,
"step": 1034
},
{
"epoch": 2.1856540084388185,
"grad_norm": 2.4833922386169434,
"learning_rate": 3.115066885724087e-06,
"loss": 1.0103787183761597,
"step": 1036
},
{
"epoch": 2.189873417721519,
"grad_norm": 4.183322429656982,
"learning_rate": 3.111458620219056e-06,
"loss": 1.0446069240570068,
"step": 1038
},
{
"epoch": 2.1940928270042193,
"grad_norm": 5.676382064819336,
"learning_rate": 3.107845422438497e-06,
"loss": 1.1852116584777832,
"step": 1040
},
{
"epoch": 2.19831223628692,
"grad_norm": 12.257451057434082,
"learning_rate": 3.1042273119349024e-06,
"loss": 0.3302527964115143,
"step": 1042
},
{
"epoch": 2.2025316455696204,
"grad_norm": 1.8637685775756836,
"learning_rate": 3.10060430828735e-06,
"loss": 1.0095632076263428,
"step": 1044
},
{
"epoch": 2.2067510548523206,
"grad_norm": 6.286106109619141,
"learning_rate": 3.0969764311013927e-06,
"loss": 0.6037812232971191,
"step": 1046
},
{
"epoch": 2.210970464135021,
"grad_norm": 2.026481866836548,
"learning_rate": 3.09334370000896e-06,
"loss": 0.8940553665161133,
"step": 1048
},
{
"epoch": 2.2151898734177213,
"grad_norm": 2.958310604095459,
"learning_rate": 3.089706134668245e-06,
"loss": 1.070237636566162,
"step": 1050
},
{
"epoch": 2.219409282700422,
"grad_norm": 5.202909469604492,
"learning_rate": 3.0860637547636023e-06,
"loss": 0.9080023765563965,
"step": 1052
},
{
"epoch": 2.2236286919831225,
"grad_norm": 4.214676856994629,
"learning_rate": 3.082416580005441e-06,
"loss": 0.9310380220413208,
"step": 1054
},
{
"epoch": 2.2278481012658227,
"grad_norm": 4.913782119750977,
"learning_rate": 3.0787646301301143e-06,
"loss": 0.8610812425613403,
"step": 1056
},
{
"epoch": 2.2320675105485233,
"grad_norm": 11.496319770812988,
"learning_rate": 3.0751079248998183e-06,
"loss": 0.5102381706237793,
"step": 1058
},
{
"epoch": 2.2362869198312234,
"grad_norm": 2.501431703567505,
"learning_rate": 3.0714464841024817e-06,
"loss": 1.026395559310913,
"step": 1060
},
{
"epoch": 2.240506329113924,
"grad_norm": 1.0209457874298096,
"learning_rate": 3.067780327551658e-06,
"loss": 0.7514087557792664,
"step": 1062
},
{
"epoch": 2.2447257383966246,
"grad_norm": 10.08558464050293,
"learning_rate": 3.06410947508642e-06,
"loss": 0.4998623728752136,
"step": 1064
},
{
"epoch": 2.2489451476793247,
"grad_norm": 2.017042875289917,
"learning_rate": 3.060433946571253e-06,
"loss": 0.9955783486366272,
"step": 1066
},
{
"epoch": 2.2531645569620253,
"grad_norm": 3.0692787170410156,
"learning_rate": 3.0567537618959453e-06,
"loss": 1.24436616897583,
"step": 1068
},
{
"epoch": 2.257383966244726,
"grad_norm": 2.2183597087860107,
"learning_rate": 3.0530689409754826e-06,
"loss": 1.1389007568359375,
"step": 1070
},
{
"epoch": 2.261603375527426,
"grad_norm": 3.1245839595794678,
"learning_rate": 3.0493795037499374e-06,
"loss": 1.1064579486846924,
"step": 1072
},
{
"epoch": 2.2658227848101267,
"grad_norm": 5.401794910430908,
"learning_rate": 3.0456854701843647e-06,
"loss": 1.280016303062439,
"step": 1074
},
{
"epoch": 2.270042194092827,
"grad_norm": 2.5527584552764893,
"learning_rate": 3.041986860268693e-06,
"loss": 1.0337902307510376,
"step": 1076
},
{
"epoch": 2.2742616033755274,
"grad_norm": 1.6811496019363403,
"learning_rate": 3.0382836940176112e-06,
"loss": 0.7087812423706055,
"step": 1078
},
{
"epoch": 2.278481012658228,
"grad_norm": 4.886277675628662,
"learning_rate": 3.034575991470468e-06,
"loss": 0.8468987941741943,
"step": 1080
},
{
"epoch": 2.282700421940928,
"grad_norm": 10.467023849487305,
"learning_rate": 3.03086377269116e-06,
"loss": 0.46134668588638306,
"step": 1082
},
{
"epoch": 2.2869198312236287,
"grad_norm": 4.281970500946045,
"learning_rate": 3.027147057768022e-06,
"loss": 0.6730149984359741,
"step": 1084
},
{
"epoch": 2.291139240506329,
"grad_norm": 1.6377662420272827,
"learning_rate": 3.023425866813718e-06,
"loss": 0.5801299810409546,
"step": 1086
},
{
"epoch": 2.2953586497890295,
"grad_norm": 4.013052940368652,
"learning_rate": 3.0197002199651353e-06,
"loss": 0.900696873664856,
"step": 1088
},
{
"epoch": 2.29957805907173,
"grad_norm": 1.2075470685958862,
"learning_rate": 3.015970137383273e-06,
"loss": 0.557762861251831,
"step": 1090
},
{
"epoch": 2.3037974683544302,
"grad_norm": 6.79136848449707,
"learning_rate": 3.0122356392531345e-06,
"loss": 0.8252531290054321,
"step": 1092
},
{
"epoch": 2.308016877637131,
"grad_norm": 1.973429560661316,
"learning_rate": 3.008496745783617e-06,
"loss": 0.6639243364334106,
"step": 1094
},
{
"epoch": 2.3122362869198314,
"grad_norm": 5.644299507141113,
"learning_rate": 3.0047534772074038e-06,
"loss": 0.41757094860076904,
"step": 1096
},
{
"epoch": 2.3164556962025316,
"grad_norm": 4.321779727935791,
"learning_rate": 3.001005853780852e-06,
"loss": 1.101494550704956,
"step": 1098
},
{
"epoch": 2.320675105485232,
"grad_norm": 2.1912591457366943,
"learning_rate": 2.9972538957838848e-06,
"loss": 0.9152376055717468,
"step": 1100
},
{
"epoch": 2.3248945147679323,
"grad_norm": 2.042452335357666,
"learning_rate": 2.9934976235198827e-06,
"loss": 1.0394017696380615,
"step": 1102
},
{
"epoch": 2.329113924050633,
"grad_norm": 1.53744637966156,
"learning_rate": 2.989737057315572e-06,
"loss": 1.2090572118759155,
"step": 1104
},
{
"epoch": 2.3333333333333335,
"grad_norm": 2.0143048763275146,
"learning_rate": 2.9859722175209153e-06,
"loss": 0.7863491773605347,
"step": 1106
},
{
"epoch": 2.3375527426160336,
"grad_norm": 10.555294036865234,
"learning_rate": 2.9822031245090002e-06,
"loss": 0.5064557790756226,
"step": 1108
},
{
"epoch": 2.3417721518987342,
"grad_norm": 3.0460026264190674,
"learning_rate": 2.978429798675931e-06,
"loss": 1.0185744762420654,
"step": 1110
},
{
"epoch": 2.3459915611814344,
"grad_norm": 1.6025739908218384,
"learning_rate": 2.97465226044072e-06,
"loss": 1.0687915086746216,
"step": 1112
},
{
"epoch": 2.350210970464135,
"grad_norm": 2.336373805999756,
"learning_rate": 2.9708705302451697e-06,
"loss": 1.1018157005310059,
"step": 1114
},
{
"epoch": 2.3544303797468356,
"grad_norm": 1.2120983600616455,
"learning_rate": 2.96708462855377e-06,
"loss": 0.6393563747406006,
"step": 1116
},
{
"epoch": 2.3586497890295357,
"grad_norm": 5.554210186004639,
"learning_rate": 2.9632945758535847e-06,
"loss": 0.9500521421432495,
"step": 1118
},
{
"epoch": 2.3628691983122363,
"grad_norm": 13.489524841308594,
"learning_rate": 2.9595003926541398e-06,
"loss": 0.6889848709106445,
"step": 1120
},
{
"epoch": 2.367088607594937,
"grad_norm": 6.1560187339782715,
"learning_rate": 2.9557020994873125e-06,
"loss": 0.9626091718673706,
"step": 1122
},
{
"epoch": 2.371308016877637,
"grad_norm": 1.836715579032898,
"learning_rate": 2.951899716907221e-06,
"loss": 0.5855181217193604,
"step": 1124
},
{
"epoch": 2.3755274261603376,
"grad_norm": 1.9696272611618042,
"learning_rate": 2.9480932654901142e-06,
"loss": 0.8846515417098999,
"step": 1126
},
{
"epoch": 2.379746835443038,
"grad_norm": 2.0595052242279053,
"learning_rate": 2.944282765834257e-06,
"loss": 1.0026812553405762,
"step": 1128
},
{
"epoch": 2.3839662447257384,
"grad_norm": 8.984773635864258,
"learning_rate": 2.9404682385598225e-06,
"loss": 0.4564356803894043,
"step": 1130
},
{
"epoch": 2.388185654008439,
"grad_norm": 9.524094581604004,
"learning_rate": 2.9366497043087794e-06,
"loss": 0.3366748094558716,
"step": 1132
},
{
"epoch": 2.392405063291139,
"grad_norm": 2.6163482666015625,
"learning_rate": 2.932827183744778e-06,
"loss": 0.46002885699272156,
"step": 1134
},
{
"epoch": 2.3966244725738397,
"grad_norm": 7.858697414398193,
"learning_rate": 2.929000697553041e-06,
"loss": 0.5188404321670532,
"step": 1136
},
{
"epoch": 2.40084388185654,
"grad_norm": 2.04315447807312,
"learning_rate": 2.925170266440252e-06,
"loss": 1.063408613204956,
"step": 1138
},
{
"epoch": 2.4050632911392404,
"grad_norm": 3.0201163291931152,
"learning_rate": 2.921335911134439e-06,
"loss": 0.7606229186058044,
"step": 1140
},
{
"epoch": 2.409282700421941,
"grad_norm": 5.318437576293945,
"learning_rate": 2.91749765238487e-06,
"loss": 0.2792668044567108,
"step": 1142
},
{
"epoch": 2.413502109704641,
"grad_norm": 1.64540433883667,
"learning_rate": 2.9136555109619316e-06,
"loss": 0.7836066484451294,
"step": 1144
},
{
"epoch": 2.4177215189873418,
"grad_norm": 7.265844821929932,
"learning_rate": 2.9098095076570235e-06,
"loss": 1.0778812170028687,
"step": 1146
},
{
"epoch": 2.4219409282700424,
"grad_norm": 4.908560752868652,
"learning_rate": 2.9059596632824432e-06,
"loss": 0.8231828212738037,
"step": 1148
},
{
"epoch": 2.4261603375527425,
"grad_norm": 3.473619222640991,
"learning_rate": 2.902105998671275e-06,
"loss": 1.0785859823226929,
"step": 1150
},
{
"epoch": 2.430379746835443,
"grad_norm": 5.009274959564209,
"learning_rate": 2.8982485346772733e-06,
"loss": 0.6990054845809937,
"step": 1152
},
{
"epoch": 2.4345991561181437,
"grad_norm": 1.6592916250228882,
"learning_rate": 2.894387292174754e-06,
"loss": 1.1584959030151367,
"step": 1154
},
{
"epoch": 2.438818565400844,
"grad_norm": 1.9908864498138428,
"learning_rate": 2.8905222920584814e-06,
"loss": 0.3479560613632202,
"step": 1156
},
{
"epoch": 2.4430379746835444,
"grad_norm": 2.59413743019104,
"learning_rate": 2.886653555243553e-06,
"loss": 0.7740304470062256,
"step": 1158
},
{
"epoch": 2.4472573839662446,
"grad_norm": 3.607126235961914,
"learning_rate": 2.882781102665284e-06,
"loss": 1.0350849628448486,
"step": 1160
},
{
"epoch": 2.451476793248945,
"grad_norm": 2.7151076793670654,
"learning_rate": 2.8789049552791024e-06,
"loss": 0.6460145711898804,
"step": 1162
},
{
"epoch": 2.4556962025316453,
"grad_norm": 1.7807066440582275,
"learning_rate": 2.8750251340604255e-06,
"loss": 1.0453755855560303,
"step": 1164
},
{
"epoch": 2.459915611814346,
"grad_norm": 2.944485664367676,
"learning_rate": 2.8711416600045556e-06,
"loss": 1.079903483390808,
"step": 1166
},
{
"epoch": 2.4641350210970465,
"grad_norm": 0.9675163626670837,
"learning_rate": 2.8672545541265583e-06,
"loss": 0.5578194856643677,
"step": 1168
},
{
"epoch": 2.4683544303797467,
"grad_norm": 1.795234203338623,
"learning_rate": 2.8633638374611544e-06,
"loss": 1.0072107315063477,
"step": 1170
},
{
"epoch": 2.4725738396624473,
"grad_norm": 3.3494741916656494,
"learning_rate": 2.8594695310626034e-06,
"loss": 1.0281925201416016,
"step": 1172
},
{
"epoch": 2.476793248945148,
"grad_norm": 2.088599920272827,
"learning_rate": 2.8555716560045917e-06,
"loss": 1.0314571857452393,
"step": 1174
},
{
"epoch": 2.481012658227848,
"grad_norm": 2.605670213699341,
"learning_rate": 2.851670233380114e-06,
"loss": 0.7644580602645874,
"step": 1176
},
{
"epoch": 2.4852320675105486,
"grad_norm": 13.257305145263672,
"learning_rate": 2.8477652843013666e-06,
"loss": 0.42062222957611084,
"step": 1178
},
{
"epoch": 2.489451476793249,
"grad_norm": 7.103763103485107,
"learning_rate": 2.8438568298996265e-06,
"loss": 0.7796779274940491,
"step": 1180
},
{
"epoch": 2.4936708860759493,
"grad_norm": 3.013402223587036,
"learning_rate": 2.8399448913251374e-06,
"loss": 0.9339659214019775,
"step": 1182
},
{
"epoch": 2.49789029535865,
"grad_norm": 7.224562644958496,
"learning_rate": 2.836029489747002e-06,
"loss": 0.49434345960617065,
"step": 1184
},
{
"epoch": 2.50210970464135,
"grad_norm": 17.112947463989258,
"learning_rate": 2.8321106463530592e-06,
"loss": 0.6316568851470947,
"step": 1186
},
{
"epoch": 2.5063291139240507,
"grad_norm": 5.573176383972168,
"learning_rate": 2.8281883823497745e-06,
"loss": 0.7511799335479736,
"step": 1188
},
{
"epoch": 2.510548523206751,
"grad_norm": 2.383787155151367,
"learning_rate": 2.824262718962122e-06,
"loss": 1.03713858127594,
"step": 1190
},
{
"epoch": 2.5147679324894514,
"grad_norm": 5.0437116622924805,
"learning_rate": 2.820333677433474e-06,
"loss": 0.510556697845459,
"step": 1192
},
{
"epoch": 2.518987341772152,
"grad_norm": 6.297809600830078,
"learning_rate": 2.816401279025482e-06,
"loss": 1.3623912334442139,
"step": 1194
},
{
"epoch": 2.523206751054852,
"grad_norm": 2.4292147159576416,
"learning_rate": 2.8124655450179618e-06,
"loss": 1.1327567100524902,
"step": 1196
},
{
"epoch": 2.5274261603375527,
"grad_norm": 2.8005106449127197,
"learning_rate": 2.808526496708781e-06,
"loss": 0.980167031288147,
"step": 1198
},
{
"epoch": 2.5316455696202533,
"grad_norm": 6.94888162612915,
"learning_rate": 2.804584155413741e-06,
"loss": 0.6094427704811096,
"step": 1200
},
{
"epoch": 2.5358649789029535,
"grad_norm": 2.302324056625366,
"learning_rate": 2.8006385424664638e-06,
"loss": 0.7884533405303955,
"step": 1202
},
{
"epoch": 2.540084388185654,
"grad_norm": 7.919814586639404,
"learning_rate": 2.7966896792182755e-06,
"loss": 0.6705489754676819,
"step": 1204
},
{
"epoch": 2.5443037974683547,
"grad_norm": 2.791510581970215,
"learning_rate": 2.792737587038092e-06,
"loss": 0.9616777300834656,
"step": 1206
},
{
"epoch": 2.548523206751055,
"grad_norm": 5.007606029510498,
"learning_rate": 2.7887822873122995e-06,
"loss": 0.7277128100395203,
"step": 1208
},
{
"epoch": 2.5527426160337554,
"grad_norm": 2.232788562774658,
"learning_rate": 2.7848238014446447e-06,
"loss": 1.1262240409851074,
"step": 1210
},
{
"epoch": 2.5569620253164556,
"grad_norm": 3.4404702186584473,
"learning_rate": 2.7808621508561123e-06,
"loss": 1.0465441942214966,
"step": 1212
},
{
"epoch": 2.561181434599156,
"grad_norm": 8.573604583740234,
"learning_rate": 2.776897356984816e-06,
"loss": 0.30951395630836487,
"step": 1214
},
{
"epoch": 2.5654008438818563,
"grad_norm": 3.45868182182312,
"learning_rate": 2.7729294412858776e-06,
"loss": 0.7883036136627197,
"step": 1216
},
{
"epoch": 2.569620253164557,
"grad_norm": 1.7647202014923096,
"learning_rate": 2.7689584252313128e-06,
"loss": 1.0650732517242432,
"step": 1218
},
{
"epoch": 2.5738396624472575,
"grad_norm": 8.709357261657715,
"learning_rate": 2.7649843303099127e-06,
"loss": 0.6637066602706909,
"step": 1220
},
{
"epoch": 2.5780590717299576,
"grad_norm": 4.496120929718018,
"learning_rate": 2.761007178027132e-06,
"loss": 0.9158288240432739,
"step": 1222
},
{
"epoch": 2.5822784810126582,
"grad_norm": 11.006595611572266,
"learning_rate": 2.75702698990497e-06,
"loss": 0.7496324777603149,
"step": 1224
},
{
"epoch": 2.586497890295359,
"grad_norm": 4.899750232696533,
"learning_rate": 2.7530437874818515e-06,
"loss": 0.6235587000846863,
"step": 1226
},
{
"epoch": 2.590717299578059,
"grad_norm": 1.8441094160079956,
"learning_rate": 2.749057592312515e-06,
"loss": 1.0314083099365234,
"step": 1228
},
{
"epoch": 2.5949367088607596,
"grad_norm": 2.7288100719451904,
"learning_rate": 2.7450684259678943e-06,
"loss": 1.0736459493637085,
"step": 1230
},
{
"epoch": 2.59915611814346,
"grad_norm": 3.8577749729156494,
"learning_rate": 2.7410763100350004e-06,
"loss": 0.9584764838218689,
"step": 1232
},
{
"epoch": 2.6033755274261603,
"grad_norm": 9.928874969482422,
"learning_rate": 2.7370812661168046e-06,
"loss": 0.2811320722103119,
"step": 1234
},
{
"epoch": 2.607594936708861,
"grad_norm": 3.457975387573242,
"learning_rate": 2.7330833158321267e-06,
"loss": 1.1292645931243896,
"step": 1236
},
{
"epoch": 2.611814345991561,
"grad_norm": 6.1282172203063965,
"learning_rate": 2.7290824808155096e-06,
"loss": 1.2942759990692139,
"step": 1238
},
{
"epoch": 2.6160337552742616,
"grad_norm": 6.050518035888672,
"learning_rate": 2.7250787827171085e-06,
"loss": 0.7845382690429688,
"step": 1240
},
{
"epoch": 2.620253164556962,
"grad_norm": 2.2712647914886475,
"learning_rate": 2.721072243202573e-06,
"loss": 0.9927393794059753,
"step": 1242
},
{
"epoch": 2.6244725738396624,
"grad_norm": 12.99117660522461,
"learning_rate": 2.7170628839529277e-06,
"loss": 0.4361240863800049,
"step": 1244
},
{
"epoch": 2.628691983122363,
"grad_norm": 2.062415599822998,
"learning_rate": 2.7130507266644555e-06,
"loss": 0.7296593189239502,
"step": 1246
},
{
"epoch": 2.632911392405063,
"grad_norm": 6.197027206420898,
"learning_rate": 2.709035793048581e-06,
"loss": 1.5014359951019287,
"step": 1248
},
{
"epoch": 2.6371308016877637,
"grad_norm": 1.7749969959259033,
"learning_rate": 2.705018104831753e-06,
"loss": 1.0191712379455566,
"step": 1250
},
{
"epoch": 2.6413502109704643,
"grad_norm": 3.7179009914398193,
"learning_rate": 2.700997683755326e-06,
"loss": 0.9707983732223511,
"step": 1252
},
{
"epoch": 2.6455696202531644,
"grad_norm": 7.614749431610107,
"learning_rate": 2.6969745515754444e-06,
"loss": 0.47567054629325867,
"step": 1254
},
{
"epoch": 2.649789029535865,
"grad_norm": 3.8538355827331543,
"learning_rate": 2.6929487300629206e-06,
"loss": 0.5580261945724487,
"step": 1256
},
{
"epoch": 2.6540084388185656,
"grad_norm": 3.0637574195861816,
"learning_rate": 2.6889202410031237e-06,
"loss": 0.9232720136642456,
"step": 1258
},
{
"epoch": 2.6582278481012658,
"grad_norm": 1.9953484535217285,
"learning_rate": 2.6848891061958565e-06,
"loss": 1.007423996925354,
"step": 1260
},
{
"epoch": 2.6624472573839664,
"grad_norm": 10.962545394897461,
"learning_rate": 2.680855347455238e-06,
"loss": 1.0483016967773438,
"step": 1262
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.6327028274536133,
"learning_rate": 2.6768189866095867e-06,
"loss": 0.5767178535461426,
"step": 1264
},
{
"epoch": 2.670886075949367,
"grad_norm": 5.506629943847656,
"learning_rate": 2.6727800455013037e-06,
"loss": 0.8919286727905273,
"step": 1266
},
{
"epoch": 2.6751054852320673,
"grad_norm": 1.8910753726959229,
"learning_rate": 2.6687385459867514e-06,
"loss": 0.7154239416122437,
"step": 1268
},
{
"epoch": 2.679324894514768,
"grad_norm": 4.416780948638916,
"learning_rate": 2.6646945099361382e-06,
"loss": 0.4701068103313446,
"step": 1270
},
{
"epoch": 2.6835443037974684,
"grad_norm": 1.5386635065078735,
"learning_rate": 2.6606479592333965e-06,
"loss": 0.9448637962341309,
"step": 1272
},
{
"epoch": 2.6877637130801686,
"grad_norm": 6.68757963180542,
"learning_rate": 2.6565989157760678e-06,
"loss": 0.735755443572998,
"step": 1274
},
{
"epoch": 2.691983122362869,
"grad_norm": 23.566585540771484,
"learning_rate": 2.652547401475184e-06,
"loss": 0.8000218868255615,
"step": 1276
},
{
"epoch": 2.6962025316455698,
"grad_norm": 1.7401084899902344,
"learning_rate": 2.6484934382551465e-06,
"loss": 0.35548001527786255,
"step": 1278
},
{
"epoch": 2.70042194092827,
"grad_norm": 10.348366737365723,
"learning_rate": 2.644437048053609e-06,
"loss": 0.8879528641700745,
"step": 1280
},
{
"epoch": 2.7046413502109705,
"grad_norm": 2.0043532848358154,
"learning_rate": 2.6403782528213577e-06,
"loss": 1.076289415359497,
"step": 1282
},
{
"epoch": 2.708860759493671,
"grad_norm": 14.326828956604004,
"learning_rate": 2.6363170745221958e-06,
"loss": 0.5147005915641785,
"step": 1284
},
{
"epoch": 2.7130801687763713,
"grad_norm": 2.707928419113159,
"learning_rate": 2.6322535351328193e-06,
"loss": 0.502042293548584,
"step": 1286
},
{
"epoch": 2.717299578059072,
"grad_norm": 1.4950000047683716,
"learning_rate": 2.6281876566427034e-06,
"loss": 0.6342880129814148,
"step": 1288
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.5780206918716431,
"learning_rate": 2.624119461053979e-06,
"loss": 0.7421303391456604,
"step": 1290
},
{
"epoch": 2.7257383966244726,
"grad_norm": 1.3298128843307495,
"learning_rate": 2.620048970381319e-06,
"loss": 0.9955764412879944,
"step": 1292
},
{
"epoch": 2.7299578059071727,
"grad_norm": 2.542677879333496,
"learning_rate": 2.6159762066518117e-06,
"loss": 0.5678607821464539,
"step": 1294
},
{
"epoch": 2.7341772151898733,
"grad_norm": 2.9699714183807373,
"learning_rate": 2.61190119190485e-06,
"loss": 1.0441884994506836,
"step": 1296
},
{
"epoch": 2.738396624472574,
"grad_norm": 1.9846669435501099,
"learning_rate": 2.607823948192005e-06,
"loss": 1.0227396488189697,
"step": 1298
},
{
"epoch": 2.742616033755274,
"grad_norm": 3.1612093448638916,
"learning_rate": 2.6037444975769104e-06,
"loss": 0.7024236917495728,
"step": 1300
},
{
"epoch": 2.7468354430379747,
"grad_norm": 1.8448959589004517,
"learning_rate": 2.5996628621351437e-06,
"loss": 1.156023621559143,
"step": 1302
},
{
"epoch": 2.7510548523206753,
"grad_norm": 4.011197566986084,
"learning_rate": 2.5955790639541036e-06,
"loss": 0.6238597631454468,
"step": 1304
},
{
"epoch": 2.7552742616033754,
"grad_norm": 3.856045961380005,
"learning_rate": 2.591493125132893e-06,
"loss": 1.281459093093872,
"step": 1306
},
{
"epoch": 2.759493670886076,
"grad_norm": 2.341705083847046,
"learning_rate": 2.5874050677821984e-06,
"loss": 0.9869955778121948,
"step": 1308
},
{
"epoch": 2.7637130801687766,
"grad_norm": 10.147032737731934,
"learning_rate": 2.5833149140241718e-06,
"loss": 0.8909780979156494,
"step": 1310
},
{
"epoch": 2.7679324894514767,
"grad_norm": 1.7961941957473755,
"learning_rate": 2.579222685992307e-06,
"loss": 1.0535545349121094,
"step": 1312
},
{
"epoch": 2.7721518987341773,
"grad_norm": 3.12715482711792,
"learning_rate": 2.5751284058313266e-06,
"loss": 1.1261003017425537,
"step": 1314
},
{
"epoch": 2.7763713080168775,
"grad_norm": 3.8387131690979004,
"learning_rate": 2.5710320956970536e-06,
"loss": 0.7698974609375,
"step": 1316
},
{
"epoch": 2.780590717299578,
"grad_norm": 1.3000264167785645,
"learning_rate": 2.5669337777562996e-06,
"loss": 0.5697190761566162,
"step": 1318
},
{
"epoch": 2.7848101265822782,
"grad_norm": 1.9856594800949097,
"learning_rate": 2.5628334741867385e-06,
"loss": 1.1043368577957153,
"step": 1320
},
{
"epoch": 2.789029535864979,
"grad_norm": 3.5784945487976074,
"learning_rate": 2.5587312071767923e-06,
"loss": 0.6595450639724731,
"step": 1322
},
{
"epoch": 2.7932489451476794,
"grad_norm": 5.370586395263672,
"learning_rate": 2.554626998925505e-06,
"loss": 1.2037230730056763,
"step": 1324
},
{
"epoch": 2.7974683544303796,
"grad_norm": 6.791380882263184,
"learning_rate": 2.5505208716424275e-06,
"loss": 0.899883508682251,
"step": 1326
},
{
"epoch": 2.80168776371308,
"grad_norm": 1.783818006515503,
"learning_rate": 2.5464128475474937e-06,
"loss": 0.7012801170349121,
"step": 1328
},
{
"epoch": 2.8059071729957807,
"grad_norm": 1.9667185544967651,
"learning_rate": 2.542302948870904e-06,
"loss": 1.041996955871582,
"step": 1330
},
{
"epoch": 2.810126582278481,
"grad_norm": 13.571832656860352,
"learning_rate": 2.5381911978530006e-06,
"loss": 0.9141802787780762,
"step": 1332
},
{
"epoch": 2.8143459915611815,
"grad_norm": 2.473447799682617,
"learning_rate": 2.5340776167441508e-06,
"loss": 0.5973923206329346,
"step": 1334
},
{
"epoch": 2.818565400843882,
"grad_norm": 1.2413594722747803,
"learning_rate": 2.529962227804626e-06,
"loss": 0.8588274717330933,
"step": 1336
},
{
"epoch": 2.8227848101265822,
"grad_norm": 5.830739498138428,
"learning_rate": 2.525845053304479e-06,
"loss": 0.7775506973266602,
"step": 1338
},
{
"epoch": 2.827004219409283,
"grad_norm": 5.612140655517578,
"learning_rate": 2.521726115523425e-06,
"loss": 0.9469473361968994,
"step": 1340
},
{
"epoch": 2.831223628691983,
"grad_norm": 2.9371390342712402,
"learning_rate": 2.517605436750723e-06,
"loss": 1.0295050144195557,
"step": 1342
},
{
"epoch": 2.8354430379746836,
"grad_norm": 2.6451170444488525,
"learning_rate": 2.513483039285051e-06,
"loss": 1.1780718564987183,
"step": 1344
},
{
"epoch": 2.8396624472573837,
"grad_norm": 12.214982032775879,
"learning_rate": 2.5093589454343883e-06,
"loss": 0.7536942362785339,
"step": 1346
},
{
"epoch": 2.8438818565400843,
"grad_norm": 2.7933950424194336,
"learning_rate": 2.505233177515894e-06,
"loss": 0.607318639755249,
"step": 1348
},
{
"epoch": 2.848101265822785,
"grad_norm": 2.1484858989715576,
"learning_rate": 2.501105757855787e-06,
"loss": 1.0892062187194824,
"step": 1350
},
{
"epoch": 2.852320675105485,
"grad_norm": 3.0315003395080566,
"learning_rate": 2.4969767087892236e-06,
"loss": 0.7782174348831177,
"step": 1352
},
{
"epoch": 2.8565400843881856,
"grad_norm": 2.569249153137207,
"learning_rate": 2.492846052660178e-06,
"loss": 0.8103134632110596,
"step": 1354
},
{
"epoch": 2.8607594936708862,
"grad_norm": 8.901324272155762,
"learning_rate": 2.4887138118213206e-06,
"loss": 0.5044631361961365,
"step": 1356
},
{
"epoch": 2.8649789029535864,
"grad_norm": 2.725210428237915,
"learning_rate": 2.4845800086338972e-06,
"loss": 1.0778303146362305,
"step": 1358
},
{
"epoch": 2.869198312236287,
"grad_norm": 3.4764597415924072,
"learning_rate": 2.4804446654676076e-06,
"loss": 0.8491913080215454,
"step": 1360
},
{
"epoch": 2.8734177215189876,
"grad_norm": 1.586370587348938,
"learning_rate": 2.4763078047004863e-06,
"loss": 0.6659104824066162,
"step": 1362
},
{
"epoch": 2.8776371308016877,
"grad_norm": 6.09430456161499,
"learning_rate": 2.47216944871878e-06,
"loss": 0.950684130191803,
"step": 1364
},
{
"epoch": 2.8818565400843883,
"grad_norm": 2.1875460147857666,
"learning_rate": 2.468029619916825e-06,
"loss": 0.9997307062149048,
"step": 1366
},
{
"epoch": 2.8860759493670884,
"grad_norm": 3.9469892978668213,
"learning_rate": 2.46388834069693e-06,
"loss": 1.1051433086395264,
"step": 1368
},
{
"epoch": 2.890295358649789,
"grad_norm": 1.705639123916626,
"learning_rate": 2.4597456334692505e-06,
"loss": 1.03743577003479,
"step": 1370
},
{
"epoch": 2.894514767932489,
"grad_norm": 22.948728561401367,
"learning_rate": 2.455601520651671e-06,
"loss": 0.4580141305923462,
"step": 1372
},
{
"epoch": 2.8987341772151898,
"grad_norm": 1.9022364616394043,
"learning_rate": 2.451456024669681e-06,
"loss": 0.92431640625,
"step": 1374
},
{
"epoch": 2.9029535864978904,
"grad_norm": 1.598383903503418,
"learning_rate": 2.4473091679562555e-06,
"loss": 1.1237053871154785,
"step": 1376
},
{
"epoch": 2.9071729957805905,
"grad_norm": 4.576679706573486,
"learning_rate": 2.443160972951733e-06,
"loss": 0.8321917653083801,
"step": 1378
},
{
"epoch": 2.911392405063291,
"grad_norm": 3.267960786819458,
"learning_rate": 2.4390114621036948e-06,
"loss": 1.2134051322937012,
"step": 1380
},
{
"epoch": 2.9156118143459917,
"grad_norm": 9.497183799743652,
"learning_rate": 2.43486065786684e-06,
"loss": 0.6116930842399597,
"step": 1382
},
{
"epoch": 2.919831223628692,
"grad_norm": 9.528655052185059,
"learning_rate": 2.43070858270287e-06,
"loss": 0.7370846271514893,
"step": 1384
},
{
"epoch": 2.9240506329113924,
"grad_norm": 2.335017204284668,
"learning_rate": 2.4265552590803616e-06,
"loss": 0.6520988941192627,
"step": 1386
},
{
"epoch": 2.928270042194093,
"grad_norm": 3.7409374713897705,
"learning_rate": 2.4224007094746495e-06,
"loss": 1.0449352264404297,
"step": 1388
},
{
"epoch": 2.932489451476793,
"grad_norm": 2.975673198699951,
"learning_rate": 2.418244956367701e-06,
"loss": 0.9698547124862671,
"step": 1390
},
{
"epoch": 2.9367088607594938,
"grad_norm": 2.086550712585449,
"learning_rate": 2.4140880222479963e-06,
"loss": 0.6123561859130859,
"step": 1392
},
{
"epoch": 2.9409282700421944,
"grad_norm": 2.2701752185821533,
"learning_rate": 2.4099299296104063e-06,
"loss": 0.6262718439102173,
"step": 1394
},
{
"epoch": 2.9451476793248945,
"grad_norm": 4.327895164489746,
"learning_rate": 2.405770700956073e-06,
"loss": 1.0023303031921387,
"step": 1396
},
{
"epoch": 2.9493670886075947,
"grad_norm": 1.0873901844024658,
"learning_rate": 2.401610358792283e-06,
"loss": 0.8893314599990845,
"step": 1398
},
{
"epoch": 2.9535864978902953,
"grad_norm": 3.0334839820861816,
"learning_rate": 2.3974489256323508e-06,
"loss": 0.8417981266975403,
"step": 1400
},
{
"epoch": 2.957805907172996,
"grad_norm": 5.334658622741699,
"learning_rate": 2.3932864239954937e-06,
"loss": 0.7297941446304321,
"step": 1402
},
{
"epoch": 2.962025316455696,
"grad_norm": 2.946950674057007,
"learning_rate": 2.3891228764067106e-06,
"loss": 1.0070791244506836,
"step": 1404
},
{
"epoch": 2.9662447257383966,
"grad_norm": 3.0521016120910645,
"learning_rate": 2.384958305396662e-06,
"loss": 0.8960994482040405,
"step": 1406
},
{
"epoch": 2.970464135021097,
"grad_norm": 4.832094192504883,
"learning_rate": 2.380792733501545e-06,
"loss": 0.577763557434082,
"step": 1408
},
{
"epoch": 2.9746835443037973,
"grad_norm": 3.717233419418335,
"learning_rate": 2.376626183262975e-06,
"loss": 0.8571799993515015,
"step": 1410
},
{
"epoch": 2.978902953586498,
"grad_norm": 5.3040547370910645,
"learning_rate": 2.3724586772278574e-06,
"loss": 1.0527344942092896,
"step": 1412
},
{
"epoch": 2.9831223628691985,
"grad_norm": 5.977110385894775,
"learning_rate": 2.368290237948275e-06,
"loss": 0.8416517972946167,
"step": 1414
},
{
"epoch": 2.9873417721518987,
"grad_norm": 10.9218111038208,
"learning_rate": 2.3641208879813567e-06,
"loss": 0.8895251750946045,
"step": 1416
},
{
"epoch": 2.9915611814345993,
"grad_norm": 2.910202741622925,
"learning_rate": 2.3599506498891625e-06,
"loss": 0.9375064373016357,
"step": 1418
},
{
"epoch": 2.9957805907173,
"grad_norm": 4.632609844207764,
"learning_rate": 2.355779546238555e-06,
"loss": 1.054133415222168,
"step": 1420
},
{
"epoch": 3.0,
"grad_norm": 2.0572307109832764,
"learning_rate": 2.3516075996010844e-06,
"loss": 0.47653502225875854,
"step": 1422
},
{
"epoch": 3.0042194092827006,
"grad_norm": 6.811531066894531,
"learning_rate": 2.3474348325528613e-06,
"loss": 0.7990585565567017,
"step": 1424
},
{
"epoch": 3.0084388185654007,
"grad_norm": 4.26148796081543,
"learning_rate": 2.3432612676744338e-06,
"loss": 0.6641910672187805,
"step": 1426
},
{
"epoch": 3.0126582278481013,
"grad_norm": 14.421019554138184,
"learning_rate": 2.3390869275506704e-06,
"loss": 0.6507161855697632,
"step": 1428
},
{
"epoch": 3.0168776371308015,
"grad_norm": 3.2018351554870605,
"learning_rate": 2.334911834770633e-06,
"loss": 0.3902518153190613,
"step": 1430
},
{
"epoch": 3.021097046413502,
"grad_norm": 1.717463731765747,
"learning_rate": 2.330736011927458e-06,
"loss": 1.0567653179168701,
"step": 1432
},
{
"epoch": 3.0253164556962027,
"grad_norm": 1.948553442955017,
"learning_rate": 2.326559481618229e-06,
"loss": 0.9750782251358032,
"step": 1434
},
{
"epoch": 3.029535864978903,
"grad_norm": 3.6881439685821533,
"learning_rate": 2.322382266443863e-06,
"loss": 1.128783106803894,
"step": 1436
},
{
"epoch": 3.0337552742616034,
"grad_norm": 3.5191917419433594,
"learning_rate": 2.3182043890089784e-06,
"loss": 0.5267306566238403,
"step": 1438
},
{
"epoch": 3.037974683544304,
"grad_norm": 2.0311169624328613,
"learning_rate": 2.3140258719217808e-06,
"loss": 0.9317551851272583,
"step": 1440
},
{
"epoch": 3.042194092827004,
"grad_norm": 5.587665557861328,
"learning_rate": 2.309846737793935e-06,
"loss": 0.6537089943885803,
"step": 1442
},
{
"epoch": 3.0464135021097047,
"grad_norm": 2.518566131591797,
"learning_rate": 2.3056670092404463e-06,
"loss": 0.8329222202301025,
"step": 1444
},
{
"epoch": 3.050632911392405,
"grad_norm": 2.06829833984375,
"learning_rate": 2.3014867088795357e-06,
"loss": 1.0246927738189697,
"step": 1446
},
{
"epoch": 3.0548523206751055,
"grad_norm": 9.406811714172363,
"learning_rate": 2.297305859332519e-06,
"loss": 0.6608364582061768,
"step": 1448
},
{
"epoch": 3.059071729957806,
"grad_norm": 18.555009841918945,
"learning_rate": 2.2931244832236837e-06,
"loss": 0.8099187612533569,
"step": 1450
},
{
"epoch": 3.0632911392405062,
"grad_norm": 2.87371563911438,
"learning_rate": 2.288942603180167e-06,
"loss": 1.048098087310791,
"step": 1452
},
{
"epoch": 3.067510548523207,
"grad_norm": 5.4316205978393555,
"learning_rate": 2.2847602418318327e-06,
"loss": 0.7442044019699097,
"step": 1454
},
{
"epoch": 3.071729957805907,
"grad_norm": 6.081297397613525,
"learning_rate": 2.2805774218111496e-06,
"loss": 0.6251615285873413,
"step": 1456
},
{
"epoch": 3.0759493670886076,
"grad_norm": 10.227375984191895,
"learning_rate": 2.276394165753067e-06,
"loss": 0.6871986389160156,
"step": 1458
},
{
"epoch": 3.080168776371308,
"grad_norm": 7.270413398742676,
"learning_rate": 2.272210496294896e-06,
"loss": 0.7179367542266846,
"step": 1460
},
{
"epoch": 3.0843881856540083,
"grad_norm": 2.082552194595337,
"learning_rate": 2.268026436076185e-06,
"loss": 0.9696202278137207,
"step": 1462
},
{
"epoch": 3.088607594936709,
"grad_norm": 2.518341302871704,
"learning_rate": 2.263842007738594e-06,
"loss": 0.9051344394683838,
"step": 1464
},
{
"epoch": 3.0928270042194095,
"grad_norm": 2.340363025665283,
"learning_rate": 2.2596572339257777e-06,
"loss": 0.8250648975372314,
"step": 1466
},
{
"epoch": 3.0970464135021096,
"grad_norm": 11.077507019042969,
"learning_rate": 2.255472137283259e-06,
"loss": 0.6344802975654602,
"step": 1468
},
{
"epoch": 3.1012658227848102,
"grad_norm": 7.140725612640381,
"learning_rate": 2.2512867404583085e-06,
"loss": 0.1541098654270172,
"step": 1470
},
{
"epoch": 3.1054852320675104,
"grad_norm": 4.636476993560791,
"learning_rate": 2.2471010660998215e-06,
"loss": 1.4155219793319702,
"step": 1472
},
{
"epoch": 3.109704641350211,
"grad_norm": 3.027451276779175,
"learning_rate": 2.242915136858193e-06,
"loss": 0.49524158239364624,
"step": 1474
},
{
"epoch": 3.1139240506329116,
"grad_norm": 3.410243034362793,
"learning_rate": 2.2387289753852e-06,
"loss": 1.0359880924224854,
"step": 1476
},
{
"epoch": 3.1181434599156117,
"grad_norm": 2.0419440269470215,
"learning_rate": 2.234542604333875e-06,
"loss": 1.03524911403656,
"step": 1478
},
{
"epoch": 3.1223628691983123,
"grad_norm": 2.8948912620544434,
"learning_rate": 2.230356046358384e-06,
"loss": 0.9543738961219788,
"step": 1480
},
{
"epoch": 3.1265822784810124,
"grad_norm": 2.4057018756866455,
"learning_rate": 2.2261693241139065e-06,
"loss": 0.9722020030021667,
"step": 1482
},
{
"epoch": 3.130801687763713,
"grad_norm": 1.9602731466293335,
"learning_rate": 2.2219824602565087e-06,
"loss": 0.9750865697860718,
"step": 1484
},
{
"epoch": 3.1350210970464136,
"grad_norm": 2.10933780670166,
"learning_rate": 2.2177954774430234e-06,
"loss": 0.6285134553909302,
"step": 1486
},
{
"epoch": 3.1392405063291138,
"grad_norm": 1.8953523635864258,
"learning_rate": 2.2136083983309286e-06,
"loss": 0.6080442667007446,
"step": 1488
},
{
"epoch": 3.1434599156118144,
"grad_norm": 10.058272361755371,
"learning_rate": 2.2094212455782227e-06,
"loss": 1.1448235511779785,
"step": 1490
},
{
"epoch": 3.147679324894515,
"grad_norm": 3.1135060787200928,
"learning_rate": 2.2052340418433024e-06,
"loss": 0.6743577718734741,
"step": 1492
},
{
"epoch": 3.151898734177215,
"grad_norm": 6.616735458374023,
"learning_rate": 2.2010468097848396e-06,
"loss": 0.737909197807312,
"step": 1494
},
{
"epoch": 3.1561181434599157,
"grad_norm": 3.229160785675049,
"learning_rate": 2.1968595720616606e-06,
"loss": 0.8287728428840637,
"step": 1496
},
{
"epoch": 3.160337552742616,
"grad_norm": 0.8820177316665649,
"learning_rate": 2.192672351332623e-06,
"loss": 0.4992554783821106,
"step": 1498
},
{
"epoch": 3.1645569620253164,
"grad_norm": 2.330535411834717,
"learning_rate": 2.1884851702564897e-06,
"loss": 0.5810240507125854,
"step": 1500
},
{
"epoch": 3.168776371308017,
"grad_norm": 1.8228909969329834,
"learning_rate": 2.1842980514918117e-06,
"loss": 0.9471129179000854,
"step": 1502
},
{
"epoch": 3.172995780590717,
"grad_norm": 2.3642683029174805,
"learning_rate": 2.1801110176968016e-06,
"loss": 0.8418397903442383,
"step": 1504
},
{
"epoch": 3.1772151898734178,
"grad_norm": 1.8827167749404907,
"learning_rate": 2.1759240915292135e-06,
"loss": 0.9700140357017517,
"step": 1506
},
{
"epoch": 3.181434599156118,
"grad_norm": 3.775982618331909,
"learning_rate": 2.171737295646216e-06,
"loss": 1.1170215606689453,
"step": 1508
},
{
"epoch": 3.1856540084388185,
"grad_norm": 4.465606212615967,
"learning_rate": 2.167550652704276e-06,
"loss": 0.9244706630706787,
"step": 1510
},
{
"epoch": 3.189873417721519,
"grad_norm": 2.6871254444122314,
"learning_rate": 2.1633641853590318e-06,
"loss": 0.25759080052375793,
"step": 1512
},
{
"epoch": 3.1940928270042193,
"grad_norm": 3.345410108566284,
"learning_rate": 2.15917791626517e-06,
"loss": 0.9588069319725037,
"step": 1514
},
{
"epoch": 3.19831223628692,
"grad_norm": 10.61077880859375,
"learning_rate": 2.154991868076306e-06,
"loss": 0.5874932408332825,
"step": 1516
},
{
"epoch": 3.2025316455696204,
"grad_norm": 2.544962167739868,
"learning_rate": 2.1508060634448595e-06,
"loss": 0.991689920425415,
"step": 1518
},
{
"epoch": 3.2067510548523206,
"grad_norm": 3.6874847412109375,
"learning_rate": 2.1466205250219315e-06,
"loss": 0.9816372990608215,
"step": 1520
},
{
"epoch": 3.210970464135021,
"grad_norm": 5.628320217132568,
"learning_rate": 2.142435275457184e-06,
"loss": 0.30518054962158203,
"step": 1522
},
{
"epoch": 3.2151898734177213,
"grad_norm": 3.656771659851074,
"learning_rate": 2.1382503373987133e-06,
"loss": 0.7900766134262085,
"step": 1524
},
{
"epoch": 3.219409282700422,
"grad_norm": 2.2453036308288574,
"learning_rate": 2.1340657334929335e-06,
"loss": 0.8744317293167114,
"step": 1526
},
{
"epoch": 3.2236286919831225,
"grad_norm": 0.9895398616790771,
"learning_rate": 2.1298814863844476e-06,
"loss": 0.47598880529403687,
"step": 1528
},
{
"epoch": 3.2278481012658227,
"grad_norm": 5.763035297393799,
"learning_rate": 2.1256976187159278e-06,
"loss": 0.7799667119979858,
"step": 1530
},
{
"epoch": 3.2320675105485233,
"grad_norm": 0.7087782621383667,
"learning_rate": 2.121514153127995e-06,
"loss": 0.2722686529159546,
"step": 1532
},
{
"epoch": 3.2362869198312234,
"grad_norm": 3.2583420276641846,
"learning_rate": 2.1173311122590932e-06,
"loss": 0.7357510328292847,
"step": 1534
},
{
"epoch": 3.240506329113924,
"grad_norm": 3.8085386753082275,
"learning_rate": 2.1131485187453676e-06,
"loss": 0.9901435375213623,
"step": 1536
},
{
"epoch": 3.2447257383966246,
"grad_norm": 2.8548874855041504,
"learning_rate": 2.1089663952205435e-06,
"loss": 0.9335240721702576,
"step": 1538
},
{
"epoch": 3.2489451476793247,
"grad_norm": 9.909287452697754,
"learning_rate": 2.104784764315802e-06,
"loss": 0.752236008644104,
"step": 1540
},
{
"epoch": 3.2531645569620253,
"grad_norm": 9.005875587463379,
"learning_rate": 2.100603648659659e-06,
"loss": 0.741628885269165,
"step": 1542
},
{
"epoch": 3.257383966244726,
"grad_norm": 8.307135581970215,
"learning_rate": 2.096423070877843e-06,
"loss": 0.6267164945602417,
"step": 1544
},
{
"epoch": 3.261603375527426,
"grad_norm": 6.679696559906006,
"learning_rate": 2.092243053593169e-06,
"loss": 0.5680997371673584,
"step": 1546
},
{
"epoch": 3.2658227848101267,
"grad_norm": 3.8873493671417236,
"learning_rate": 2.0880636194254225e-06,
"loss": 0.874029278755188,
"step": 1548
},
{
"epoch": 3.270042194092827,
"grad_norm": 6.6328301429748535,
"learning_rate": 2.0838847909912307e-06,
"loss": 0.4085759222507477,
"step": 1550
},
{
"epoch": 3.2742616033755274,
"grad_norm": 2.145261526107788,
"learning_rate": 2.0797065909039457e-06,
"loss": 0.36501544713974,
"step": 1552
},
{
"epoch": 3.278481012658228,
"grad_norm": 7.986878395080566,
"learning_rate": 2.0755290417735156e-06,
"loss": 0.4557437002658844,
"step": 1554
},
{
"epoch": 3.282700421940928,
"grad_norm": 2.874678373336792,
"learning_rate": 2.071352166206369e-06,
"loss": 0.962173581123352,
"step": 1556
},
{
"epoch": 3.2869198312236287,
"grad_norm": 1.1194981336593628,
"learning_rate": 2.0671759868052893e-06,
"loss": 0.7566915154457092,
"step": 1558
},
{
"epoch": 3.291139240506329,
"grad_norm": 2.609232187271118,
"learning_rate": 2.0630005261692905e-06,
"loss": 0.6619813442230225,
"step": 1560
},
{
"epoch": 3.2953586497890295,
"grad_norm": 9.03283977508545,
"learning_rate": 2.0588258068935002e-06,
"loss": 0.5809231400489807,
"step": 1562
},
{
"epoch": 3.29957805907173,
"grad_norm": 3.785902500152588,
"learning_rate": 2.0546518515690316e-06,
"loss": 0.8656713366508484,
"step": 1564
},
{
"epoch": 3.3037974683544302,
"grad_norm": 11.584537506103516,
"learning_rate": 2.0504786827828648e-06,
"loss": 0.7611091136932373,
"step": 1566
},
{
"epoch": 3.308016877637131,
"grad_norm": 8.480819702148438,
"learning_rate": 2.0463063231177236e-06,
"loss": 0.5610800981521606,
"step": 1568
},
{
"epoch": 3.3122362869198314,
"grad_norm": 5.454217910766602,
"learning_rate": 2.0421347951519535e-06,
"loss": 0.5264372229576111,
"step": 1570
},
{
"epoch": 3.3164556962025316,
"grad_norm": 3.4205212593078613,
"learning_rate": 2.037964121459399e-06,
"loss": 0.5730254650115967,
"step": 1572
},
{
"epoch": 3.320675105485232,
"grad_norm": 3.719339609146118,
"learning_rate": 2.033794324609282e-06,
"loss": 1.091575026512146,
"step": 1574
},
{
"epoch": 3.3248945147679323,
"grad_norm": 2.0314159393310547,
"learning_rate": 2.0296254271660795e-06,
"loss": 0.8482744693756104,
"step": 1576
},
{
"epoch": 3.329113924050633,
"grad_norm": 2.4221394062042236,
"learning_rate": 2.025457451689401e-06,
"loss": 0.9338847398757935,
"step": 1578
},
{
"epoch": 3.3333333333333335,
"grad_norm": 3.0065743923187256,
"learning_rate": 2.0212904207338672e-06,
"loss": 0.7377324104309082,
"step": 1580
},
{
"epoch": 3.3375527426160336,
"grad_norm": 1.4527244567871094,
"learning_rate": 2.0171243568489883e-06,
"loss": 0.48970168828964233,
"step": 1582
},
{
"epoch": 3.3417721518987342,
"grad_norm": 5.6556010246276855,
"learning_rate": 2.0129592825790397e-06,
"loss": 0.7742688655853271,
"step": 1584
},
{
"epoch": 3.3459915611814344,
"grad_norm": 5.596024990081787,
"learning_rate": 2.0087952204629422e-06,
"loss": 0.641385555267334,
"step": 1586
},
{
"epoch": 3.350210970464135,
"grad_norm": 2.5374205112457275,
"learning_rate": 2.0046321930341405e-06,
"loss": 0.5972579717636108,
"step": 1588
},
{
"epoch": 3.3544303797468356,
"grad_norm": 1.8447959423065186,
"learning_rate": 2.0004702228204797e-06,
"loss": 0.8615912199020386,
"step": 1590
},
{
"epoch": 3.3586497890295357,
"grad_norm": 6.396413326263428,
"learning_rate": 1.9963093323440824e-06,
"loss": 0.9015900492668152,
"step": 1592
},
{
"epoch": 3.3628691983122363,
"grad_norm": 0.825650155544281,
"learning_rate": 1.99214954412123e-06,
"loss": 0.6481198072433472,
"step": 1594
},
{
"epoch": 3.367088607594937,
"grad_norm": 6.811497211456299,
"learning_rate": 1.9879908806622385e-06,
"loss": 0.4374066889286041,
"step": 1596
},
{
"epoch": 3.371308016877637,
"grad_norm": 0.7065004706382751,
"learning_rate": 1.9838333644713377e-06,
"loss": 0.4804467558860779,
"step": 1598
},
{
"epoch": 3.3755274261603376,
"grad_norm": 10.63037109375,
"learning_rate": 1.9796770180465484e-06,
"loss": 0.6881888508796692,
"step": 1600
},
{
"epoch": 3.379746835443038,
"grad_norm": 5.440734386444092,
"learning_rate": 1.9755218638795626e-06,
"loss": 0.547875165939331,
"step": 1602
},
{
"epoch": 3.3839662447257384,
"grad_norm": 1.4273031949996948,
"learning_rate": 1.971367924455618e-06,
"loss": 0.5285290479660034,
"step": 1604
},
{
"epoch": 3.388185654008439,
"grad_norm": 3.3389201164245605,
"learning_rate": 1.9672152222533822e-06,
"loss": 1.0279819965362549,
"step": 1606
},
{
"epoch": 3.392405063291139,
"grad_norm": 2.3950865268707275,
"learning_rate": 1.9630637797448248e-06,
"loss": 0.6111994981765747,
"step": 1608
},
{
"epoch": 3.3966244725738397,
"grad_norm": 1.5823328495025635,
"learning_rate": 1.9589136193951e-06,
"loss": 0.5231560468673706,
"step": 1610
},
{
"epoch": 3.40084388185654,
"grad_norm": 2.5763416290283203,
"learning_rate": 1.9547647636624243e-06,
"loss": 0.916947603225708,
"step": 1612
},
{
"epoch": 3.4050632911392404,
"grad_norm": 2.1279733180999756,
"learning_rate": 1.9506172349979523e-06,
"loss": 0.39490947127342224,
"step": 1614
},
{
"epoch": 3.409282700421941,
"grad_norm": 13.062804222106934,
"learning_rate": 1.9464710558456595e-06,
"loss": 0.8276299238204956,
"step": 1616
},
{
"epoch": 3.413502109704641,
"grad_norm": 2.3977434635162354,
"learning_rate": 1.942326248642218e-06,
"loss": 1.0900508165359497,
"step": 1618
},
{
"epoch": 3.4177215189873418,
"grad_norm": 2.819269895553589,
"learning_rate": 1.9381828358168748e-06,
"loss": 0.9528172016143799,
"step": 1620
},
{
"epoch": 3.4219409282700424,
"grad_norm": 23.19445037841797,
"learning_rate": 1.934040839791332e-06,
"loss": 0.5396543145179749,
"step": 1622
},
{
"epoch": 3.4261603375527425,
"grad_norm": 7.595386028289795,
"learning_rate": 1.9299002829796253e-06,
"loss": 0.3888126611709595,
"step": 1624
},
{
"epoch": 3.430379746835443,
"grad_norm": 5.8151960372924805,
"learning_rate": 1.925761187788002e-06,
"loss": 0.3526824712753296,
"step": 1626
},
{
"epoch": 3.4345991561181437,
"grad_norm": 5.015478134155273,
"learning_rate": 1.921623576614799e-06,
"loss": 1.0127757787704468,
"step": 1628
},
{
"epoch": 3.438818565400844,
"grad_norm": 3.884026050567627,
"learning_rate": 1.917487471850323e-06,
"loss": 0.3786028325557709,
"step": 1630
},
{
"epoch": 3.4430379746835444,
"grad_norm": 4.3548784255981445,
"learning_rate": 1.91335289587673e-06,
"loss": 1.0020424127578735,
"step": 1632
},
{
"epoch": 3.4472573839662446,
"grad_norm": 1.1631532907485962,
"learning_rate": 1.909219871067902e-06,
"loss": 0.5979082584381104,
"step": 1634
},
{
"epoch": 3.451476793248945,
"grad_norm": 5.268650531768799,
"learning_rate": 1.9050884197893278e-06,
"loss": 1.1838793754577637,
"step": 1636
},
{
"epoch": 3.4556962025316453,
"grad_norm": 4.640054702758789,
"learning_rate": 1.90095856439798e-06,
"loss": 1.1896966695785522,
"step": 1638
},
{
"epoch": 3.459915611814346,
"grad_norm": 3.0583033561706543,
"learning_rate": 1.8968303272421968e-06,
"loss": 0.9648596048355103,
"step": 1640
},
{
"epoch": 3.4641350210970465,
"grad_norm": 30.096092224121094,
"learning_rate": 1.8927037306615578e-06,
"loss": 1.0935192108154297,
"step": 1642
},
{
"epoch": 3.4683544303797467,
"grad_norm": 2.4771618843078613,
"learning_rate": 1.8885787969867656e-06,
"loss": 0.35215988755226135,
"step": 1644
},
{
"epoch": 3.4725738396624473,
"grad_norm": 10.702546119689941,
"learning_rate": 1.884455548539524e-06,
"loss": 0.839633584022522,
"step": 1646
},
{
"epoch": 3.476793248945148,
"grad_norm": 1.648725152015686,
"learning_rate": 1.8803340076324181e-06,
"loss": 0.9294931888580322,
"step": 1648
},
{
"epoch": 3.481012658227848,
"grad_norm": 2.6175386905670166,
"learning_rate": 1.876214196568791e-06,
"loss": 0.5126534104347229,
"step": 1650
},
{
"epoch": 3.4852320675105486,
"grad_norm": 2.2899160385131836,
"learning_rate": 1.872096137642627e-06,
"loss": 0.8264724612236023,
"step": 1652
},
{
"epoch": 3.489451476793249,
"grad_norm": 1.0500420331954956,
"learning_rate": 1.8679798531384274e-06,
"loss": 0.4854082465171814,
"step": 1654
},
{
"epoch": 3.4936708860759493,
"grad_norm": 1.5645257234573364,
"learning_rate": 1.8638653653310926e-06,
"loss": 0.7242560386657715,
"step": 1656
},
{
"epoch": 3.49789029535865,
"grad_norm": 3.557481050491333,
"learning_rate": 1.8597526964857985e-06,
"loss": 0.7009620666503906,
"step": 1658
},
{
"epoch": 3.50210970464135,
"grad_norm": 2.4170994758605957,
"learning_rate": 1.8556418688578797e-06,
"loss": 1.0089216232299805,
"step": 1660
},
{
"epoch": 3.5063291139240507,
"grad_norm": 5.906785488128662,
"learning_rate": 1.8515329046927058e-06,
"loss": 1.111635446548462,
"step": 1662
},
{
"epoch": 3.510548523206751,
"grad_norm": 21.11191749572754,
"learning_rate": 1.8474258262255642e-06,
"loss": 0.4738878309726715,
"step": 1664
},
{
"epoch": 3.5147679324894514,
"grad_norm": 6.232138633728027,
"learning_rate": 1.843320655681536e-06,
"loss": 1.019901990890503,
"step": 1666
},
{
"epoch": 3.518987341772152,
"grad_norm": 7.000395774841309,
"learning_rate": 1.839217415275379e-06,
"loss": 0.6458152532577515,
"step": 1668
},
{
"epoch": 3.523206751054852,
"grad_norm": 2.109321355819702,
"learning_rate": 1.835116127211406e-06,
"loss": 0.9234386086463928,
"step": 1670
},
{
"epoch": 3.5274261603375527,
"grad_norm": 8.41999340057373,
"learning_rate": 1.8310168136833646e-06,
"loss": 0.382904052734375,
"step": 1672
},
{
"epoch": 3.5316455696202533,
"grad_norm": 2.0964558124542236,
"learning_rate": 1.8269194968743178e-06,
"loss": 0.585561990737915,
"step": 1674
},
{
"epoch": 3.5358649789029535,
"grad_norm": 10.49689769744873,
"learning_rate": 1.8228241989565239e-06,
"loss": 0.6187952160835266,
"step": 1676
},
{
"epoch": 3.540084388185654,
"grad_norm": 7.462824821472168,
"learning_rate": 1.8187309420913142e-06,
"loss": 0.7788501977920532,
"step": 1678
},
{
"epoch": 3.5443037974683547,
"grad_norm": 3.3341939449310303,
"learning_rate": 1.8146397484289774e-06,
"loss": 0.9248118996620178,
"step": 1680
},
{
"epoch": 3.548523206751055,
"grad_norm": 2.9744744300842285,
"learning_rate": 1.810550640108636e-06,
"loss": 0.7860240936279297,
"step": 1682
},
{
"epoch": 3.5527426160337554,
"grad_norm": 16.682893753051758,
"learning_rate": 1.8064636392581285e-06,
"loss": 0.7947289347648621,
"step": 1684
},
{
"epoch": 3.5569620253164556,
"grad_norm": 11.304174423217773,
"learning_rate": 1.8023787679938884e-06,
"loss": 0.32021385431289673,
"step": 1686
},
{
"epoch": 3.561181434599156,
"grad_norm": 3.4476826190948486,
"learning_rate": 1.7982960484208255e-06,
"loss": 0.5928635597229004,
"step": 1688
},
{
"epoch": 3.5654008438818563,
"grad_norm": 4.565676689147949,
"learning_rate": 1.7942155026322064e-06,
"loss": 1.007154941558838,
"step": 1690
},
{
"epoch": 3.569620253164557,
"grad_norm": 54.17780685424805,
"learning_rate": 1.7901371527095336e-06,
"loss": 0.20298929512500763,
"step": 1692
},
{
"epoch": 3.5738396624472575,
"grad_norm": 0.7691044807434082,
"learning_rate": 1.7860610207224266e-06,
"loss": 0.610919713973999,
"step": 1694
},
{
"epoch": 3.5780590717299576,
"grad_norm": 7.206573486328125,
"learning_rate": 1.7819871287285042e-06,
"loss": 0.2613908350467682,
"step": 1696
},
{
"epoch": 3.5822784810126582,
"grad_norm": 2.2238030433654785,
"learning_rate": 1.7779154987732628e-06,
"loss": 0.7429696321487427,
"step": 1698
},
{
"epoch": 3.586497890295359,
"grad_norm": 2.2671563625335693,
"learning_rate": 1.7738461528899582e-06,
"loss": 0.6627340912818909,
"step": 1700
},
{
"epoch": 3.590717299578059,
"grad_norm": 1.9748802185058594,
"learning_rate": 1.769779113099485e-06,
"loss": 0.5637974739074707,
"step": 1702
},
{
"epoch": 3.5949367088607596,
"grad_norm": 2.075197696685791,
"learning_rate": 1.7657144014102605e-06,
"loss": 1.022030234336853,
"step": 1704
},
{
"epoch": 3.59915611814346,
"grad_norm": 2.7764699459075928,
"learning_rate": 1.7616520398181019e-06,
"loss": 0.6542642116546631,
"step": 1706
},
{
"epoch": 3.6033755274261603,
"grad_norm": 5.018822193145752,
"learning_rate": 1.757592050306111e-06,
"loss": 0.7118390202522278,
"step": 1708
},
{
"epoch": 3.607594936708861,
"grad_norm": 1.829730749130249,
"learning_rate": 1.7535344548445523e-06,
"loss": 0.5238461494445801,
"step": 1710
},
{
"epoch": 3.611814345991561,
"grad_norm": 2.2571935653686523,
"learning_rate": 1.7494792753907342e-06,
"loss": 0.9762560129165649,
"step": 1712
},
{
"epoch": 3.6160337552742616,
"grad_norm": 11.215494155883789,
"learning_rate": 1.7454265338888923e-06,
"loss": 1.1840991973876953,
"step": 1714
},
{
"epoch": 3.620253164556962,
"grad_norm": 5.1113972663879395,
"learning_rate": 1.741376252270069e-06,
"loss": 0.5932983160018921,
"step": 1716
},
{
"epoch": 3.6244725738396624,
"grad_norm": 3.276780843734741,
"learning_rate": 1.7373284524519956e-06,
"loss": 0.654528021812439,
"step": 1718
},
{
"epoch": 3.628691983122363,
"grad_norm": 4.502676486968994,
"learning_rate": 1.733283156338973e-06,
"loss": 0.329173743724823,
"step": 1720
},
{
"epoch": 3.632911392405063,
"grad_norm": 4.122840404510498,
"learning_rate": 1.7292403858217534e-06,
"loss": 1.0182509422302246,
"step": 1722
},
{
"epoch": 3.6371308016877637,
"grad_norm": 8.013359069824219,
"learning_rate": 1.7252001627774227e-06,
"loss": 0.5020068287849426,
"step": 1724
},
{
"epoch": 3.6413502109704643,
"grad_norm": 7.430994987487793,
"learning_rate": 1.72116250906928e-06,
"loss": 0.45291832089424133,
"step": 1726
},
{
"epoch": 3.6455696202531644,
"grad_norm": 5.890309810638428,
"learning_rate": 1.7171274465467224e-06,
"loss": 0.8754688501358032,
"step": 1728
},
{
"epoch": 3.649789029535865,
"grad_norm": 9.963774681091309,
"learning_rate": 1.7130949970451245e-06,
"loss": 0.2187124788761139,
"step": 1730
},
{
"epoch": 3.6540084388185656,
"grad_norm": 6.262022972106934,
"learning_rate": 1.709065182385719e-06,
"loss": 0.886106014251709,
"step": 1732
},
{
"epoch": 3.6582278481012658,
"grad_norm": 9.15018367767334,
"learning_rate": 1.7050380243754838e-06,
"loss": 0.3278903365135193,
"step": 1734
},
{
"epoch": 3.6624472573839664,
"grad_norm": 30.086578369140625,
"learning_rate": 1.7010135448070169e-06,
"loss": 0.3603389263153076,
"step": 1736
},
{
"epoch": 3.6666666666666665,
"grad_norm": 24.306060791015625,
"learning_rate": 1.6969917654584247e-06,
"loss": 0.6651766300201416,
"step": 1738
},
{
"epoch": 3.670886075949367,
"grad_norm": 4.77196741104126,
"learning_rate": 1.692972708093201e-06,
"loss": 0.33792465925216675,
"step": 1740
},
{
"epoch": 3.6751054852320673,
"grad_norm": 1.7918250560760498,
"learning_rate": 1.688956394460109e-06,
"loss": 1.0997920036315918,
"step": 1742
},
{
"epoch": 3.679324894514768,
"grad_norm": 19.624130249023438,
"learning_rate": 1.6849428462930653e-06,
"loss": 0.5909217596054077,
"step": 1744
},
{
"epoch": 3.6835443037974684,
"grad_norm": 7.293959140777588,
"learning_rate": 1.6809320853110215e-06,
"loss": 0.563459038734436,
"step": 1746
},
{
"epoch": 3.6877637130801686,
"grad_norm": 2.4896528720855713,
"learning_rate": 1.6769241332178469e-06,
"loss": 1.0555415153503418,
"step": 1748
},
{
"epoch": 3.691983122362869,
"grad_norm": 2.973538398742676,
"learning_rate": 1.6729190117022095e-06,
"loss": 0.8185904026031494,
"step": 1750
},
{
"epoch": 3.6962025316455698,
"grad_norm": 3.3849141597747803,
"learning_rate": 1.6689167424374597e-06,
"loss": 0.8749343752861023,
"step": 1752
},
{
"epoch": 3.70042194092827,
"grad_norm": 2.0385217666625977,
"learning_rate": 1.664917347081516e-06,
"loss": 1.026354432106018,
"step": 1754
},
{
"epoch": 3.7046413502109705,
"grad_norm": 5.828520774841309,
"learning_rate": 1.660920847276741e-06,
"loss": 0.8060284852981567,
"step": 1756
},
{
"epoch": 3.708860759493671,
"grad_norm": 5.976668357849121,
"learning_rate": 1.6569272646498318e-06,
"loss": 0.7234772443771362,
"step": 1758
},
{
"epoch": 3.7130801687763713,
"grad_norm": 9.543655395507812,
"learning_rate": 1.6529366208116974e-06,
"loss": 0.7528952360153198,
"step": 1760
},
{
"epoch": 3.717299578059072,
"grad_norm": 4.140414237976074,
"learning_rate": 1.6489489373573443e-06,
"loss": 0.26903659105300903,
"step": 1762
},
{
"epoch": 3.721518987341772,
"grad_norm": 12.051411628723145,
"learning_rate": 1.64496423586576e-06,
"loss": 0.5374072790145874,
"step": 1764
},
{
"epoch": 3.7257383966244726,
"grad_norm": 2.326197624206543,
"learning_rate": 1.6409825378997941e-06,
"loss": 0.9479004740715027,
"step": 1766
},
{
"epoch": 3.7299578059071727,
"grad_norm": 4.621135234832764,
"learning_rate": 1.6370038650060437e-06,
"loss": 0.5748968124389648,
"step": 1768
},
{
"epoch": 3.7341772151898733,
"grad_norm": 2.885585069656372,
"learning_rate": 1.6330282387147349e-06,
"loss": 0.5932916402816772,
"step": 1770
},
{
"epoch": 3.738396624472574,
"grad_norm": 1.9321597814559937,
"learning_rate": 1.6290556805396093e-06,
"loss": 0.9674075245857239,
"step": 1772
},
{
"epoch": 3.742616033755274,
"grad_norm": 3.254708766937256,
"learning_rate": 1.6250862119778046e-06,
"loss": 0.4991704523563385,
"step": 1774
},
{
"epoch": 3.7468354430379747,
"grad_norm": 1.1030203104019165,
"learning_rate": 1.6211198545097381e-06,
"loss": 0.5824090242385864,
"step": 1776
},
{
"epoch": 3.7510548523206753,
"grad_norm": 2.4272022247314453,
"learning_rate": 1.6171566295989947e-06,
"loss": 0.8916751146316528,
"step": 1778
},
{
"epoch": 3.7552742616033754,
"grad_norm": 2.7834560871124268,
"learning_rate": 1.6131965586922039e-06,
"loss": 0.9039870500564575,
"step": 1780
},
{
"epoch": 3.759493670886076,
"grad_norm": 3.2108805179595947,
"learning_rate": 1.6092396632189317e-06,
"loss": 0.8393138647079468,
"step": 1782
},
{
"epoch": 3.7637130801687766,
"grad_norm": 8.731537818908691,
"learning_rate": 1.6052859645915575e-06,
"loss": 0.8530555963516235,
"step": 1784
},
{
"epoch": 3.7679324894514767,
"grad_norm": 2.2591445446014404,
"learning_rate": 1.6013354842051624e-06,
"loss": 1.0453441143035889,
"step": 1786
},
{
"epoch": 3.7721518987341773,
"grad_norm": 18.5029296875,
"learning_rate": 1.5973882434374124e-06,
"loss": 0.2866585850715637,
"step": 1788
},
{
"epoch": 3.7763713080168775,
"grad_norm": 2.598447561264038,
"learning_rate": 1.5934442636484425e-06,
"loss": 0.5377147197723389,
"step": 1790
},
{
"epoch": 3.780590717299578,
"grad_norm": 2.245370864868164,
"learning_rate": 1.5895035661807397e-06,
"loss": 0.9374682903289795,
"step": 1792
},
{
"epoch": 3.7848101265822782,
"grad_norm": 10.506272315979004,
"learning_rate": 1.5855661723590319e-06,
"loss": 0.7131825685501099,
"step": 1794
},
{
"epoch": 3.789029535864979,
"grad_norm": 5.187559127807617,
"learning_rate": 1.581632103490168e-06,
"loss": 0.9631250500679016,
"step": 1796
},
{
"epoch": 3.7932489451476794,
"grad_norm": 5.299999713897705,
"learning_rate": 1.577701380863003e-06,
"loss": 1.1112829446792603,
"step": 1798
},
{
"epoch": 3.7974683544303796,
"grad_norm": 2.1457207202911377,
"learning_rate": 1.5737740257482867e-06,
"loss": 0.8928860425949097,
"step": 1800
},
{
"epoch": 3.80168776371308,
"grad_norm": 2.5547454357147217,
"learning_rate": 1.569850059398544e-06,
"loss": 1.004746675491333,
"step": 1802
},
{
"epoch": 3.8059071729957807,
"grad_norm": 3.674745798110962,
"learning_rate": 1.565929503047963e-06,
"loss": 0.49736908078193665,
"step": 1804
},
{
"epoch": 3.810126582278481,
"grad_norm": 7.80587100982666,
"learning_rate": 1.562012377912277e-06,
"loss": 0.23617342114448547,
"step": 1806
},
{
"epoch": 3.8143459915611815,
"grad_norm": 5.4438958168029785,
"learning_rate": 1.5580987051886533e-06,
"loss": 0.8461598753929138,
"step": 1808
},
{
"epoch": 3.818565400843882,
"grad_norm": 2.466731071472168,
"learning_rate": 1.554188506055577e-06,
"loss": 0.9447206258773804,
"step": 1810
},
{
"epoch": 3.8227848101265822,
"grad_norm": 5.592019081115723,
"learning_rate": 1.550281801672735e-06,
"loss": 0.47888684272766113,
"step": 1812
},
{
"epoch": 3.827004219409283,
"grad_norm": 2.1095151901245117,
"learning_rate": 1.5463786131809031e-06,
"loss": 0.9347876310348511,
"step": 1814
},
{
"epoch": 3.831223628691983,
"grad_norm": 4.567122936248779,
"learning_rate": 1.542478961701831e-06,
"loss": 0.8219131231307983,
"step": 1816
},
{
"epoch": 3.8354430379746836,
"grad_norm": 3.2872185707092285,
"learning_rate": 1.5385828683381293e-06,
"loss": 0.7965229749679565,
"step": 1818
},
{
"epoch": 3.8396624472573837,
"grad_norm": 4.746089935302734,
"learning_rate": 1.5346903541731524e-06,
"loss": 0.6401727199554443,
"step": 1820
},
{
"epoch": 3.8438818565400843,
"grad_norm": 3.5851891040802,
"learning_rate": 1.530801440270888e-06,
"loss": 0.9646581411361694,
"step": 1822
},
{
"epoch": 3.848101265822785,
"grad_norm": 7.018674373626709,
"learning_rate": 1.5269161476758404e-06,
"loss": 0.7993499636650085,
"step": 1824
},
{
"epoch": 3.852320675105485,
"grad_norm": 3.83168888092041,
"learning_rate": 1.523034497412916e-06,
"loss": 0.9415961503982544,
"step": 1826
},
{
"epoch": 3.8565400843881856,
"grad_norm": 3.7820115089416504,
"learning_rate": 1.5191565104873144e-06,
"loss": 0.9054951667785645,
"step": 1828
},
{
"epoch": 3.8607594936708862,
"grad_norm": 5.366248607635498,
"learning_rate": 1.5152822078844088e-06,
"loss": 0.9999287128448486,
"step": 1830
},
{
"epoch": 3.8649789029535864,
"grad_norm": 5.807839393615723,
"learning_rate": 1.511411610569636e-06,
"loss": 0.3293692171573639,
"step": 1832
},
{
"epoch": 3.869198312236287,
"grad_norm": 3.83225679397583,
"learning_rate": 1.5075447394883814e-06,
"loss": 0.6949493885040283,
"step": 1834
},
{
"epoch": 3.8734177215189876,
"grad_norm": 10.349047660827637,
"learning_rate": 1.5036816155658665e-06,
"loss": 0.7142183184623718,
"step": 1836
},
{
"epoch": 3.8776371308016877,
"grad_norm": 4.179904460906982,
"learning_rate": 1.4998222597070362e-06,
"loss": 0.6529619097709656,
"step": 1838
},
{
"epoch": 3.8818565400843883,
"grad_norm": 11.569310188293457,
"learning_rate": 1.4959666927964437e-06,
"loss": 0.8389513492584229,
"step": 1840
},
{
"epoch": 3.8860759493670884,
"grad_norm": 4.005336761474609,
"learning_rate": 1.4921149356981397e-06,
"loss": 0.5777831077575684,
"step": 1842
},
{
"epoch": 3.890295358649789,
"grad_norm": 5.133764266967773,
"learning_rate": 1.4882670092555567e-06,
"loss": 0.5414679050445557,
"step": 1844
},
{
"epoch": 3.894514767932489,
"grad_norm": 2.863504409790039,
"learning_rate": 1.4844229342913996e-06,
"loss": 0.9309226870536804,
"step": 1846
},
{
"epoch": 3.8987341772151898,
"grad_norm": 22.195985794067383,
"learning_rate": 1.480582731607531e-06,
"loss": 0.4635329842567444,
"step": 1848
},
{
"epoch": 3.9029535864978904,
"grad_norm": 2.475642204284668,
"learning_rate": 1.4767464219848593e-06,
"loss": 0.9393260478973389,
"step": 1850
},
{
"epoch": 3.9071729957805905,
"grad_norm": 3.141064405441284,
"learning_rate": 1.4729140261832246e-06,
"loss": 0.9542742967605591,
"step": 1852
},
{
"epoch": 3.911392405063291,
"grad_norm": 2.667790174484253,
"learning_rate": 1.4690855649412895e-06,
"loss": 0.9756711721420288,
"step": 1854
},
{
"epoch": 3.9156118143459917,
"grad_norm": 2.641533374786377,
"learning_rate": 1.4652610589764235e-06,
"loss": 0.9634566903114319,
"step": 1856
},
{
"epoch": 3.919831223628692,
"grad_norm": 2.9647128582000732,
"learning_rate": 1.461440528984594e-06,
"loss": 0.9994820356369019,
"step": 1858
},
{
"epoch": 3.9240506329113924,
"grad_norm": 5.323459625244141,
"learning_rate": 1.4576239956402514e-06,
"loss": 0.9943286180496216,
"step": 1860
},
{
"epoch": 3.928270042194093,
"grad_norm": 2.4466195106506348,
"learning_rate": 1.4538114795962195e-06,
"loss": 0.6168838143348694,
"step": 1862
},
{
"epoch": 3.932489451476793,
"grad_norm": 3.8990132808685303,
"learning_rate": 1.4500030014835822e-06,
"loss": 0.6228926777839661,
"step": 1864
},
{
"epoch": 3.9367088607594938,
"grad_norm": 6.640925407409668,
"learning_rate": 1.4461985819115733e-06,
"loss": 1.230762243270874,
"step": 1866
},
{
"epoch": 3.9409282700421944,
"grad_norm": 1.7788114547729492,
"learning_rate": 1.4423982414674635e-06,
"loss": 0.9199753999710083,
"step": 1868
},
{
"epoch": 3.9451476793248945,
"grad_norm": 11.634161949157715,
"learning_rate": 1.4386020007164494e-06,
"loss": 0.702942967414856,
"step": 1870
},
{
"epoch": 3.9493670886075947,
"grad_norm": 0.652026355266571,
"learning_rate": 1.4348098802015446e-06,
"loss": 0.5037093162536621,
"step": 1872
},
{
"epoch": 3.9535864978902953,
"grad_norm": 10.706385612487793,
"learning_rate": 1.4310219004434632e-06,
"loss": 0.45475533604621887,
"step": 1874
},
{
"epoch": 3.957805907172996,
"grad_norm": 7.073146820068359,
"learning_rate": 1.4272380819405139e-06,
"loss": 0.8023735284805298,
"step": 1876
},
{
"epoch": 3.962025316455696,
"grad_norm": 3.564532518386841,
"learning_rate": 1.4234584451684866e-06,
"loss": 0.716842770576477,
"step": 1878
},
{
"epoch": 3.9662447257383966,
"grad_norm": 1.7148876190185547,
"learning_rate": 1.4196830105805432e-06,
"loss": 0.5358736515045166,
"step": 1880
},
{
"epoch": 3.970464135021097,
"grad_norm": 3.4616918563842773,
"learning_rate": 1.4159117986071038e-06,
"loss": 0.9063611030578613,
"step": 1882
},
{
"epoch": 3.9746835443037973,
"grad_norm": 5.480584144592285,
"learning_rate": 1.4121448296557406e-06,
"loss": 0.40525734424591064,
"step": 1884
},
{
"epoch": 3.978902953586498,
"grad_norm": 4.338303565979004,
"learning_rate": 1.4083821241110637e-06,
"loss": 0.9141275882720947,
"step": 1886
},
{
"epoch": 3.9831223628691985,
"grad_norm": 7.042728900909424,
"learning_rate": 1.4046237023346113e-06,
"loss": 0.6083638668060303,
"step": 1888
},
{
"epoch": 3.9873417721518987,
"grad_norm": 7.335713863372803,
"learning_rate": 1.400869584664743e-06,
"loss": 0.9237312078475952,
"step": 1890
},
{
"epoch": 3.9915611814345993,
"grad_norm": 7.168555736541748,
"learning_rate": 1.3971197914165238e-06,
"loss": 0.6043530702590942,
"step": 1892
},
{
"epoch": 3.9957805907173,
"grad_norm": 2.8935647010803223,
"learning_rate": 1.3933743428816209e-06,
"loss": 0.9517507553100586,
"step": 1894
},
{
"epoch": 4.0,
"grad_norm": 2.358701705932617,
"learning_rate": 1.3896332593281876e-06,
"loss": 0.9641570448875427,
"step": 1896
},
{
"epoch": 4.0042194092827,
"grad_norm": 4.007087230682373,
"learning_rate": 1.385896561000759e-06,
"loss": 0.8658764362335205,
"step": 1898
},
{
"epoch": 4.008438818565401,
"grad_norm": 6.783811092376709,
"learning_rate": 1.382164268120137e-06,
"loss": 0.7082722187042236,
"step": 1900
},
{
"epoch": 4.012658227848101,
"grad_norm": 2.4722962379455566,
"learning_rate": 1.3784364008832867e-06,
"loss": 0.7488058805465698,
"step": 1902
},
{
"epoch": 4.0168776371308015,
"grad_norm": 7.7128705978393555,
"learning_rate": 1.3747129794632236e-06,
"loss": 0.5546174049377441,
"step": 1904
},
{
"epoch": 4.0210970464135025,
"grad_norm": 1.1015756130218506,
"learning_rate": 1.3709940240089027e-06,
"loss": 0.5142375826835632,
"step": 1906
},
{
"epoch": 4.025316455696203,
"grad_norm": 5.702658653259277,
"learning_rate": 1.3672795546451144e-06,
"loss": 0.9443526268005371,
"step": 1908
},
{
"epoch": 4.029535864978903,
"grad_norm": 6.516256809234619,
"learning_rate": 1.3635695914723724e-06,
"loss": 0.11540517210960388,
"step": 1910
},
{
"epoch": 4.033755274261603,
"grad_norm": 3.0924103260040283,
"learning_rate": 1.359864154566805e-06,
"loss": 0.7493268251419067,
"step": 1912
},
{
"epoch": 4.037974683544304,
"grad_norm": 5.080263614654541,
"learning_rate": 1.356163263980048e-06,
"loss": 0.793247401714325,
"step": 1914
},
{
"epoch": 4.042194092827004,
"grad_norm": 0.8498378396034241,
"learning_rate": 1.352466939739134e-06,
"loss": 0.4381150007247925,
"step": 1916
},
{
"epoch": 4.046413502109704,
"grad_norm": 5.049806594848633,
"learning_rate": 1.3487752018463865e-06,
"loss": 0.23625794053077698,
"step": 1918
},
{
"epoch": 4.050632911392405,
"grad_norm": 6.505473613739014,
"learning_rate": 1.34508807027931e-06,
"loss": 0.8553643226623535,
"step": 1920
},
{
"epoch": 4.0548523206751055,
"grad_norm": 2.442864418029785,
"learning_rate": 1.341405564990481e-06,
"loss": 0.9089441895484924,
"step": 1922
},
{
"epoch": 4.059071729957806,
"grad_norm": 3.782691717147827,
"learning_rate": 1.3377277059074428e-06,
"loss": 0.6086368560791016,
"step": 1924
},
{
"epoch": 4.063291139240507,
"grad_norm": 7.667325019836426,
"learning_rate": 1.3340545129325956e-06,
"loss": 0.5529667139053345,
"step": 1926
},
{
"epoch": 4.067510548523207,
"grad_norm": 4.649930953979492,
"learning_rate": 1.330386005943089e-06,
"loss": 0.7499093413352966,
"step": 1928
},
{
"epoch": 4.071729957805907,
"grad_norm": 6.8586602210998535,
"learning_rate": 1.3267222047907167e-06,
"loss": 0.2909429967403412,
"step": 1930
},
{
"epoch": 4.075949367088608,
"grad_norm": 5.850220680236816,
"learning_rate": 1.323063129301806e-06,
"loss": 0.5432990789413452,
"step": 1932
},
{
"epoch": 4.080168776371308,
"grad_norm": 5.898839473724365,
"learning_rate": 1.3194087992771097e-06,
"loss": 0.6550246477127075,
"step": 1934
},
{
"epoch": 4.084388185654008,
"grad_norm": 3.0061066150665283,
"learning_rate": 1.3157592344917036e-06,
"loss": 0.7705998420715332,
"step": 1936
},
{
"epoch": 4.0886075949367084,
"grad_norm": 2.5635762214660645,
"learning_rate": 1.3121144546948766e-06,
"loss": 0.44453972578048706,
"step": 1938
},
{
"epoch": 4.0928270042194095,
"grad_norm": 4.0387773513793945,
"learning_rate": 1.3084744796100229e-06,
"loss": 0.5306001901626587,
"step": 1940
},
{
"epoch": 4.09704641350211,
"grad_norm": 4.215574264526367,
"learning_rate": 1.3048393289345369e-06,
"loss": 0.5609068870544434,
"step": 1942
},
{
"epoch": 4.10126582278481,
"grad_norm": 2.5985476970672607,
"learning_rate": 1.3012090223397066e-06,
"loss": 0.503987193107605,
"step": 1944
},
{
"epoch": 4.105485232067511,
"grad_norm": 2.6729464530944824,
"learning_rate": 1.2975835794706063e-06,
"loss": 0.8981311321258545,
"step": 1946
},
{
"epoch": 4.109704641350211,
"grad_norm": 8.088824272155762,
"learning_rate": 1.2939630199459914e-06,
"loss": 0.502710223197937,
"step": 1948
},
{
"epoch": 4.113924050632911,
"grad_norm": 0.9990053772926331,
"learning_rate": 1.2903473633581894e-06,
"loss": 0.5058774948120117,
"step": 1950
},
{
"epoch": 4.118143459915612,
"grad_norm": 2.116455554962158,
"learning_rate": 1.2867366292729984e-06,
"loss": 0.8362418413162231,
"step": 1952
},
{
"epoch": 4.122362869198312,
"grad_norm": 4.284731388092041,
"learning_rate": 1.283130837229578e-06,
"loss": 0.9526023864746094,
"step": 1954
},
{
"epoch": 4.1265822784810124,
"grad_norm": 27.23639678955078,
"learning_rate": 1.2795300067403436e-06,
"loss": 0.16982686519622803,
"step": 1956
},
{
"epoch": 4.1308016877637135,
"grad_norm": 9.439923286437988,
"learning_rate": 1.275934157290863e-06,
"loss": 0.844666600227356,
"step": 1958
},
{
"epoch": 4.135021097046414,
"grad_norm": 52.91316604614258,
"learning_rate": 1.2723433083397486e-06,
"loss": 0.8215901255607605,
"step": 1960
},
{
"epoch": 4.139240506329114,
"grad_norm": 12.898977279663086,
"learning_rate": 1.2687574793185535e-06,
"loss": 0.5214605331420898,
"step": 1962
},
{
"epoch": 4.143459915611814,
"grad_norm": 1.9493759870529175,
"learning_rate": 1.2651766896316653e-06,
"loss": 0.8226008415222168,
"step": 1964
},
{
"epoch": 4.147679324894515,
"grad_norm": 0.4556528925895691,
"learning_rate": 1.2616009586562021e-06,
"loss": 0.43690699338912964,
"step": 1966
},
{
"epoch": 4.151898734177215,
"grad_norm": 3.7246546745300293,
"learning_rate": 1.2580303057419079e-06,
"loss": 0.871078610420227,
"step": 1968
},
{
"epoch": 4.156118143459915,
"grad_norm": 3.5394413471221924,
"learning_rate": 1.2544647502110464e-06,
"loss": 0.9380326271057129,
"step": 1970
},
{
"epoch": 4.160337552742616,
"grad_norm": 4.74537467956543,
"learning_rate": 1.2509043113582969e-06,
"loss": 1.0427074432373047,
"step": 1972
},
{
"epoch": 4.1645569620253164,
"grad_norm": 5.703405380249023,
"learning_rate": 1.247349008450651e-06,
"loss": 0.17169350385665894,
"step": 1974
},
{
"epoch": 4.168776371308017,
"grad_norm": 8.463484764099121,
"learning_rate": 1.243798860727308e-06,
"loss": 0.5819951891899109,
"step": 1976
},
{
"epoch": 4.172995780590718,
"grad_norm": 5.530209541320801,
"learning_rate": 1.2402538873995701e-06,
"loss": 0.40900328755378723,
"step": 1978
},
{
"epoch": 4.177215189873418,
"grad_norm": 6.495384216308594,
"learning_rate": 1.236714107650737e-06,
"loss": 0.42087459564208984,
"step": 1980
},
{
"epoch": 4.181434599156118,
"grad_norm": 3.931180953979492,
"learning_rate": 1.233179540636006e-06,
"loss": 0.7898563742637634,
"step": 1982
},
{
"epoch": 4.185654008438819,
"grad_norm": 5.3524322509765625,
"learning_rate": 1.2296502054823655e-06,
"loss": 0.335269570350647,
"step": 1984
},
{
"epoch": 4.189873417721519,
"grad_norm": 6.85384464263916,
"learning_rate": 1.226126121288492e-06,
"loss": 0.220280259847641,
"step": 1986
},
{
"epoch": 4.194092827004219,
"grad_norm": 17.49827003479004,
"learning_rate": 1.222607307124647e-06,
"loss": 0.5092884302139282,
"step": 1988
},
{
"epoch": 4.198312236286919,
"grad_norm": 5.663785934448242,
"learning_rate": 1.2190937820325733e-06,
"loss": 0.4246003031730652,
"step": 1990
},
{
"epoch": 4.2025316455696204,
"grad_norm": 4.241413116455078,
"learning_rate": 1.215585565025394e-06,
"loss": 0.8379718065261841,
"step": 1992
},
{
"epoch": 4.206751054852321,
"grad_norm": 6.015312194824219,
"learning_rate": 1.2120826750875059e-06,
"loss": 0.5074017643928528,
"step": 1994
},
{
"epoch": 4.210970464135021,
"grad_norm": 3.0558958053588867,
"learning_rate": 1.2085851311744794e-06,
"loss": 0.8118472099304199,
"step": 1996
},
{
"epoch": 4.215189873417722,
"grad_norm": 6.353532314300537,
"learning_rate": 1.205092952212956e-06,
"loss": 1.135847568511963,
"step": 1998
},
{
"epoch": 4.219409282700422,
"grad_norm": 2.435732126235962,
"learning_rate": 1.201606157100544e-06,
"loss": 0.9003854990005493,
"step": 2000
},
{
"epoch": 4.223628691983122,
"grad_norm": 8.28079891204834,
"learning_rate": 1.1981247647057202e-06,
"loss": 0.6943663358688354,
"step": 2002
},
{
"epoch": 4.227848101265823,
"grad_norm": 7.8127264976501465,
"learning_rate": 1.1946487938677226e-06,
"loss": 0.16587281227111816,
"step": 2004
},
{
"epoch": 4.232067510548523,
"grad_norm": 1.957531213760376,
"learning_rate": 1.1911782633964518e-06,
"loss": 0.9451367855072021,
"step": 2006
},
{
"epoch": 4.236286919831223,
"grad_norm": 2.2095224857330322,
"learning_rate": 1.1877131920723674e-06,
"loss": 0.4541362524032593,
"step": 2008
},
{
"epoch": 4.2405063291139244,
"grad_norm": 2.0317702293395996,
"learning_rate": 1.1842535986463885e-06,
"loss": 0.9444383382797241,
"step": 2010
},
{
"epoch": 4.244725738396625,
"grad_norm": 8.539976119995117,
"learning_rate": 1.180799501839791e-06,
"loss": 0.19654181599617004,
"step": 2012
},
{
"epoch": 4.248945147679325,
"grad_norm": 3.8399620056152344,
"learning_rate": 1.1773509203441052e-06,
"loss": 0.5152616500854492,
"step": 2014
},
{
"epoch": 4.253164556962025,
"grad_norm": 2.4427969455718994,
"learning_rate": 1.1739078728210175e-06,
"loss": 0.89030921459198,
"step": 2016
},
{
"epoch": 4.257383966244726,
"grad_norm": 3.564229726791382,
"learning_rate": 1.170470377902266e-06,
"loss": 0.9515880346298218,
"step": 2018
},
{
"epoch": 4.261603375527426,
"grad_norm": 17.13824462890625,
"learning_rate": 1.167038454189543e-06,
"loss": 0.0852670818567276,
"step": 2020
},
{
"epoch": 4.265822784810126,
"grad_norm": 1.1132172346115112,
"learning_rate": 1.163612120254392e-06,
"loss": 0.3325420618057251,
"step": 2022
},
{
"epoch": 4.270042194092827,
"grad_norm": 2.2386295795440674,
"learning_rate": 1.1601913946381068e-06,
"loss": 0.8490246534347534,
"step": 2024
},
{
"epoch": 4.274261603375527,
"grad_norm": 4.5493927001953125,
"learning_rate": 1.1567762958516336e-06,
"loss": 0.30698156356811523,
"step": 2026
},
{
"epoch": 4.2784810126582276,
"grad_norm": 2.7599310874938965,
"learning_rate": 1.1533668423754703e-06,
"loss": 0.3949320912361145,
"step": 2028
},
{
"epoch": 4.282700421940929,
"grad_norm": 0.7302427291870117,
"learning_rate": 1.1499630526595632e-06,
"loss": 0.4672113060951233,
"step": 2030
},
{
"epoch": 4.286919831223629,
"grad_norm": 6.222799777984619,
"learning_rate": 1.1465649451232121e-06,
"loss": 1.0849535465240479,
"step": 2032
},
{
"epoch": 4.291139240506329,
"grad_norm": 2.6900506019592285,
"learning_rate": 1.1431725381549675e-06,
"loss": 0.12843787670135498,
"step": 2034
},
{
"epoch": 4.29535864978903,
"grad_norm": 7.403899669647217,
"learning_rate": 1.1397858501125304e-06,
"loss": 0.3389854431152344,
"step": 2036
},
{
"epoch": 4.29957805907173,
"grad_norm": 5.636825084686279,
"learning_rate": 1.1364048993226566e-06,
"loss": 0.6659049391746521,
"step": 2038
},
{
"epoch": 4.30379746835443,
"grad_norm": 10.65471363067627,
"learning_rate": 1.1330297040810534e-06,
"loss": 1.0959115028381348,
"step": 2040
},
{
"epoch": 4.308016877637131,
"grad_norm": 6.164623737335205,
"learning_rate": 1.129660282652284e-06,
"loss": 0.8495713472366333,
"step": 2042
},
{
"epoch": 4.312236286919831,
"grad_norm": 21.337953567504883,
"learning_rate": 1.1262966532696658e-06,
"loss": 0.4679602384567261,
"step": 2044
},
{
"epoch": 4.3164556962025316,
"grad_norm": 3.735825300216675,
"learning_rate": 1.1229388341351739e-06,
"loss": 1.0504865646362305,
"step": 2046
},
{
"epoch": 4.320675105485232,
"grad_norm": 2.874302387237549,
"learning_rate": 1.1195868434193413e-06,
"loss": 0.9641183614730835,
"step": 2048
},
{
"epoch": 4.324894514767933,
"grad_norm": 2.697021722793579,
"learning_rate": 1.1162406992611618e-06,
"loss": 0.24490822851657867,
"step": 2050
},
{
"epoch": 4.329113924050633,
"grad_norm": 8.153789520263672,
"learning_rate": 1.1129004197679907e-06,
"loss": 0.43832969665527344,
"step": 2052
},
{
"epoch": 4.333333333333333,
"grad_norm": 2.9143199920654297,
"learning_rate": 1.1095660230154457e-06,
"loss": 0.7494316101074219,
"step": 2054
},
{
"epoch": 4.337552742616034,
"grad_norm": 22.089580535888672,
"learning_rate": 1.1062375270473129e-06,
"loss": 0.4954107403755188,
"step": 2056
},
{
"epoch": 4.341772151898734,
"grad_norm": 5.983814716339111,
"learning_rate": 1.1029149498754458e-06,
"loss": 0.39451485872268677,
"step": 2058
},
{
"epoch": 4.345991561181434,
"grad_norm": 3.4319894313812256,
"learning_rate": 1.0995983094796688e-06,
"loss": 0.816379189491272,
"step": 2060
},
{
"epoch": 4.350210970464135,
"grad_norm": 2.19193172454834,
"learning_rate": 1.0962876238076799e-06,
"loss": 0.9197038412094116,
"step": 2062
},
{
"epoch": 4.3544303797468356,
"grad_norm": 8.006820678710938,
"learning_rate": 1.0929829107749547e-06,
"loss": 0.8574424982070923,
"step": 2064
},
{
"epoch": 4.358649789029536,
"grad_norm": 3.324010133743286,
"learning_rate": 1.0896841882646471e-06,
"loss": 0.9916654825210571,
"step": 2066
},
{
"epoch": 4.362869198312236,
"grad_norm": 3.6000797748565674,
"learning_rate": 1.0863914741274944e-06,
"loss": 0.4570949077606201,
"step": 2068
},
{
"epoch": 4.367088607594937,
"grad_norm": 1.9650532007217407,
"learning_rate": 1.0831047861817193e-06,
"loss": 0.9559861421585083,
"step": 2070
},
{
"epoch": 4.371308016877637,
"grad_norm": 2.6903204917907715,
"learning_rate": 1.079824142212936e-06,
"loss": 0.9988477230072021,
"step": 2072
},
{
"epoch": 4.375527426160337,
"grad_norm": 3.71533203125,
"learning_rate": 1.07654955997405e-06,
"loss": 0.8142194747924805,
"step": 2074
},
{
"epoch": 4.379746835443038,
"grad_norm": 6.335799694061279,
"learning_rate": 1.0732810571851677e-06,
"loss": 0.6120598316192627,
"step": 2076
},
{
"epoch": 4.383966244725738,
"grad_norm": 2.123081684112549,
"learning_rate": 1.0700186515334939e-06,
"loss": 0.4905482232570648,
"step": 2078
},
{
"epoch": 4.3881856540084385,
"grad_norm": 14.160784721374512,
"learning_rate": 1.0667623606732408e-06,
"loss": 0.9914622902870178,
"step": 2080
},
{
"epoch": 4.3924050632911396,
"grad_norm": 1.714659571647644,
"learning_rate": 1.0635122022255298e-06,
"loss": 0.6109655499458313,
"step": 2082
},
{
"epoch": 4.39662447257384,
"grad_norm": 4.592569351196289,
"learning_rate": 1.0602681937782985e-06,
"loss": 0.7499299049377441,
"step": 2084
},
{
"epoch": 4.40084388185654,
"grad_norm": 3.3476827144622803,
"learning_rate": 1.0570303528862044e-06,
"loss": 0.9557301998138428,
"step": 2086
},
{
"epoch": 4.405063291139241,
"grad_norm": 3.5681612491607666,
"learning_rate": 1.0537986970705284e-06,
"loss": 0.9052315354347229,
"step": 2088
},
{
"epoch": 4.409282700421941,
"grad_norm": 4.310785293579102,
"learning_rate": 1.0505732438190832e-06,
"loss": 0.6285467147827148,
"step": 2090
},
{
"epoch": 4.413502109704641,
"grad_norm": 8.120601654052734,
"learning_rate": 1.0473540105861158e-06,
"loss": 0.8778185844421387,
"step": 2092
},
{
"epoch": 4.417721518987342,
"grad_norm": 2.5801761150360107,
"learning_rate": 1.0441410147922142e-06,
"loss": 0.8876914381980896,
"step": 2094
},
{
"epoch": 4.421940928270042,
"grad_norm": 22.094350814819336,
"learning_rate": 1.0409342738242145e-06,
"loss": 0.5706854462623596,
"step": 2096
},
{
"epoch": 4.4261603375527425,
"grad_norm": 2.56339955329895,
"learning_rate": 1.0377338050351023e-06,
"loss": 0.8818637132644653,
"step": 2098
},
{
"epoch": 4.430379746835443,
"grad_norm": 15.67695140838623,
"learning_rate": 1.0345396257439248e-06,
"loss": 0.6227443814277649,
"step": 2100
},
{
"epoch": 4.434599156118144,
"grad_norm": 2.2217981815338135,
"learning_rate": 1.0313517532356928e-06,
"loss": 0.7605068683624268,
"step": 2102
},
{
"epoch": 4.438818565400844,
"grad_norm": 2.441141128540039,
"learning_rate": 1.0281702047612885e-06,
"loss": 0.7203768491744995,
"step": 2104
},
{
"epoch": 4.443037974683544,
"grad_norm": 4.200733184814453,
"learning_rate": 1.024994997537373e-06,
"loss": 0.8852105736732483,
"step": 2106
},
{
"epoch": 4.447257383966245,
"grad_norm": 9.651650428771973,
"learning_rate": 1.0218261487462916e-06,
"loss": 0.5270538330078125,
"step": 2108
},
{
"epoch": 4.451476793248945,
"grad_norm": 2.550156593322754,
"learning_rate": 1.0186636755359814e-06,
"loss": 0.8197285532951355,
"step": 2110
},
{
"epoch": 4.455696202531645,
"grad_norm": 2.093350887298584,
"learning_rate": 1.0155075950198794e-06,
"loss": 0.8607369065284729,
"step": 2112
},
{
"epoch": 4.459915611814346,
"grad_norm": 1.2756742238998413,
"learning_rate": 1.0123579242768282e-06,
"loss": 0.6345518827438354,
"step": 2114
},
{
"epoch": 4.4641350210970465,
"grad_norm": 3.17000675201416,
"learning_rate": 1.0092146803509854e-06,
"loss": 0.48864442110061646,
"step": 2116
},
{
"epoch": 4.468354430379747,
"grad_norm": 2.0671489238739014,
"learning_rate": 1.006077880251729e-06,
"loss": 0.862575888633728,
"step": 2118
},
{
"epoch": 4.472573839662447,
"grad_norm": 2.851736307144165,
"learning_rate": 1.0029475409535692e-06,
"loss": 0.4032348692417145,
"step": 2120
},
{
"epoch": 4.476793248945148,
"grad_norm": 4.477703094482422,
"learning_rate": 9.998236793960514e-07,
"loss": 0.36202433705329895,
"step": 2122
},
{
"epoch": 4.481012658227848,
"grad_norm": 8.475764274597168,
"learning_rate": 9.967063124836695e-07,
"loss": 0.21301576495170593,
"step": 2124
},
{
"epoch": 4.485232067510548,
"grad_norm": 3.3703811168670654,
"learning_rate": 9.935954570857717e-07,
"loss": 0.39527398347854614,
"step": 2126
},
{
"epoch": 4.489451476793249,
"grad_norm": 2.7759153842926025,
"learning_rate": 9.90491130036468e-07,
"loss": 0.6493411064147949,
"step": 2128
},
{
"epoch": 4.493670886075949,
"grad_norm": 9.41816520690918,
"learning_rate": 9.873933481345432e-07,
"loss": 0.484800785779953,
"step": 2130
},
{
"epoch": 4.4978902953586495,
"grad_norm": 2.5206875801086426,
"learning_rate": 9.843021281433624e-07,
"loss": 1.0602920055389404,
"step": 2132
},
{
"epoch": 4.5021097046413505,
"grad_norm": 4.042180061340332,
"learning_rate": 9.81217486790782e-07,
"loss": 0.7310470342636108,
"step": 2134
},
{
"epoch": 4.506329113924051,
"grad_norm": 4.009156703948975,
"learning_rate": 9.781394407690582e-07,
"loss": 0.12923167645931244,
"step": 2136
},
{
"epoch": 4.510548523206751,
"grad_norm": 3.74722957611084,
"learning_rate": 9.750680067347574e-07,
"loss": 0.3252981901168823,
"step": 2138
},
{
"epoch": 4.514767932489452,
"grad_norm": 39.80788040161133,
"learning_rate": 9.720032013086665e-07,
"loss": 0.25149163603782654,
"step": 2140
},
{
"epoch": 4.518987341772152,
"grad_norm": 1.6690428256988525,
"learning_rate": 9.689450410757014e-07,
"loss": 0.6628930568695068,
"step": 2142
},
{
"epoch": 4.523206751054852,
"grad_norm": 12.622380256652832,
"learning_rate": 9.658935425848178e-07,
"loss": 0.17167873680591583,
"step": 2144
},
{
"epoch": 4.527426160337553,
"grad_norm": 3.543349504470825,
"learning_rate": 9.628487223489232e-07,
"loss": 0.5717638731002808,
"step": 2146
},
{
"epoch": 4.531645569620253,
"grad_norm": 3.6029629707336426,
"learning_rate": 9.598105968447845e-07,
"loss": 0.5759022831916809,
"step": 2148
},
{
"epoch": 4.5358649789029535,
"grad_norm": 0.09577035158872604,
"learning_rate": 9.567791825129436e-07,
"loss": 0.45371395349502563,
"step": 2150
},
{
"epoch": 4.540084388185654,
"grad_norm": 2.7558352947235107,
"learning_rate": 9.537544957576232e-07,
"loss": 0.5172098875045776,
"step": 2152
},
{
"epoch": 4.544303797468355,
"grad_norm": 6.613936424255371,
"learning_rate": 9.507365529466414e-07,
"loss": 1.0241069793701172,
"step": 2154
},
{
"epoch": 4.548523206751055,
"grad_norm": 3.1837728023529053,
"learning_rate": 9.477253704113204e-07,
"loss": 0.9064798355102539,
"step": 2156
},
{
"epoch": 4.552742616033755,
"grad_norm": 2.8910419940948486,
"learning_rate": 9.447209644464014e-07,
"loss": 0.8971297740936279,
"step": 2158
},
{
"epoch": 4.556962025316456,
"grad_norm": 3.6541380882263184,
"learning_rate": 9.417233513099545e-07,
"loss": 0.7274525165557861,
"step": 2160
},
{
"epoch": 4.561181434599156,
"grad_norm": 2.7287378311157227,
"learning_rate": 9.387325472232908e-07,
"loss": 0.7473336458206177,
"step": 2162
},
{
"epoch": 4.565400843881856,
"grad_norm": 5.9793500900268555,
"learning_rate": 9.357485683708752e-07,
"loss": 0.6158387660980225,
"step": 2164
},
{
"epoch": 4.569620253164557,
"grad_norm": 2.9492175579071045,
"learning_rate": 9.327714309002378e-07,
"loss": 0.8946245312690735,
"step": 2166
},
{
"epoch": 4.5738396624472575,
"grad_norm": 2.516920566558838,
"learning_rate": 9.298011509218878e-07,
"loss": 0.7441626787185669,
"step": 2168
},
{
"epoch": 4.578059071729958,
"grad_norm": 9.32639217376709,
"learning_rate": 9.268377445092257e-07,
"loss": 0.18001851439476013,
"step": 2170
},
{
"epoch": 4.582278481012658,
"grad_norm": 2.5811736583709717,
"learning_rate": 9.238812276984563e-07,
"loss": 0.6168837547302246,
"step": 2172
},
{
"epoch": 4.586497890295359,
"grad_norm": 79.09625244140625,
"learning_rate": 9.209316164885007e-07,
"loss": 0.7156883478164673,
"step": 2174
},
{
"epoch": 4.590717299578059,
"grad_norm": 3.27329683303833,
"learning_rate": 9.179889268409126e-07,
"loss": 0.9324935078620911,
"step": 2176
},
{
"epoch": 4.594936708860759,
"grad_norm": 10.160416603088379,
"learning_rate": 9.150531746797897e-07,
"loss": 0.6166714429855347,
"step": 2178
},
{
"epoch": 4.59915611814346,
"grad_norm": 3.8143131732940674,
"learning_rate": 9.121243758916885e-07,
"loss": 0.5362197756767273,
"step": 2180
},
{
"epoch": 4.60337552742616,
"grad_norm": 2.629331111907959,
"learning_rate": 9.092025463255371e-07,
"loss": 0.9286479949951172,
"step": 2182
},
{
"epoch": 4.6075949367088604,
"grad_norm": 3.1048662662506104,
"learning_rate": 9.062877017925509e-07,
"loss": 0.23398178815841675,
"step": 2184
},
{
"epoch": 4.6118143459915615,
"grad_norm": 2.799243211746216,
"learning_rate": 9.033798580661465e-07,
"loss": 0.9572643041610718,
"step": 2186
},
{
"epoch": 4.616033755274262,
"grad_norm": 1.9954626560211182,
"learning_rate": 9.00479030881856e-07,
"loss": 0.5257174372673035,
"step": 2188
},
{
"epoch": 4.620253164556962,
"grad_norm": 2.1326327323913574,
"learning_rate": 8.975852359372421e-07,
"loss": 0.907311737537384,
"step": 2190
},
{
"epoch": 4.624472573839663,
"grad_norm": 2.5456783771514893,
"learning_rate": 8.946984888918133e-07,
"loss": 0.48332294821739197,
"step": 2192
},
{
"epoch": 4.628691983122363,
"grad_norm": 4.807265758514404,
"learning_rate": 8.918188053669391e-07,
"loss": 0.8633521199226379,
"step": 2194
},
{
"epoch": 4.632911392405063,
"grad_norm": 3.1572585105895996,
"learning_rate": 8.889462009457651e-07,
"loss": 0.4701206088066101,
"step": 2196
},
{
"epoch": 4.637130801687764,
"grad_norm": 2.305100440979004,
"learning_rate": 8.860806911731295e-07,
"loss": 0.4662626385688782,
"step": 2198
},
{
"epoch": 4.641350210970464,
"grad_norm": 2.7214598655700684,
"learning_rate": 8.832222915554783e-07,
"loss": 0.8649228811264038,
"step": 2200
},
{
"epoch": 4.6455696202531644,
"grad_norm": 4.30544900894165,
"learning_rate": 8.803710175607808e-07,
"loss": 0.8740881085395813,
"step": 2202
},
{
"epoch": 4.649789029535865,
"grad_norm": 1.4659613370895386,
"learning_rate": 8.775268846184471e-07,
"loss": 0.5230797529220581,
"step": 2204
},
{
"epoch": 4.654008438818566,
"grad_norm": 3.350233316421509,
"learning_rate": 8.74689908119245e-07,
"loss": 0.5945952534675598,
"step": 2206
},
{
"epoch": 4.658227848101266,
"grad_norm": 1.1876442432403564,
"learning_rate": 8.718601034152144e-07,
"loss": 0.5520751476287842,
"step": 2208
},
{
"epoch": 4.662447257383966,
"grad_norm": 2.593919277191162,
"learning_rate": 8.690374858195868e-07,
"loss": 0.8659783601760864,
"step": 2210
},
{
"epoch": 4.666666666666667,
"grad_norm": 2.051456928253174,
"learning_rate": 8.662220706067007e-07,
"loss": 0.7441516518592834,
"step": 2212
},
{
"epoch": 4.670886075949367,
"grad_norm": 9.695352554321289,
"learning_rate": 8.634138730119199e-07,
"loss": 0.6046957969665527,
"step": 2214
},
{
"epoch": 4.675105485232067,
"grad_norm": 3.217013359069824,
"learning_rate": 8.606129082315514e-07,
"loss": 0.8700679540634155,
"step": 2216
},
{
"epoch": 4.679324894514768,
"grad_norm": 2.5168628692626953,
"learning_rate": 8.578191914227602e-07,
"loss": 0.5581780076026917,
"step": 2218
},
{
"epoch": 4.6835443037974684,
"grad_norm": 3.7480080127716064,
"learning_rate": 8.550327377034915e-07,
"loss": 0.7154510617256165,
"step": 2220
},
{
"epoch": 4.687763713080169,
"grad_norm": 2.187389373779297,
"learning_rate": 8.522535621523864e-07,
"loss": 0.17346470057964325,
"step": 2222
},
{
"epoch": 4.691983122362869,
"grad_norm": 2.2572085857391357,
"learning_rate": 8.494816798087014e-07,
"loss": 0.8721593618392944,
"step": 2224
},
{
"epoch": 4.69620253164557,
"grad_norm": 8.44543170928955,
"learning_rate": 8.467171056722262e-07,
"loss": 0.5838876962661743,
"step": 2226
},
{
"epoch": 4.70042194092827,
"grad_norm": 10.777728080749512,
"learning_rate": 8.439598547032021e-07,
"loss": 0.15432819724082947,
"step": 2228
},
{
"epoch": 4.70464135021097,
"grad_norm": 2.5708587169647217,
"learning_rate": 8.412099418222429e-07,
"loss": 0.8907821178436279,
"step": 2230
},
{
"epoch": 4.708860759493671,
"grad_norm": 1.8050702810287476,
"learning_rate": 8.384673819102515e-07,
"loss": 0.8190984725952148,
"step": 2232
},
{
"epoch": 4.713080168776371,
"grad_norm": 13.505372047424316,
"learning_rate": 8.357321898083417e-07,
"loss": 0.5908716917037964,
"step": 2234
},
{
"epoch": 4.717299578059071,
"grad_norm": 4.608894348144531,
"learning_rate": 8.330043803177576e-07,
"loss": 0.43208563327789307,
"step": 2236
},
{
"epoch": 4.7215189873417724,
"grad_norm": 6.133680820465088,
"learning_rate": 8.302839681997924e-07,
"loss": 0.7111215591430664,
"step": 2238
},
{
"epoch": 4.725738396624473,
"grad_norm": 1.9396830797195435,
"learning_rate": 8.275709681757091e-07,
"loss": 0.8701183795928955,
"step": 2240
},
{
"epoch": 4.729957805907173,
"grad_norm": 2.7942826747894287,
"learning_rate": 8.248653949266609e-07,
"loss": 0.9508087635040283,
"step": 2242
},
{
"epoch": 4.734177215189874,
"grad_norm": 1.77509343624115,
"learning_rate": 8.221672630936114e-07,
"loss": 0.14094747602939606,
"step": 2244
},
{
"epoch": 4.738396624472574,
"grad_norm": 9.949209213256836,
"learning_rate": 8.194765872772569e-07,
"loss": 0.7157829999923706,
"step": 2246
},
{
"epoch": 4.742616033755274,
"grad_norm": 3.089747667312622,
"learning_rate": 8.167933820379438e-07,
"loss": 0.9330974817276001,
"step": 2248
},
{
"epoch": 4.746835443037975,
"grad_norm": 8.302231788635254,
"learning_rate": 8.141176618955941e-07,
"loss": 0.18974465131759644,
"step": 2250
},
{
"epoch": 4.751054852320675,
"grad_norm": 17.27684211730957,
"learning_rate": 8.114494413296242e-07,
"loss": 0.6534916162490845,
"step": 2252
},
{
"epoch": 4.755274261603375,
"grad_norm": 13.299623489379883,
"learning_rate": 8.087887347788675e-07,
"loss": 0.5243600606918335,
"step": 2254
},
{
"epoch": 4.759493670886076,
"grad_norm": 1.3798922300338745,
"learning_rate": 8.061355566414959e-07,
"loss": 0.46594005823135376,
"step": 2256
},
{
"epoch": 4.763713080168777,
"grad_norm": 6.182672023773193,
"learning_rate": 8.034899212749415e-07,
"loss": 0.22735753655433655,
"step": 2258
},
{
"epoch": 4.767932489451477,
"grad_norm": 4.455085277557373,
"learning_rate": 8.0085184299582e-07,
"loss": 0.22588486969470978,
"step": 2260
},
{
"epoch": 4.772151898734177,
"grad_norm": 2.398963212966919,
"learning_rate": 7.982213360798524e-07,
"loss": 0.5842011570930481,
"step": 2262
},
{
"epoch": 4.776371308016878,
"grad_norm": 3.986417055130005,
"learning_rate": 7.955984147617878e-07,
"loss": 0.8581550121307373,
"step": 2264
},
{
"epoch": 4.780590717299578,
"grad_norm": 2.5186336040496826,
"learning_rate": 7.929830932353267e-07,
"loss": 0.9678604602813721,
"step": 2266
},
{
"epoch": 4.784810126582278,
"grad_norm": 17.917510986328125,
"learning_rate": 7.903753856530439e-07,
"loss": 0.776985764503479,
"step": 2268
},
{
"epoch": 4.789029535864979,
"grad_norm": 4.219602108001709,
"learning_rate": 7.877753061263124e-07,
"loss": 0.49661773443222046,
"step": 2270
},
{
"epoch": 4.793248945147679,
"grad_norm": 2.524501323699951,
"learning_rate": 7.851828687252258e-07,
"loss": 0.9214498996734619,
"step": 2272
},
{
"epoch": 4.7974683544303796,
"grad_norm": 24.021936416625977,
"learning_rate": 7.825980874785245e-07,
"loss": 0.2861242890357971,
"step": 2274
},
{
"epoch": 4.80168776371308,
"grad_norm": 1.766944169998169,
"learning_rate": 7.800209763735166e-07,
"loss": 0.2682395279407501,
"step": 2276
},
{
"epoch": 4.805907172995781,
"grad_norm": 3.6635119915008545,
"learning_rate": 7.774515493560047e-07,
"loss": 0.5065731406211853,
"step": 2278
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.9169036746025085,
"learning_rate": 7.748898203302101e-07,
"loss": 0.4213840365409851,
"step": 2280
},
{
"epoch": 4.814345991561181,
"grad_norm": 2.111497402191162,
"learning_rate": 7.723358031586968e-07,
"loss": 0.8279630541801453,
"step": 2282
},
{
"epoch": 4.818565400843882,
"grad_norm": 3.6885154247283936,
"learning_rate": 7.697895116622962e-07,
"loss": 0.721439003944397,
"step": 2284
},
{
"epoch": 4.822784810126582,
"grad_norm": 4.22064733505249,
"learning_rate": 7.672509596200339e-07,
"loss": 0.8761791586875916,
"step": 2286
},
{
"epoch": 4.827004219409282,
"grad_norm": 2.2504615783691406,
"learning_rate": 7.647201607690535e-07,
"loss": 0.43095457553863525,
"step": 2288
},
{
"epoch": 4.831223628691983,
"grad_norm": 2.19746470451355,
"learning_rate": 7.621971288045436e-07,
"loss": 0.7216506004333496,
"step": 2290
},
{
"epoch": 4.8354430379746836,
"grad_norm": 2.588840961456299,
"learning_rate": 7.596818773796616e-07,
"loss": 0.8444218039512634,
"step": 2292
},
{
"epoch": 4.839662447257384,
"grad_norm": 2.1437089443206787,
"learning_rate": 7.571744201054619e-07,
"loss": 0.9132941961288452,
"step": 2294
},
{
"epoch": 4.843881856540085,
"grad_norm": 2.2970213890075684,
"learning_rate": 7.54674770550823e-07,
"loss": 0.8675155639648438,
"step": 2296
},
{
"epoch": 4.848101265822785,
"grad_norm": 2.469003438949585,
"learning_rate": 7.521829422423707e-07,
"loss": 0.8924763202667236,
"step": 2298
},
{
"epoch": 4.852320675105485,
"grad_norm": 5.6491169929504395,
"learning_rate": 7.496989486644074e-07,
"loss": 1.2289131879806519,
"step": 2300
},
{
"epoch": 4.856540084388186,
"grad_norm": 0.6651078462600708,
"learning_rate": 7.472228032588392e-07,
"loss": 0.5435088872909546,
"step": 2302
},
{
"epoch": 4.860759493670886,
"grad_norm": 1.8895771503448486,
"learning_rate": 7.447545194251021e-07,
"loss": 0.4832010865211487,
"step": 2304
},
{
"epoch": 4.864978902953586,
"grad_norm": 4.667498588562012,
"learning_rate": 7.422941105200888e-07,
"loss": 0.7593515515327454,
"step": 2306
},
{
"epoch": 4.869198312236287,
"grad_norm": 2.6413588523864746,
"learning_rate": 7.398415898580795e-07,
"loss": 0.5025730729103088,
"step": 2308
},
{
"epoch": 4.8734177215189876,
"grad_norm": 2.2257080078125,
"learning_rate": 7.373969707106667e-07,
"loss": 0.5178145170211792,
"step": 2310
},
{
"epoch": 4.877637130801688,
"grad_norm": 4.63566255569458,
"learning_rate": 7.349602663066848e-07,
"loss": 0.8785790801048279,
"step": 2312
},
{
"epoch": 4.881856540084388,
"grad_norm": 11.207052230834961,
"learning_rate": 7.325314898321387e-07,
"loss": 0.6604704260826111,
"step": 2314
},
{
"epoch": 4.886075949367089,
"grad_norm": 2.7186286449432373,
"learning_rate": 7.30110654430131e-07,
"loss": 0.8655844330787659,
"step": 2316
},
{
"epoch": 4.890295358649789,
"grad_norm": 9.436038970947266,
"learning_rate": 7.276977732007934e-07,
"loss": 0.6372033357620239,
"step": 2318
},
{
"epoch": 4.894514767932489,
"grad_norm": 9.619095802307129,
"learning_rate": 7.252928592012131e-07,
"loss": 0.5399308204650879,
"step": 2320
},
{
"epoch": 4.89873417721519,
"grad_norm": 3.560415267944336,
"learning_rate": 7.228959254453634e-07,
"loss": 0.5512664318084717,
"step": 2322
},
{
"epoch": 4.90295358649789,
"grad_norm": 2.261822462081909,
"learning_rate": 7.20506984904034e-07,
"loss": 0.965155839920044,
"step": 2324
},
{
"epoch": 4.9071729957805905,
"grad_norm": 5.737890243530273,
"learning_rate": 7.181260505047593e-07,
"loss": 0.5091350078582764,
"step": 2326
},
{
"epoch": 4.911392405063291,
"grad_norm": 2.460875988006592,
"learning_rate": 7.157531351317499e-07,
"loss": 0.6960829496383667,
"step": 2328
},
{
"epoch": 4.915611814345992,
"grad_norm": 2.570103883743286,
"learning_rate": 7.133882516258215e-07,
"loss": 1.0476431846618652,
"step": 2330
},
{
"epoch": 4.919831223628692,
"grad_norm": 0.8946544528007507,
"learning_rate": 7.110314127843266e-07,
"loss": 0.5339324474334717,
"step": 2332
},
{
"epoch": 4.924050632911392,
"grad_norm": 6.283257007598877,
"learning_rate": 7.086826313610843e-07,
"loss": 0.6191664934158325,
"step": 2334
},
{
"epoch": 4.928270042194093,
"grad_norm": 2.9751780033111572,
"learning_rate": 7.063419200663121e-07,
"loss": 0.9971131086349487,
"step": 2336
},
{
"epoch": 4.932489451476793,
"grad_norm": 30.684070587158203,
"learning_rate": 7.040092915665563e-07,
"loss": 0.5671279430389404,
"step": 2338
},
{
"epoch": 4.936708860759493,
"grad_norm": 3.855710506439209,
"learning_rate": 7.016847584846243e-07,
"loss": 0.5699124336242676,
"step": 2340
},
{
"epoch": 4.940928270042194,
"grad_norm": 5.847226142883301,
"learning_rate": 6.993683333995155e-07,
"loss": 0.8012879490852356,
"step": 2342
},
{
"epoch": 4.9451476793248945,
"grad_norm": 6.018973350524902,
"learning_rate": 6.970600288463544e-07,
"loss": 0.5165205597877502,
"step": 2344
},
{
"epoch": 4.949367088607595,
"grad_norm": 2.1352529525756836,
"learning_rate": 6.947598573163207e-07,
"loss": 0.9921296834945679,
"step": 2346
},
{
"epoch": 4.953586497890296,
"grad_norm": 2.2737410068511963,
"learning_rate": 6.924678312565846e-07,
"loss": 0.5466551780700684,
"step": 2348
},
{
"epoch": 4.957805907172996,
"grad_norm": 2.052476167678833,
"learning_rate": 6.901839630702358e-07,
"loss": 0.7028835415840149,
"step": 2350
},
{
"epoch": 4.962025316455696,
"grad_norm": 5.379444599151611,
"learning_rate": 6.879082651162198e-07,
"loss": 0.4037717580795288,
"step": 2352
},
{
"epoch": 4.966244725738397,
"grad_norm": 2.764251470565796,
"learning_rate": 6.856407497092698e-07,
"loss": 0.8569744825363159,
"step": 2354
},
{
"epoch": 4.970464135021097,
"grad_norm": 11.355871200561523,
"learning_rate": 6.833814291198395e-07,
"loss": 0.5073586106300354,
"step": 2356
},
{
"epoch": 4.974683544303797,
"grad_norm": 3.204270124435425,
"learning_rate": 6.811303155740364e-07,
"loss": 0.8562701344490051,
"step": 2358
},
{
"epoch": 4.978902953586498,
"grad_norm": 0.8829920887947083,
"learning_rate": 6.788874212535576e-07,
"loss": 0.5263558626174927,
"step": 2360
},
{
"epoch": 4.9831223628691985,
"grad_norm": 4.092447757720947,
"learning_rate": 6.766527582956217e-07,
"loss": 0.8253353238105774,
"step": 2362
},
{
"epoch": 4.987341772151899,
"grad_norm": 5.725378513336182,
"learning_rate": 6.744263387929044e-07,
"loss": 1.0841920375823975,
"step": 2364
},
{
"epoch": 4.991561181434599,
"grad_norm": 2.338805913925171,
"learning_rate": 6.722081747934722e-07,
"loss": 0.9890093803405762,
"step": 2366
},
{
"epoch": 4.9957805907173,
"grad_norm": 6.7110137939453125,
"learning_rate": 6.699982783007181e-07,
"loss": 0.6056183576583862,
"step": 2368
},
{
"epoch": 5.0,
"grad_norm": 8.881539344787598,
"learning_rate": 6.677966612732969e-07,
"loss": 0.19234615564346313,
"step": 2370
},
{
"epoch": 5.0042194092827,
"grad_norm": 2.8525843620300293,
"learning_rate": 6.656033356250588e-07,
"loss": 0.5725005865097046,
"step": 2372
},
{
"epoch": 5.008438818565401,
"grad_norm": 2.8133490085601807,
"learning_rate": 6.634183132249862e-07,
"loss": 0.930966317653656,
"step": 2374
},
{
"epoch": 5.012658227848101,
"grad_norm": 6.63195276260376,
"learning_rate": 6.612416058971295e-07,
"loss": 0.6420108079910278,
"step": 2376
},
{
"epoch": 5.0168776371308015,
"grad_norm": 3.1725778579711914,
"learning_rate": 6.590732254205429e-07,
"loss": 0.8284573554992676,
"step": 2378
},
{
"epoch": 5.0210970464135025,
"grad_norm": 4.44754695892334,
"learning_rate": 6.569131835292196e-07,
"loss": 0.48461687564849854,
"step": 2380
},
{
"epoch": 5.025316455696203,
"grad_norm": 3.432302474975586,
"learning_rate": 6.547614919120305e-07,
"loss": 0.7024222016334534,
"step": 2382
},
{
"epoch": 5.029535864978903,
"grad_norm": 2.8031928539276123,
"learning_rate": 6.526181622126594e-07,
"loss": 0.6068092584609985,
"step": 2384
},
{
"epoch": 5.033755274261603,
"grad_norm": 6.872043132781982,
"learning_rate": 6.504832060295403e-07,
"loss": 0.5841951370239258,
"step": 2386
},
{
"epoch": 5.037974683544304,
"grad_norm": 2.907249927520752,
"learning_rate": 6.483566349157945e-07,
"loss": 0.6709692478179932,
"step": 2388
},
{
"epoch": 5.042194092827004,
"grad_norm": 0.09023797512054443,
"learning_rate": 6.462384603791684e-07,
"loss": 0.5061817765235901,
"step": 2390
},
{
"epoch": 5.046413502109704,
"grad_norm": 11.977813720703125,
"learning_rate": 6.441286938819714e-07,
"loss": 0.31504881381988525,
"step": 2392
},
{
"epoch": 5.050632911392405,
"grad_norm": 5.734729766845703,
"learning_rate": 6.420273468410131e-07,
"loss": 0.48150938749313354,
"step": 2394
},
{
"epoch": 5.0548523206751055,
"grad_norm": 1.924585223197937,
"learning_rate": 6.399344306275419e-07,
"loss": 0.3540734052658081,
"step": 2396
},
{
"epoch": 5.059071729957806,
"grad_norm": 3.9746241569519043,
"learning_rate": 6.378499565671839e-07,
"loss": 0.8421119451522827,
"step": 2398
},
{
"epoch": 5.063291139240507,
"grad_norm": 7.219764232635498,
"learning_rate": 6.35773935939881e-07,
"loss": 1.0435487031936646,
"step": 2400
},
{
"epoch": 5.067510548523207,
"grad_norm": 5.04371976852417,
"learning_rate": 6.337063799798305e-07,
"loss": 0.9782629013061523,
"step": 2402
},
{
"epoch": 5.071729957805907,
"grad_norm": 16.103092193603516,
"learning_rate": 6.316472998754234e-07,
"loss": 0.033330727368593216,
"step": 2404
},
{
"epoch": 5.075949367088608,
"grad_norm": 4.6494526863098145,
"learning_rate": 6.29596706769185e-07,
"loss": 0.7846492528915405,
"step": 2406
},
{
"epoch": 5.080168776371308,
"grad_norm": 2.9837498664855957,
"learning_rate": 6.275546117577132e-07,
"loss": 0.48354560136795044,
"step": 2408
},
{
"epoch": 5.084388185654008,
"grad_norm": 5.440659999847412,
"learning_rate": 6.255210258916199e-07,
"loss": 0.5124998688697815,
"step": 2410
},
{
"epoch": 5.0886075949367084,
"grad_norm": 2.850675106048584,
"learning_rate": 6.234959601754703e-07,
"loss": 0.7655423879623413,
"step": 2412
},
{
"epoch": 5.0928270042194095,
"grad_norm": 2.1231069564819336,
"learning_rate": 6.214794255677234e-07,
"loss": 0.7977665662765503,
"step": 2414
},
{
"epoch": 5.09704641350211,
"grad_norm": 17.62323760986328,
"learning_rate": 6.194714329806732e-07,
"loss": 0.34903600811958313,
"step": 2416
},
{
"epoch": 5.10126582278481,
"grad_norm": 2.3470253944396973,
"learning_rate": 6.174719932803891e-07,
"loss": 0.5935072898864746,
"step": 2418
},
{
"epoch": 5.105485232067511,
"grad_norm": 2.2750778198242188,
"learning_rate": 6.154811172866576e-07,
"loss": 1.007997751235962,
"step": 2420
},
{
"epoch": 5.109704641350211,
"grad_norm": 2.6111321449279785,
"learning_rate": 6.13498815772923e-07,
"loss": 0.7840423583984375,
"step": 2422
},
{
"epoch": 5.113924050632911,
"grad_norm": 1.3846306800842285,
"learning_rate": 6.115250994662303e-07,
"loss": 0.5133131742477417,
"step": 2424
},
{
"epoch": 5.118143459915612,
"grad_norm": 2.471632480621338,
"learning_rate": 6.095599790471655e-07,
"loss": 0.5239850282669067,
"step": 2426
},
{
"epoch": 5.122362869198312,
"grad_norm": 6.427463054656982,
"learning_rate": 6.076034651497995e-07,
"loss": 0.46869874000549316,
"step": 2428
},
{
"epoch": 5.1265822784810124,
"grad_norm": 2.414717674255371,
"learning_rate": 6.056555683616291e-07,
"loss": 0.5103088617324829,
"step": 2430
},
{
"epoch": 5.1308016877637135,
"grad_norm": 2.512148380279541,
"learning_rate": 6.037162992235214e-07,
"loss": 0.8223515152931213,
"step": 2432
},
{
"epoch": 5.135021097046414,
"grad_norm": 2.9548940658569336,
"learning_rate": 6.017856682296551e-07,
"loss": 0.917111873626709,
"step": 2434
},
{
"epoch": 5.139240506329114,
"grad_norm": 3.2818551063537598,
"learning_rate": 5.998636858274642e-07,
"loss": 0.4495956301689148,
"step": 2436
},
{
"epoch": 5.143459915611814,
"grad_norm": 7.414487838745117,
"learning_rate": 5.97950362417582e-07,
"loss": 0.10738074779510498,
"step": 2438
},
{
"epoch": 5.147679324894515,
"grad_norm": 3.5307252407073975,
"learning_rate": 5.960457083537848e-07,
"loss": 0.6862280368804932,
"step": 2440
},
{
"epoch": 5.151898734177215,
"grad_norm": 5.234355449676514,
"learning_rate": 5.941497339429337e-07,
"loss": 0.790778636932373,
"step": 2442
},
{
"epoch": 5.156118143459915,
"grad_norm": 9.668578147888184,
"learning_rate": 5.922624494449232e-07,
"loss": 0.44245994091033936,
"step": 2444
},
{
"epoch": 5.160337552742616,
"grad_norm": 2.4678266048431396,
"learning_rate": 5.903838650726219e-07,
"loss": 0.9481706023216248,
"step": 2446
},
{
"epoch": 5.1645569620253164,
"grad_norm": 6.675557613372803,
"learning_rate": 5.885139909918178e-07,
"loss": 0.5106003284454346,
"step": 2448
},
{
"epoch": 5.168776371308017,
"grad_norm": 2.8948278427124023,
"learning_rate": 5.866528373211652e-07,
"loss": 0.818520188331604,
"step": 2450
},
{
"epoch": 5.172995780590718,
"grad_norm": 0.031267765909433365,
"learning_rate": 5.848004141321279e-07,
"loss": 0.4252956509590149,
"step": 2452
},
{
"epoch": 5.177215189873418,
"grad_norm": 5.288304805755615,
"learning_rate": 5.82956731448926e-07,
"loss": 0.17302009463310242,
"step": 2454
},
{
"epoch": 5.181434599156118,
"grad_norm": 2.205019950866699,
"learning_rate": 5.811217992484801e-07,
"loss": 0.44998836517333984,
"step": 2456
},
{
"epoch": 5.185654008438819,
"grad_norm": 2.3904027938842773,
"learning_rate": 5.792956274603598e-07,
"loss": 0.5072075128555298,
"step": 2458
},
{
"epoch": 5.189873417721519,
"grad_norm": 10.959815979003906,
"learning_rate": 5.774782259667278e-07,
"loss": 0.5302789807319641,
"step": 2460
},
{
"epoch": 5.194092827004219,
"grad_norm": 3.0402653217315674,
"learning_rate": 5.756696046022868e-07,
"loss": 0.8277729749679565,
"step": 2462
},
{
"epoch": 5.198312236286919,
"grad_norm": 1.877632737159729,
"learning_rate": 5.738697731542275e-07,
"loss": 0.8515483736991882,
"step": 2464
},
{
"epoch": 5.2025316455696204,
"grad_norm": 3.8875975608825684,
"learning_rate": 5.720787413621739e-07,
"loss": 0.3267098069190979,
"step": 2466
},
{
"epoch": 5.206751054852321,
"grad_norm": 2.2627320289611816,
"learning_rate": 5.702965189181324e-07,
"loss": 0.786805272102356,
"step": 2468
},
{
"epoch": 5.210970464135021,
"grad_norm": 6.892368793487549,
"learning_rate": 5.685231154664372e-07,
"loss": 0.6648309826850891,
"step": 2470
},
{
"epoch": 5.215189873417722,
"grad_norm": 3.592425584793091,
"learning_rate": 5.667585406036999e-07,
"loss": 0.6738979816436768,
"step": 2472
},
{
"epoch": 5.219409282700422,
"grad_norm": 4.459148406982422,
"learning_rate": 5.650028038787577e-07,
"loss": 0.7590001821517944,
"step": 2474
},
{
"epoch": 5.223628691983122,
"grad_norm": 2.538756847381592,
"learning_rate": 5.632559147926202e-07,
"loss": 0.42987027764320374,
"step": 2476
},
{
"epoch": 5.227848101265823,
"grad_norm": 2.191638946533203,
"learning_rate": 5.615178827984186e-07,
"loss": 0.0880412608385086,
"step": 2478
},
{
"epoch": 5.232067510548523,
"grad_norm": 0.6364640593528748,
"learning_rate": 5.597887173013555e-07,
"loss": 0.48929768800735474,
"step": 2480
},
{
"epoch": 5.236286919831223,
"grad_norm": 3.1823930740356445,
"learning_rate": 5.580684276586535e-07,
"loss": 0.7606073617935181,
"step": 2482
},
{
"epoch": 5.2405063291139244,
"grad_norm": 3.0193521976470947,
"learning_rate": 5.563570231795027e-07,
"loss": 0.4337414503097534,
"step": 2484
},
{
"epoch": 5.244725738396625,
"grad_norm": 5.9851298332214355,
"learning_rate": 5.546545131250133e-07,
"loss": 1.1480921506881714,
"step": 2486
},
{
"epoch": 5.248945147679325,
"grad_norm": 2.274847984313965,
"learning_rate": 5.52960906708164e-07,
"loss": 0.8605833053588867,
"step": 2488
},
{
"epoch": 5.253164556962025,
"grad_norm": 4.390803813934326,
"learning_rate": 5.512762130937521e-07,
"loss": 0.891315221786499,
"step": 2490
},
{
"epoch": 5.257383966244726,
"grad_norm": 3.777196168899536,
"learning_rate": 5.496004413983437e-07,
"loss": 0.9285299777984619,
"step": 2492
},
{
"epoch": 5.261603375527426,
"grad_norm": 2.7875325679779053,
"learning_rate": 5.479336006902255e-07,
"loss": 0.6960370540618896,
"step": 2494
},
{
"epoch": 5.265822784810126,
"grad_norm": 5.017436981201172,
"learning_rate": 5.462756999893543e-07,
"loss": 0.42756134271621704,
"step": 2496
},
{
"epoch": 5.270042194092827,
"grad_norm": 4.370994567871094,
"learning_rate": 5.446267482673096e-07,
"loss": 0.9004020690917969,
"step": 2498
},
{
"epoch": 5.274261603375527,
"grad_norm": 8.342371940612793,
"learning_rate": 5.429867544472434e-07,
"loss": 0.49218082427978516,
"step": 2500
},
{
"epoch": 5.2784810126582276,
"grad_norm": 4.241844654083252,
"learning_rate": 5.413557274038332e-07,
"loss": 0.6770671606063843,
"step": 2502
},
{
"epoch": 5.282700421940929,
"grad_norm": 2.6205804347991943,
"learning_rate": 5.397336759632338e-07,
"loss": 0.660459041595459,
"step": 2504
},
{
"epoch": 5.286919831223629,
"grad_norm": 8.33484935760498,
"learning_rate": 5.381206089030293e-07,
"loss": 0.9731260538101196,
"step": 2506
},
{
"epoch": 5.291139240506329,
"grad_norm": 2.1884765625,
"learning_rate": 5.365165349521859e-07,
"loss": 0.9394969940185547,
"step": 2508
},
{
"epoch": 5.29535864978903,
"grad_norm": 13.115405082702637,
"learning_rate": 5.349214627910034e-07,
"loss": 0.3471090793609619,
"step": 2510
},
{
"epoch": 5.29957805907173,
"grad_norm": 4.7298970222473145,
"learning_rate": 5.333354010510703e-07,
"loss": 0.49661415815353394,
"step": 2512
},
{
"epoch": 5.30379746835443,
"grad_norm": 8.904556274414062,
"learning_rate": 5.31758358315216e-07,
"loss": 0.9580909609794617,
"step": 2514
},
{
"epoch": 5.308016877637131,
"grad_norm": 4.521732807159424,
"learning_rate": 5.301903431174628e-07,
"loss": 0.6797637939453125,
"step": 2516
},
{
"epoch": 5.312236286919831,
"grad_norm": 3.3015851974487305,
"learning_rate": 5.286313639429837e-07,
"loss": 0.8633707761764526,
"step": 2518
},
{
"epoch": 5.3164556962025316,
"grad_norm": 6.265556812286377,
"learning_rate": 5.270814292280526e-07,
"loss": 0.9207254648208618,
"step": 2520
},
{
"epoch": 5.320675105485232,
"grad_norm": 2.132657051086426,
"learning_rate": 5.255405473600001e-07,
"loss": 0.8656923174858093,
"step": 2522
},
{
"epoch": 5.324894514767933,
"grad_norm": 3.075263738632202,
"learning_rate": 5.240087266771686e-07,
"loss": 0.8665053844451904,
"step": 2524
},
{
"epoch": 5.329113924050633,
"grad_norm": 25.491024017333984,
"learning_rate": 5.22485975468867e-07,
"loss": 0.9272741675376892,
"step": 2526
},
{
"epoch": 5.333333333333333,
"grad_norm": 5.792654514312744,
"learning_rate": 5.209723019753245e-07,
"loss": 0.6649227142333984,
"step": 2528
},
{
"epoch": 5.337552742616034,
"grad_norm": 3.785661220550537,
"learning_rate": 5.19467714387648e-07,
"loss": 0.7637553215026855,
"step": 2530
},
{
"epoch": 5.341772151898734,
"grad_norm": 3.9980387687683105,
"learning_rate": 5.179722208477764e-07,
"loss": 0.8297359347343445,
"step": 2532
},
{
"epoch": 5.345991561181434,
"grad_norm": 3.7611520290374756,
"learning_rate": 5.164858294484372e-07,
"loss": 0.5959780216217041,
"step": 2534
},
{
"epoch": 5.350210970464135,
"grad_norm": 3.5796737670898438,
"learning_rate": 5.150085482331025e-07,
"loss": 0.8286501169204712,
"step": 2536
},
{
"epoch": 5.3544303797468356,
"grad_norm": 3.5265612602233887,
"learning_rate": 5.135403851959455e-07,
"loss": 0.7233340740203857,
"step": 2538
},
{
"epoch": 5.358649789029536,
"grad_norm": 3.3139536380767822,
"learning_rate": 5.120813482817971e-07,
"loss": 0.5095676183700562,
"step": 2540
},
{
"epoch": 5.362869198312236,
"grad_norm": 4.816203594207764,
"learning_rate": 5.106314453861031e-07,
"loss": 0.10940656065940857,
"step": 2542
},
{
"epoch": 5.367088607594937,
"grad_norm": 0.789928138256073,
"learning_rate": 5.091906843548809e-07,
"loss": 0.4012370705604553,
"step": 2544
},
{
"epoch": 5.371308016877637,
"grad_norm": 6.561746120452881,
"learning_rate": 5.077590729846782e-07,
"loss": 0.6537183523178101,
"step": 2546
},
{
"epoch": 5.375527426160337,
"grad_norm": 2.8327221870422363,
"learning_rate": 5.063366190225298e-07,
"loss": 0.8231172561645508,
"step": 2548
},
{
"epoch": 5.379746835443038,
"grad_norm": 8.452791213989258,
"learning_rate": 5.049233301659161e-07,
"loss": 0.5680804252624512,
"step": 2550
},
{
"epoch": 5.383966244725738,
"grad_norm": 3.7673988342285156,
"learning_rate": 5.035192140627213e-07,
"loss": 0.1833023726940155,
"step": 2552
},
{
"epoch": 5.3881856540084385,
"grad_norm": 2.091782569885254,
"learning_rate": 5.021242783111924e-07,
"loss": 0.7948375344276428,
"step": 2554
},
{
"epoch": 5.3924050632911396,
"grad_norm": 14.723713874816895,
"learning_rate": 5.007385304598978e-07,
"loss": 0.6941039562225342,
"step": 2556
},
{
"epoch": 5.39662447257384,
"grad_norm": 14.07388973236084,
"learning_rate": 4.993619780076855e-07,
"loss": 0.43440479040145874,
"step": 2558
},
{
"epoch": 5.40084388185654,
"grad_norm": 7.218398094177246,
"learning_rate": 4.979946284036441e-07,
"loss": 0.21915487945079803,
"step": 2560
},
{
"epoch": 5.405063291139241,
"grad_norm": 4.034780979156494,
"learning_rate": 4.966364890470618e-07,
"loss": 0.547726571559906,
"step": 2562
},
{
"epoch": 5.409282700421941,
"grad_norm": 2.1914002895355225,
"learning_rate": 4.952875672873867e-07,
"loss": 0.9137965440750122,
"step": 2564
},
{
"epoch": 5.413502109704641,
"grad_norm": 2.9248530864715576,
"learning_rate": 4.939478704241859e-07,
"loss": 0.4639781713485718,
"step": 2566
},
{
"epoch": 5.417721518987342,
"grad_norm": 3.954902172088623,
"learning_rate": 4.926174057071077e-07,
"loss": 0.7315584421157837,
"step": 2568
},
{
"epoch": 5.421940928270042,
"grad_norm": 3.5057809352874756,
"learning_rate": 4.912961803358409e-07,
"loss": 0.17236268520355225,
"step": 2570
},
{
"epoch": 5.4261603375527425,
"grad_norm": 7.570180416107178,
"learning_rate": 4.899842014600768e-07,
"loss": 0.542130708694458,
"step": 2572
},
{
"epoch": 5.430379746835443,
"grad_norm": 0.8830806612968445,
"learning_rate": 4.886814761794694e-07,
"loss": 0.08617094159126282,
"step": 2574
},
{
"epoch": 5.434599156118144,
"grad_norm": 2.9515278339385986,
"learning_rate": 4.873880115435982e-07,
"loss": 0.6731958389282227,
"step": 2576
},
{
"epoch": 5.438818565400844,
"grad_norm": 14.474751472473145,
"learning_rate": 4.861038145519302e-07,
"loss": 0.8146198987960815,
"step": 2578
},
{
"epoch": 5.443037974683544,
"grad_norm": 4.252223968505859,
"learning_rate": 4.848288921537804e-07,
"loss": 0.7910962104797363,
"step": 2580
},
{
"epoch": 5.447257383966245,
"grad_norm": 3.409487009048462,
"learning_rate": 4.835632512482754e-07,
"loss": 0.4601414203643799,
"step": 2582
},
{
"epoch": 5.451476793248945,
"grad_norm": 2.4231197834014893,
"learning_rate": 4.823068986843162e-07,
"loss": 0.5326846837997437,
"step": 2584
},
{
"epoch": 5.455696202531645,
"grad_norm": 11.150148391723633,
"learning_rate": 4.810598412605407e-07,
"loss": 0.6682008504867554,
"step": 2586
},
{
"epoch": 5.459915611814346,
"grad_norm": 1.4135065078735352,
"learning_rate": 4.798220857252866e-07,
"loss": 0.30620691180229187,
"step": 2588
},
{
"epoch": 5.4641350210970465,
"grad_norm": 6.7066755294799805,
"learning_rate": 4.785936387765555e-07,
"loss": 0.7434167861938477,
"step": 2590
},
{
"epoch": 5.468354430379747,
"grad_norm": 1.7110621929168701,
"learning_rate": 4.773745070619767e-07,
"loss": 0.5532716512680054,
"step": 2592
},
{
"epoch": 5.472573839662447,
"grad_norm": 3.8936519622802734,
"learning_rate": 4.761646971787707e-07,
"loss": 0.47537532448768616,
"step": 2594
},
{
"epoch": 5.476793248945148,
"grad_norm": 4.289298057556152,
"learning_rate": 4.749642156737138e-07,
"loss": 0.34944185614585876,
"step": 2596
},
{
"epoch": 5.481012658227848,
"grad_norm": 2.747558116912842,
"learning_rate": 4.7377306904310233e-07,
"loss": 0.16377092897891998,
"step": 2598
},
{
"epoch": 5.485232067510548,
"grad_norm": 0.3106602132320404,
"learning_rate": 4.7259126373271865e-07,
"loss": 0.42584800720214844,
"step": 2600
},
{
"epoch": 5.489451476793249,
"grad_norm": 5.100452899932861,
"learning_rate": 4.714188061377942e-07,
"loss": 0.8994771242141724,
"step": 2602
},
{
"epoch": 5.493670886075949,
"grad_norm": 7.287261962890625,
"learning_rate": 4.7025570260297703e-07,
"loss": 0.8067635297775269,
"step": 2604
},
{
"epoch": 5.4978902953586495,
"grad_norm": 15.138601303100586,
"learning_rate": 4.6910195942229627e-07,
"loss": 0.13593333959579468,
"step": 2606
},
{
"epoch": 5.5021097046413505,
"grad_norm": 2.714247226715088,
"learning_rate": 4.6795758283912836e-07,
"loss": 0.3896440267562866,
"step": 2608
},
{
"epoch": 5.506329113924051,
"grad_norm": 3.3304672241210938,
"learning_rate": 4.668225790461631e-07,
"loss": 0.0639631599187851,
"step": 2610
},
{
"epoch": 5.510548523206751,
"grad_norm": 19.265941619873047,
"learning_rate": 4.6569695418537063e-07,
"loss": 0.2734604477882385,
"step": 2612
},
{
"epoch": 5.514767932489452,
"grad_norm": 2.5918619632720947,
"learning_rate": 4.645807143479674e-07,
"loss": 0.8366518616676331,
"step": 2614
},
{
"epoch": 5.518987341772152,
"grad_norm": 1.1537539958953857,
"learning_rate": 4.634738655743843e-07,
"loss": 0.4462703466415405,
"step": 2616
},
{
"epoch": 5.523206751054852,
"grad_norm": 2.6978986263275146,
"learning_rate": 4.6237641385423225e-07,
"loss": 0.4549875259399414,
"step": 2618
},
{
"epoch": 5.527426160337553,
"grad_norm": 2.130697727203369,
"learning_rate": 4.6128836512627204e-07,
"loss": 0.8581835627555847,
"step": 2620
},
{
"epoch": 5.531645569620253,
"grad_norm": 4.347284317016602,
"learning_rate": 4.602097252783805e-07,
"loss": 0.5586264133453369,
"step": 2622
},
{
"epoch": 5.5358649789029535,
"grad_norm": 14.522599220275879,
"learning_rate": 4.591405001475189e-07,
"loss": 0.8266869783401489,
"step": 2624
},
{
"epoch": 5.540084388185654,
"grad_norm": 7.911047458648682,
"learning_rate": 4.58080695519702e-07,
"loss": 0.44375789165496826,
"step": 2626
},
{
"epoch": 5.544303797468355,
"grad_norm": 4.837867736816406,
"learning_rate": 4.570303171299666e-07,
"loss": 0.6062820553779602,
"step": 2628
},
{
"epoch": 5.548523206751055,
"grad_norm": 5.2242021560668945,
"learning_rate": 4.5598937066233973e-07,
"loss": 0.7080090641975403,
"step": 2630
},
{
"epoch": 5.552742616033755,
"grad_norm": 4.157374858856201,
"learning_rate": 4.5495786174980867e-07,
"loss": 0.45279741287231445,
"step": 2632
},
{
"epoch": 5.556962025316456,
"grad_norm": 3.1067519187927246,
"learning_rate": 4.539357959742899e-07,
"loss": 0.4694240689277649,
"step": 2634
},
{
"epoch": 5.561181434599156,
"grad_norm": 2.8363306522369385,
"learning_rate": 4.5292317886659993e-07,
"loss": 0.37042319774627686,
"step": 2636
},
{
"epoch": 5.565400843881856,
"grad_norm": 5.392505168914795,
"learning_rate": 4.51920015906424e-07,
"loss": 0.4348013401031494,
"step": 2638
},
{
"epoch": 5.569620253164557,
"grad_norm": 1.3840664625167847,
"learning_rate": 4.5092631252228734e-07,
"loss": 0.2230294644832611,
"step": 2640
},
{
"epoch": 5.5738396624472575,
"grad_norm": 2.5080552101135254,
"learning_rate": 4.4994207409152575e-07,
"loss": 0.8967776298522949,
"step": 2642
},
{
"epoch": 5.578059071729958,
"grad_norm": 3.2199008464813232,
"learning_rate": 4.48967305940256e-07,
"loss": 0.9706035852432251,
"step": 2644
},
{
"epoch": 5.582278481012658,
"grad_norm": 11.19129753112793,
"learning_rate": 4.480020133433474e-07,
"loss": 0.626300573348999,
"step": 2646
},
{
"epoch": 5.586497890295359,
"grad_norm": 14.880667686462402,
"learning_rate": 4.47046201524393e-07,
"loss": 0.06408479064702988,
"step": 2648
},
{
"epoch": 5.590717299578059,
"grad_norm": 2.3462014198303223,
"learning_rate": 4.460998756556818e-07,
"loss": 0.44877690076828003,
"step": 2650
},
{
"epoch": 5.594936708860759,
"grad_norm": 3.08370041847229,
"learning_rate": 4.451630408581701e-07,
"loss": 0.3830834925174713,
"step": 2652
},
{
"epoch": 5.59915611814346,
"grad_norm": 7.73508358001709,
"learning_rate": 4.442357022014546e-07,
"loss": 0.15033870935440063,
"step": 2654
},
{
"epoch": 5.60337552742616,
"grad_norm": 4.020411014556885,
"learning_rate": 4.43317864703744e-07,
"loss": 0.5552294850349426,
"step": 2656
},
{
"epoch": 5.6075949367088604,
"grad_norm": 12.524031639099121,
"learning_rate": 4.4240953333183257e-07,
"loss": 0.1009381040930748,
"step": 2658
},
{
"epoch": 5.6118143459915615,
"grad_norm": 3.7477056980133057,
"learning_rate": 4.4151071300107296e-07,
"loss": 0.4878613352775574,
"step": 2660
},
{
"epoch": 5.616033755274262,
"grad_norm": 25.352882385253906,
"learning_rate": 4.406214085753499e-07,
"loss": 0.0786014273762703,
"step": 2662
},
{
"epoch": 5.620253164556962,
"grad_norm": 5.754502773284912,
"learning_rate": 4.3974162486705327e-07,
"loss": 0.424061119556427,
"step": 2664
},
{
"epoch": 5.624472573839663,
"grad_norm": 4.437866687774658,
"learning_rate": 4.38871366637053e-07,
"loss": 0.07941263914108276,
"step": 2666
},
{
"epoch": 5.628691983122363,
"grad_norm": 3.537459373474121,
"learning_rate": 4.380106385946721e-07,
"loss": 0.30082571506500244,
"step": 2668
},
{
"epoch": 5.632911392405063,
"grad_norm": 2.312814474105835,
"learning_rate": 4.3715944539766257e-07,
"loss": 0.71795254945755,
"step": 2670
},
{
"epoch": 5.637130801687764,
"grad_norm": 5.115408897399902,
"learning_rate": 4.3631779165217875e-07,
"loss": 0.811305820941925,
"step": 2672
},
{
"epoch": 5.641350210970464,
"grad_norm": 8.744047164916992,
"learning_rate": 4.354856819127537e-07,
"loss": 0.6766564249992371,
"step": 2674
},
{
"epoch": 5.6455696202531644,
"grad_norm": 2.2004096508026123,
"learning_rate": 4.346631206822732e-07,
"loss": 0.8192415237426758,
"step": 2676
},
{
"epoch": 5.649789029535865,
"grad_norm": 1.8391209840774536,
"learning_rate": 4.338501124119533e-07,
"loss": 0.5205031037330627,
"step": 2678
},
{
"epoch": 5.654008438818566,
"grad_norm": 3.9403841495513916,
"learning_rate": 4.330466615013138e-07,
"loss": 0.2361564040184021,
"step": 2680
},
{
"epoch": 5.658227848101266,
"grad_norm": 4.0212554931640625,
"learning_rate": 4.3225277229815673e-07,
"loss": 0.45385825634002686,
"step": 2682
},
{
"epoch": 5.662447257383966,
"grad_norm": 3.5017166137695312,
"learning_rate": 4.314684490985411e-07,
"loss": 0.2712249159812927,
"step": 2684
},
{
"epoch": 5.666666666666667,
"grad_norm": 2.6000726222991943,
"learning_rate": 4.3069369614676086e-07,
"loss": 0.9603966474533081,
"step": 2686
},
{
"epoch": 5.670886075949367,
"grad_norm": 2.9337501525878906,
"learning_rate": 4.2992851763532125e-07,
"loss": 0.5593338012695312,
"step": 2688
},
{
"epoch": 5.675105485232067,
"grad_norm": 3.656930923461914,
"learning_rate": 4.291729177049159e-07,
"loss": 1.0005125999450684,
"step": 2690
},
{
"epoch": 5.679324894514768,
"grad_norm": 12.878107070922852,
"learning_rate": 4.28426900444406e-07,
"loss": 0.04988168552517891,
"step": 2692
},
{
"epoch": 5.6835443037974684,
"grad_norm": 2.371689558029175,
"learning_rate": 4.2769046989079543e-07,
"loss": 0.8081762790679932,
"step": 2694
},
{
"epoch": 5.687763713080169,
"grad_norm": 5.237072944641113,
"learning_rate": 4.2696363002921135e-07,
"loss": 0.4558332860469818,
"step": 2696
},
{
"epoch": 5.691983122362869,
"grad_norm": 2.5988988876342773,
"learning_rate": 4.262463847928818e-07,
"loss": 0.8788666129112244,
"step": 2698
},
{
"epoch": 5.69620253164557,
"grad_norm": 3.3628621101379395,
"learning_rate": 4.2553873806311424e-07,
"loss": 0.8370002508163452,
"step": 2700
},
{
"epoch": 5.70042194092827,
"grad_norm": 3.688671588897705,
"learning_rate": 4.248406936692747e-07,
"loss": 0.6099220514297485,
"step": 2702
},
{
"epoch": 5.70464135021097,
"grad_norm": 1.2157199382781982,
"learning_rate": 4.2415225538876686e-07,
"loss": 0.49759507179260254,
"step": 2704
},
{
"epoch": 5.708860759493671,
"grad_norm": 0.465036541223526,
"learning_rate": 4.2347342694701206e-07,
"loss": 0.40582969784736633,
"step": 2706
},
{
"epoch": 5.713080168776371,
"grad_norm": 13.82797622680664,
"learning_rate": 4.2280421201742874e-07,
"loss": 0.11880761384963989,
"step": 2708
},
{
"epoch": 5.717299578059071,
"grad_norm": 1.5762630701065063,
"learning_rate": 4.221446142214125e-07,
"loss": 0.620478630065918,
"step": 2710
},
{
"epoch": 5.7215189873417724,
"grad_norm": 4.519263744354248,
"learning_rate": 4.214946371283172e-07,
"loss": 0.8996577262878418,
"step": 2712
},
{
"epoch": 5.725738396624473,
"grad_norm": 8.791622161865234,
"learning_rate": 4.2085428425543474e-07,
"loss": 0.6637638807296753,
"step": 2714
},
{
"epoch": 5.729957805907173,
"grad_norm": 3.510023832321167,
"learning_rate": 4.202235590679763e-07,
"loss": 0.77869713306427,
"step": 2716
},
{
"epoch": 5.734177215189874,
"grad_norm": 5.473074913024902,
"learning_rate": 4.1960246497905417e-07,
"loss": 0.8682685494422913,
"step": 2718
},
{
"epoch": 5.738396624472574,
"grad_norm": 2.2831952571868896,
"learning_rate": 4.1899100534966263e-07,
"loss": 0.8572003841400146,
"step": 2720
},
{
"epoch": 5.742616033755274,
"grad_norm": 4.826292037963867,
"learning_rate": 4.183891834886598e-07,
"loss": 0.834069013595581,
"step": 2722
},
{
"epoch": 5.746835443037975,
"grad_norm": 32.02092742919922,
"learning_rate": 4.177970026527499e-07,
"loss": 0.22675754129886627,
"step": 2724
},
{
"epoch": 5.751054852320675,
"grad_norm": 2.374525308609009,
"learning_rate": 4.1721446604646607e-07,
"loss": 0.6690686345100403,
"step": 2726
},
{
"epoch": 5.755274261603375,
"grad_norm": 2.256140947341919,
"learning_rate": 4.1664157682215173e-07,
"loss": 0.7398881316184998,
"step": 2728
},
{
"epoch": 5.759493670886076,
"grad_norm": 4.3521504402160645,
"learning_rate": 4.1607833807994547e-07,
"loss": 0.8732868432998657,
"step": 2730
},
{
"epoch": 5.763713080168777,
"grad_norm": 6.75162410736084,
"learning_rate": 4.155247528677621e-07,
"loss": 0.7909585237503052,
"step": 2732
},
{
"epoch": 5.767932489451477,
"grad_norm": 31.269031524658203,
"learning_rate": 4.1498082418127807e-07,
"loss": 0.2190740704536438,
"step": 2734
},
{
"epoch": 5.772151898734177,
"grad_norm": 7.679251194000244,
"learning_rate": 4.1444655496391376e-07,
"loss": 0.46999984979629517,
"step": 2736
},
{
"epoch": 5.776371308016878,
"grad_norm": 2.543074607849121,
"learning_rate": 4.139219481068185e-07,
"loss": 0.884986162185669,
"step": 2738
},
{
"epoch": 5.780590717299578,
"grad_norm": 2.4317591190338135,
"learning_rate": 4.13407006448855e-07,
"loss": 0.5444875955581665,
"step": 2740
},
{
"epoch": 5.784810126582278,
"grad_norm": 2.9350624084472656,
"learning_rate": 4.1290173277658303e-07,
"loss": 0.8912389278411865,
"step": 2742
},
{
"epoch": 5.789029535864979,
"grad_norm": 7.446691513061523,
"learning_rate": 4.124061298242451e-07,
"loss": 0.5339520573616028,
"step": 2744
},
{
"epoch": 5.793248945147679,
"grad_norm": 5.2088704109191895,
"learning_rate": 4.119202002737515e-07,
"loss": 0.45539939403533936,
"step": 2746
},
{
"epoch": 5.7974683544303796,
"grad_norm": 3.678557872772217,
"learning_rate": 4.1144394675466634e-07,
"loss": 0.8749001026153564,
"step": 2748
},
{
"epoch": 5.80168776371308,
"grad_norm": 10.216012954711914,
"learning_rate": 4.109773718441916e-07,
"loss": 0.7841247320175171,
"step": 2750
},
{
"epoch": 5.805907172995781,
"grad_norm": 2.440023422241211,
"learning_rate": 4.105204780671556e-07,
"loss": 0.8511307239532471,
"step": 2752
},
{
"epoch": 5.810126582278481,
"grad_norm": 7.605076789855957,
"learning_rate": 4.100732678959971e-07,
"loss": 1.0421419143676758,
"step": 2754
},
{
"epoch": 5.814345991561181,
"grad_norm": 4.731003284454346,
"learning_rate": 4.0963574375075354e-07,
"loss": 0.4821122884750366,
"step": 2756
},
{
"epoch": 5.818565400843882,
"grad_norm": 3.862736463546753,
"learning_rate": 4.092079079990471e-07,
"loss": 0.05994529277086258,
"step": 2758
},
{
"epoch": 5.822784810126582,
"grad_norm": 2.8706905841827393,
"learning_rate": 4.087897629560719e-07,
"loss": 0.6597020626068115,
"step": 2760
},
{
"epoch": 5.827004219409282,
"grad_norm": 10.528990745544434,
"learning_rate": 4.0838131088458207e-07,
"loss": 0.5567920804023743,
"step": 2762
},
{
"epoch": 5.831223628691983,
"grad_norm": 12.403848648071289,
"learning_rate": 4.079825539948785e-07,
"loss": 0.22084438800811768,
"step": 2764
},
{
"epoch": 5.8354430379746836,
"grad_norm": 3.479530096054077,
"learning_rate": 4.0759349444479853e-07,
"loss": 0.8606102466583252,
"step": 2766
},
{
"epoch": 5.839662447257384,
"grad_norm": 2.724365711212158,
"learning_rate": 4.072141343397021e-07,
"loss": 0.45490285754203796,
"step": 2768
},
{
"epoch": 5.843881856540085,
"grad_norm": 6.362490653991699,
"learning_rate": 4.068444757324621e-07,
"loss": 0.8239868879318237,
"step": 2770
},
{
"epoch": 5.848101265822785,
"grad_norm": 10.339574813842773,
"learning_rate": 4.064845206234523e-07,
"loss": 0.5215486884117126,
"step": 2772
},
{
"epoch": 5.852320675105485,
"grad_norm": 186.8642578125,
"learning_rate": 4.061342709605374e-07,
"loss": 0.5665589570999146,
"step": 2774
},
{
"epoch": 5.856540084388186,
"grad_norm": 2.5368990898132324,
"learning_rate": 4.057937286390615e-07,
"loss": 0.7514277100563049,
"step": 2776
},
{
"epoch": 5.860759493670886,
"grad_norm": 7.951842784881592,
"learning_rate": 4.0546289550183833e-07,
"loss": 0.8747674822807312,
"step": 2778
},
{
"epoch": 5.864978902953586,
"grad_norm": 4.173673152923584,
"learning_rate": 4.0514177333914147e-07,
"loss": 0.8620109558105469,
"step": 2780
},
{
"epoch": 5.869198312236287,
"grad_norm": 2.6262011528015137,
"learning_rate": 4.0483036388869426e-07,
"loss": 0.8278003931045532,
"step": 2782
},
{
"epoch": 5.8734177215189876,
"grad_norm": 3.4531075954437256,
"learning_rate": 4.045286688356607e-07,
"loss": 0.8439078330993652,
"step": 2784
},
{
"epoch": 5.877637130801688,
"grad_norm": 17.26287269592285,
"learning_rate": 4.0423668981263635e-07,
"loss": 0.2546153664588928,
"step": 2786
},
{
"epoch": 5.881856540084388,
"grad_norm": 2.9670450687408447,
"learning_rate": 4.039544283996389e-07,
"loss": 0.803874135017395,
"step": 2788
},
{
"epoch": 5.886075949367089,
"grad_norm": 1.728909969329834,
"learning_rate": 4.036818861241004e-07,
"loss": 0.11378484964370728,
"step": 2790
},
{
"epoch": 5.890295358649789,
"grad_norm": 15.379825592041016,
"learning_rate": 4.0341906446085865e-07,
"loss": 0.40370649099349976,
"step": 2792
},
{
"epoch": 5.894514767932489,
"grad_norm": 14.338972091674805,
"learning_rate": 4.0316596483214915e-07,
"loss": 0.7983355522155762,
"step": 2794
},
{
"epoch": 5.89873417721519,
"grad_norm": 3.563936710357666,
"learning_rate": 4.0292258860759767e-07,
"loss": 0.9050275087356567,
"step": 2796
},
{
"epoch": 5.90295358649789,
"grad_norm": 2.441664934158325,
"learning_rate": 4.026889371042125e-07,
"loss": 0.4420316219329834,
"step": 2798
},
{
"epoch": 5.9071729957805905,
"grad_norm": 2.3287241458892822,
"learning_rate": 4.024650115863774e-07,
"loss": 0.7599180936813354,
"step": 2800
},
{
"epoch": 5.911392405063291,
"grad_norm": 3.5945613384246826,
"learning_rate": 4.022508132658452e-07,
"loss": 0.6878820657730103,
"step": 2802
},
{
"epoch": 5.915611814345992,
"grad_norm": 12.153562545776367,
"learning_rate": 4.020463433017305e-07,
"loss": 0.40130820870399475,
"step": 2804
},
{
"epoch": 5.919831223628692,
"grad_norm": 3.069974899291992,
"learning_rate": 4.0185160280050384e-07,
"loss": 0.095822274684906,
"step": 2806
},
{
"epoch": 5.924050632911392,
"grad_norm": 8.717458724975586,
"learning_rate": 4.01666592815986e-07,
"loss": 0.9885622262954712,
"step": 2808
},
{
"epoch": 5.928270042194093,
"grad_norm": 7.206968307495117,
"learning_rate": 4.014913143493415e-07,
"loss": 0.04864209145307541,
"step": 2810
},
{
"epoch": 5.932489451476793,
"grad_norm": 3.1413886547088623,
"learning_rate": 4.0132576834907404e-07,
"loss": 0.43854427337646484,
"step": 2812
},
{
"epoch": 5.936708860759493,
"grad_norm": 0.46113014221191406,
"learning_rate": 4.0116995571102056e-07,
"loss": 0.4027542471885681,
"step": 2814
},
{
"epoch": 5.940928270042194,
"grad_norm": 3.120668888092041,
"learning_rate": 4.0102387727834705e-07,
"loss": 0.6854231357574463,
"step": 2816
},
{
"epoch": 5.9451476793248945,
"grad_norm": 2.3229949474334717,
"learning_rate": 4.008875338415438e-07,
"loss": 0.5028409361839294,
"step": 2818
},
{
"epoch": 5.949367088607595,
"grad_norm": 0.521416187286377,
"learning_rate": 4.007609261384207e-07,
"loss": 0.43289196491241455,
"step": 2820
},
{
"epoch": 5.953586497890296,
"grad_norm": 2.9964866638183594,
"learning_rate": 4.006440548541041e-07,
"loss": 0.9015544652938843,
"step": 2822
},
{
"epoch": 5.957805907172996,
"grad_norm": 1.731990933418274,
"learning_rate": 4.005369206210321e-07,
"loss": 0.43057486414909363,
"step": 2824
},
{
"epoch": 5.962025316455696,
"grad_norm": 2.3747055530548096,
"learning_rate": 4.0043952401895207e-07,
"loss": 0.8347324132919312,
"step": 2826
},
{
"epoch": 5.966244725738397,
"grad_norm": 1.1531779766082764,
"learning_rate": 4.0035186557491683e-07,
"loss": 0.44332531094551086,
"step": 2828
},
{
"epoch": 5.970464135021097,
"grad_norm": 2.191092014312744,
"learning_rate": 4.0027394576328213e-07,
"loss": 0.39579838514328003,
"step": 2830
},
{
"epoch": 5.974683544303797,
"grad_norm": 7.761366367340088,
"learning_rate": 4.0020576500570355e-07,
"loss": 1.0412178039550781,
"step": 2832
},
{
"epoch": 5.978902953586498,
"grad_norm": 0.7852330207824707,
"learning_rate": 4.0014732367113567e-07,
"loss": 0.36100465059280396,
"step": 2834
},
{
"epoch": 5.9831223628691985,
"grad_norm": 21.701784133911133,
"learning_rate": 4.000986220758279e-07,
"loss": 0.07913509011268616,
"step": 2836
},
{
"epoch": 5.987341772151899,
"grad_norm": 5.154250621795654,
"learning_rate": 4.0005966048332503e-07,
"loss": 0.5702348351478577,
"step": 2838
},
{
"epoch": 5.991561181434599,
"grad_norm": 1.1740047931671143,
"learning_rate": 4.0003043910446375e-07,
"loss": 0.47653162479400635,
"step": 2840
},
{
"epoch": 5.9957805907173,
"grad_norm": 6.092247009277344,
"learning_rate": 4.000109580973733e-07,
"loss": 0.811444878578186,
"step": 2842
},
{
"epoch": 6.0,
"grad_norm": 3.5838730335235596,
"learning_rate": 4.0000121756747285e-07,
"loss": 0.7996691465377808,
"step": 2844
},
{
"epoch": 6.0,
"step": 2844,
"total_flos": 5.392281114922451e+18,
"train_loss": 0.8344338661696338,
"train_runtime": 6866.9503,
"train_samples_per_second": 12.425,
"train_steps_per_second": 0.414
}
],
"logging_steps": 2,
"max_steps": 2844,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.392281114922451e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}