9b-107 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
4fbe7de verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1896,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004219409282700422,
"grad_norm": 8.808255195617676,
"learning_rate": 8.421052631578947e-08,
"loss": 2.1962406635284424,
"step": 2
},
{
"epoch": 0.008438818565400843,
"grad_norm": 11.09923267364502,
"learning_rate": 2.526315789473684e-07,
"loss": 1.780366063117981,
"step": 4
},
{
"epoch": 0.012658227848101266,
"grad_norm": 2.419753313064575,
"learning_rate": 4.2105263157894733e-07,
"loss": 1.93634033203125,
"step": 6
},
{
"epoch": 0.016877637130801686,
"grad_norm": 3.9049134254455566,
"learning_rate": 5.894736842105262e-07,
"loss": 1.943023443222046,
"step": 8
},
{
"epoch": 0.02109704641350211,
"grad_norm": 6.477982044219971,
"learning_rate": 7.578947368421053e-07,
"loss": 1.8409148454666138,
"step": 10
},
{
"epoch": 0.02531645569620253,
"grad_norm": 1.0134243965148926,
"learning_rate": 9.263157894736841e-07,
"loss": 1.3077127933502197,
"step": 12
},
{
"epoch": 0.029535864978902954,
"grad_norm": 3.247878074645996,
"learning_rate": 1.0947368421052632e-06,
"loss": 1.6219741106033325,
"step": 14
},
{
"epoch": 0.03375527426160337,
"grad_norm": 7.898465156555176,
"learning_rate": 1.263157894736842e-06,
"loss": 2.037022113800049,
"step": 16
},
{
"epoch": 0.0379746835443038,
"grad_norm": 1.3195950984954834,
"learning_rate": 1.431578947368421e-06,
"loss": 1.7350990772247314,
"step": 18
},
{
"epoch": 0.04219409282700422,
"grad_norm": 1.8259081840515137,
"learning_rate": 1.6e-06,
"loss": 1.8126976490020752,
"step": 20
},
{
"epoch": 0.046413502109704644,
"grad_norm": 1.4393107891082764,
"learning_rate": 1.768421052631579e-06,
"loss": 1.4626768827438354,
"step": 22
},
{
"epoch": 0.05063291139240506,
"grad_norm": 2.4846441745758057,
"learning_rate": 1.936842105263158e-06,
"loss": 1.600361704826355,
"step": 24
},
{
"epoch": 0.05485232067510549,
"grad_norm": 2.291980743408203,
"learning_rate": 2.1052631578947366e-06,
"loss": 1.3303472995758057,
"step": 26
},
{
"epoch": 0.05907172995780591,
"grad_norm": 1.7172958850860596,
"learning_rate": 2.273684210526316e-06,
"loss": 1.7306660413742065,
"step": 28
},
{
"epoch": 0.06329113924050633,
"grad_norm": 2.336642026901245,
"learning_rate": 2.4421052631578946e-06,
"loss": 1.3191598653793335,
"step": 30
},
{
"epoch": 0.06751054852320675,
"grad_norm": 1.4607104063034058,
"learning_rate": 2.6105263157894738e-06,
"loss": 1.550937533378601,
"step": 32
},
{
"epoch": 0.07172995780590717,
"grad_norm": 3.0410056114196777,
"learning_rate": 2.7789473684210525e-06,
"loss": 1.0037612915039062,
"step": 34
},
{
"epoch": 0.0759493670886076,
"grad_norm": 1.8816224336624146,
"learning_rate": 2.9473684210526313e-06,
"loss": 1.5629675388336182,
"step": 36
},
{
"epoch": 0.08016877637130802,
"grad_norm": 2.050464391708374,
"learning_rate": 3.1157894736842105e-06,
"loss": 1.1124558448791504,
"step": 38
},
{
"epoch": 0.08438818565400844,
"grad_norm": 1.7500700950622559,
"learning_rate": 3.2842105263157892e-06,
"loss": 1.5181232690811157,
"step": 40
},
{
"epoch": 0.08860759493670886,
"grad_norm": 3.6279757022857666,
"learning_rate": 3.4526315789473684e-06,
"loss": 1.064979076385498,
"step": 42
},
{
"epoch": 0.09282700421940929,
"grad_norm": 3.692965507507324,
"learning_rate": 3.621052631578947e-06,
"loss": 1.7742705345153809,
"step": 44
},
{
"epoch": 0.0970464135021097,
"grad_norm": 2.689681053161621,
"learning_rate": 3.789473684210526e-06,
"loss": 1.652271032333374,
"step": 46
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.1244308948516846,
"learning_rate": 3.957894736842105e-06,
"loss": 1.5283629894256592,
"step": 48
},
{
"epoch": 0.10548523206751055,
"grad_norm": 1.6453142166137695,
"learning_rate": 4.126315789473685e-06,
"loss": 0.7807677388191223,
"step": 50
},
{
"epoch": 0.10970464135021098,
"grad_norm": 1.3963041305541992,
"learning_rate": 4.294736842105263e-06,
"loss": 0.8492421507835388,
"step": 52
},
{
"epoch": 0.11392405063291139,
"grad_norm": 1.8241719007492065,
"learning_rate": 4.463157894736841e-06,
"loss": 0.7646088600158691,
"step": 54
},
{
"epoch": 0.11814345991561181,
"grad_norm": 5.430877208709717,
"learning_rate": 4.631578947368421e-06,
"loss": 1.385468602180481,
"step": 56
},
{
"epoch": 0.12236286919831224,
"grad_norm": 4.216091632843018,
"learning_rate": 4.8e-06,
"loss": 0.6626491546630859,
"step": 58
},
{
"epoch": 0.12658227848101267,
"grad_norm": 2.5527150630950928,
"learning_rate": 4.968421052631579e-06,
"loss": 1.4430313110351562,
"step": 60
},
{
"epoch": 0.1308016877637131,
"grad_norm": 2.92517352104187,
"learning_rate": 5.136842105263157e-06,
"loss": 1.4682120084762573,
"step": 62
},
{
"epoch": 0.1350210970464135,
"grad_norm": 0.9485960602760315,
"learning_rate": 5.305263157894736e-06,
"loss": 1.0755484104156494,
"step": 64
},
{
"epoch": 0.13924050632911392,
"grad_norm": 3.126896619796753,
"learning_rate": 5.473684210526316e-06,
"loss": 0.8076987266540527,
"step": 66
},
{
"epoch": 0.14345991561181434,
"grad_norm": 1.318830966949463,
"learning_rate": 5.6421052631578944e-06,
"loss": 1.0927525758743286,
"step": 68
},
{
"epoch": 0.14767932489451477,
"grad_norm": 3.404849052429199,
"learning_rate": 5.810526315789474e-06,
"loss": 0.9728879332542419,
"step": 70
},
{
"epoch": 0.1518987341772152,
"grad_norm": 3.532927989959717,
"learning_rate": 5.978947368421052e-06,
"loss": 1.265703797340393,
"step": 72
},
{
"epoch": 0.15611814345991562,
"grad_norm": 2.622828245162964,
"learning_rate": 6.147368421052631e-06,
"loss": 1.0082859992980957,
"step": 74
},
{
"epoch": 0.16033755274261605,
"grad_norm": 3.0084891319274902,
"learning_rate": 6.31578947368421e-06,
"loss": 0.8589051365852356,
"step": 76
},
{
"epoch": 0.16455696202531644,
"grad_norm": 1.5682191848754883,
"learning_rate": 6.484210526315789e-06,
"loss": 1.3428035974502563,
"step": 78
},
{
"epoch": 0.16877637130801687,
"grad_norm": 1.149340033531189,
"learning_rate": 6.652631578947369e-06,
"loss": 1.3348019123077393,
"step": 80
},
{
"epoch": 0.1729957805907173,
"grad_norm": 1.4813284873962402,
"learning_rate": 6.821052631578947e-06,
"loss": 1.3795466423034668,
"step": 82
},
{
"epoch": 0.17721518987341772,
"grad_norm": 2.05072283744812,
"learning_rate": 6.989473684210526e-06,
"loss": 0.6971267461776733,
"step": 84
},
{
"epoch": 0.18143459915611815,
"grad_norm": 1.8534572124481201,
"learning_rate": 7.157894736842105e-06,
"loss": 1.113297700881958,
"step": 86
},
{
"epoch": 0.18565400843881857,
"grad_norm": 1.253941535949707,
"learning_rate": 7.326315789473684e-06,
"loss": 1.2875044345855713,
"step": 88
},
{
"epoch": 0.189873417721519,
"grad_norm": 1.6247971057891846,
"learning_rate": 7.494736842105263e-06,
"loss": 0.6056119203567505,
"step": 90
},
{
"epoch": 0.1940928270042194,
"grad_norm": 1.780320167541504,
"learning_rate": 7.663157894736842e-06,
"loss": 1.676588773727417,
"step": 92
},
{
"epoch": 0.19831223628691982,
"grad_norm": 2.9397106170654297,
"learning_rate": 7.831578947368421e-06,
"loss": 0.941127359867096,
"step": 94
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.2082825899124146,
"learning_rate": 8e-06,
"loss": 1.4084690809249878,
"step": 96
},
{
"epoch": 0.20675105485232068,
"grad_norm": 1.0113117694854736,
"learning_rate": 7.999978091917096e-06,
"loss": 1.4652810096740723,
"step": 98
},
{
"epoch": 0.2109704641350211,
"grad_norm": 3.00728178024292,
"learning_rate": 7.999912367935033e-06,
"loss": 1.2376593351364136,
"step": 100
},
{
"epoch": 0.21518987341772153,
"grad_norm": 2.393007278442383,
"learning_rate": 7.999802828853748e-06,
"loss": 1.051348090171814,
"step": 102
},
{
"epoch": 0.21940928270042195,
"grad_norm": 1.1730682849884033,
"learning_rate": 7.999649476006456e-06,
"loss": 0.9889463782310486,
"step": 104
},
{
"epoch": 0.22362869198312235,
"grad_norm": 3.6534433364868164,
"learning_rate": 7.99945231125964e-06,
"loss": 0.9653201699256897,
"step": 106
},
{
"epoch": 0.22784810126582278,
"grad_norm": 1.5248578786849976,
"learning_rate": 7.999211337013023e-06,
"loss": 0.970741331577301,
"step": 108
},
{
"epoch": 0.2320675105485232,
"grad_norm": 1.0831537246704102,
"learning_rate": 7.998926556199543e-06,
"loss": 1.0245221853256226,
"step": 110
},
{
"epoch": 0.23628691983122363,
"grad_norm": 1.1785749197006226,
"learning_rate": 7.998597972285308e-06,
"loss": 1.3712621927261353,
"step": 112
},
{
"epoch": 0.24050632911392406,
"grad_norm": 1.3017958402633667,
"learning_rate": 7.998225589269567e-06,
"loss": 1.2707055807113647,
"step": 114
},
{
"epoch": 0.24472573839662448,
"grad_norm": 2.281855344772339,
"learning_rate": 7.997809411684651e-06,
"loss": 1.5997581481933594,
"step": 116
},
{
"epoch": 0.2489451476793249,
"grad_norm": 0.8561908602714539,
"learning_rate": 7.997349444595921e-06,
"loss": 1.2587316036224365,
"step": 118
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.5100749135017395,
"learning_rate": 7.996845693601713e-06,
"loss": 0.9957402348518372,
"step": 120
},
{
"epoch": 0.25738396624472576,
"grad_norm": 3.094721794128418,
"learning_rate": 7.996298164833256e-06,
"loss": 1.258643627166748,
"step": 122
},
{
"epoch": 0.2616033755274262,
"grad_norm": 1.02621328830719,
"learning_rate": 7.995706864954613e-06,
"loss": 1.2669998407363892,
"step": 124
},
{
"epoch": 0.26582278481012656,
"grad_norm": 0.9123193621635437,
"learning_rate": 7.995071801162584e-06,
"loss": 1.3570575714111328,
"step": 126
},
{
"epoch": 0.270042194092827,
"grad_norm": 1.731139898300171,
"learning_rate": 7.99439298118663e-06,
"loss": 0.8451032042503357,
"step": 128
},
{
"epoch": 0.2742616033755274,
"grad_norm": 1.6111301183700562,
"learning_rate": 7.993670413288777e-06,
"loss": 1.1453604698181152,
"step": 130
},
{
"epoch": 0.27848101265822783,
"grad_norm": 2.9632482528686523,
"learning_rate": 7.992904106263512e-06,
"loss": 1.2021801471710205,
"step": 132
},
{
"epoch": 0.28270042194092826,
"grad_norm": 3.683976411819458,
"learning_rate": 7.992094069437679e-06,
"loss": 0.8209038972854614,
"step": 134
},
{
"epoch": 0.2869198312236287,
"grad_norm": 0.8348782658576965,
"learning_rate": 7.991240312670361e-06,
"loss": 0.8820058703422546,
"step": 136
},
{
"epoch": 0.2911392405063291,
"grad_norm": 2.7286171913146973,
"learning_rate": 7.99034284635277e-06,
"loss": 1.361273169517517,
"step": 138
},
{
"epoch": 0.29535864978902954,
"grad_norm": 4.104984283447266,
"learning_rate": 7.989401681408107e-06,
"loss": 0.9259814023971558,
"step": 140
},
{
"epoch": 0.29957805907172996,
"grad_norm": 1.7529696226119995,
"learning_rate": 7.988416829291437e-06,
"loss": 1.2620042562484741,
"step": 142
},
{
"epoch": 0.3037974683544304,
"grad_norm": 3.8300654888153076,
"learning_rate": 7.987388301989553e-06,
"loss": 1.0700979232788086,
"step": 144
},
{
"epoch": 0.3080168776371308,
"grad_norm": 1.7690354585647583,
"learning_rate": 7.986316112020821e-06,
"loss": 1.3733104467391968,
"step": 146
},
{
"epoch": 0.31223628691983124,
"grad_norm": 1.0495412349700928,
"learning_rate": 7.985200272435035e-06,
"loss": 1.3526469469070435,
"step": 148
},
{
"epoch": 0.31645569620253167,
"grad_norm": 1.3418861627578735,
"learning_rate": 7.984040796813251e-06,
"loss": 1.2077337503433228,
"step": 150
},
{
"epoch": 0.3206751054852321,
"grad_norm": 1.4774577617645264,
"learning_rate": 7.982837699267632e-06,
"loss": 1.2690041065216064,
"step": 152
},
{
"epoch": 0.32489451476793246,
"grad_norm": 1.621232032775879,
"learning_rate": 7.981590994441264e-06,
"loss": 1.4557234048843384,
"step": 154
},
{
"epoch": 0.3291139240506329,
"grad_norm": 9.58849811553955,
"learning_rate": 7.98030069750799e-06,
"loss": 1.2517377138137817,
"step": 156
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.9245334267616272,
"learning_rate": 7.978966824172219e-06,
"loss": 1.312515377998352,
"step": 158
},
{
"epoch": 0.33755274261603374,
"grad_norm": 0.9059025049209595,
"learning_rate": 7.977589390668727e-06,
"loss": 1.2920206785202026,
"step": 160
},
{
"epoch": 0.34177215189873417,
"grad_norm": 4.1672210693359375,
"learning_rate": 7.976168413762478e-06,
"loss": 0.8602538108825684,
"step": 162
},
{
"epoch": 0.3459915611814346,
"grad_norm": 3.4024016857147217,
"learning_rate": 7.974703910748405e-06,
"loss": 1.214678168296814,
"step": 164
},
{
"epoch": 0.350210970464135,
"grad_norm": 2.5402605533599854,
"learning_rate": 7.973195899451203e-06,
"loss": 0.809662401676178,
"step": 166
},
{
"epoch": 0.35443037974683544,
"grad_norm": 2.9653215408325195,
"learning_rate": 7.971644398225114e-06,
"loss": 1.2221626043319702,
"step": 168
},
{
"epoch": 0.35864978902953587,
"grad_norm": 1.4855773448944092,
"learning_rate": 7.970049425953705e-06,
"loss": 1.3168489933013916,
"step": 170
},
{
"epoch": 0.3628691983122363,
"grad_norm": 2.3102357387542725,
"learning_rate": 7.968411002049635e-06,
"loss": 1.1956959962844849,
"step": 172
},
{
"epoch": 0.3670886075949367,
"grad_norm": 0.9286043643951416,
"learning_rate": 7.966729146454414e-06,
"loss": 1.0124504566192627,
"step": 174
},
{
"epoch": 0.37130801687763715,
"grad_norm": 2.3038041591644287,
"learning_rate": 7.965003879638177e-06,
"loss": 1.30778169631958,
"step": 176
},
{
"epoch": 0.3755274261603376,
"grad_norm": 1.806934118270874,
"learning_rate": 7.963235222599414e-06,
"loss": 1.307655930519104,
"step": 178
},
{
"epoch": 0.379746835443038,
"grad_norm": 1.0257319211959839,
"learning_rate": 7.961423196864727e-06,
"loss": 0.990490198135376,
"step": 180
},
{
"epoch": 0.38396624472573837,
"grad_norm": 0.9438347220420837,
"learning_rate": 7.95956782448857e-06,
"loss": 1.2663373947143555,
"step": 182
},
{
"epoch": 0.3881856540084388,
"grad_norm": 0.7051662802696228,
"learning_rate": 7.957669128052967e-06,
"loss": 1.264948844909668,
"step": 184
},
{
"epoch": 0.3924050632911392,
"grad_norm": 1.4901031255722046,
"learning_rate": 7.955727130667254e-06,
"loss": 0.5198807120323181,
"step": 186
},
{
"epoch": 0.39662447257383965,
"grad_norm": 1.6583565473556519,
"learning_rate": 7.953741855967786e-06,
"loss": 0.9574030041694641,
"step": 188
},
{
"epoch": 0.4008438818565401,
"grad_norm": 0.9092651009559631,
"learning_rate": 7.951713328117653e-06,
"loss": 1.0500378608703613,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.398674726486206,
"learning_rate": 7.949641571806384e-06,
"loss": 1.2852396965026855,
"step": 192
},
{
"epoch": 0.4092827004219409,
"grad_norm": 1.3172980546951294,
"learning_rate": 7.947526612249655e-06,
"loss": 1.063001036643982,
"step": 194
},
{
"epoch": 0.41350210970464135,
"grad_norm": 1.0889501571655273,
"learning_rate": 7.945368475188967e-06,
"loss": 1.2641280889511108,
"step": 196
},
{
"epoch": 0.4177215189873418,
"grad_norm": 1.0665010213851929,
"learning_rate": 7.943167186891349e-06,
"loss": 1.0218112468719482,
"step": 198
},
{
"epoch": 0.4219409282700422,
"grad_norm": 3.58223032951355,
"learning_rate": 7.940922774149026e-06,
"loss": 1.0860857963562012,
"step": 200
},
{
"epoch": 0.42616033755274263,
"grad_norm": 1.089058756828308,
"learning_rate": 7.938635264279095e-06,
"loss": 1.11153244972229,
"step": 202
},
{
"epoch": 0.43037974683544306,
"grad_norm": 0.7927507162094116,
"learning_rate": 7.9363046851232e-06,
"loss": 0.9039996862411499,
"step": 204
},
{
"epoch": 0.4345991561181435,
"grad_norm": 1.210415005683899,
"learning_rate": 7.933931065047189e-06,
"loss": 1.3628909587860107,
"step": 206
},
{
"epoch": 0.4388185654008439,
"grad_norm": 2.830822706222534,
"learning_rate": 7.931514432940762e-06,
"loss": 0.7663958668708801,
"step": 208
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.9303013682365417,
"learning_rate": 7.92905481821713e-06,
"loss": 1.2602308988571167,
"step": 210
},
{
"epoch": 0.4472573839662447,
"grad_norm": 2.2079782485961914,
"learning_rate": 7.926552250812647e-06,
"loss": 0.7922911047935486,
"step": 212
},
{
"epoch": 0.45147679324894513,
"grad_norm": 1.6323434114456177,
"learning_rate": 7.92400676118646e-06,
"loss": 1.1895182132720947,
"step": 214
},
{
"epoch": 0.45569620253164556,
"grad_norm": 1.2990820407867432,
"learning_rate": 7.921418380320117e-06,
"loss": 1.236521601676941,
"step": 216
},
{
"epoch": 0.459915611814346,
"grad_norm": 1.2548811435699463,
"learning_rate": 7.918787139717211e-06,
"loss": 1.2785851955413818,
"step": 218
},
{
"epoch": 0.4641350210970464,
"grad_norm": 1.9813899993896484,
"learning_rate": 7.916113071402986e-06,
"loss": 1.230564832687378,
"step": 220
},
{
"epoch": 0.46835443037974683,
"grad_norm": 3.8140616416931152,
"learning_rate": 7.913396207923946e-06,
"loss": 1.2052173614501953,
"step": 222
},
{
"epoch": 0.47257383966244726,
"grad_norm": 1.2737183570861816,
"learning_rate": 7.910636582347466e-06,
"loss": 1.0253933668136597,
"step": 224
},
{
"epoch": 0.4767932489451477,
"grad_norm": 2.2479248046875,
"learning_rate": 7.907834228261378e-06,
"loss": 1.259740948677063,
"step": 226
},
{
"epoch": 0.4810126582278481,
"grad_norm": 2.846742630004883,
"learning_rate": 7.90498917977358e-06,
"loss": 0.782292366027832,
"step": 228
},
{
"epoch": 0.48523206751054854,
"grad_norm": 1.540499210357666,
"learning_rate": 7.9021014715116e-06,
"loss": 0.9024060368537903,
"step": 230
},
{
"epoch": 0.48945147679324896,
"grad_norm": 4.1563334465026855,
"learning_rate": 7.89917113862219e-06,
"loss": 0.8967229723930359,
"step": 232
},
{
"epoch": 0.4936708860759494,
"grad_norm": 1.8965283632278442,
"learning_rate": 7.896198216770892e-06,
"loss": 1.2712947130203247,
"step": 234
},
{
"epoch": 0.4978902953586498,
"grad_norm": 1.1842418909072876,
"learning_rate": 7.893182742141606e-06,
"loss": 1.312996506690979,
"step": 236
},
{
"epoch": 0.5021097046413502,
"grad_norm": 5.636916637420654,
"learning_rate": 7.890124751436146e-06,
"loss": 0.9852314591407776,
"step": 238
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.3782975673675537,
"learning_rate": 7.887024281873801e-06,
"loss": 0.5027163028717041,
"step": 240
},
{
"epoch": 0.510548523206751,
"grad_norm": 2.305345296859741,
"learning_rate": 7.88388137119087e-06,
"loss": 1.4169656038284302,
"step": 242
},
{
"epoch": 0.5147679324894515,
"grad_norm": 3.1710622310638428,
"learning_rate": 7.880696057640214e-06,
"loss": 0.8661314845085144,
"step": 244
},
{
"epoch": 0.5189873417721519,
"grad_norm": 1.3543529510498047,
"learning_rate": 7.877468379990784e-06,
"loss": 1.4801579713821411,
"step": 246
},
{
"epoch": 0.5232067510548524,
"grad_norm": 1.236095666885376,
"learning_rate": 7.874198377527153e-06,
"loss": 1.1857268810272217,
"step": 248
},
{
"epoch": 0.5274261603375527,
"grad_norm": 1.6078490018844604,
"learning_rate": 7.870886090049034e-06,
"loss": 1.301129698753357,
"step": 250
},
{
"epoch": 0.5316455696202531,
"grad_norm": 2.838106632232666,
"learning_rate": 7.867531557870802e-06,
"loss": 1.1084915399551392,
"step": 252
},
{
"epoch": 0.5358649789029536,
"grad_norm": 0.7895591855049133,
"learning_rate": 7.864134821820989e-06,
"loss": 1.2187299728393555,
"step": 254
},
{
"epoch": 0.540084388185654,
"grad_norm": 0.9526500701904297,
"learning_rate": 7.860695923241808e-06,
"loss": 1.1880545616149902,
"step": 256
},
{
"epoch": 0.5443037974683544,
"grad_norm": 5.490752220153809,
"learning_rate": 7.857214903988633e-06,
"loss": 1.0243443250656128,
"step": 258
},
{
"epoch": 0.5485232067510548,
"grad_norm": 5.083505153656006,
"learning_rate": 7.853691806429497e-06,
"loss": 0.9623079299926758,
"step": 260
},
{
"epoch": 0.5527426160337553,
"grad_norm": 0.914940357208252,
"learning_rate": 7.850126673444574e-06,
"loss": 1.2602107524871826,
"step": 262
},
{
"epoch": 0.5569620253164557,
"grad_norm": 1.6456938982009888,
"learning_rate": 7.846519548425655e-06,
"loss": 1.2723337411880493,
"step": 264
},
{
"epoch": 0.5611814345991561,
"grad_norm": 1.0772303342819214,
"learning_rate": 7.84287047527563e-06,
"loss": 1.2261412143707275,
"step": 266
},
{
"epoch": 0.5654008438818565,
"grad_norm": 1.5974887609481812,
"learning_rate": 7.839179498407939e-06,
"loss": 1.5006755590438843,
"step": 268
},
{
"epoch": 0.569620253164557,
"grad_norm": 3.4865124225616455,
"learning_rate": 7.835446662746043e-06,
"loss": 1.0508859157562256,
"step": 270
},
{
"epoch": 0.5738396624472574,
"grad_norm": 1.8510380983352661,
"learning_rate": 7.831672013722869e-06,
"loss": 1.303536057472229,
"step": 272
},
{
"epoch": 0.5780590717299579,
"grad_norm": 1.9769766330718994,
"learning_rate": 7.827855597280267e-06,
"loss": 1.1014729738235474,
"step": 274
},
{
"epoch": 0.5822784810126582,
"grad_norm": 1.2615407705307007,
"learning_rate": 7.82399745986844e-06,
"loss": 1.247708797454834,
"step": 276
},
{
"epoch": 0.5864978902953587,
"grad_norm": 1.0950466394424438,
"learning_rate": 7.820097648445383e-06,
"loss": 1.3929113149642944,
"step": 278
},
{
"epoch": 0.5907172995780591,
"grad_norm": 0.7477395534515381,
"learning_rate": 7.816156210476316e-06,
"loss": 0.9548027515411377,
"step": 280
},
{
"epoch": 0.5949367088607594,
"grad_norm": 0.8824933171272278,
"learning_rate": 7.812173193933098e-06,
"loss": 1.321789264678955,
"step": 282
},
{
"epoch": 0.5991561181434599,
"grad_norm": 2.1495587825775146,
"learning_rate": 7.808148647293651e-06,
"loss": 1.5318031311035156,
"step": 284
},
{
"epoch": 0.6033755274261603,
"grad_norm": 0.366915225982666,
"learning_rate": 7.804082619541366e-06,
"loss": 1.1145987510681152,
"step": 286
},
{
"epoch": 0.6075949367088608,
"grad_norm": 0.9638784527778625,
"learning_rate": 7.799975160164503e-06,
"loss": 1.244531512260437,
"step": 288
},
{
"epoch": 0.6118143459915611,
"grad_norm": 1.1089756488800049,
"learning_rate": 7.795826319155596e-06,
"loss": 0.8029107451438904,
"step": 290
},
{
"epoch": 0.6160337552742616,
"grad_norm": 1.2907359600067139,
"learning_rate": 7.791636147010842e-06,
"loss": 0.660262942314148,
"step": 292
},
{
"epoch": 0.620253164556962,
"grad_norm": 1.7545690536499023,
"learning_rate": 7.787404694729485e-06,
"loss": 1.2182437181472778,
"step": 294
},
{
"epoch": 0.6244725738396625,
"grad_norm": 2.1209237575531006,
"learning_rate": 7.783132013813194e-06,
"loss": 0.698481559753418,
"step": 296
},
{
"epoch": 0.6286919831223629,
"grad_norm": 15.612320899963379,
"learning_rate": 7.778818156265443e-06,
"loss": 0.6525253057479858,
"step": 298
},
{
"epoch": 0.6329113924050633,
"grad_norm": 2.1628663539886475,
"learning_rate": 7.774463174590867e-06,
"loss": 1.7705399990081787,
"step": 300
},
{
"epoch": 0.6371308016877637,
"grad_norm": 0.9601294994354248,
"learning_rate": 7.770067121794634e-06,
"loss": 1.2569221258163452,
"step": 302
},
{
"epoch": 0.6413502109704642,
"grad_norm": 6.321805953979492,
"learning_rate": 7.765630051381794e-06,
"loss": 0.8693046569824219,
"step": 304
},
{
"epoch": 0.6455696202531646,
"grad_norm": 1.1084986925125122,
"learning_rate": 7.761152017356627e-06,
"loss": 0.9949886798858643,
"step": 306
},
{
"epoch": 0.6497890295358649,
"grad_norm": 0.5674229860305786,
"learning_rate": 7.75663307422199e-06,
"loss": 1.2497336864471436,
"step": 308
},
{
"epoch": 0.6540084388185654,
"grad_norm": 0.785914421081543,
"learning_rate": 7.75207327697865e-06,
"loss": 1.197920560836792,
"step": 310
},
{
"epoch": 0.6582278481012658,
"grad_norm": 1.9101108312606812,
"learning_rate": 7.747472681124616e-06,
"loss": 1.0015456676483154,
"step": 312
},
{
"epoch": 0.6624472573839663,
"grad_norm": 1.1794395446777344,
"learning_rate": 7.742831342654461e-06,
"loss": 1.199405312538147,
"step": 314
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.8022367358207703,
"learning_rate": 7.738149318058648e-06,
"loss": 1.1004928350448608,
"step": 316
},
{
"epoch": 0.6708860759493671,
"grad_norm": 0.7416080236434937,
"learning_rate": 7.733426664322834e-06,
"loss": 1.0781973600387573,
"step": 318
},
{
"epoch": 0.6751054852320675,
"grad_norm": 1.5531445741653442,
"learning_rate": 7.728663438927177e-06,
"loss": 1.0812546014785767,
"step": 320
},
{
"epoch": 0.679324894514768,
"grad_norm": 1.3288339376449585,
"learning_rate": 7.723859699845645e-06,
"loss": 0.8775804042816162,
"step": 322
},
{
"epoch": 0.6835443037974683,
"grad_norm": 1.0390150547027588,
"learning_rate": 7.7190155055453e-06,
"loss": 0.8802211284637451,
"step": 324
},
{
"epoch": 0.6877637130801688,
"grad_norm": 0.8296968936920166,
"learning_rate": 7.714130914985593e-06,
"loss": 0.8521700501441956,
"step": 326
},
{
"epoch": 0.6919831223628692,
"grad_norm": 0.742783784866333,
"learning_rate": 7.709205987617642e-06,
"loss": 0.8648751974105835,
"step": 328
},
{
"epoch": 0.6962025316455697,
"grad_norm": 2.3347666263580322,
"learning_rate": 7.704240783383513e-06,
"loss": 1.5764340162277222,
"step": 330
},
{
"epoch": 0.70042194092827,
"grad_norm": 1.7489248514175415,
"learning_rate": 7.699235362715488e-06,
"loss": 1.0720549821853638,
"step": 332
},
{
"epoch": 0.7046413502109705,
"grad_norm": 1.7539390325546265,
"learning_rate": 7.694189786535325e-06,
"loss": 1.3112399578094482,
"step": 334
},
{
"epoch": 0.7088607594936709,
"grad_norm": 2.141007900238037,
"learning_rate": 7.689104116253529e-06,
"loss": 1.2512861490249634,
"step": 336
},
{
"epoch": 0.7130801687763713,
"grad_norm": 1.4252289533615112,
"learning_rate": 7.683978413768591e-06,
"loss": 1.2772711515426636,
"step": 338
},
{
"epoch": 0.7172995780590717,
"grad_norm": 0.931917667388916,
"learning_rate": 7.678812741466241e-06,
"loss": 1.2473686933517456,
"step": 340
},
{
"epoch": 0.7215189873417721,
"grad_norm": 0.674299418926239,
"learning_rate": 7.673607162218688e-06,
"loss": 1.2429455518722534,
"step": 342
},
{
"epoch": 0.7257383966244726,
"grad_norm": 3.6710023880004883,
"learning_rate": 7.668361739383856e-06,
"loss": 1.0689202547073364,
"step": 344
},
{
"epoch": 0.729957805907173,
"grad_norm": 2.2613823413848877,
"learning_rate": 7.66307653680461e-06,
"loss": 0.8741579651832581,
"step": 346
},
{
"epoch": 0.7341772151898734,
"grad_norm": 0.8934361338615417,
"learning_rate": 7.657751618807982e-06,
"loss": 1.234643578529358,
"step": 348
},
{
"epoch": 0.7383966244725738,
"grad_norm": 2.2448158264160156,
"learning_rate": 7.652387050204386e-06,
"loss": 1.1629151105880737,
"step": 350
},
{
"epoch": 0.7426160337552743,
"grad_norm": 2.3110733032226562,
"learning_rate": 7.64698289628683e-06,
"loss": 1.074580192565918,
"step": 352
},
{
"epoch": 0.7468354430379747,
"grad_norm": 2.6933298110961914,
"learning_rate": 7.641539222830117e-06,
"loss": 0.6495164036750793,
"step": 354
},
{
"epoch": 0.7510548523206751,
"grad_norm": 8.03363037109375,
"learning_rate": 7.63605609609006e-06,
"loss": 1.0515384674072266,
"step": 356
},
{
"epoch": 0.7552742616033755,
"grad_norm": 1.2727802991867065,
"learning_rate": 7.630533582802647e-06,
"loss": 1.0052093267440796,
"step": 358
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.8008536100387573,
"learning_rate": 7.6249717501832616e-06,
"loss": 1.208338975906372,
"step": 360
},
{
"epoch": 0.7637130801687764,
"grad_norm": 1.467780351638794,
"learning_rate": 7.619370665925842e-06,
"loss": 0.9765693545341492,
"step": 362
},
{
"epoch": 0.7679324894514767,
"grad_norm": 1.806458830833435,
"learning_rate": 7.613730398202061e-06,
"loss": 1.5730071067810059,
"step": 364
},
{
"epoch": 0.7721518987341772,
"grad_norm": 2.95654034614563,
"learning_rate": 7.608051015660508e-06,
"loss": 0.6979476809501648,
"step": 366
},
{
"epoch": 0.7763713080168776,
"grad_norm": 1.1918550729751587,
"learning_rate": 7.60233258742584e-06,
"loss": 1.1616028547286987,
"step": 368
},
{
"epoch": 0.7805907172995781,
"grad_norm": 2.9640629291534424,
"learning_rate": 7.596575183097943e-06,
"loss": 1.4773938655853271,
"step": 370
},
{
"epoch": 0.7848101265822784,
"grad_norm": 3.126946210861206,
"learning_rate": 7.590778872751091e-06,
"loss": 0.9821402430534363,
"step": 372
},
{
"epoch": 0.7890295358649789,
"grad_norm": 1.5098716020584106,
"learning_rate": 7.58494372693309e-06,
"loss": 1.0515096187591553,
"step": 374
},
{
"epoch": 0.7932489451476793,
"grad_norm": 1.3018288612365723,
"learning_rate": 7.579069816664417e-06,
"loss": 1.1510859727859497,
"step": 376
},
{
"epoch": 0.7974683544303798,
"grad_norm": 0.4790183901786804,
"learning_rate": 7.573157213437353e-06,
"loss": 1.1152373552322388,
"step": 378
},
{
"epoch": 0.8016877637130801,
"grad_norm": 1.7124649286270142,
"learning_rate": 7.567205989215126e-06,
"loss": 0.729989230632782,
"step": 380
},
{
"epoch": 0.8059071729957806,
"grad_norm": 1.813360333442688,
"learning_rate": 7.5612162164310196e-06,
"loss": 1.2611396312713623,
"step": 382
},
{
"epoch": 0.810126582278481,
"grad_norm": 3.6355385780334473,
"learning_rate": 7.555187967987499e-06,
"loss": 0.9938818216323853,
"step": 384
},
{
"epoch": 0.8143459915611815,
"grad_norm": 1.1984269618988037,
"learning_rate": 7.549121317255322e-06,
"loss": 1.2364702224731445,
"step": 386
},
{
"epoch": 0.8185654008438819,
"grad_norm": 1.193095326423645,
"learning_rate": 7.543016338072653e-06,
"loss": 0.9437189102172852,
"step": 388
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.9609726071357727,
"learning_rate": 7.5368731047441486e-06,
"loss": 1.2113581895828247,
"step": 390
},
{
"epoch": 0.8270042194092827,
"grad_norm": 2.6752779483795166,
"learning_rate": 7.530691692040069e-06,
"loss": 0.9650623798370361,
"step": 392
},
{
"epoch": 0.8312236286919831,
"grad_norm": 1.805607795715332,
"learning_rate": 7.52447217519536e-06,
"loss": 0.9127689003944397,
"step": 394
},
{
"epoch": 0.8354430379746836,
"grad_norm": 1.9785407781600952,
"learning_rate": 7.5182146299087375e-06,
"loss": 1.2258358001708984,
"step": 396
},
{
"epoch": 0.8396624472573839,
"grad_norm": 0.2685816287994385,
"learning_rate": 7.51191913234177e-06,
"loss": 1.0780510902404785,
"step": 398
},
{
"epoch": 0.8438818565400844,
"grad_norm": 0.8069247007369995,
"learning_rate": 7.505585759117947e-06,
"loss": 0.9565885663032532,
"step": 400
},
{
"epoch": 0.8481012658227848,
"grad_norm": 2.5125067234039307,
"learning_rate": 7.499214587321749e-06,
"loss": 0.7042322754859924,
"step": 402
},
{
"epoch": 0.8523206751054853,
"grad_norm": 1.6951823234558105,
"learning_rate": 7.49280569449771e-06,
"loss": 1.1971807479858398,
"step": 404
},
{
"epoch": 0.8565400843881856,
"grad_norm": 0.7480567097663879,
"learning_rate": 7.486359158649471e-06,
"loss": 0.9361912608146667,
"step": 406
},
{
"epoch": 0.8607594936708861,
"grad_norm": 1.4761838912963867,
"learning_rate": 7.4798750582388354e-06,
"loss": 0.9626801609992981,
"step": 408
},
{
"epoch": 0.8649789029535865,
"grad_norm": 0.9039321541786194,
"learning_rate": 7.473353472184806e-06,
"loss": 1.2230124473571777,
"step": 410
},
{
"epoch": 0.869198312236287,
"grad_norm": 1.8411026000976562,
"learning_rate": 7.466794479862632e-06,
"loss": 0.838551938533783,
"step": 412
},
{
"epoch": 0.8734177215189873,
"grad_norm": 1.6176677942276,
"learning_rate": 7.460198161102841e-06,
"loss": 1.2056636810302734,
"step": 414
},
{
"epoch": 0.8776371308016878,
"grad_norm": 0.796684205532074,
"learning_rate": 7.453564596190265e-06,
"loss": 1.0609307289123535,
"step": 416
},
{
"epoch": 0.8818565400843882,
"grad_norm": 3.51039457321167,
"learning_rate": 7.446893865863063e-06,
"loss": 1.1577751636505127,
"step": 418
},
{
"epoch": 0.8860759493670886,
"grad_norm": 1.3933305740356445,
"learning_rate": 7.440186051311744e-06,
"loss": 0.9417897462844849,
"step": 420
},
{
"epoch": 0.890295358649789,
"grad_norm": 1.077215313911438,
"learning_rate": 7.433441234178174e-06,
"loss": 1.333181619644165,
"step": 422
},
{
"epoch": 0.8945147679324894,
"grad_norm": 3.3169748783111572,
"learning_rate": 7.426659496554582e-06,
"loss": 0.9721631407737732,
"step": 424
},
{
"epoch": 0.8987341772151899,
"grad_norm": 1.1560137271881104,
"learning_rate": 7.4198409209825615e-06,
"loss": 1.1756271123886108,
"step": 426
},
{
"epoch": 0.9029535864978903,
"grad_norm": 0.8857593536376953,
"learning_rate": 7.412985590452066e-06,
"loss": 1.037049651145935,
"step": 428
},
{
"epoch": 0.9071729957805907,
"grad_norm": 1.4016187191009521,
"learning_rate": 7.4060935884004045e-06,
"loss": 1.0027376413345337,
"step": 430
},
{
"epoch": 0.9113924050632911,
"grad_norm": 1.6030839681625366,
"learning_rate": 7.399164998711215e-06,
"loss": 1.0767489671707153,
"step": 432
},
{
"epoch": 0.9156118143459916,
"grad_norm": 1.2388867139816284,
"learning_rate": 7.392199905713454e-06,
"loss": 1.241571307182312,
"step": 434
},
{
"epoch": 0.919831223628692,
"grad_norm": 1.3980982303619385,
"learning_rate": 7.385198394180359e-06,
"loss": 0.8756187558174133,
"step": 436
},
{
"epoch": 0.9240506329113924,
"grad_norm": 1.7992874383926392,
"learning_rate": 7.378160549328429e-06,
"loss": 1.196347713470459,
"step": 438
},
{
"epoch": 0.9282700421940928,
"grad_norm": 0.9916715025901794,
"learning_rate": 7.371086456816381e-06,
"loss": 0.9922671318054199,
"step": 440
},
{
"epoch": 0.9324894514767933,
"grad_norm": 2.890634536743164,
"learning_rate": 7.363976202744106e-06,
"loss": 0.9319839477539062,
"step": 442
},
{
"epoch": 0.9367088607594937,
"grad_norm": 5.665074348449707,
"learning_rate": 7.356829873651623e-06,
"loss": 1.0942474603652954,
"step": 444
},
{
"epoch": 0.9409282700421941,
"grad_norm": 1.669215440750122,
"learning_rate": 7.3496475565180284e-06,
"loss": 1.2984267473220825,
"step": 446
},
{
"epoch": 0.9451476793248945,
"grad_norm": 0.8537882566452026,
"learning_rate": 7.342429338760431e-06,
"loss": 0.9971826076507568,
"step": 448
},
{
"epoch": 0.9493670886075949,
"grad_norm": 3.3685364723205566,
"learning_rate": 7.3351753082328946e-06,
"loss": 0.9323700666427612,
"step": 450
},
{
"epoch": 0.9535864978902954,
"grad_norm": 1.2475708723068237,
"learning_rate": 7.327885553225365e-06,
"loss": 1.2786669731140137,
"step": 452
},
{
"epoch": 0.9578059071729957,
"grad_norm": 1.7981699705123901,
"learning_rate": 7.320560162462594e-06,
"loss": 0.9830716848373413,
"step": 454
},
{
"epoch": 0.9620253164556962,
"grad_norm": 1.964571237564087,
"learning_rate": 7.313199225103068e-06,
"loss": 1.1577880382537842,
"step": 456
},
{
"epoch": 0.9662447257383966,
"grad_norm": 0.8031247854232788,
"learning_rate": 7.3058028307379104e-06,
"loss": 0.8746158480644226,
"step": 458
},
{
"epoch": 0.9704641350210971,
"grad_norm": 6.945025444030762,
"learning_rate": 7.298371069389798e-06,
"loss": 0.6917670369148254,
"step": 460
},
{
"epoch": 0.9746835443037974,
"grad_norm": 1.1903830766677856,
"learning_rate": 7.290904031511867e-06,
"loss": 0.8951276540756226,
"step": 462
},
{
"epoch": 0.9789029535864979,
"grad_norm": 1.7528347969055176,
"learning_rate": 7.28340180798661e-06,
"loss": 1.1649961471557617,
"step": 464
},
{
"epoch": 0.9831223628691983,
"grad_norm": 2.7463033199310303,
"learning_rate": 7.275864490124769e-06,
"loss": 0.7191216349601746,
"step": 466
},
{
"epoch": 0.9873417721518988,
"grad_norm": 1.2754981517791748,
"learning_rate": 7.268292169664222e-06,
"loss": 1.3055366277694702,
"step": 468
},
{
"epoch": 0.9915611814345991,
"grad_norm": 1.402946949005127,
"learning_rate": 7.260684938768874e-06,
"loss": 0.8869744539260864,
"step": 470
},
{
"epoch": 0.9957805907172996,
"grad_norm": 1.225201964378357,
"learning_rate": 7.253042890027527e-06,
"loss": 1.202407956123352,
"step": 472
},
{
"epoch": 1.0,
"grad_norm": 1.0377144813537598,
"learning_rate": 7.2453661164527565e-06,
"loss": 1.249975562095642,
"step": 474
},
{
"epoch": 1.0042194092827004,
"grad_norm": 2.005743980407715,
"learning_rate": 7.237654711479781e-06,
"loss": 0.9949838519096375,
"step": 476
},
{
"epoch": 1.0084388185654007,
"grad_norm": 1.0849618911743164,
"learning_rate": 7.2299087689653224e-06,
"loss": 1.1602823734283447,
"step": 478
},
{
"epoch": 1.0126582278481013,
"grad_norm": 1.2653388977050781,
"learning_rate": 7.222128383186464e-06,
"loss": 1.13376784324646,
"step": 480
},
{
"epoch": 1.0168776371308017,
"grad_norm": 1.4933967590332031,
"learning_rate": 7.214313648839504e-06,
"loss": 0.8098440766334534,
"step": 482
},
{
"epoch": 1.021097046413502,
"grad_norm": 1.3467376232147217,
"learning_rate": 7.206464661038802e-06,
"loss": 1.058078408241272,
"step": 484
},
{
"epoch": 1.0253164556962024,
"grad_norm": 1.572613000869751,
"learning_rate": 7.198581515315622e-06,
"loss": 0.46203434467315674,
"step": 486
},
{
"epoch": 1.029535864978903,
"grad_norm": 1.1707247495651245,
"learning_rate": 7.1906643076169736e-06,
"loss": 0.952732264995575,
"step": 488
},
{
"epoch": 1.0337552742616034,
"grad_norm": 0.9213857650756836,
"learning_rate": 7.182713134304431e-06,
"loss": 0.8125715851783752,
"step": 490
},
{
"epoch": 1.0379746835443038,
"grad_norm": 1.2644870281219482,
"learning_rate": 7.174728092152975e-06,
"loss": 1.1190340518951416,
"step": 492
},
{
"epoch": 1.0421940928270041,
"grad_norm": 2.924903392791748,
"learning_rate": 7.1667092783498105e-06,
"loss": 0.8107770085334778,
"step": 494
},
{
"epoch": 1.0464135021097047,
"grad_norm": 1.9948618412017822,
"learning_rate": 7.15865679049318e-06,
"loss": 0.9311866164207458,
"step": 496
},
{
"epoch": 1.0506329113924051,
"grad_norm": 0.8843758702278137,
"learning_rate": 7.150570726591178e-06,
"loss": 1.216412901878357,
"step": 498
},
{
"epoch": 1.0548523206751055,
"grad_norm": 1.182377815246582,
"learning_rate": 7.14245118506056e-06,
"loss": 1.0732381343841553,
"step": 500
},
{
"epoch": 1.0590717299578059,
"grad_norm": 1.3736239671707153,
"learning_rate": 7.134298264725542e-06,
"loss": 1.0816361904144287,
"step": 502
},
{
"epoch": 1.0632911392405062,
"grad_norm": 0.9931637644767761,
"learning_rate": 7.126112064816598e-06,
"loss": 1.20469331741333,
"step": 504
},
{
"epoch": 1.0675105485232068,
"grad_norm": 0.99040687084198,
"learning_rate": 7.117892684969255e-06,
"loss": 0.7193590402603149,
"step": 506
},
{
"epoch": 1.0717299578059072,
"grad_norm": 1.0574781894683838,
"learning_rate": 7.109640225222874e-06,
"loss": 1.030031442642212,
"step": 508
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.9034721255302429,
"learning_rate": 7.101354786019443e-06,
"loss": 1.0760937929153442,
"step": 510
},
{
"epoch": 1.080168776371308,
"grad_norm": 2.8123185634613037,
"learning_rate": 7.0930364682023446e-06,
"loss": 1.0546125173568726,
"step": 512
},
{
"epoch": 1.0843881856540085,
"grad_norm": 1.0355448722839355,
"learning_rate": 7.084685373015131e-06,
"loss": 1.060817003250122,
"step": 514
},
{
"epoch": 1.0886075949367089,
"grad_norm": 0.6953267455101013,
"learning_rate": 7.076301602100294e-06,
"loss": 1.0389786958694458,
"step": 516
},
{
"epoch": 1.0928270042194093,
"grad_norm": 1.1498650312423706,
"learning_rate": 7.067885257498027e-06,
"loss": 0.9518197774887085,
"step": 518
},
{
"epoch": 1.0970464135021096,
"grad_norm": 1.411102533340454,
"learning_rate": 7.059436441644984e-06,
"loss": 0.8960402011871338,
"step": 520
},
{
"epoch": 1.1012658227848102,
"grad_norm": 2.03861927986145,
"learning_rate": 7.0509552573730305e-06,
"loss": 0.8347494602203369,
"step": 522
},
{
"epoch": 1.1054852320675106,
"grad_norm": 1.5810351371765137,
"learning_rate": 7.0424418079079925e-06,
"loss": 0.9857693314552307,
"step": 524
},
{
"epoch": 1.109704641350211,
"grad_norm": 2.000030040740967,
"learning_rate": 7.033896196868403e-06,
"loss": 0.8366687893867493,
"step": 526
},
{
"epoch": 1.1139240506329113,
"grad_norm": 1.2057420015335083,
"learning_rate": 7.025318528264234e-06,
"loss": 1.3332631587982178,
"step": 528
},
{
"epoch": 1.1181434599156117,
"grad_norm": 0.9624310731887817,
"learning_rate": 7.016708906495641e-06,
"loss": 1.2037197351455688,
"step": 530
},
{
"epoch": 1.1223628691983123,
"grad_norm": 1.497033715248108,
"learning_rate": 7.008067436351683e-06,
"loss": 0.6526771783828735,
"step": 532
},
{
"epoch": 1.1265822784810127,
"grad_norm": 2.6542580127716064,
"learning_rate": 6.999394223009052e-06,
"loss": 0.8994975090026855,
"step": 534
},
{
"epoch": 1.130801687763713,
"grad_norm": 0.8712950944900513,
"learning_rate": 6.9906893720307895e-06,
"loss": 1.0523709058761597,
"step": 536
},
{
"epoch": 1.1350210970464134,
"grad_norm": 1.1105852127075195,
"learning_rate": 6.981952989365005e-06,
"loss": 0.8754544854164124,
"step": 538
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.9426999688148499,
"learning_rate": 6.973185181343585e-06,
"loss": 0.7062304019927979,
"step": 540
},
{
"epoch": 1.1434599156118144,
"grad_norm": 0.9875909090042114,
"learning_rate": 6.9643860546809e-06,
"loss": 0.7558496594429016,
"step": 542
},
{
"epoch": 1.1476793248945147,
"grad_norm": 1.1727705001831055,
"learning_rate": 6.955555716472503e-06,
"loss": 0.7818480730056763,
"step": 544
},
{
"epoch": 1.1518987341772151,
"grad_norm": 1.8272697925567627,
"learning_rate": 6.9466942741938275e-06,
"loss": 1.0798598527908325,
"step": 546
},
{
"epoch": 1.1561181434599157,
"grad_norm": 1.3200129270553589,
"learning_rate": 6.93780183569888e-06,
"loss": 1.1049277782440186,
"step": 548
},
{
"epoch": 1.160337552742616,
"grad_norm": 12.516786575317383,
"learning_rate": 6.928878509218929e-06,
"loss": 0.8925328850746155,
"step": 550
},
{
"epoch": 1.1645569620253164,
"grad_norm": 1.5449568033218384,
"learning_rate": 6.919924403361182e-06,
"loss": 1.3479973077774048,
"step": 552
},
{
"epoch": 1.1687763713080168,
"grad_norm": 0.9759451746940613,
"learning_rate": 6.910939627107469e-06,
"loss": 1.0944254398345947,
"step": 554
},
{
"epoch": 1.1729957805907172,
"grad_norm": 3.6769256591796875,
"learning_rate": 6.901924289812913e-06,
"loss": 0.6496379375457764,
"step": 556
},
{
"epoch": 1.1772151898734178,
"grad_norm": 1.0708627700805664,
"learning_rate": 6.892878501204603e-06,
"loss": 0.9399113059043884,
"step": 558
},
{
"epoch": 1.1814345991561181,
"grad_norm": 3.548515558242798,
"learning_rate": 6.883802371380252e-06,
"loss": 0.6334307789802551,
"step": 560
},
{
"epoch": 1.1856540084388185,
"grad_norm": 0.7087482810020447,
"learning_rate": 6.874696010806865e-06,
"loss": 0.6812013983726501,
"step": 562
},
{
"epoch": 1.189873417721519,
"grad_norm": 1.6790183782577515,
"learning_rate": 6.865559530319386e-06,
"loss": 1.2819935083389282,
"step": 564
},
{
"epoch": 1.1940928270042195,
"grad_norm": 2.5965490341186523,
"learning_rate": 6.8563930411193535e-06,
"loss": 1.02937912940979,
"step": 566
},
{
"epoch": 1.1983122362869199,
"grad_norm": 3.7219197750091553,
"learning_rate": 6.847196654773552e-06,
"loss": 0.7903206944465637,
"step": 568
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.9391790628433228,
"learning_rate": 6.837970483212643e-06,
"loss": 1.0360606908798218,
"step": 570
},
{
"epoch": 1.2067510548523206,
"grad_norm": 20.603315353393555,
"learning_rate": 6.828714638729809e-06,
"loss": 1.0591099262237549,
"step": 572
},
{
"epoch": 1.2109704641350212,
"grad_norm": 2.5088610649108887,
"learning_rate": 6.81942923397939e-06,
"loss": 1.0366530418395996,
"step": 574
},
{
"epoch": 1.2151898734177216,
"grad_norm": 0.9826826453208923,
"learning_rate": 6.810114381975507e-06,
"loss": 0.9062384963035583,
"step": 576
},
{
"epoch": 1.219409282700422,
"grad_norm": 1.3147906064987183,
"learning_rate": 6.800770196090688e-06,
"loss": 0.6110230684280396,
"step": 578
},
{
"epoch": 1.2236286919831223,
"grad_norm": 0.8988205194473267,
"learning_rate": 6.791396790054484e-06,
"loss": 0.910240888595581,
"step": 580
},
{
"epoch": 1.2278481012658227,
"grad_norm": 2.201284170150757,
"learning_rate": 6.781994277952099e-06,
"loss": 0.8457823395729065,
"step": 582
},
{
"epoch": 1.2320675105485233,
"grad_norm": 3.1297316551208496,
"learning_rate": 6.7725627742229815e-06,
"loss": 0.8808956146240234,
"step": 584
},
{
"epoch": 1.2362869198312236,
"grad_norm": 5.279428482055664,
"learning_rate": 6.763102393659446e-06,
"loss": 0.9118282198905945,
"step": 586
},
{
"epoch": 1.240506329113924,
"grad_norm": 1.449725866317749,
"learning_rate": 6.753613251405274e-06,
"loss": 0.8038244247436523,
"step": 588
},
{
"epoch": 1.2447257383966246,
"grad_norm": 1.0893408060073853,
"learning_rate": 6.744095462954303e-06,
"loss": 1.065926194190979,
"step": 590
},
{
"epoch": 1.248945147679325,
"grad_norm": 11.18133544921875,
"learning_rate": 6.734549144149036e-06,
"loss": 0.6128525733947754,
"step": 592
},
{
"epoch": 1.2531645569620253,
"grad_norm": 0.5239539742469788,
"learning_rate": 6.724974411179218e-06,
"loss": 0.8248177766799927,
"step": 594
},
{
"epoch": 1.2573839662447257,
"grad_norm": 3.62746000289917,
"learning_rate": 6.7153713805804285e-06,
"loss": 0.6825551986694336,
"step": 596
},
{
"epoch": 1.261603375527426,
"grad_norm": 3.413501501083374,
"learning_rate": 6.7057401692326625e-06,
"loss": 0.567305862903595,
"step": 598
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.6996157169342041,
"learning_rate": 6.696080894358908e-06,
"loss": 0.8849403262138367,
"step": 600
},
{
"epoch": 1.270042194092827,
"grad_norm": 6.248124599456787,
"learning_rate": 6.686393673523715e-06,
"loss": 1.3093706369400024,
"step": 602
},
{
"epoch": 1.2742616033755274,
"grad_norm": 0.9306197166442871,
"learning_rate": 6.6766786246317726e-06,
"loss": 1.0244123935699463,
"step": 604
},
{
"epoch": 1.2784810126582278,
"grad_norm": 2.1768555641174316,
"learning_rate": 6.666935865926468e-06,
"loss": 0.8419608473777771,
"step": 606
},
{
"epoch": 1.2827004219409281,
"grad_norm": 1.509337306022644,
"learning_rate": 6.65716551598845e-06,
"loss": 0.8019965291023254,
"step": 608
},
{
"epoch": 1.2869198312236287,
"grad_norm": 5.7914323806762695,
"learning_rate": 6.647367693734181e-06,
"loss": 0.8274118900299072,
"step": 610
},
{
"epoch": 1.2911392405063291,
"grad_norm": 3.3554115295410156,
"learning_rate": 6.637542518414495e-06,
"loss": 0.5377339124679565,
"step": 612
},
{
"epoch": 1.2953586497890295,
"grad_norm": 0.9977070093154907,
"learning_rate": 6.627690109613147e-06,
"loss": 0.6412088871002197,
"step": 614
},
{
"epoch": 1.29957805907173,
"grad_norm": 5.793771743774414,
"learning_rate": 6.617810587245352e-06,
"loss": 1.0477070808410645,
"step": 616
},
{
"epoch": 1.3037974683544304,
"grad_norm": 1.5624624490737915,
"learning_rate": 6.607904071556331e-06,
"loss": 1.0696133375167847,
"step": 618
},
{
"epoch": 1.3080168776371308,
"grad_norm": 0.9112898111343384,
"learning_rate": 6.597970683119841e-06,
"loss": 0.6664775609970093,
"step": 620
},
{
"epoch": 1.3122362869198312,
"grad_norm": 1.806433081626892,
"learning_rate": 6.588010542836715e-06,
"loss": 0.7590267062187195,
"step": 622
},
{
"epoch": 1.3164556962025316,
"grad_norm": 1.8049041032791138,
"learning_rate": 6.578023771933387e-06,
"loss": 0.8482476472854614,
"step": 624
},
{
"epoch": 1.3206751054852321,
"grad_norm": 0.6221198439598083,
"learning_rate": 6.568010491960412e-06,
"loss": 1.0443530082702637,
"step": 626
},
{
"epoch": 1.3248945147679325,
"grad_norm": 1.562410593032837,
"learning_rate": 6.557970824790997e-06,
"loss": 1.539845585823059,
"step": 628
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.906804084777832,
"learning_rate": 6.5479048926195106e-06,
"loss": 0.9005885124206543,
"step": 630
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.909907341003418,
"learning_rate": 6.53781281795999e-06,
"loss": 0.8963803648948669,
"step": 632
},
{
"epoch": 1.3375527426160336,
"grad_norm": 0.4755572974681854,
"learning_rate": 6.527694723644668e-06,
"loss": 0.766118049621582,
"step": 634
},
{
"epoch": 1.3417721518987342,
"grad_norm": 1.1491714715957642,
"learning_rate": 6.517550732822457e-06,
"loss": 0.611838161945343,
"step": 636
},
{
"epoch": 1.3459915611814346,
"grad_norm": 2.129890203475952,
"learning_rate": 6.507380968957463e-06,
"loss": 0.7923972606658936,
"step": 638
},
{
"epoch": 1.350210970464135,
"grad_norm": 0.800757646560669,
"learning_rate": 6.497185555827484e-06,
"loss": 1.1963096857070923,
"step": 640
},
{
"epoch": 1.3544303797468356,
"grad_norm": 1.0547887086868286,
"learning_rate": 6.486964617522494e-06,
"loss": 0.7548023462295532,
"step": 642
},
{
"epoch": 1.358649789029536,
"grad_norm": 1.0978026390075684,
"learning_rate": 6.476718278443137e-06,
"loss": 1.230237603187561,
"step": 644
},
{
"epoch": 1.3628691983122363,
"grad_norm": 0.8420884609222412,
"learning_rate": 6.4664466632992195e-06,
"loss": 1.0555733442306519,
"step": 646
},
{
"epoch": 1.3670886075949367,
"grad_norm": 13.5333890914917,
"learning_rate": 6.456149897108182e-06,
"loss": 0.8676448464393616,
"step": 648
},
{
"epoch": 1.371308016877637,
"grad_norm": 0.9475505352020264,
"learning_rate": 6.445828105193586e-06,
"loss": 1.2682842016220093,
"step": 650
},
{
"epoch": 1.3755274261603376,
"grad_norm": 0.9018502831459045,
"learning_rate": 6.4354814131835815e-06,
"loss": 1.0565340518951416,
"step": 652
},
{
"epoch": 1.379746835443038,
"grad_norm": 0.54316645860672,
"learning_rate": 6.425109947009384e-06,
"loss": 0.7839528322219849,
"step": 654
},
{
"epoch": 1.3839662447257384,
"grad_norm": 0.874775767326355,
"learning_rate": 6.414713832903737e-06,
"loss": 1.1698050498962402,
"step": 656
},
{
"epoch": 1.3881856540084387,
"grad_norm": 4.575730323791504,
"learning_rate": 6.404293197399381e-06,
"loss": 0.5863835215568542,
"step": 658
},
{
"epoch": 1.3924050632911391,
"grad_norm": 0.7375707030296326,
"learning_rate": 6.393848167327507e-06,
"loss": 1.086789608001709,
"step": 660
},
{
"epoch": 1.3966244725738397,
"grad_norm": 2.6595211029052734,
"learning_rate": 6.3833788698162205e-06,
"loss": 0.7023826241493225,
"step": 662
},
{
"epoch": 1.40084388185654,
"grad_norm": 1.631126046180725,
"learning_rate": 6.372885432288982e-06,
"loss": 1.0789552927017212,
"step": 664
},
{
"epoch": 1.4050632911392404,
"grad_norm": 0.8146964907646179,
"learning_rate": 6.362367982463073e-06,
"loss": 0.6926907300949097,
"step": 666
},
{
"epoch": 1.409282700421941,
"grad_norm": 0.9691137075424194,
"learning_rate": 6.351826648348027e-06,
"loss": 1.0910325050354004,
"step": 668
},
{
"epoch": 1.4135021097046414,
"grad_norm": 0.9395283460617065,
"learning_rate": 6.341261558244079e-06,
"loss": 1.0995792150497437,
"step": 670
},
{
"epoch": 1.4177215189873418,
"grad_norm": 1.9160293340682983,
"learning_rate": 6.3306728407406015e-06,
"loss": 1.3626757860183716,
"step": 672
},
{
"epoch": 1.4219409282700421,
"grad_norm": 1.089788556098938,
"learning_rate": 6.320060624714535e-06,
"loss": 0.6344588994979858,
"step": 674
},
{
"epoch": 1.4261603375527425,
"grad_norm": 0.6872738003730774,
"learning_rate": 6.309425039328834e-06,
"loss": 0.5873957872390747,
"step": 676
},
{
"epoch": 1.4303797468354431,
"grad_norm": 2.0739026069641113,
"learning_rate": 6.298766214030878e-06,
"loss": 0.9192869067192078,
"step": 678
},
{
"epoch": 1.4345991561181435,
"grad_norm": 1.781925082206726,
"learning_rate": 6.288084278550905e-06,
"loss": 1.0239968299865723,
"step": 680
},
{
"epoch": 1.4388185654008439,
"grad_norm": 1.0395771265029907,
"learning_rate": 6.2773793629004305e-06,
"loss": 0.7735893726348877,
"step": 682
},
{
"epoch": 1.4430379746835442,
"grad_norm": 2.4246017932891846,
"learning_rate": 6.2666515973706635e-06,
"loss": 1.135629415512085,
"step": 684
},
{
"epoch": 1.4472573839662446,
"grad_norm": 0.7276327013969421,
"learning_rate": 6.255901112530928e-06,
"loss": 0.7381588816642761,
"step": 686
},
{
"epoch": 1.4514767932489452,
"grad_norm": 1.8231748342514038,
"learning_rate": 6.245128039227063e-06,
"loss": 0.8623338341712952,
"step": 688
},
{
"epoch": 1.4556962025316456,
"grad_norm": 2.5722429752349854,
"learning_rate": 6.234332508579835e-06,
"loss": 1.0199339389801025,
"step": 690
},
{
"epoch": 1.459915611814346,
"grad_norm": 1.6927199363708496,
"learning_rate": 6.2235146519833465e-06,
"loss": 0.5960026383399963,
"step": 692
},
{
"epoch": 1.4641350210970465,
"grad_norm": 1.3126322031021118,
"learning_rate": 6.21267460110343e-06,
"loss": 1.2402201890945435,
"step": 694
},
{
"epoch": 1.4683544303797469,
"grad_norm": 2.016200304031372,
"learning_rate": 6.201812487876048e-06,
"loss": 0.5972878932952881,
"step": 696
},
{
"epoch": 1.4725738396624473,
"grad_norm": 0.913316011428833,
"learning_rate": 6.1909284445056886e-06,
"loss": 1.0932003259658813,
"step": 698
},
{
"epoch": 1.4767932489451476,
"grad_norm": 0.6828026175498962,
"learning_rate": 6.1800226034637514e-06,
"loss": 1.1358331441879272,
"step": 700
},
{
"epoch": 1.481012658227848,
"grad_norm": 2.209768295288086,
"learning_rate": 6.169095097486947e-06,
"loss": 0.9575251340866089,
"step": 702
},
{
"epoch": 1.4852320675105486,
"grad_norm": 1.6049399375915527,
"learning_rate": 6.158146059575663e-06,
"loss": 0.7674723863601685,
"step": 704
},
{
"epoch": 1.489451476793249,
"grad_norm": 3.575786828994751,
"learning_rate": 6.147175622992363e-06,
"loss": 1.086501121520996,
"step": 706
},
{
"epoch": 1.4936708860759493,
"grad_norm": 0.9102724194526672,
"learning_rate": 6.136183921259956e-06,
"loss": 1.1395413875579834,
"step": 708
},
{
"epoch": 1.49789029535865,
"grad_norm": 0.5558487176895142,
"learning_rate": 6.125171088160168e-06,
"loss": 0.9195235371589661,
"step": 710
},
{
"epoch": 1.50210970464135,
"grad_norm": 0.9477531313896179,
"learning_rate": 6.114137257731925e-06,
"loss": 0.4785539209842682,
"step": 712
},
{
"epoch": 1.5063291139240507,
"grad_norm": 0.8040223717689514,
"learning_rate": 6.10308256426971e-06,
"loss": 1.0396082401275635,
"step": 714
},
{
"epoch": 1.510548523206751,
"grad_norm": 0.9315183162689209,
"learning_rate": 6.092007142321932e-06,
"loss": 1.043006181716919,
"step": 716
},
{
"epoch": 1.5147679324894514,
"grad_norm": 0.7686951756477356,
"learning_rate": 6.080911126689296e-06,
"loss": 1.0344305038452148,
"step": 718
},
{
"epoch": 1.518987341772152,
"grad_norm": 2.7792484760284424,
"learning_rate": 6.069794652423152e-06,
"loss": 1.009570598602295,
"step": 720
},
{
"epoch": 1.5232067510548524,
"grad_norm": 2.2277615070343018,
"learning_rate": 6.058657854823854e-06,
"loss": 1.0374475717544556,
"step": 722
},
{
"epoch": 1.5274261603375527,
"grad_norm": 1.6987587213516235,
"learning_rate": 6.047500869439114e-06,
"loss": 1.1916974782943726,
"step": 724
},
{
"epoch": 1.5316455696202531,
"grad_norm": 4.649923801422119,
"learning_rate": 6.036323832062359e-06,
"loss": 0.5684564113616943,
"step": 726
},
{
"epoch": 1.5358649789029535,
"grad_norm": 2.5940101146698,
"learning_rate": 6.025126878731064e-06,
"loss": 0.3716410994529724,
"step": 728
},
{
"epoch": 1.540084388185654,
"grad_norm": 0.8527175784111023,
"learning_rate": 6.013910145725112e-06,
"loss": 0.8164302706718445,
"step": 730
},
{
"epoch": 1.5443037974683544,
"grad_norm": 0.9312422275543213,
"learning_rate": 6.002673769565118e-06,
"loss": 0.9368805885314941,
"step": 732
},
{
"epoch": 1.5485232067510548,
"grad_norm": 0.9247412085533142,
"learning_rate": 5.991417887010786e-06,
"loss": 1.1238614320755005,
"step": 734
},
{
"epoch": 1.5527426160337554,
"grad_norm": 0.723078727722168,
"learning_rate": 5.98014263505923e-06,
"loss": 0.8048302531242371,
"step": 736
},
{
"epoch": 1.5569620253164556,
"grad_norm": 1.92336106300354,
"learning_rate": 5.968848150943314e-06,
"loss": 0.8754326105117798,
"step": 738
},
{
"epoch": 1.5611814345991561,
"grad_norm": 1.0468974113464355,
"learning_rate": 5.957534572129979e-06,
"loss": 0.9829418659210205,
"step": 740
},
{
"epoch": 1.5654008438818565,
"grad_norm": 0.782278835773468,
"learning_rate": 5.946202036318572e-06,
"loss": 0.6887242197990417,
"step": 742
},
{
"epoch": 1.5696202531645569,
"grad_norm": 1.8223977088928223,
"learning_rate": 5.934850681439166e-06,
"loss": 0.5122029185295105,
"step": 744
},
{
"epoch": 1.5738396624472575,
"grad_norm": 1.1448414325714111,
"learning_rate": 5.923480645650887e-06,
"loss": 0.6803614497184753,
"step": 746
},
{
"epoch": 1.5780590717299579,
"grad_norm": 4.745306491851807,
"learning_rate": 5.912092067340226e-06,
"loss": 0.6753883361816406,
"step": 748
},
{
"epoch": 1.5822784810126582,
"grad_norm": 0.8308981657028198,
"learning_rate": 5.900685085119361e-06,
"loss": 1.0774937868118286,
"step": 750
},
{
"epoch": 1.5864978902953588,
"grad_norm": 1.0071589946746826,
"learning_rate": 5.889259837824464e-06,
"loss": 0.5942963361740112,
"step": 752
},
{
"epoch": 1.590717299578059,
"grad_norm": 0.7977795600891113,
"learning_rate": 5.8778164645140155e-06,
"loss": 0.644191563129425,
"step": 754
},
{
"epoch": 1.5949367088607596,
"grad_norm": 0.7821984887123108,
"learning_rate": 5.8663551044671125e-06,
"loss": 0.601950466632843,
"step": 756
},
{
"epoch": 1.59915611814346,
"grad_norm": 1.1435626745224,
"learning_rate": 5.854875897181766e-06,
"loss": 0.8324768543243408,
"step": 758
},
{
"epoch": 1.6033755274261603,
"grad_norm": 0.794941246509552,
"learning_rate": 5.843378982373218e-06,
"loss": 1.0321424007415771,
"step": 760
},
{
"epoch": 1.6075949367088609,
"grad_norm": 0.4165087938308716,
"learning_rate": 5.8318644999722194e-06,
"loss": 0.6179360747337341,
"step": 762
},
{
"epoch": 1.611814345991561,
"grad_norm": 1.0744069814682007,
"learning_rate": 5.820332590123348e-06,
"loss": 1.0869427919387817,
"step": 764
},
{
"epoch": 1.6160337552742616,
"grad_norm": 2.4255731105804443,
"learning_rate": 5.80878339318329e-06,
"loss": 0.9976139664649963,
"step": 766
},
{
"epoch": 1.620253164556962,
"grad_norm": 0.3937893807888031,
"learning_rate": 5.797217049719138e-06,
"loss": 0.8773806095123291,
"step": 768
},
{
"epoch": 1.6244725738396624,
"grad_norm": 1.528141975402832,
"learning_rate": 5.785633700506676e-06,
"loss": 1.0529608726501465,
"step": 770
},
{
"epoch": 1.628691983122363,
"grad_norm": 1.1761523485183716,
"learning_rate": 5.774033486528666e-06,
"loss": 1.1696523427963257,
"step": 772
},
{
"epoch": 1.6329113924050633,
"grad_norm": 0.6605724096298218,
"learning_rate": 5.762416548973137e-06,
"loss": 1.06764554977417,
"step": 774
},
{
"epoch": 1.6371308016877637,
"grad_norm": 1.0275272130966187,
"learning_rate": 5.750783029231662e-06,
"loss": 1.0699821710586548,
"step": 776
},
{
"epoch": 1.6413502109704643,
"grad_norm": 0.9171205759048462,
"learning_rate": 5.739133068897638e-06,
"loss": 0.7903687953948975,
"step": 778
},
{
"epoch": 1.6455696202531644,
"grad_norm": 2.0880374908447266,
"learning_rate": 5.727466809764562e-06,
"loss": 0.372045636177063,
"step": 780
},
{
"epoch": 1.649789029535865,
"grad_norm": 3.6843972206115723,
"learning_rate": 5.715784393824309e-06,
"loss": 1.0749914646148682,
"step": 782
},
{
"epoch": 1.6540084388185654,
"grad_norm": 1.0832284688949585,
"learning_rate": 5.7040859632653985e-06,
"loss": 0.9234107136726379,
"step": 784
},
{
"epoch": 1.6582278481012658,
"grad_norm": 1.366377353668213,
"learning_rate": 5.692371660471269e-06,
"loss": 1.0691020488739014,
"step": 786
},
{
"epoch": 1.6624472573839664,
"grad_norm": 1.1804683208465576,
"learning_rate": 5.680641628018539e-06,
"loss": 0.5163772702217102,
"step": 788
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.0049868822097778,
"learning_rate": 5.6688960086752775e-06,
"loss": 1.0653210878372192,
"step": 790
},
{
"epoch": 1.6708860759493671,
"grad_norm": 2.6457467079162598,
"learning_rate": 5.657134945399265e-06,
"loss": 0.6419547200202942,
"step": 792
},
{
"epoch": 1.6751054852320675,
"grad_norm": 0.2932789921760559,
"learning_rate": 5.645358581336249e-06,
"loss": 0.8718560338020325,
"step": 794
},
{
"epoch": 1.6793248945147679,
"grad_norm": 0.8630058169364929,
"learning_rate": 5.633567059818208e-06,
"loss": 1.0517830848693848,
"step": 796
},
{
"epoch": 1.6835443037974684,
"grad_norm": 4.652339935302734,
"learning_rate": 5.621760524361605e-06,
"loss": 0.8228880167007446,
"step": 798
},
{
"epoch": 1.6877637130801688,
"grad_norm": 1.278536081314087,
"learning_rate": 5.6099391186656375e-06,
"loss": 1.1810134649276733,
"step": 800
},
{
"epoch": 1.6919831223628692,
"grad_norm": 5.43027925491333,
"learning_rate": 5.598102986610493e-06,
"loss": 0.5597525238990784,
"step": 802
},
{
"epoch": 1.6962025316455698,
"grad_norm": 4.56681489944458,
"learning_rate": 5.586252272255595e-06,
"loss": 1.1962707042694092,
"step": 804
},
{
"epoch": 1.70042194092827,
"grad_norm": 0.887153148651123,
"learning_rate": 5.574387119837848e-06,
"loss": 1.0536723136901855,
"step": 806
},
{
"epoch": 1.7046413502109705,
"grad_norm": 8.7942533493042,
"learning_rate": 5.562507673769889e-06,
"loss": 0.7714130282402039,
"step": 808
},
{
"epoch": 1.7088607594936709,
"grad_norm": 2.3065688610076904,
"learning_rate": 5.550614078638324e-06,
"loss": 0.8722562193870544,
"step": 810
},
{
"epoch": 1.7130801687763713,
"grad_norm": 1.0666223764419556,
"learning_rate": 5.5387064792019686e-06,
"loss": 1.1128357648849487,
"step": 812
},
{
"epoch": 1.7172995780590719,
"grad_norm": 3.0997695922851562,
"learning_rate": 5.526785020390084e-06,
"loss": 1.4282304048538208,
"step": 814
},
{
"epoch": 1.721518987341772,
"grad_norm": 1.0092432498931885,
"learning_rate": 5.514849847300622e-06,
"loss": 1.1036298274993896,
"step": 816
},
{
"epoch": 1.7257383966244726,
"grad_norm": 3.5967748165130615,
"learning_rate": 5.502901105198449e-06,
"loss": 0.7901860475540161,
"step": 818
},
{
"epoch": 1.729957805907173,
"grad_norm": 2.40335750579834,
"learning_rate": 5.490938939513584e-06,
"loss": 0.3646574020385742,
"step": 820
},
{
"epoch": 1.7341772151898733,
"grad_norm": 3.5133466720581055,
"learning_rate": 5.478963495839425e-06,
"loss": 0.5445467233657837,
"step": 822
},
{
"epoch": 1.738396624472574,
"grad_norm": 0.9658949375152588,
"learning_rate": 5.466974919930979e-06,
"loss": 0.7141355276107788,
"step": 824
},
{
"epoch": 1.7426160337552743,
"grad_norm": 1.7418462038040161,
"learning_rate": 5.454973357703087e-06,
"loss": 0.8929092884063721,
"step": 826
},
{
"epoch": 1.7468354430379747,
"grad_norm": 8.529016494750977,
"learning_rate": 5.442958955228649e-06,
"loss": 0.9267692565917969,
"step": 828
},
{
"epoch": 1.7510548523206753,
"grad_norm": 0.9777578115463257,
"learning_rate": 5.430931858736848e-06,
"loss": 1.0351005792617798,
"step": 830
},
{
"epoch": 1.7552742616033754,
"grad_norm": 1.306839108467102,
"learning_rate": 5.418892214611364e-06,
"loss": 1.0336472988128662,
"step": 832
},
{
"epoch": 1.759493670886076,
"grad_norm": 0.9158060550689697,
"learning_rate": 5.406840169388598e-06,
"loss": 0.8349417448043823,
"step": 834
},
{
"epoch": 1.7637130801687764,
"grad_norm": 4.128747940063477,
"learning_rate": 5.394775869755888e-06,
"loss": 1.078331470489502,
"step": 836
},
{
"epoch": 1.7679324894514767,
"grad_norm": 2.023729085922241,
"learning_rate": 5.3826994625497186e-06,
"loss": 0.8993400931358337,
"step": 838
},
{
"epoch": 1.7721518987341773,
"grad_norm": 1.9145705699920654,
"learning_rate": 5.370611094753943e-06,
"loss": 0.756892740726471,
"step": 840
},
{
"epoch": 1.7763713080168775,
"grad_norm": 4.3195695877075195,
"learning_rate": 5.358510913497981e-06,
"loss": 0.8908122777938843,
"step": 842
},
{
"epoch": 1.780590717299578,
"grad_norm": 0.7751283645629883,
"learning_rate": 5.346399066055044e-06,
"loss": 0.4248788058757782,
"step": 844
},
{
"epoch": 1.7848101265822784,
"grad_norm": 0.7409003973007202,
"learning_rate": 5.33427569984033e-06,
"loss": 0.650154173374176,
"step": 846
},
{
"epoch": 1.7890295358649788,
"grad_norm": 1.8226172924041748,
"learning_rate": 5.322140962409236e-06,
"loss": 0.59881591796875,
"step": 848
},
{
"epoch": 1.7932489451476794,
"grad_norm": 1.4619311094284058,
"learning_rate": 5.3099950014555554e-06,
"loss": 0.7507359981536865,
"step": 850
},
{
"epoch": 1.7974683544303798,
"grad_norm": 1.0151058435440063,
"learning_rate": 5.29783796480969e-06,
"loss": 1.127907633781433,
"step": 852
},
{
"epoch": 1.8016877637130801,
"grad_norm": 2.056638240814209,
"learning_rate": 5.2856700004368425e-06,
"loss": 1.3744020462036133,
"step": 854
},
{
"epoch": 1.8059071729957807,
"grad_norm": 0.30007457733154297,
"learning_rate": 5.273491256435222e-06,
"loss": 0.8465395569801331,
"step": 856
},
{
"epoch": 1.810126582278481,
"grad_norm": 2.211362361907959,
"learning_rate": 5.2613018810342314e-06,
"loss": 0.9668091535568237,
"step": 858
},
{
"epoch": 1.8143459915611815,
"grad_norm": 0.9358858466148376,
"learning_rate": 5.24910202259268e-06,
"loss": 0.664305567741394,
"step": 860
},
{
"epoch": 1.8185654008438819,
"grad_norm": 1.0162758827209473,
"learning_rate": 5.236891829596958e-06,
"loss": 1.0983484983444214,
"step": 862
},
{
"epoch": 1.8227848101265822,
"grad_norm": 0.8416312336921692,
"learning_rate": 5.2246714506592454e-06,
"loss": 0.9112118482589722,
"step": 864
},
{
"epoch": 1.8270042194092828,
"grad_norm": 0.833525538444519,
"learning_rate": 5.212441034515695e-06,
"loss": 0.9819576740264893,
"step": 866
},
{
"epoch": 1.831223628691983,
"grad_norm": 21.00221824645996,
"learning_rate": 5.200200730024622e-06,
"loss": 0.9238821268081665,
"step": 868
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.7736468315124512,
"learning_rate": 5.187950686164699e-06,
"loss": 0.9560548663139343,
"step": 870
},
{
"epoch": 1.839662447257384,
"grad_norm": 2.4590296745300293,
"learning_rate": 5.175691052033133e-06,
"loss": 0.7176443338394165,
"step": 872
},
{
"epoch": 1.8438818565400843,
"grad_norm": 14.438817977905273,
"learning_rate": 5.163421976843859e-06,
"loss": 0.9139724373817444,
"step": 874
},
{
"epoch": 1.8481012658227849,
"grad_norm": 0.7402142286300659,
"learning_rate": 5.151143609925718e-06,
"loss": 1.0629327297210693,
"step": 876
},
{
"epoch": 1.8523206751054853,
"grad_norm": 2.3395838737487793,
"learning_rate": 5.138856100720645e-06,
"loss": 0.7460686564445496,
"step": 878
},
{
"epoch": 1.8565400843881856,
"grad_norm": 1.0752966403961182,
"learning_rate": 5.126559598781845e-06,
"loss": 0.6765896677970886,
"step": 880
},
{
"epoch": 1.8607594936708862,
"grad_norm": 1.11525559425354,
"learning_rate": 5.114254253771977e-06,
"loss": 0.8317286968231201,
"step": 882
},
{
"epoch": 1.8649789029535864,
"grad_norm": 1.3351662158966064,
"learning_rate": 5.1019402154613264e-06,
"loss": 0.6764845252037048,
"step": 884
},
{
"epoch": 1.869198312236287,
"grad_norm": 1.5033775568008423,
"learning_rate": 5.089617633725992e-06,
"loss": 0.7203776240348816,
"step": 886
},
{
"epoch": 1.8734177215189873,
"grad_norm": 0.8415127992630005,
"learning_rate": 5.07728665854605e-06,
"loss": 1.027212142944336,
"step": 888
},
{
"epoch": 1.8776371308016877,
"grad_norm": 6.813464641571045,
"learning_rate": 5.064947440003741e-06,
"loss": 0.3982529640197754,
"step": 890
},
{
"epoch": 1.8818565400843883,
"grad_norm": 1.178280234336853,
"learning_rate": 5.0526001282816285e-06,
"loss": 0.6589434146881104,
"step": 892
},
{
"epoch": 1.8860759493670884,
"grad_norm": 1.1725980043411255,
"learning_rate": 5.0402448736607874e-06,
"loss": 1.087322473526001,
"step": 894
},
{
"epoch": 1.890295358649789,
"grad_norm": 0.9317290782928467,
"learning_rate": 5.027881826518963e-06,
"loss": 1.2050056457519531,
"step": 896
},
{
"epoch": 1.8945147679324894,
"grad_norm": 3.785717010498047,
"learning_rate": 5.015511137328743e-06,
"loss": 0.7949274182319641,
"step": 898
},
{
"epoch": 1.8987341772151898,
"grad_norm": 1.9153603315353394,
"learning_rate": 5.003132956655735e-06,
"loss": 0.9485737085342407,
"step": 900
},
{
"epoch": 1.9029535864978904,
"grad_norm": 1.1202062368392944,
"learning_rate": 4.990747435156715e-06,
"loss": 1.1542925834655762,
"step": 902
},
{
"epoch": 1.9071729957805907,
"grad_norm": 5.2798967361450195,
"learning_rate": 4.978354723577818e-06,
"loss": 0.9438016414642334,
"step": 904
},
{
"epoch": 1.9113924050632911,
"grad_norm": 1.9839028120040894,
"learning_rate": 4.965954972752677e-06,
"loss": 1.31730055809021,
"step": 906
},
{
"epoch": 1.9156118143459917,
"grad_norm": 3.2454068660736084,
"learning_rate": 4.953548333600616e-06,
"loss": 0.43834638595581055,
"step": 908
},
{
"epoch": 1.9198312236286919,
"grad_norm": 0.7044057250022888,
"learning_rate": 4.9411349571247845e-06,
"loss": 1.0278995037078857,
"step": 910
},
{
"epoch": 1.9240506329113924,
"grad_norm": 4.6270246505737305,
"learning_rate": 4.928714994410341e-06,
"loss": 0.7902883887290955,
"step": 912
},
{
"epoch": 1.9282700421940928,
"grad_norm": 1.1006572246551514,
"learning_rate": 4.9162885966226035e-06,
"loss": 1.0976777076721191,
"step": 914
},
{
"epoch": 1.9324894514767932,
"grad_norm": 1.0053693056106567,
"learning_rate": 4.903855915005212e-06,
"loss": 0.7121254205703735,
"step": 916
},
{
"epoch": 1.9367088607594938,
"grad_norm": 0.7003014087677002,
"learning_rate": 4.8914171008782885e-06,
"loss": 1.054925560951233,
"step": 918
},
{
"epoch": 1.9409282700421941,
"grad_norm": 1.9592584371566772,
"learning_rate": 4.878972305636595e-06,
"loss": 0.46024253964424133,
"step": 920
},
{
"epoch": 1.9451476793248945,
"grad_norm": 0.7745406627655029,
"learning_rate": 4.86652168074769e-06,
"loss": 1.0552338361740112,
"step": 922
},
{
"epoch": 1.9493670886075949,
"grad_norm": 1.3688344955444336,
"learning_rate": 4.8540653777500865e-06,
"loss": 1.0862473249435425,
"step": 924
},
{
"epoch": 1.9535864978902953,
"grad_norm": 2.7827236652374268,
"learning_rate": 4.841603548251406e-06,
"loss": 0.8950420022010803,
"step": 926
},
{
"epoch": 1.9578059071729959,
"grad_norm": 0.7784646153450012,
"learning_rate": 4.829136343926532e-06,
"loss": 0.720669686794281,
"step": 928
},
{
"epoch": 1.9620253164556962,
"grad_norm": 1.3589555025100708,
"learning_rate": 4.816663916515772e-06,
"loss": 1.1043243408203125,
"step": 930
},
{
"epoch": 1.9662447257383966,
"grad_norm": 3.5423665046691895,
"learning_rate": 4.804186417822995e-06,
"loss": 0.870411217212677,
"step": 932
},
{
"epoch": 1.9704641350210972,
"grad_norm": 0.7861785292625427,
"learning_rate": 4.791703999713803e-06,
"loss": 1.1790004968643188,
"step": 934
},
{
"epoch": 1.9746835443037973,
"grad_norm": 0.7902828454971313,
"learning_rate": 4.779216814113667e-06,
"loss": 1.103920340538025,
"step": 936
},
{
"epoch": 1.978902953586498,
"grad_norm": 0.6530998945236206,
"learning_rate": 4.766725013006085e-06,
"loss": 0.7225710153579712,
"step": 938
},
{
"epoch": 1.9831223628691983,
"grad_norm": 1.588179588317871,
"learning_rate": 4.754228748430731e-06,
"loss": 1.0604408979415894,
"step": 940
},
{
"epoch": 1.9873417721518987,
"grad_norm": 1.7913926839828491,
"learning_rate": 4.741728172481607e-06,
"loss": 0.8651899099349976,
"step": 942
},
{
"epoch": 1.9915611814345993,
"grad_norm": 0.891537070274353,
"learning_rate": 4.729223437305187e-06,
"loss": 0.6996287107467651,
"step": 944
},
{
"epoch": 1.9957805907172996,
"grad_norm": 1.4173448085784912,
"learning_rate": 4.716714695098568e-06,
"loss": 1.0344507694244385,
"step": 946
},
{
"epoch": 2.0,
"grad_norm": 3.7296454906463623,
"learning_rate": 4.7042020981076185e-06,
"loss": 0.5512294173240662,
"step": 948
},
{
"epoch": 2.0042194092827006,
"grad_norm": 2.249424457550049,
"learning_rate": 4.69168579862512e-06,
"loss": 0.8092342615127563,
"step": 950
},
{
"epoch": 2.0084388185654007,
"grad_norm": 2.6464383602142334,
"learning_rate": 4.679165948988924e-06,
"loss": 0.47413283586502075,
"step": 952
},
{
"epoch": 2.0126582278481013,
"grad_norm": 1.5369104146957397,
"learning_rate": 4.666642701580086e-06,
"loss": 0.7702062129974365,
"step": 954
},
{
"epoch": 2.0168776371308015,
"grad_norm": 1.0920283794403076,
"learning_rate": 4.65411620882102e-06,
"loss": 0.8473414182662964,
"step": 956
},
{
"epoch": 2.021097046413502,
"grad_norm": 20.295406341552734,
"learning_rate": 4.6415866231736375e-06,
"loss": 0.6457698345184326,
"step": 958
},
{
"epoch": 2.0253164556962027,
"grad_norm": 3.8915340900421143,
"learning_rate": 4.629054097137493e-06,
"loss": 0.7031627893447876,
"step": 960
},
{
"epoch": 2.029535864978903,
"grad_norm": 1.0874841213226318,
"learning_rate": 4.616518783247934e-06,
"loss": 1.0022499561309814,
"step": 962
},
{
"epoch": 2.0337552742616034,
"grad_norm": 5.714715480804443,
"learning_rate": 4.603980834074232e-06,
"loss": 0.7056564688682556,
"step": 964
},
{
"epoch": 2.037974683544304,
"grad_norm": 0.8951921463012695,
"learning_rate": 4.591440402217741e-06,
"loss": 0.5630991458892822,
"step": 966
},
{
"epoch": 2.042194092827004,
"grad_norm": 4.608378887176514,
"learning_rate": 4.578897640310025e-06,
"loss": 0.6585802435874939,
"step": 968
},
{
"epoch": 2.0464135021097047,
"grad_norm": 1.6705124378204346,
"learning_rate": 4.566352701011013e-06,
"loss": 0.9024470448493958,
"step": 970
},
{
"epoch": 2.050632911392405,
"grad_norm": 2.591546058654785,
"learning_rate": 4.5538057370071315e-06,
"loss": 0.7236870527267456,
"step": 972
},
{
"epoch": 2.0548523206751055,
"grad_norm": 1.0205042362213135,
"learning_rate": 4.541256901009451e-06,
"loss": 0.7728800177574158,
"step": 974
},
{
"epoch": 2.059071729957806,
"grad_norm": 2.32804799079895,
"learning_rate": 4.528706345751826e-06,
"loss": 0.6220592856407166,
"step": 976
},
{
"epoch": 2.0632911392405062,
"grad_norm": 0.9847302436828613,
"learning_rate": 4.516154223989039e-06,
"loss": 0.6414508819580078,
"step": 978
},
{
"epoch": 2.067510548523207,
"grad_norm": 1.0494519472122192,
"learning_rate": 4.503600688494938e-06,
"loss": 0.5687150359153748,
"step": 980
},
{
"epoch": 2.071729957805907,
"grad_norm": 1.0996086597442627,
"learning_rate": 4.491045892060573e-06,
"loss": 0.9595503211021423,
"step": 982
},
{
"epoch": 2.0759493670886076,
"grad_norm": 1.6307997703552246,
"learning_rate": 4.478489987492346e-06,
"loss": 0.8499625325202942,
"step": 984
},
{
"epoch": 2.080168776371308,
"grad_norm": 1.1343793869018555,
"learning_rate": 4.465933127610145e-06,
"loss": 0.8802004456520081,
"step": 986
},
{
"epoch": 2.0843881856540083,
"grad_norm": 0.8233914375305176,
"learning_rate": 4.453375465245486e-06,
"loss": 0.8876461982727051,
"step": 988
},
{
"epoch": 2.088607594936709,
"grad_norm": 3.605290651321411,
"learning_rate": 4.44081715323965e-06,
"loss": 0.47245436906814575,
"step": 990
},
{
"epoch": 2.0928270042194095,
"grad_norm": 1.4245373010635376,
"learning_rate": 4.428258344441826e-06,
"loss": 0.4930482804775238,
"step": 992
},
{
"epoch": 2.0970464135021096,
"grad_norm": 1.0939189195632935,
"learning_rate": 4.415699191707251e-06,
"loss": 0.9832253456115723,
"step": 994
},
{
"epoch": 2.1012658227848102,
"grad_norm": 1.3786028623580933,
"learning_rate": 4.403139847895348e-06,
"loss": 0.8831475377082825,
"step": 996
},
{
"epoch": 2.1054852320675104,
"grad_norm": 0.33124950528144836,
"learning_rate": 4.39058046586786e-06,
"loss": 0.5398452877998352,
"step": 998
},
{
"epoch": 2.109704641350211,
"grad_norm": 2.1223366260528564,
"learning_rate": 4.3780211984870044e-06,
"loss": 1.0190367698669434,
"step": 1000
},
{
"epoch": 2.1139240506329116,
"grad_norm": 2.0882437229156494,
"learning_rate": 4.365462198613595e-06,
"loss": 0.8691745400428772,
"step": 1002
},
{
"epoch": 2.1181434599156117,
"grad_norm": 0.9551434516906738,
"learning_rate": 4.352903619105196e-06,
"loss": 0.8893840909004211,
"step": 1004
},
{
"epoch": 2.1223628691983123,
"grad_norm": 0.49108386039733887,
"learning_rate": 4.340345612814251e-06,
"loss": 0.5169594287872314,
"step": 1006
},
{
"epoch": 2.1265822784810124,
"grad_norm": 0.9406089186668396,
"learning_rate": 4.327788332586227e-06,
"loss": 0.5989170074462891,
"step": 1008
},
{
"epoch": 2.130801687763713,
"grad_norm": 1.099560022354126,
"learning_rate": 4.315231931257758e-06,
"loss": 0.5996731519699097,
"step": 1010
},
{
"epoch": 2.1350210970464136,
"grad_norm": 12.219691276550293,
"learning_rate": 4.302676561654775e-06,
"loss": 0.8513282537460327,
"step": 1012
},
{
"epoch": 2.1392405063291138,
"grad_norm": 2.0376791954040527,
"learning_rate": 4.290122376590656e-06,
"loss": 0.9961199164390564,
"step": 1014
},
{
"epoch": 2.1434599156118144,
"grad_norm": 1.4444695711135864,
"learning_rate": 4.2775695288643615e-06,
"loss": 0.4728237986564636,
"step": 1016
},
{
"epoch": 2.147679324894515,
"grad_norm": 1.0163081884384155,
"learning_rate": 4.2650181712585735e-06,
"loss": 0.7495555281639099,
"step": 1018
},
{
"epoch": 2.151898734177215,
"grad_norm": 1.1818724870681763,
"learning_rate": 4.252468456537838e-06,
"loss": 0.6457207202911377,
"step": 1020
},
{
"epoch": 2.1561181434599157,
"grad_norm": 2.961237907409668,
"learning_rate": 4.239920537446705e-06,
"loss": 0.7249948978424072,
"step": 1022
},
{
"epoch": 2.160337552742616,
"grad_norm": 2.8546791076660156,
"learning_rate": 4.227374566707871e-06,
"loss": 0.6750069856643677,
"step": 1024
},
{
"epoch": 2.1645569620253164,
"grad_norm": 1.0282621383666992,
"learning_rate": 4.214830697020316e-06,
"loss": 0.9150334000587463,
"step": 1026
},
{
"epoch": 2.168776371308017,
"grad_norm": 0.8248642086982727,
"learning_rate": 4.202289081057452e-06,
"loss": 0.9421663284301758,
"step": 1028
},
{
"epoch": 2.172995780590717,
"grad_norm": 0.9548051953315735,
"learning_rate": 4.189749871465253e-06,
"loss": 0.8729570508003235,
"step": 1030
},
{
"epoch": 2.1772151898734178,
"grad_norm": 0.8367507457733154,
"learning_rate": 4.177213220860416e-06,
"loss": 0.8981440663337708,
"step": 1032
},
{
"epoch": 2.181434599156118,
"grad_norm": 1.4248055219650269,
"learning_rate": 4.164679281828482e-06,
"loss": 0.8822668194770813,
"step": 1034
},
{
"epoch": 2.1856540084388185,
"grad_norm": 0.9020785689353943,
"learning_rate": 4.152148206921995e-06,
"loss": 0.8814399838447571,
"step": 1036
},
{
"epoch": 2.189873417721519,
"grad_norm": 1.4970018863677979,
"learning_rate": 4.139620148658634e-06,
"loss": 0.8485023379325867,
"step": 1038
},
{
"epoch": 2.1940928270042193,
"grad_norm": 1.1914066076278687,
"learning_rate": 4.127095259519368e-06,
"loss": 1.0057520866394043,
"step": 1040
},
{
"epoch": 2.19831223628692,
"grad_norm": 5.138652324676514,
"learning_rate": 4.114573691946591e-06,
"loss": 0.26296478509902954,
"step": 1042
},
{
"epoch": 2.2025316455696204,
"grad_norm": 1.1444544792175293,
"learning_rate": 4.102055598342269e-06,
"loss": 0.8880115747451782,
"step": 1044
},
{
"epoch": 2.2067510548523206,
"grad_norm": 1.740729808807373,
"learning_rate": 4.089541131066086e-06,
"loss": 0.5347674489021301,
"step": 1046
},
{
"epoch": 2.210970464135021,
"grad_norm": 1.3183239698410034,
"learning_rate": 4.077030442433593e-06,
"loss": 0.790450930595398,
"step": 1048
},
{
"epoch": 2.2151898734177213,
"grad_norm": 1.1291550397872925,
"learning_rate": 4.064523684714344e-06,
"loss": 0.8988840579986572,
"step": 1050
},
{
"epoch": 2.219409282700422,
"grad_norm": 2.9497318267822266,
"learning_rate": 4.052021010130056e-06,
"loss": 0.7755071520805359,
"step": 1052
},
{
"epoch": 2.2236286919831225,
"grad_norm": 2.4455068111419678,
"learning_rate": 4.039522570852745e-06,
"loss": 0.7849942445755005,
"step": 1054
},
{
"epoch": 2.2278481012658227,
"grad_norm": 0.9835525751113892,
"learning_rate": 4.0270285190028794e-06,
"loss": 0.7088072896003723,
"step": 1056
},
{
"epoch": 2.2320675105485233,
"grad_norm": 20.216365814208984,
"learning_rate": 4.014539006647528e-06,
"loss": 0.42411160469055176,
"step": 1058
},
{
"epoch": 2.2362869198312234,
"grad_norm": 0.8427597284317017,
"learning_rate": 4.002054185798509e-06,
"loss": 0.8620681762695312,
"step": 1060
},
{
"epoch": 2.240506329113924,
"grad_norm": 0.3895626366138458,
"learning_rate": 3.98957420841054e-06,
"loss": 0.6363852024078369,
"step": 1062
},
{
"epoch": 2.2447257383966246,
"grad_norm": 1.1307460069656372,
"learning_rate": 3.977099226379386e-06,
"loss": 0.4475446343421936,
"step": 1064
},
{
"epoch": 2.2489451476793247,
"grad_norm": 1.3451250791549683,
"learning_rate": 3.9646293915400145e-06,
"loss": 0.8441832661628723,
"step": 1066
},
{
"epoch": 2.2531645569620253,
"grad_norm": 1.8237205743789673,
"learning_rate": 3.952164855664745e-06,
"loss": 1.0592007637023926,
"step": 1068
},
{
"epoch": 2.257383966244726,
"grad_norm": 1.1085244417190552,
"learning_rate": 3.939705770461403e-06,
"loss": 1.0274057388305664,
"step": 1070
},
{
"epoch": 2.261603375527426,
"grad_norm": 1.4007558822631836,
"learning_rate": 3.927252287571472e-06,
"loss": 0.8607990145683289,
"step": 1072
},
{
"epoch": 2.2658227848101267,
"grad_norm": 3.7572860717773438,
"learning_rate": 3.914804558568251e-06,
"loss": 1.1480568647384644,
"step": 1074
},
{
"epoch": 2.270042194092827,
"grad_norm": 0.819203794002533,
"learning_rate": 3.902362734955003e-06,
"loss": 0.8235105872154236,
"step": 1076
},
{
"epoch": 2.2742616033755274,
"grad_norm": 0.528959333896637,
"learning_rate": 3.889926968163123e-06,
"loss": 0.5926033854484558,
"step": 1078
},
{
"epoch": 2.278481012658228,
"grad_norm": 1.5626213550567627,
"learning_rate": 3.877497409550281e-06,
"loss": 0.7218382358551025,
"step": 1080
},
{
"epoch": 2.282700421940928,
"grad_norm": 1.657475233078003,
"learning_rate": 3.8650742103985865e-06,
"loss": 0.33192554116249084,
"step": 1082
},
{
"epoch": 2.2869198312236287,
"grad_norm": 1.3998394012451172,
"learning_rate": 3.852657521912752e-06,
"loss": 0.5696985721588135,
"step": 1084
},
{
"epoch": 2.291139240506329,
"grad_norm": 0.8090922832489014,
"learning_rate": 3.840247495218242e-06,
"loss": 0.4131937325000763,
"step": 1086
},
{
"epoch": 2.2953586497890295,
"grad_norm": 1.96702241897583,
"learning_rate": 3.827844281359444e-06,
"loss": 0.5371357202529907,
"step": 1088
},
{
"epoch": 2.29957805907173,
"grad_norm": 0.4463783800601959,
"learning_rate": 3.815448031297822e-06,
"loss": 0.48086562752723694,
"step": 1090
},
{
"epoch": 2.3037974683544302,
"grad_norm": 2.2645716667175293,
"learning_rate": 3.8030588959100845e-06,
"loss": 0.759406328201294,
"step": 1092
},
{
"epoch": 2.308016877637131,
"grad_norm": 0.9995399117469788,
"learning_rate": 3.790677025986345e-06,
"loss": 0.5466501116752625,
"step": 1094
},
{
"epoch": 2.3122362869198314,
"grad_norm": 2.6267566680908203,
"learning_rate": 3.7783025722282897e-06,
"loss": 0.35581734776496887,
"step": 1096
},
{
"epoch": 2.3164556962025316,
"grad_norm": 2.8866639137268066,
"learning_rate": 3.765935685247338e-06,
"loss": 0.8641759157180786,
"step": 1098
},
{
"epoch": 2.320675105485232,
"grad_norm": 1.3129066228866577,
"learning_rate": 3.753576515562816e-06,
"loss": 0.7505000233650208,
"step": 1100
},
{
"epoch": 2.3248945147679323,
"grad_norm": 1.0732929706573486,
"learning_rate": 3.7412252136001213e-06,
"loss": 0.8979564905166626,
"step": 1102
},
{
"epoch": 2.329113924050633,
"grad_norm": 0.7349892854690552,
"learning_rate": 3.7288819296888898e-06,
"loss": 1.1566518545150757,
"step": 1104
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.2569828033447266,
"learning_rate": 3.716546814061171e-06,
"loss": 0.6977556347846985,
"step": 1106
},
{
"epoch": 2.3375527426160336,
"grad_norm": 2.377737522125244,
"learning_rate": 3.7042200168495946e-06,
"loss": 0.44933831691741943,
"step": 1108
},
{
"epoch": 2.3417721518987342,
"grad_norm": 1.586523413658142,
"learning_rate": 3.691901688085548e-06,
"loss": 0.8763599395751953,
"step": 1110
},
{
"epoch": 2.3459915611814344,
"grad_norm": 1.1108835935592651,
"learning_rate": 3.6795919776973473e-06,
"loss": 0.9540433287620544,
"step": 1112
},
{
"epoch": 2.350210970464135,
"grad_norm": 0.9403799176216125,
"learning_rate": 3.667291035508411e-06,
"loss": 1.034621000289917,
"step": 1114
},
{
"epoch": 2.3544303797468356,
"grad_norm": 0.43867227435112,
"learning_rate": 3.65499901123544e-06,
"loss": 0.4901280999183655,
"step": 1116
},
{
"epoch": 2.3586497890295357,
"grad_norm": 2.578577995300293,
"learning_rate": 3.642716054486595e-06,
"loss": 0.7974634170532227,
"step": 1118
},
{
"epoch": 2.3628691983122363,
"grad_norm": 1.1387748718261719,
"learning_rate": 3.630442314759671e-06,
"loss": 0.5818929672241211,
"step": 1120
},
{
"epoch": 2.367088607594937,
"grad_norm": 2.1316514015197754,
"learning_rate": 3.618177941440285e-06,
"loss": 0.7703042030334473,
"step": 1122
},
{
"epoch": 2.371308016877637,
"grad_norm": 0.8024750351905823,
"learning_rate": 3.605923083800051e-06,
"loss": 0.5044012069702148,
"step": 1124
},
{
"epoch": 2.3755274261603376,
"grad_norm": 1.283586025238037,
"learning_rate": 3.593677890994768e-06,
"loss": 0.663129448890686,
"step": 1126
},
{
"epoch": 2.379746835443038,
"grad_norm": 1.0305255651474,
"learning_rate": 3.581442512062602e-06,
"loss": 0.8820338249206543,
"step": 1128
},
{
"epoch": 2.3839662447257384,
"grad_norm": 2.735337972640991,
"learning_rate": 3.5692170959222735e-06,
"loss": 0.42376741766929626,
"step": 1130
},
{
"epoch": 2.388185654008439,
"grad_norm": 4.083228588104248,
"learning_rate": 3.5570017913712438e-06,
"loss": 0.3104958236217499,
"step": 1132
},
{
"epoch": 2.392405063291139,
"grad_norm": 1.5016670227050781,
"learning_rate": 3.5447967470839038e-06,
"loss": 0.34900638461112976,
"step": 1134
},
{
"epoch": 2.3966244725738397,
"grad_norm": 1.8445940017700195,
"learning_rate": 3.5326021116097655e-06,
"loss": 0.5472123026847839,
"step": 1136
},
{
"epoch": 2.40084388185654,
"grad_norm": 1.0861736536026,
"learning_rate": 3.520418033371655e-06,
"loss": 0.9556151628494263,
"step": 1138
},
{
"epoch": 2.4050632911392404,
"grad_norm": 3.4282290935516357,
"learning_rate": 3.5082446606639014e-06,
"loss": 0.7003535032272339,
"step": 1140
},
{
"epoch": 2.409282700421941,
"grad_norm": 3.9306039810180664,
"learning_rate": 3.4960821416505406e-06,
"loss": 0.24855707585811615,
"step": 1142
},
{
"epoch": 2.413502109704641,
"grad_norm": 0.9033668041229248,
"learning_rate": 3.4839306243635003e-06,
"loss": 0.7160732746124268,
"step": 1144
},
{
"epoch": 2.4177215189873418,
"grad_norm": 1.116635799407959,
"learning_rate": 3.4717902567008086e-06,
"loss": 0.9801563620567322,
"step": 1146
},
{
"epoch": 2.4219409282700424,
"grad_norm": 4.026218891143799,
"learning_rate": 3.459661186424787e-06,
"loss": 0.7096956968307495,
"step": 1148
},
{
"epoch": 2.4261603375527425,
"grad_norm": 1.311513066291809,
"learning_rate": 3.447543561160258e-06,
"loss": 0.9519820809364319,
"step": 1150
},
{
"epoch": 2.430379746835443,
"grad_norm": 3.234283208847046,
"learning_rate": 3.435437528392741e-06,
"loss": 0.6188116073608398,
"step": 1152
},
{
"epoch": 2.4345991561181437,
"grad_norm": 0.9476786851882935,
"learning_rate": 3.4233432354666666e-06,
"loss": 1.0032005310058594,
"step": 1154
},
{
"epoch": 2.438818565400844,
"grad_norm": 1.0260239839553833,
"learning_rate": 3.4112608295835718e-06,
"loss": 0.3281160891056061,
"step": 1156
},
{
"epoch": 2.4430379746835444,
"grad_norm": 0.7956026196479797,
"learning_rate": 3.3991904578003182e-06,
"loss": 0.627183735370636,
"step": 1158
},
{
"epoch": 2.4472573839662446,
"grad_norm": 0.9774817824363708,
"learning_rate": 3.3871322670273e-06,
"loss": 0.9342701435089111,
"step": 1160
},
{
"epoch": 2.451476793248945,
"grad_norm": 1.73080313205719,
"learning_rate": 3.3750864040266497e-06,
"loss": 0.5555570721626282,
"step": 1162
},
{
"epoch": 2.4556962025316453,
"grad_norm": 1.2167036533355713,
"learning_rate": 3.3630530154104603e-06,
"loss": 0.8571757674217224,
"step": 1164
},
{
"epoch": 2.459915611814346,
"grad_norm": 0.8792468905448914,
"learning_rate": 3.3510322476389953e-06,
"loss": 0.8499954342842102,
"step": 1166
},
{
"epoch": 2.4641350210970465,
"grad_norm": 0.3647661805152893,
"learning_rate": 3.33902424701891e-06,
"loss": 0.4817237854003906,
"step": 1168
},
{
"epoch": 2.4683544303797467,
"grad_norm": 1.5427345037460327,
"learning_rate": 3.327029159701465e-06,
"loss": 0.8259966373443604,
"step": 1170
},
{
"epoch": 2.4725738396624473,
"grad_norm": 0.9573671221733093,
"learning_rate": 3.315047131680755e-06,
"loss": 0.9262470006942749,
"step": 1172
},
{
"epoch": 2.476793248945148,
"grad_norm": 0.8954631686210632,
"learning_rate": 3.3030783087919253e-06,
"loss": 0.8667972087860107,
"step": 1174
},
{
"epoch": 2.481012658227848,
"grad_norm": 0.998231828212738,
"learning_rate": 3.291122836709402e-06,
"loss": 0.6898888349533081,
"step": 1176
},
{
"epoch": 2.4852320675105486,
"grad_norm": 3.1478688716888428,
"learning_rate": 3.2791808609451125e-06,
"loss": 0.3274869918823242,
"step": 1178
},
{
"epoch": 2.489451476793249,
"grad_norm": 11.714877128601074,
"learning_rate": 3.2672525268467225e-06,
"loss": 0.6489510536193848,
"step": 1180
},
{
"epoch": 2.4936708860759493,
"grad_norm": 1.9469349384307861,
"learning_rate": 3.2553379795958604e-06,
"loss": 0.6815069913864136,
"step": 1182
},
{
"epoch": 2.49789029535865,
"grad_norm": 2.3261117935180664,
"learning_rate": 3.2434373642063522e-06,
"loss": 0.3795571029186249,
"step": 1184
},
{
"epoch": 2.50210970464135,
"grad_norm": 2.7311949729919434,
"learning_rate": 3.2315508255224613e-06,
"loss": 0.3261902630329132,
"step": 1186
},
{
"epoch": 2.5063291139240507,
"grad_norm": 2.2631030082702637,
"learning_rate": 3.2196785082171147e-06,
"loss": 0.5865919589996338,
"step": 1188
},
{
"epoch": 2.510548523206751,
"grad_norm": 0.8359600305557251,
"learning_rate": 3.207820556790155e-06,
"loss": 0.8902769088745117,
"step": 1190
},
{
"epoch": 2.5147679324894514,
"grad_norm": 2.3550963401794434,
"learning_rate": 3.1959771155665715e-06,
"loss": 0.4082001745700836,
"step": 1192
},
{
"epoch": 2.518987341772152,
"grad_norm": 4.461960315704346,
"learning_rate": 3.184148328694748e-06,
"loss": 1.1846554279327393,
"step": 1194
},
{
"epoch": 2.523206751054852,
"grad_norm": 1.4942057132720947,
"learning_rate": 3.1723343401447107e-06,
"loss": 0.9881184697151184,
"step": 1196
},
{
"epoch": 2.5274261603375527,
"grad_norm": 2.0736021995544434,
"learning_rate": 3.160535293706369e-06,
"loss": 0.9017194509506226,
"step": 1198
},
{
"epoch": 2.5316455696202533,
"grad_norm": 3.7537925243377686,
"learning_rate": 3.148751332987772e-06,
"loss": 0.5090019106864929,
"step": 1200
},
{
"epoch": 2.5358649789029535,
"grad_norm": 1.3264377117156982,
"learning_rate": 3.1369826014133594e-06,
"loss": 0.67947918176651,
"step": 1202
},
{
"epoch": 2.540084388185654,
"grad_norm": 3.953713893890381,
"learning_rate": 3.125229242222211e-06,
"loss": 0.5951077342033386,
"step": 1204
},
{
"epoch": 2.5443037974683547,
"grad_norm": 0.990692675113678,
"learning_rate": 3.1134913984663093e-06,
"loss": 0.8030409812927246,
"step": 1206
},
{
"epoch": 2.548523206751055,
"grad_norm": 3.0001838207244873,
"learning_rate": 3.101769213008796e-06,
"loss": 0.6891695261001587,
"step": 1208
},
{
"epoch": 2.5527426160337554,
"grad_norm": 1.335438847541809,
"learning_rate": 3.0900628285222307e-06,
"loss": 0.9814665913581848,
"step": 1210
},
{
"epoch": 2.5569620253164556,
"grad_norm": 1.2493577003479004,
"learning_rate": 3.078372387486861e-06,
"loss": 0.9131478667259216,
"step": 1212
},
{
"epoch": 2.561181434599156,
"grad_norm": 2.756460428237915,
"learning_rate": 3.0666980321888823e-06,
"loss": 0.27317380905151367,
"step": 1214
},
{
"epoch": 2.5654008438818563,
"grad_norm": 3.6866559982299805,
"learning_rate": 3.055039904718706e-06,
"loss": 0.6986894011497498,
"step": 1216
},
{
"epoch": 2.569620253164557,
"grad_norm": 0.7736930847167969,
"learning_rate": 3.0433981469692346e-06,
"loss": 0.8533654808998108,
"step": 1218
},
{
"epoch": 2.5738396624472575,
"grad_norm": 6.2710161209106445,
"learning_rate": 3.0317729006341315e-06,
"loss": 0.5412061214447021,
"step": 1220
},
{
"epoch": 2.5780590717299576,
"grad_norm": 2.4914796352386475,
"learning_rate": 3.0201643072060964e-06,
"loss": 0.7507292628288269,
"step": 1222
},
{
"epoch": 2.5822784810126582,
"grad_norm": 4.1669840812683105,
"learning_rate": 3.0085725079751465e-06,
"loss": 0.599193274974823,
"step": 1224
},
{
"epoch": 2.586497890295359,
"grad_norm": 1.4165141582489014,
"learning_rate": 2.996997644026889e-06,
"loss": 0.542171835899353,
"step": 1226
},
{
"epoch": 2.590717299578059,
"grad_norm": 1.2593107223510742,
"learning_rate": 2.9854398562408144e-06,
"loss": 0.8244262933731079,
"step": 1228
},
{
"epoch": 2.5949367088607596,
"grad_norm": 1.6781362295150757,
"learning_rate": 2.9738992852885742e-06,
"loss": 1.0771939754486084,
"step": 1230
},
{
"epoch": 2.59915611814346,
"grad_norm": 1.0754374265670776,
"learning_rate": 2.9623760716322706e-06,
"loss": 0.7803739309310913,
"step": 1232
},
{
"epoch": 2.6033755274261603,
"grad_norm": 4.246564865112305,
"learning_rate": 2.950870355522748e-06,
"loss": 0.2662976384162903,
"step": 1234
},
{
"epoch": 2.607594936708861,
"grad_norm": 1.650658369064331,
"learning_rate": 2.939382276997886e-06,
"loss": 0.9140543937683105,
"step": 1236
},
{
"epoch": 2.611814345991561,
"grad_norm": 5.929245471954346,
"learning_rate": 2.9279119758808942e-06,
"loss": 1.1032469272613525,
"step": 1238
},
{
"epoch": 2.6160337552742616,
"grad_norm": 1.0307083129882812,
"learning_rate": 2.9164595917786088e-06,
"loss": 0.6352362632751465,
"step": 1240
},
{
"epoch": 2.620253164556962,
"grad_norm": 1.3630961179733276,
"learning_rate": 2.905025264079799e-06,
"loss": 0.8276194334030151,
"step": 1242
},
{
"epoch": 2.6244725738396624,
"grad_norm": 2.032569408416748,
"learning_rate": 2.8936091319534617e-06,
"loss": 0.4083612859249115,
"step": 1244
},
{
"epoch": 2.628691983122363,
"grad_norm": 0.8530462384223938,
"learning_rate": 2.8822113343471365e-06,
"loss": 0.6202731132507324,
"step": 1246
},
{
"epoch": 2.632911392405063,
"grad_norm": 1.9822677373886108,
"learning_rate": 2.8708320099852108e-06,
"loss": 1.1646617650985718,
"step": 1248
},
{
"epoch": 2.6371308016877637,
"grad_norm": 0.7690547108650208,
"learning_rate": 2.8594712973672276e-06,
"loss": 0.8482010364532471,
"step": 1250
},
{
"epoch": 2.6413502109704643,
"grad_norm": 8.547155380249023,
"learning_rate": 2.8481293347662067e-06,
"loss": 0.904060959815979,
"step": 1252
},
{
"epoch": 2.6455696202531644,
"grad_norm": 2.017336368560791,
"learning_rate": 2.8368062602269573e-06,
"loss": 0.3393191993236542,
"step": 1254
},
{
"epoch": 2.649789029535865,
"grad_norm": 1.945145845413208,
"learning_rate": 2.8255022115644017e-06,
"loss": 0.39150819182395935,
"step": 1256
},
{
"epoch": 2.6540084388185656,
"grad_norm": 1.3301414251327515,
"learning_rate": 2.8142173263618877e-06,
"loss": 0.7564312815666199,
"step": 1258
},
{
"epoch": 2.6582278481012658,
"grad_norm": 0.9791122078895569,
"learning_rate": 2.8029517419695303e-06,
"loss": 0.8787249326705933,
"step": 1260
},
{
"epoch": 2.6624472573839664,
"grad_norm": 1.0031580924987793,
"learning_rate": 2.7917055955025285e-06,
"loss": 0.8559532165527344,
"step": 1262
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.7568211555480957,
"learning_rate": 2.7804790238394958e-06,
"loss": 0.5114046931266785,
"step": 1264
},
{
"epoch": 2.670886075949367,
"grad_norm": 1.7229481935501099,
"learning_rate": 2.7692721636208013e-06,
"loss": 0.8251296281814575,
"step": 1266
},
{
"epoch": 2.6751054852320673,
"grad_norm": 0.9991238713264465,
"learning_rate": 2.7580851512469024e-06,
"loss": 0.6419144868850708,
"step": 1268
},
{
"epoch": 2.679324894514768,
"grad_norm": 1.1213876008987427,
"learning_rate": 2.746918122876686e-06,
"loss": 0.36948972940444946,
"step": 1270
},
{
"epoch": 2.6835443037974684,
"grad_norm": 1.1551014184951782,
"learning_rate": 2.7357712144258074e-06,
"loss": 0.8657974004745483,
"step": 1272
},
{
"epoch": 2.6877637130801686,
"grad_norm": 7.327043533325195,
"learning_rate": 2.724644561565042e-06,
"loss": 0.6017997860908508,
"step": 1274
},
{
"epoch": 2.691983122362869,
"grad_norm": 3.296600818634033,
"learning_rate": 2.713538299718631e-06,
"loss": 0.6844916343688965,
"step": 1276
},
{
"epoch": 2.6962025316455698,
"grad_norm": 0.31361812353134155,
"learning_rate": 2.702452564062635e-06,
"loss": 0.2726902365684509,
"step": 1278
},
{
"epoch": 2.70042194092827,
"grad_norm": 1.6500128507614136,
"learning_rate": 2.69138748952328e-06,
"loss": 0.8048746585845947,
"step": 1280
},
{
"epoch": 2.7046413502109705,
"grad_norm": 1.1757248640060425,
"learning_rate": 2.680343210775331e-06,
"loss": 0.9176240563392639,
"step": 1282
},
{
"epoch": 2.708860759493671,
"grad_norm": 2.345834493637085,
"learning_rate": 2.6693198622404403e-06,
"loss": 0.4069772958755493,
"step": 1284
},
{
"epoch": 2.7130801687763713,
"grad_norm": 5.173031330108643,
"learning_rate": 2.658317578085514e-06,
"loss": 0.4281209409236908,
"step": 1286
},
{
"epoch": 2.717299578059072,
"grad_norm": 0.6406076550483704,
"learning_rate": 2.647336492221082e-06,
"loss": 0.4584686756134033,
"step": 1288
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.30545204877853394,
"learning_rate": 2.636376738299666e-06,
"loss": 0.7299985289573669,
"step": 1290
},
{
"epoch": 2.7257383966244726,
"grad_norm": 2.3275787830352783,
"learning_rate": 2.6254384497141563e-06,
"loss": 0.8682552576065063,
"step": 1292
},
{
"epoch": 2.7299578059071727,
"grad_norm": 1.1502134799957275,
"learning_rate": 2.6145217595961786e-06,
"loss": 0.36897793412208557,
"step": 1294
},
{
"epoch": 2.7341772151898733,
"grad_norm": 0.9601994752883911,
"learning_rate": 2.603626800814486e-06,
"loss": 0.8473520278930664,
"step": 1296
},
{
"epoch": 2.738396624472574,
"grad_norm": 0.9873552322387695,
"learning_rate": 2.5927537059733337e-06,
"loss": 0.9228261113166809,
"step": 1298
},
{
"epoch": 2.742616033755274,
"grad_norm": 0.5264573097229004,
"learning_rate": 2.5819026074108695e-06,
"loss": 0.6119830012321472,
"step": 1300
},
{
"epoch": 2.7468354430379747,
"grad_norm": 0.9602957963943481,
"learning_rate": 2.5710736371975165e-06,
"loss": 0.9762548208236694,
"step": 1302
},
{
"epoch": 2.7510548523206753,
"grad_norm": 0.9380753040313721,
"learning_rate": 2.560266927134375e-06,
"loss": 0.5131715536117554,
"step": 1304
},
{
"epoch": 2.7552742616033754,
"grad_norm": 1.438719630241394,
"learning_rate": 2.549482608751613e-06,
"loss": 1.091052532196045,
"step": 1306
},
{
"epoch": 2.759493670886076,
"grad_norm": 1.7355360984802246,
"learning_rate": 2.5387208133068613e-06,
"loss": 0.9066473245620728,
"step": 1308
},
{
"epoch": 2.7637130801687766,
"grad_norm": 2.98097825050354,
"learning_rate": 2.5279816717836256e-06,
"loss": 0.7622301578521729,
"step": 1310
},
{
"epoch": 2.7679324894514767,
"grad_norm": 0.885686993598938,
"learning_rate": 2.5172653148896842e-06,
"loss": 0.9722012877464294,
"step": 1312
},
{
"epoch": 2.7721518987341773,
"grad_norm": 1.3240593671798706,
"learning_rate": 2.5065718730555033e-06,
"loss": 0.9415172338485718,
"step": 1314
},
{
"epoch": 2.7763713080168775,
"grad_norm": 1.9628123044967651,
"learning_rate": 2.4959014764326415e-06,
"loss": 0.6243242025375366,
"step": 1316
},
{
"epoch": 2.780590717299578,
"grad_norm": 3.583494186401367,
"learning_rate": 2.4852542548921747e-06,
"loss": 0.4649869501590729,
"step": 1318
},
{
"epoch": 2.7848101265822782,
"grad_norm": 0.94072425365448,
"learning_rate": 2.4746303380231085e-06,
"loss": 0.9694103002548218,
"step": 1320
},
{
"epoch": 2.789029535864979,
"grad_norm": 0.6174410581588745,
"learning_rate": 2.4640298551308073e-06,
"loss": 0.5571610331535339,
"step": 1322
},
{
"epoch": 2.7932489451476794,
"grad_norm": 2.0068700313568115,
"learning_rate": 2.453452935235412e-06,
"loss": 1.0208598375320435,
"step": 1324
},
{
"epoch": 2.7974683544303796,
"grad_norm": 1.8920451402664185,
"learning_rate": 2.442899707070277e-06,
"loss": 0.7713922262191772,
"step": 1326
},
{
"epoch": 2.80168776371308,
"grad_norm": 0.9682056903839111,
"learning_rate": 2.432370299080402e-06,
"loss": 0.5502282977104187,
"step": 1328
},
{
"epoch": 2.8059071729957807,
"grad_norm": 0.9725003838539124,
"learning_rate": 2.4218648394208675e-06,
"loss": 0.8966948986053467,
"step": 1330
},
{
"epoch": 2.810126582278481,
"grad_norm": 1.1623132228851318,
"learning_rate": 2.4113834559552725e-06,
"loss": 0.7290566563606262,
"step": 1332
},
{
"epoch": 2.8143459915611815,
"grad_norm": 1.2533057928085327,
"learning_rate": 2.4009262762541812e-06,
"loss": 0.4873872697353363,
"step": 1334
},
{
"epoch": 2.818565400843882,
"grad_norm": 0.42495617270469666,
"learning_rate": 2.3904934275935742e-06,
"loss": 0.6868776082992554,
"step": 1336
},
{
"epoch": 2.8227848101265822,
"grad_norm": 1.3464299440383911,
"learning_rate": 2.3800850369532913e-06,
"loss": 0.792182207107544,
"step": 1338
},
{
"epoch": 2.827004219409283,
"grad_norm": 1.2492246627807617,
"learning_rate": 2.3697012310154895e-06,
"loss": 0.8120459318161011,
"step": 1340
},
{
"epoch": 2.831223628691983,
"grad_norm": 1.79072105884552,
"learning_rate": 2.3593421361631063e-06,
"loss": 0.8677684664726257,
"step": 1342
},
{
"epoch": 2.8354430379746836,
"grad_norm": 1.2441151142120361,
"learning_rate": 2.3490078784783088e-06,
"loss": 1.0221854448318481,
"step": 1344
},
{
"epoch": 2.8396624472573837,
"grad_norm": 2.060967206954956,
"learning_rate": 2.3386985837409736e-06,
"loss": 0.6457461714744568,
"step": 1346
},
{
"epoch": 2.8438818565400843,
"grad_norm": 0.8780367970466614,
"learning_rate": 2.328414377427148e-06,
"loss": 0.514173686504364,
"step": 1348
},
{
"epoch": 2.848101265822785,
"grad_norm": 0.9615793228149414,
"learning_rate": 2.318155384707524e-06,
"loss": 0.9813417792320251,
"step": 1350
},
{
"epoch": 2.852320675105485,
"grad_norm": 0.7979256510734558,
"learning_rate": 2.3079217304459114e-06,
"loss": 0.6034799218177795,
"step": 1352
},
{
"epoch": 2.8565400843881856,
"grad_norm": 2.0170516967773438,
"learning_rate": 2.2977135391977264e-06,
"loss": 0.6767147779464722,
"step": 1354
},
{
"epoch": 2.8607594936708862,
"grad_norm": 2.4936254024505615,
"learning_rate": 2.287530935208469e-06,
"loss": 0.5042116045951843,
"step": 1356
},
{
"epoch": 2.8649789029535864,
"grad_norm": 1.2325421571731567,
"learning_rate": 2.277374042412214e-06,
"loss": 0.9337244033813477,
"step": 1358
},
{
"epoch": 2.869198312236287,
"grad_norm": 2.9698169231414795,
"learning_rate": 2.2672429844300972e-06,
"loss": 0.7304012179374695,
"step": 1360
},
{
"epoch": 2.8734177215189876,
"grad_norm": 1.5197981595993042,
"learning_rate": 2.257137884568819e-06,
"loss": 0.5767084956169128,
"step": 1362
},
{
"epoch": 2.8776371308016877,
"grad_norm": 2.374297857284546,
"learning_rate": 2.24705886581914e-06,
"loss": 0.9020572304725647,
"step": 1364
},
{
"epoch": 2.8818565400843883,
"grad_norm": 1.3976613283157349,
"learning_rate": 2.237006050854378e-06,
"loss": 0.8876560926437378,
"step": 1366
},
{
"epoch": 2.8860759493670884,
"grad_norm": 1.1186343431472778,
"learning_rate": 2.2269795620289255e-06,
"loss": 0.9599936008453369,
"step": 1368
},
{
"epoch": 2.890295358649789,
"grad_norm": 2.704097270965576,
"learning_rate": 2.2169795213767533e-06,
"loss": 0.8696321249008179,
"step": 1370
},
{
"epoch": 2.894514767932489,
"grad_norm": 7.440235614776611,
"learning_rate": 2.207006050609931e-06,
"loss": 0.3180171847343445,
"step": 1372
},
{
"epoch": 2.8987341772151898,
"grad_norm": 0.950478196144104,
"learning_rate": 2.1970592711171343e-06,
"loss": 0.6180795431137085,
"step": 1374
},
{
"epoch": 2.9029535864978904,
"grad_norm": 1.206428050994873,
"learning_rate": 2.1871393039621813e-06,
"loss": 0.8911280035972595,
"step": 1376
},
{
"epoch": 2.9071729957805905,
"grad_norm": 3.0545897483825684,
"learning_rate": 2.177246269882552e-06,
"loss": 0.752612292766571,
"step": 1378
},
{
"epoch": 2.911392405063291,
"grad_norm": 1.6597026586532593,
"learning_rate": 2.1673802892879202e-06,
"loss": 1.0073306560516357,
"step": 1380
},
{
"epoch": 2.9156118143459917,
"grad_norm": 2.8480212688446045,
"learning_rate": 2.1575414822586834e-06,
"loss": 0.49533841013908386,
"step": 1382
},
{
"epoch": 2.919831223628692,
"grad_norm": 2.9914588928222656,
"learning_rate": 2.1477299685445093e-06,
"loss": 0.6439518332481384,
"step": 1384
},
{
"epoch": 2.9240506329113924,
"grad_norm": 1.6400901079177856,
"learning_rate": 2.1379458675628758e-06,
"loss": 0.5329881906509399,
"step": 1386
},
{
"epoch": 2.928270042194093,
"grad_norm": 0.9584951400756836,
"learning_rate": 2.128189298397611e-06,
"loss": 0.9460800290107727,
"step": 1388
},
{
"epoch": 2.932489451476793,
"grad_norm": 1.2493575811386108,
"learning_rate": 2.118460379797452e-06,
"loss": 0.7834473848342896,
"step": 1390
},
{
"epoch": 2.9367088607594938,
"grad_norm": 1.484129548072815,
"learning_rate": 2.1087592301745965e-06,
"loss": 0.4930620491504669,
"step": 1392
},
{
"epoch": 2.9409282700421944,
"grad_norm": 1.0145891904830933,
"learning_rate": 2.0990859676032623e-06,
"loss": 0.4643522799015045,
"step": 1394
},
{
"epoch": 2.9451476793248945,
"grad_norm": 0.9809361696243286,
"learning_rate": 2.0894407098182474e-06,
"loss": 0.8622637987136841,
"step": 1396
},
{
"epoch": 2.9493670886075947,
"grad_norm": 3.8030622005462646,
"learning_rate": 2.0798235742134995e-06,
"loss": 0.6468316316604614,
"step": 1398
},
{
"epoch": 2.9535864978902953,
"grad_norm": 3.291412830352783,
"learning_rate": 2.0702346778406887e-06,
"loss": 0.871576726436615,
"step": 1400
},
{
"epoch": 2.957805907172996,
"grad_norm": 2.847675085067749,
"learning_rate": 2.0606741374077804e-06,
"loss": 0.6290037631988525,
"step": 1402
},
{
"epoch": 2.962025316455696,
"grad_norm": 0.8518403172492981,
"learning_rate": 2.0511420692776135e-06,
"loss": 0.8591277003288269,
"step": 1404
},
{
"epoch": 2.9662447257383966,
"grad_norm": 2.023810386657715,
"learning_rate": 2.041638589466487e-06,
"loss": 0.8211725354194641,
"step": 1406
},
{
"epoch": 2.970464135021097,
"grad_norm": 2.9551258087158203,
"learning_rate": 2.0321638136427495e-06,
"loss": 0.46553725004196167,
"step": 1408
},
{
"epoch": 2.9746835443037973,
"grad_norm": 3.8522558212280273,
"learning_rate": 2.0227178571253846e-06,
"loss": 0.7728868126869202,
"step": 1410
},
{
"epoch": 2.978902953586498,
"grad_norm": 0.8442367911338806,
"learning_rate": 2.013300834882615e-06,
"loss": 0.9526476860046387,
"step": 1412
},
{
"epoch": 2.9831223628691985,
"grad_norm": 2.8707711696624756,
"learning_rate": 2.0039128615304967e-06,
"loss": 0.6912641525268555,
"step": 1414
},
{
"epoch": 2.9873417721518987,
"grad_norm": 0.9124540686607361,
"learning_rate": 1.994554051331532e-06,
"loss": 0.7677329778671265,
"step": 1416
},
{
"epoch": 2.9915611814345993,
"grad_norm": 0.7803240418434143,
"learning_rate": 1.9852245181932674e-06,
"loss": 0.8512239456176758,
"step": 1418
},
{
"epoch": 2.9957805907173,
"grad_norm": 3.4592530727386475,
"learning_rate": 1.975924375666918e-06,
"loss": 0.8197758197784424,
"step": 1420
},
{
"epoch": 3.0,
"grad_norm": 1.0075371265411377,
"learning_rate": 1.9666537369459813e-06,
"loss": 0.26588016748428345,
"step": 1422
},
{
"epoch": 3.0042194092827006,
"grad_norm": 0.8261951208114624,
"learning_rate": 1.9574127148648586e-06,
"loss": 0.4992481768131256,
"step": 1424
},
{
"epoch": 3.0084388185654007,
"grad_norm": 1.8350886106491089,
"learning_rate": 1.94820142189748e-06,
"loss": 0.4615590572357178,
"step": 1426
},
{
"epoch": 3.0126582278481013,
"grad_norm": 7.030728816986084,
"learning_rate": 1.9390199701559407e-06,
"loss": 0.5607567429542542,
"step": 1428
},
{
"epoch": 3.0168776371308015,
"grad_norm": 1.843036413192749,
"learning_rate": 1.929868471389133e-06,
"loss": 0.1959325075149536,
"step": 1430
},
{
"epoch": 3.021097046413502,
"grad_norm": 1.2027599811553955,
"learning_rate": 1.920747036981388e-06,
"loss": 0.8035475611686707,
"step": 1432
},
{
"epoch": 3.0253164556962027,
"grad_norm": 1.0378309488296509,
"learning_rate": 1.9116557779511153e-06,
"loss": 0.7113970518112183,
"step": 1434
},
{
"epoch": 3.029535864978903,
"grad_norm": 1.079108715057373,
"learning_rate": 1.9025948049494587e-06,
"loss": 0.8759698271751404,
"step": 1436
},
{
"epoch": 3.0337552742616034,
"grad_norm": 1.387281060218811,
"learning_rate": 1.8935642282589452e-06,
"loss": 0.4212711453437805,
"step": 1438
},
{
"epoch": 3.037974683544304,
"grad_norm": 1.6048085689544678,
"learning_rate": 1.884564157792141e-06,
"loss": 0.7371959090232849,
"step": 1440
},
{
"epoch": 3.042194092827004,
"grad_norm": 0.33521798253059387,
"learning_rate": 1.87559470309032e-06,
"loss": 0.5267896056175232,
"step": 1442
},
{
"epoch": 3.0464135021097047,
"grad_norm": 1.3722892999649048,
"learning_rate": 1.8666559733221244e-06,
"loss": 0.657349169254303,
"step": 1444
},
{
"epoch": 3.050632911392405,
"grad_norm": 1.0858877897262573,
"learning_rate": 1.8577480772822405e-06,
"loss": 0.8311367034912109,
"step": 1446
},
{
"epoch": 3.0548523206751055,
"grad_norm": 5.020367622375488,
"learning_rate": 1.8488711233900686e-06,
"loss": 0.5246130228042603,
"step": 1448
},
{
"epoch": 3.059071729957806,
"grad_norm": 3.7570173740386963,
"learning_rate": 1.8400252196884106e-06,
"loss": 0.6080931425094604,
"step": 1450
},
{
"epoch": 3.0632911392405062,
"grad_norm": 1.1105659008026123,
"learning_rate": 1.8312104738421518e-06,
"loss": 0.8224632740020752,
"step": 1452
},
{
"epoch": 3.067510548523207,
"grad_norm": 3.6815249919891357,
"learning_rate": 1.8224269931369494e-06,
"loss": 0.6160001158714294,
"step": 1454
},
{
"epoch": 3.071729957805907,
"grad_norm": 9.295499801635742,
"learning_rate": 1.8136748844779257e-06,
"loss": 0.49316591024398804,
"step": 1456
},
{
"epoch": 3.0759493670886076,
"grad_norm": 4.4355974197387695,
"learning_rate": 1.8049542543883718e-06,
"loss": 0.6495121121406555,
"step": 1458
},
{
"epoch": 3.080168776371308,
"grad_norm": 2.505272626876831,
"learning_rate": 1.7962652090084483e-06,
"loss": 0.4862138032913208,
"step": 1460
},
{
"epoch": 3.0843881856540083,
"grad_norm": 0.9544802904129028,
"learning_rate": 1.7876078540938897e-06,
"loss": 0.7817291021347046,
"step": 1462
},
{
"epoch": 3.088607594936709,
"grad_norm": 0.9137688875198364,
"learning_rate": 1.778982295014725e-06,
"loss": 0.7803807258605957,
"step": 1464
},
{
"epoch": 3.0928270042194095,
"grad_norm": 0.9232447743415833,
"learning_rate": 1.7703886367539886e-06,
"loss": 0.7208024859428406,
"step": 1466
},
{
"epoch": 3.0970464135021096,
"grad_norm": 2.5386898517608643,
"learning_rate": 1.7618269839064476e-06,
"loss": 0.535610556602478,
"step": 1468
},
{
"epoch": 3.1012658227848102,
"grad_norm": 2.476505756378174,
"learning_rate": 1.7532974406773215e-06,
"loss": 0.11650805175304413,
"step": 1470
},
{
"epoch": 3.1054852320675104,
"grad_norm": 3.4205284118652344,
"learning_rate": 1.744800110881024e-06,
"loss": 0.9236214756965637,
"step": 1472
},
{
"epoch": 3.109704641350211,
"grad_norm": 0.38351741433143616,
"learning_rate": 1.7363350979398904e-06,
"loss": 0.3822326362133026,
"step": 1474
},
{
"epoch": 3.1139240506329116,
"grad_norm": 1.7231391668319702,
"learning_rate": 1.7279025048829247e-06,
"loss": 0.8056196570396423,
"step": 1476
},
{
"epoch": 3.1181434599156117,
"grad_norm": 1.3952598571777344,
"learning_rate": 1.7195024343445406e-06,
"loss": 0.8253889679908752,
"step": 1478
},
{
"epoch": 3.1223628691983123,
"grad_norm": 1.235793113708496,
"learning_rate": 1.711134988563318e-06,
"loss": 0.7869700193405151,
"step": 1480
},
{
"epoch": 3.1265822784810124,
"grad_norm": 1.5086437463760376,
"learning_rate": 1.7028002693807553e-06,
"loss": 0.74970543384552,
"step": 1482
},
{
"epoch": 3.130801687763713,
"grad_norm": 1.1958047151565552,
"learning_rate": 1.694498378240028e-06,
"loss": 0.7713515758514404,
"step": 1484
},
{
"epoch": 3.1350210970464136,
"grad_norm": 0.9930305480957031,
"learning_rate": 1.6862294161847582e-06,
"loss": 0.4803518056869507,
"step": 1486
},
{
"epoch": 3.1392405063291138,
"grad_norm": 1.338038444519043,
"learning_rate": 1.6779934838577833e-06,
"loss": 0.4478246569633484,
"step": 1488
},
{
"epoch": 3.1434599156118144,
"grad_norm": 1.8812412023544312,
"learning_rate": 1.6697906814999316e-06,
"loss": 0.8487708568572998,
"step": 1490
},
{
"epoch": 3.147679324894515,
"grad_norm": 1.079730749130249,
"learning_rate": 1.6616211089487968e-06,
"loss": 0.4909372329711914,
"step": 1492
},
{
"epoch": 3.151898734177215,
"grad_norm": 3.950795888900757,
"learning_rate": 1.653484865637532e-06,
"loss": 0.6456606388092041,
"step": 1494
},
{
"epoch": 3.1561181434599157,
"grad_norm": 0.8888868093490601,
"learning_rate": 1.645382050593633e-06,
"loss": 0.5738848447799683,
"step": 1496
},
{
"epoch": 3.160337552742616,
"grad_norm": 0.8062717318534851,
"learning_rate": 1.6373127624377361e-06,
"loss": 0.3924991488456726,
"step": 1498
},
{
"epoch": 3.1645569620253164,
"grad_norm": 1.1965993642807007,
"learning_rate": 1.6292770993824138e-06,
"loss": 0.4241105318069458,
"step": 1500
},
{
"epoch": 3.168776371308017,
"grad_norm": 1.7078224420547485,
"learning_rate": 1.621275159230986e-06,
"loss": 0.7920833230018616,
"step": 1502
},
{
"epoch": 3.172995780590717,
"grad_norm": 3.2493438720703125,
"learning_rate": 1.6133070393763222e-06,
"loss": 0.7387109994888306,
"step": 1504
},
{
"epoch": 3.1772151898734178,
"grad_norm": 1.1433643102645874,
"learning_rate": 1.605372836799664e-06,
"loss": 0.8177753686904907,
"step": 1506
},
{
"epoch": 3.181434599156118,
"grad_norm": 1.1686694622039795,
"learning_rate": 1.5974726480694356e-06,
"loss": 0.810562014579773,
"step": 1508
},
{
"epoch": 3.1856540084388185,
"grad_norm": 1.6440011262893677,
"learning_rate": 1.589606569340076e-06,
"loss": 0.8004451394081116,
"step": 1510
},
{
"epoch": 3.189873417721519,
"grad_norm": 3.572957754135132,
"learning_rate": 1.5817746963508675e-06,
"loss": 0.19780634343624115,
"step": 1512
},
{
"epoch": 3.1940928270042193,
"grad_norm": 1.8729281425476074,
"learning_rate": 1.5739771244247647e-06,
"loss": 0.8508098721504211,
"step": 1514
},
{
"epoch": 3.19831223628692,
"grad_norm": 0.22832605242729187,
"learning_rate": 1.5662139484672423e-06,
"loss": 0.5102086663246155,
"step": 1516
},
{
"epoch": 3.2025316455696204,
"grad_norm": 1.493944764137268,
"learning_rate": 1.558485262965135e-06,
"loss": 0.8561201691627502,
"step": 1518
},
{
"epoch": 3.2067510548523206,
"grad_norm": 2.02929949760437,
"learning_rate": 1.55079116198549e-06,
"loss": 0.7038779258728027,
"step": 1520
},
{
"epoch": 3.210970464135021,
"grad_norm": 2.459091901779175,
"learning_rate": 1.5431317391744167e-06,
"loss": 0.2252277433872223,
"step": 1522
},
{
"epoch": 3.2151898734177213,
"grad_norm": 2.103160858154297,
"learning_rate": 1.535507087755956e-06,
"loss": 0.548999011516571,
"step": 1524
},
{
"epoch": 3.219409282700422,
"grad_norm": 1.064772129058838,
"learning_rate": 1.527917300530938e-06,
"loss": 0.7090752124786377,
"step": 1526
},
{
"epoch": 3.2236286919831225,
"grad_norm": 0.5920833945274353,
"learning_rate": 1.5203624698758573e-06,
"loss": 0.28943130373954773,
"step": 1528
},
{
"epoch": 3.2278481012658227,
"grad_norm": 2.5098395347595215,
"learning_rate": 1.5128426877417428e-06,
"loss": 0.5822982788085938,
"step": 1530
},
{
"epoch": 3.2320675105485233,
"grad_norm": 0.6460347175598145,
"learning_rate": 1.5053580456530459e-06,
"loss": 0.15637226402759552,
"step": 1532
},
{
"epoch": 3.2362869198312234,
"grad_norm": 1.804608702659607,
"learning_rate": 1.4979086347065225e-06,
"loss": 0.7296754121780396,
"step": 1534
},
{
"epoch": 3.240506329113924,
"grad_norm": 1.5082496404647827,
"learning_rate": 1.4904945455701232e-06,
"loss": 0.7508465647697449,
"step": 1536
},
{
"epoch": 3.2447257383966246,
"grad_norm": 1.1056941747665405,
"learning_rate": 1.4831158684818917e-06,
"loss": 0.6265556812286377,
"step": 1538
},
{
"epoch": 3.2489451476793247,
"grad_norm": 2.1995933055877686,
"learning_rate": 1.4757726932488672e-06,
"loss": 0.5779432058334351,
"step": 1540
},
{
"epoch": 3.2531645569620253,
"grad_norm": 2.594663619995117,
"learning_rate": 1.4684651092459906e-06,
"loss": 0.4649961590766907,
"step": 1542
},
{
"epoch": 3.257383966244726,
"grad_norm": 2.5885109901428223,
"learning_rate": 1.4611932054150132e-06,
"loss": 0.5126054883003235,
"step": 1544
},
{
"epoch": 3.261603375527426,
"grad_norm": 2.8481526374816895,
"learning_rate": 1.4539570702634208e-06,
"loss": 0.49317800998687744,
"step": 1546
},
{
"epoch": 3.2658227848101267,
"grad_norm": 1.6855295896530151,
"learning_rate": 1.446756791863351e-06,
"loss": 0.6522631049156189,
"step": 1548
},
{
"epoch": 3.270042194092827,
"grad_norm": 2.981158971786499,
"learning_rate": 1.4395924578505253e-06,
"loss": 0.20762769877910614,
"step": 1550
},
{
"epoch": 3.2742616033755274,
"grad_norm": 0.8789273500442505,
"learning_rate": 1.4324641554231767e-06,
"loss": 0.2234586775302887,
"step": 1552
},
{
"epoch": 3.278481012658228,
"grad_norm": 5.3056182861328125,
"learning_rate": 1.4253719713409958e-06,
"loss": 0.40713340044021606,
"step": 1554
},
{
"epoch": 3.282700421940928,
"grad_norm": 0.8367089033126831,
"learning_rate": 1.41831599192407e-06,
"loss": 0.7326263189315796,
"step": 1556
},
{
"epoch": 3.2869198312236287,
"grad_norm": 1.1955314874649048,
"learning_rate": 1.4112963030518329e-06,
"loss": 0.5510862469673157,
"step": 1558
},
{
"epoch": 3.291139240506329,
"grad_norm": 1.1264405250549316,
"learning_rate": 1.4043129901620198e-06,
"loss": 0.44987189769744873,
"step": 1560
},
{
"epoch": 3.2953586497890295,
"grad_norm": 2.407663345336914,
"learning_rate": 1.397366138249633e-06,
"loss": 0.42221248149871826,
"step": 1562
},
{
"epoch": 3.29957805907173,
"grad_norm": 2.001704692840576,
"learning_rate": 1.3904558318658964e-06,
"loss": 0.7191241383552551,
"step": 1564
},
{
"epoch": 3.3037974683544302,
"grad_norm": 2.9357941150665283,
"learning_rate": 1.3835821551172352e-06,
"loss": 0.5609620809555054,
"step": 1566
},
{
"epoch": 3.308016877637131,
"grad_norm": 0.1518426090478897,
"learning_rate": 1.3767451916642502e-06,
"loss": 0.3671785891056061,
"step": 1568
},
{
"epoch": 3.3122362869198314,
"grad_norm": 2.9103848934173584,
"learning_rate": 1.3699450247206987e-06,
"loss": 0.3877882659435272,
"step": 1570
},
{
"epoch": 3.3164556962025316,
"grad_norm": 1.832383394241333,
"learning_rate": 1.363181737052479e-06,
"loss": 0.38887959718704224,
"step": 1572
},
{
"epoch": 3.320675105485232,
"grad_norm": 1.458479404449463,
"learning_rate": 1.3564554109766303e-06,
"loss": 0.87562096118927,
"step": 1574
},
{
"epoch": 3.3248945147679323,
"grad_norm": 1.4098705053329468,
"learning_rate": 1.3497661283603241e-06,
"loss": 0.618715763092041,
"step": 1576
},
{
"epoch": 3.329113924050633,
"grad_norm": 0.9463833570480347,
"learning_rate": 1.3431139706198703e-06,
"loss": 0.7363364100456238,
"step": 1578
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.1084620952606201,
"learning_rate": 1.336499018719726e-06,
"loss": 0.46182820200920105,
"step": 1580
},
{
"epoch": 3.3375527426160336,
"grad_norm": 2.6807730197906494,
"learning_rate": 1.3299213531715104e-06,
"loss": 0.4027124345302582,
"step": 1582
},
{
"epoch": 3.3417721518987342,
"grad_norm": 3.307328939437866,
"learning_rate": 1.3233810540330258e-06,
"loss": 0.7045289278030396,
"step": 1584
},
{
"epoch": 3.3459915611814344,
"grad_norm": 3.4561619758605957,
"learning_rate": 1.3168782009072792e-06,
"loss": 0.5450237989425659,
"step": 1586
},
{
"epoch": 3.350210970464135,
"grad_norm": 0.9626051783561707,
"learning_rate": 1.3104128729415191e-06,
"loss": 0.29501575231552124,
"step": 1588
},
{
"epoch": 3.3544303797468356,
"grad_norm": 0.9624335169792175,
"learning_rate": 1.3039851488262682e-06,
"loss": 0.7472168207168579,
"step": 1590
},
{
"epoch": 3.3586497890295357,
"grad_norm": 1.6804134845733643,
"learning_rate": 1.2975951067943673e-06,
"loss": 0.7001281976699829,
"step": 1592
},
{
"epoch": 3.3628691983122363,
"grad_norm": 0.3944559097290039,
"learning_rate": 1.2912428246200215e-06,
"loss": 0.45443102717399597,
"step": 1594
},
{
"epoch": 3.367088607594937,
"grad_norm": 3.5907063484191895,
"learning_rate": 1.2849283796178554e-06,
"loss": 0.32309669256210327,
"step": 1596
},
{
"epoch": 3.371308016877637,
"grad_norm": 1.1190893650054932,
"learning_rate": 1.2786518486419726e-06,
"loss": 0.369854599237442,
"step": 1598
},
{
"epoch": 3.3755274261603376,
"grad_norm": 5.47310733795166,
"learning_rate": 1.2724133080850176e-06,
"loss": 0.5572913289070129,
"step": 1600
},
{
"epoch": 3.379746835443038,
"grad_norm": 1.1562130451202393,
"learning_rate": 1.266212833877248e-06,
"loss": 0.4165474772453308,
"step": 1602
},
{
"epoch": 3.3839662447257384,
"grad_norm": 0.32527777552604675,
"learning_rate": 1.2600505014856088e-06,
"loss": 0.3750830888748169,
"step": 1604
},
{
"epoch": 3.388185654008439,
"grad_norm": 1.6657167673110962,
"learning_rate": 1.253926385912818e-06,
"loss": 0.8115463852882385,
"step": 1606
},
{
"epoch": 3.392405063291139,
"grad_norm": 1.3920835256576538,
"learning_rate": 1.2478405616964485e-06,
"loss": 0.4179677963256836,
"step": 1608
},
{
"epoch": 3.3966244725738397,
"grad_norm": 1.1664825677871704,
"learning_rate": 1.2417931029080215e-06,
"loss": 0.41709059476852417,
"step": 1610
},
{
"epoch": 3.40084388185654,
"grad_norm": 1.5139544010162354,
"learning_rate": 1.23578408315211e-06,
"loss": 0.7101098299026489,
"step": 1612
},
{
"epoch": 3.4050632911392404,
"grad_norm": 0.8697150945663452,
"learning_rate": 1.2298135755654378e-06,
"loss": 0.20523357391357422,
"step": 1614
},
{
"epoch": 3.409282700421941,
"grad_norm": 2.3192219734191895,
"learning_rate": 1.2238816528159904e-06,
"loss": 0.6002774238586426,
"step": 1616
},
{
"epoch": 3.413502109704641,
"grad_norm": 1.1426221132278442,
"learning_rate": 1.2179883871021322e-06,
"loss": 0.8457775712013245,
"step": 1618
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.9432998895645142,
"learning_rate": 1.2121338501517264e-06,
"loss": 0.7718835473060608,
"step": 1620
},
{
"epoch": 3.4219409282700424,
"grad_norm": 0.8319332599639893,
"learning_rate": 1.2063181132212632e-06,
"loss": 0.43066444993019104,
"step": 1622
},
{
"epoch": 3.4261603375527425,
"grad_norm": 6.067880630493164,
"learning_rate": 1.200541247094989e-06,
"loss": 0.23788659274578094,
"step": 1624
},
{
"epoch": 3.430379746835443,
"grad_norm": 5.315701007843018,
"learning_rate": 1.1948033220840512e-06,
"loss": 0.17813172936439514,
"step": 1626
},
{
"epoch": 3.4345991561181437,
"grad_norm": 1.0699946880340576,
"learning_rate": 1.1891044080256355e-06,
"loss": 0.67367023229599,
"step": 1628
},
{
"epoch": 3.438818565400844,
"grad_norm": 1.1057825088500977,
"learning_rate": 1.1834445742821226e-06,
"loss": 0.27095526456832886,
"step": 1630
},
{
"epoch": 3.4430379746835444,
"grad_norm": 1.340883493423462,
"learning_rate": 1.1778238897402362e-06,
"loss": 0.8471240401268005,
"step": 1632
},
{
"epoch": 3.4472573839662446,
"grad_norm": 0.34447231888771057,
"learning_rate": 1.1722424228102123e-06,
"loss": 0.4438764452934265,
"step": 1634
},
{
"epoch": 3.451476793248945,
"grad_norm": 3.183422327041626,
"learning_rate": 1.1667002414249631e-06,
"loss": 0.7752975225448608,
"step": 1636
},
{
"epoch": 3.4556962025316453,
"grad_norm": 2.9104697704315186,
"learning_rate": 1.1611974130392475e-06,
"loss": 0.9540504813194275,
"step": 1638
},
{
"epoch": 3.459915611814346,
"grad_norm": 1.6280553340911865,
"learning_rate": 1.1557340046288554e-06,
"loss": 0.8632485270500183,
"step": 1640
},
{
"epoch": 3.4641350210970465,
"grad_norm": 2.387031078338623,
"learning_rate": 1.1503100826897889e-06,
"loss": 0.8144734501838684,
"step": 1642
},
{
"epoch": 3.4683544303797467,
"grad_norm": 0.9215964674949646,
"learning_rate": 1.144925713237456e-06,
"loss": 0.20231464505195618,
"step": 1644
},
{
"epoch": 3.4725738396624473,
"grad_norm": 1.58251953125,
"learning_rate": 1.1395809618058614e-06,
"loss": 0.5774148106575012,
"step": 1646
},
{
"epoch": 3.476793248945148,
"grad_norm": 2.2536582946777344,
"learning_rate": 1.1342758934468158e-06,
"loss": 0.6982643604278564,
"step": 1648
},
{
"epoch": 3.481012658227848,
"grad_norm": 1.4097844362258911,
"learning_rate": 1.12901057272914e-06,
"loss": 0.38915500044822693,
"step": 1650
},
{
"epoch": 3.4852320675105486,
"grad_norm": 1.30046546459198,
"learning_rate": 1.1237850637378808e-06,
"loss": 0.6481969356536865,
"step": 1652
},
{
"epoch": 3.489451476793249,
"grad_norm": 0.18971386551856995,
"learning_rate": 1.1185994300735278e-06,
"loss": 0.3767941892147064,
"step": 1654
},
{
"epoch": 3.4936708860759493,
"grad_norm": 0.3824913203716278,
"learning_rate": 1.1134537348512443e-06,
"loss": 0.5739644169807434,
"step": 1656
},
{
"epoch": 3.49789029535865,
"grad_norm": 2.9707915782928467,
"learning_rate": 1.1083480407000954e-06,
"loss": 0.609894335269928,
"step": 1658
},
{
"epoch": 3.50210970464135,
"grad_norm": 1.3457541465759277,
"learning_rate": 1.103282409762287e-06,
"loss": 0.6929283142089844,
"step": 1660
},
{
"epoch": 3.5063291139240507,
"grad_norm": 2.39221453666687,
"learning_rate": 1.0982569036924092e-06,
"loss": 0.8087446093559265,
"step": 1662
},
{
"epoch": 3.510548523206751,
"grad_norm": 5.895007610321045,
"learning_rate": 1.0932715836566866e-06,
"loss": 0.3411268889904022,
"step": 1664
},
{
"epoch": 3.5147679324894514,
"grad_norm": 1.3041728734970093,
"learning_rate": 1.0883265103322333e-06,
"loss": 0.8067029714584351,
"step": 1666
},
{
"epoch": 3.518987341772152,
"grad_norm": 1.6455022096633911,
"learning_rate": 1.083421743906313e-06,
"loss": 0.4951574504375458,
"step": 1668
},
{
"epoch": 3.523206751054852,
"grad_norm": 1.431204915046692,
"learning_rate": 1.0785573440756093e-06,
"loss": 0.7452267408370972,
"step": 1670
},
{
"epoch": 3.5274261603375527,
"grad_norm": 7.941998481750488,
"learning_rate": 1.0737333700454966e-06,
"loss": 0.2036304473876953,
"step": 1672
},
{
"epoch": 3.5316455696202533,
"grad_norm": 1.081209659576416,
"learning_rate": 1.068949880529322e-06,
"loss": 0.4741116166114807,
"step": 1674
},
{
"epoch": 3.5358649789029535,
"grad_norm": 3.1109554767608643,
"learning_rate": 1.0642069337476872e-06,
"loss": 0.5494669675827026,
"step": 1676
},
{
"epoch": 3.540084388185654,
"grad_norm": 3.2354819774627686,
"learning_rate": 1.0595045874277425e-06,
"loss": 0.4578985571861267,
"step": 1678
},
{
"epoch": 3.5443037974683547,
"grad_norm": 1.4328290224075317,
"learning_rate": 1.0548428988024858e-06,
"loss": 0.7518556714057922,
"step": 1680
},
{
"epoch": 3.548523206751055,
"grad_norm": 1.069136619567871,
"learning_rate": 1.050221924610061e-06,
"loss": 0.567197859287262,
"step": 1682
},
{
"epoch": 3.5527426160337554,
"grad_norm": 21.512428283691406,
"learning_rate": 1.045641721093071e-06,
"loss": 0.6879177093505859,
"step": 1684
},
{
"epoch": 3.5569620253164556,
"grad_norm": 3.211840867996216,
"learning_rate": 1.041102343997893e-06,
"loss": 0.23187503218650818,
"step": 1686
},
{
"epoch": 3.561181434599156,
"grad_norm": 0.7154665589332581,
"learning_rate": 1.0366038485739996e-06,
"loss": 0.4495694935321808,
"step": 1688
},
{
"epoch": 3.5654008438818563,
"grad_norm": 1.3137481212615967,
"learning_rate": 1.032146289573284e-06,
"loss": 0.7427676320075989,
"step": 1690
},
{
"epoch": 3.569620253164557,
"grad_norm": 4.688238620758057,
"learning_rate": 1.027729721249399e-06,
"loss": 0.16239574551582336,
"step": 1692
},
{
"epoch": 3.5738396624472575,
"grad_norm": 0.26294824481010437,
"learning_rate": 1.023354197357091e-06,
"loss": 0.6016992926597595,
"step": 1694
},
{
"epoch": 3.5780590717299576,
"grad_norm": 2.513110637664795,
"learning_rate": 1.0190197711515498e-06,
"loss": 0.20142441987991333,
"step": 1696
},
{
"epoch": 3.5822784810126582,
"grad_norm": 1.3879189491271973,
"learning_rate": 1.014726495387757e-06,
"loss": 0.5553002953529358,
"step": 1698
},
{
"epoch": 3.586497890295359,
"grad_norm": 1.3632709980010986,
"learning_rate": 1.0104744223198471e-06,
"loss": 0.4727664589881897,
"step": 1700
},
{
"epoch": 3.590717299578059,
"grad_norm": 1.0121846199035645,
"learning_rate": 1.0062636037004696e-06,
"loss": 0.3748111128807068,
"step": 1702
},
{
"epoch": 3.5949367088607596,
"grad_norm": 1.831874132156372,
"learning_rate": 1.0020940907801604e-06,
"loss": 0.869547963142395,
"step": 1704
},
{
"epoch": 3.59915611814346,
"grad_norm": 7.198643684387207,
"learning_rate": 9.979659343067154e-07,
"loss": 0.5534847974777222,
"step": 1706
},
{
"epoch": 3.6033755274261603,
"grad_norm": 2.4725635051727295,
"learning_rate": 9.938791845245768e-07,
"loss": 0.5149208307266235,
"step": 1708
},
{
"epoch": 3.607594936708861,
"grad_norm": 0.5918768048286438,
"learning_rate": 9.898338911742186e-07,
"loss": 0.364676296710968,
"step": 1710
},
{
"epoch": 3.611814345991561,
"grad_norm": 1.779348611831665,
"learning_rate": 9.85830103491541e-07,
"loss": 0.7533677816390991,
"step": 1712
},
{
"epoch": 3.6160337552742616,
"grad_norm": 3.70202374458313,
"learning_rate": 9.818678702072734e-07,
"loss": 0.9169490933418274,
"step": 1714
},
{
"epoch": 3.620253164556962,
"grad_norm": 1.195534110069275,
"learning_rate": 9.779472395463802e-07,
"loss": 0.39904284477233887,
"step": 1716
},
{
"epoch": 3.6244725738396624,
"grad_norm": 1.971677303314209,
"learning_rate": 9.740682592274744e-07,
"loss": 0.3311789035797119,
"step": 1718
},
{
"epoch": 3.628691983122363,
"grad_norm": 1.8470239639282227,
"learning_rate": 9.702309764622328e-07,
"loss": 0.1799009144306183,
"step": 1720
},
{
"epoch": 3.632911392405063,
"grad_norm": 2.5582504272460938,
"learning_rate": 9.664354379548284e-07,
"loss": 0.8046585321426392,
"step": 1722
},
{
"epoch": 3.6371308016877637,
"grad_norm": 3.1312508583068848,
"learning_rate": 9.62681689901357e-07,
"loss": 0.3371848165988922,
"step": 1724
},
{
"epoch": 3.6413502109704643,
"grad_norm": 2.6263599395751953,
"learning_rate": 9.589697779892765e-07,
"loss": 0.2725059986114502,
"step": 1726
},
{
"epoch": 3.6455696202531644,
"grad_norm": 1.8412586450576782,
"learning_rate": 9.552997473968485e-07,
"loss": 0.8444567918777466,
"step": 1728
},
{
"epoch": 3.649789029535865,
"grad_norm": 2.1324095726013184,
"learning_rate": 9.516716427925936e-07,
"loss": 0.15560747683048248,
"step": 1730
},
{
"epoch": 3.6540084388185656,
"grad_norm": 3.671393394470215,
"learning_rate": 9.480855083347428e-07,
"loss": 0.7069560289382935,
"step": 1732
},
{
"epoch": 3.6582278481012658,
"grad_norm": 2.5802621841430664,
"learning_rate": 9.445413876707028e-07,
"loss": 0.2358541190624237,
"step": 1734
},
{
"epoch": 3.6624472573839664,
"grad_norm": 1.5859323740005493,
"learning_rate": 9.41039323936522e-07,
"loss": 0.20277546346187592,
"step": 1736
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.9495890736579895,
"learning_rate": 9.375793597563692e-07,
"loss": 0.5327252745628357,
"step": 1738
},
{
"epoch": 3.670886075949367,
"grad_norm": 5.723246097564697,
"learning_rate": 9.341615372420126e-07,
"loss": 0.2760300636291504,
"step": 1740
},
{
"epoch": 3.6751054852320673,
"grad_norm": 1.0034908056259155,
"learning_rate": 9.307858979923064e-07,
"loss": 0.905087411403656,
"step": 1742
},
{
"epoch": 3.679324894514768,
"grad_norm": 0.9102334976196289,
"learning_rate": 9.274524830926866e-07,
"loss": 0.40605294704437256,
"step": 1744
},
{
"epoch": 3.6835443037974684,
"grad_norm": 1.6663873195648193,
"learning_rate": 9.241613331146703e-07,
"loss": 0.4531800448894501,
"step": 1746
},
{
"epoch": 3.6877637130801686,
"grad_norm": 0.9356587529182434,
"learning_rate": 9.209124881153613e-07,
"loss": 0.8058107495307922,
"step": 1748
},
{
"epoch": 3.691983122362869,
"grad_norm": 1.130460500717163,
"learning_rate": 9.177059876369619e-07,
"loss": 0.5929072499275208,
"step": 1750
},
{
"epoch": 3.6962025316455698,
"grad_norm": 1.5055688619613647,
"learning_rate": 9.145418707062941e-07,
"loss": 0.7090030908584595,
"step": 1752
},
{
"epoch": 3.70042194092827,
"grad_norm": 1.6110928058624268,
"learning_rate": 9.114201758343216e-07,
"loss": 0.8376182913780212,
"step": 1754
},
{
"epoch": 3.7046413502109705,
"grad_norm": 2.095933198928833,
"learning_rate": 9.083409410156845e-07,
"loss": 0.6055005788803101,
"step": 1756
},
{
"epoch": 3.708860759493671,
"grad_norm": 4.733712673187256,
"learning_rate": 9.053042037282327e-07,
"loss": 0.6132983565330505,
"step": 1758
},
{
"epoch": 3.7130801687763713,
"grad_norm": 5.780431270599365,
"learning_rate": 9.023100009325733e-07,
"loss": 0.5792241096496582,
"step": 1760
},
{
"epoch": 3.717299578059072,
"grad_norm": 2.4617018699645996,
"learning_rate": 8.993583690716196e-07,
"loss": 0.16029909253120422,
"step": 1762
},
{
"epoch": 3.721518987341772,
"grad_norm": 22.150638580322266,
"learning_rate": 8.964493440701455e-07,
"loss": 0.41341426968574524,
"step": 1764
},
{
"epoch": 3.7257383966244726,
"grad_norm": 1.3757541179656982,
"learning_rate": 8.935829613343528e-07,
"loss": 0.6639930605888367,
"step": 1766
},
{
"epoch": 3.7299578059071727,
"grad_norm": 2.44677472114563,
"learning_rate": 8.907592557514363e-07,
"loss": 0.404757022857666,
"step": 1768
},
{
"epoch": 3.7341772151898733,
"grad_norm": 1.3542742729187012,
"learning_rate": 8.8797826168916e-07,
"loss": 0.539573073387146,
"step": 1770
},
{
"epoch": 3.738396624472574,
"grad_norm": 1.3471025228500366,
"learning_rate": 8.852400129954396e-07,
"loss": 0.7064318656921387,
"step": 1772
},
{
"epoch": 3.742616033755274,
"grad_norm": 0.8148087859153748,
"learning_rate": 8.825445429979306e-07,
"loss": 0.22752483189105988,
"step": 1774
},
{
"epoch": 3.7468354430379747,
"grad_norm": 0.2493860125541687,
"learning_rate": 8.798918845036217e-07,
"loss": 0.4672152101993561,
"step": 1776
},
{
"epoch": 3.7510548523206753,
"grad_norm": 1.1036282777786255,
"learning_rate": 8.772820697984369e-07,
"loss": 0.6906728148460388,
"step": 1778
},
{
"epoch": 3.7552742616033754,
"grad_norm": 1.202394962310791,
"learning_rate": 8.747151306468404e-07,
"loss": 0.689781904220581,
"step": 1780
},
{
"epoch": 3.759493670886076,
"grad_norm": 1.5368152856826782,
"learning_rate": 8.721910982914527e-07,
"loss": 0.6156559586524963,
"step": 1782
},
{
"epoch": 3.7637130801687766,
"grad_norm": 2.000227928161621,
"learning_rate": 8.697100034526685e-07,
"loss": 0.6539533734321594,
"step": 1784
},
{
"epoch": 3.7679324894514767,
"grad_norm": 1.3653666973114014,
"learning_rate": 8.672718763282814e-07,
"loss": 0.7773669362068176,
"step": 1786
},
{
"epoch": 3.7721518987341773,
"grad_norm": 4.830440521240234,
"learning_rate": 8.648767465931215e-07,
"loss": 0.11648596078157425,
"step": 1788
},
{
"epoch": 3.7763713080168775,
"grad_norm": 1.1531578302383423,
"learning_rate": 8.625246433986894e-07,
"loss": 0.3612111806869507,
"step": 1790
},
{
"epoch": 3.780590717299578,
"grad_norm": 1.085537075996399,
"learning_rate": 8.602155953728014e-07,
"loss": 0.7319397330284119,
"step": 1792
},
{
"epoch": 3.7848101265822782,
"grad_norm": 2.1902003288269043,
"learning_rate": 8.579496306192452e-07,
"loss": 0.42418360710144043,
"step": 1794
},
{
"epoch": 3.789029535864979,
"grad_norm": 2.2132256031036377,
"learning_rate": 8.557267767174329e-07,
"loss": 0.6966800093650818,
"step": 1796
},
{
"epoch": 3.7932489451476794,
"grad_norm": 15.147263526916504,
"learning_rate": 8.535470607220696e-07,
"loss": 0.7651135325431824,
"step": 1798
},
{
"epoch": 3.7974683544303796,
"grad_norm": 1.363494634628296,
"learning_rate": 8.514105091628205e-07,
"loss": 0.6677999496459961,
"step": 1800
},
{
"epoch": 3.80168776371308,
"grad_norm": 2.6623036861419678,
"learning_rate": 8.493171480439908e-07,
"loss": 0.8932458758354187,
"step": 1802
},
{
"epoch": 3.8059071729957807,
"grad_norm": 1.0123828649520874,
"learning_rate": 8.47267002844208e-07,
"loss": 0.3895692527294159,
"step": 1804
},
{
"epoch": 3.810126582278481,
"grad_norm": 7.874610900878906,
"learning_rate": 8.452600985161112e-07,
"loss": 0.0816773921251297,
"step": 1806
},
{
"epoch": 3.8143459915611815,
"grad_norm": 2.1802990436553955,
"learning_rate": 8.432964594860478e-07,
"loss": 0.7556171417236328,
"step": 1808
},
{
"epoch": 3.818565400843882,
"grad_norm": 1.1918423175811768,
"learning_rate": 8.413761096537786e-07,
"loss": 0.6875542402267456,
"step": 1810
},
{
"epoch": 3.8227848101265822,
"grad_norm": 1.7377078533172607,
"learning_rate": 8.394990723921816e-07,
"loss": 0.29866987466812134,
"step": 1812
},
{
"epoch": 3.827004219409283,
"grad_norm": 1.0950947999954224,
"learning_rate": 8.376653705469733e-07,
"loss": 0.7598391771316528,
"step": 1814
},
{
"epoch": 3.831223628691983,
"grad_norm": 2.3216395378112793,
"learning_rate": 8.358750264364267e-07,
"loss": 0.7117894291877747,
"step": 1816
},
{
"epoch": 3.8354430379746836,
"grad_norm": 4.284765720367432,
"learning_rate": 8.341280618511016e-07,
"loss": 0.6586706042289734,
"step": 1818
},
{
"epoch": 3.8396624472573837,
"grad_norm": 2.1526107788085938,
"learning_rate": 8.324244980535782e-07,
"loss": 0.5206190347671509,
"step": 1820
},
{
"epoch": 3.8438818565400843,
"grad_norm": 1.1617799997329712,
"learning_rate": 8.307643557781994e-07,
"loss": 0.7454214692115784,
"step": 1822
},
{
"epoch": 3.848101265822785,
"grad_norm": 1.9797450304031372,
"learning_rate": 8.291476552308179e-07,
"loss": 0.6207857728004456,
"step": 1824
},
{
"epoch": 3.852320675105485,
"grad_norm": 1.9322015047073364,
"learning_rate": 8.275744160885501e-07,
"loss": 0.685775876045227,
"step": 1826
},
{
"epoch": 3.8565400843881856,
"grad_norm": 2.0633387565612793,
"learning_rate": 8.260446574995363e-07,
"loss": 0.7667111754417419,
"step": 1828
},
{
"epoch": 3.8607594936708862,
"grad_norm": 1.2368988990783691,
"learning_rate": 8.245583980827098e-07,
"loss": 0.6670578718185425,
"step": 1830
},
{
"epoch": 3.8649789029535864,
"grad_norm": 2.3721048831939697,
"learning_rate": 8.231156559275666e-07,
"loss": 0.15816515684127808,
"step": 1832
},
{
"epoch": 3.869198312236287,
"grad_norm": 3.6182823181152344,
"learning_rate": 8.217164485939484e-07,
"loss": 0.4539300501346588,
"step": 1834
},
{
"epoch": 3.8734177215189876,
"grad_norm": 4.013774394989014,
"learning_rate": 8.203607931118281e-07,
"loss": 0.5095362663269043,
"step": 1836
},
{
"epoch": 3.8776371308016877,
"grad_norm": 2.4649147987365723,
"learning_rate": 8.190487059811013e-07,
"loss": 0.4961618483066559,
"step": 1838
},
{
"epoch": 3.8818565400843883,
"grad_norm": 4.491702079772949,
"learning_rate": 8.177802031713863e-07,
"loss": 0.7962309122085571,
"step": 1840
},
{
"epoch": 3.8860759493670884,
"grad_norm": 0.9100720286369324,
"learning_rate": 8.165553001218308e-07,
"loss": 0.4460848867893219,
"step": 1842
},
{
"epoch": 3.890295358649789,
"grad_norm": 0.9674596786499023,
"learning_rate": 8.153740117409218e-07,
"loss": 0.44232675433158875,
"step": 1844
},
{
"epoch": 3.894514767932489,
"grad_norm": 1.0640747547149658,
"learning_rate": 8.142363524063067e-07,
"loss": 0.7083509564399719,
"step": 1846
},
{
"epoch": 3.8987341772151898,
"grad_norm": 7.521092414855957,
"learning_rate": 8.131423359646147e-07,
"loss": 0.309792697429657,
"step": 1848
},
{
"epoch": 3.9029535864978904,
"grad_norm": 1.9627479314804077,
"learning_rate": 8.120919757312934e-07,
"loss": 0.7434027194976807,
"step": 1850
},
{
"epoch": 3.9071729957805905,
"grad_norm": 1.8858873844146729,
"learning_rate": 8.110852844904411e-07,
"loss": 0.7783426642417908,
"step": 1852
},
{
"epoch": 3.911392405063291,
"grad_norm": 1.578383445739746,
"learning_rate": 8.101222744946554e-07,
"loss": 0.7528443336486816,
"step": 1854
},
{
"epoch": 3.9156118143459917,
"grad_norm": 1.0352789163589478,
"learning_rate": 8.092029574648825e-07,
"loss": 0.6360561847686768,
"step": 1856
},
{
"epoch": 3.919831223628692,
"grad_norm": 1.9537928104400635,
"learning_rate": 8.08327344590275e-07,
"loss": 0.7542226314544678,
"step": 1858
},
{
"epoch": 3.9240506329113924,
"grad_norm": 1.5767334699630737,
"learning_rate": 8.074954465280533e-07,
"loss": 0.7059440016746521,
"step": 1860
},
{
"epoch": 3.928270042194093,
"grad_norm": 1.4566371440887451,
"learning_rate": 8.067072734033808e-07,
"loss": 0.44404223561286926,
"step": 1862
},
{
"epoch": 3.932489451476793,
"grad_norm": 1.6387444734573364,
"learning_rate": 8.05962834809236e-07,
"loss": 0.4295271039009094,
"step": 1864
},
{
"epoch": 3.9367088607594938,
"grad_norm": 4.41506290435791,
"learning_rate": 8.052621398062982e-07,
"loss": 0.9274621605873108,
"step": 1866
},
{
"epoch": 3.9409282700421944,
"grad_norm": 2.0459539890289307,
"learning_rate": 8.046051969228362e-07,
"loss": 0.6663318872451782,
"step": 1868
},
{
"epoch": 3.9451476793248945,
"grad_norm": 1.6559040546417236,
"learning_rate": 8.039920141546053e-07,
"loss": 0.5702696442604065,
"step": 1870
},
{
"epoch": 3.9493670886075947,
"grad_norm": 0.035610347986221313,
"learning_rate": 8.034225989647494e-07,
"loss": 0.2956307530403137,
"step": 1872
},
{
"epoch": 3.9535864978902953,
"grad_norm": 5.27208948135376,
"learning_rate": 8.028969582837097e-07,
"loss": 0.22891630232334137,
"step": 1874
},
{
"epoch": 3.957805907172996,
"grad_norm": 1.6149741411209106,
"learning_rate": 8.024150985091419e-07,
"loss": 0.5240350961685181,
"step": 1876
},
{
"epoch": 3.962025316455696,
"grad_norm": 3.051912307739258,
"learning_rate": 8.019770255058373e-07,
"loss": 0.6355645060539246,
"step": 1878
},
{
"epoch": 3.9662447257383966,
"grad_norm": 1.413213849067688,
"learning_rate": 8.015827446056511e-07,
"loss": 0.4071570634841919,
"step": 1880
},
{
"epoch": 3.970464135021097,
"grad_norm": 1.0149176120758057,
"learning_rate": 8.012322606074381e-07,
"loss": 0.6791200637817383,
"step": 1882
},
{
"epoch": 3.9746835443037973,
"grad_norm": 6.349963188171387,
"learning_rate": 8.009255777769939e-07,
"loss": 0.1739484965801239,
"step": 1884
},
{
"epoch": 3.978902953586498,
"grad_norm": 1.85853111743927,
"learning_rate": 8.006626998470039e-07,
"loss": 0.6670107245445251,
"step": 1886
},
{
"epoch": 3.9831223628691985,
"grad_norm": 1.893870234489441,
"learning_rate": 8.004436300169959e-07,
"loss": 0.5138272047042847,
"step": 1888
},
{
"epoch": 3.9873417721518987,
"grad_norm": 3.016247272491455,
"learning_rate": 8.002683709533043e-07,
"loss": 0.7126239538192749,
"step": 1890
},
{
"epoch": 3.9915611814345993,
"grad_norm": 2.8701586723327637,
"learning_rate": 8.001369247890338e-07,
"loss": 0.4470701813697815,
"step": 1892
},
{
"epoch": 3.9957805907173,
"grad_norm": 1.1190752983093262,
"learning_rate": 8.00049293124037e-07,
"loss": 0.6980884075164795,
"step": 1894
},
{
"epoch": 4.0,
"grad_norm": 1.7359868288040161,
"learning_rate": 8.000054770248921e-07,
"loss": 0.6384545564651489,
"step": 1896
},
{
"epoch": 4.0,
"step": 1896,
"total_flos": 3.5948540672197263e+18,
"train_loss": 0.8366947202287017,
"train_runtime": 8313.2571,
"train_samples_per_second": 6.842,
"train_steps_per_second": 0.228
}
],
"logging_steps": 2,
"max_steps": 1896,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.5948540672197263e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}