EtashGuha's picture
Upload folder using huggingface_hub
8c19ba3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 990,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030303030303030303,
"grad_norm": 6.14251184463501,
"learning_rate": 1.0101010101010103e-07,
"loss": 0.8811,
"step": 1
},
{
"epoch": 0.006060606060606061,
"grad_norm": 5.7579426765441895,
"learning_rate": 2.0202020202020205e-07,
"loss": 0.8291,
"step": 2
},
{
"epoch": 0.00909090909090909,
"grad_norm": 6.182405471801758,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.8745,
"step": 3
},
{
"epoch": 0.012121212121212121,
"grad_norm": 5.871363162994385,
"learning_rate": 4.040404040404041e-07,
"loss": 0.8764,
"step": 4
},
{
"epoch": 0.015151515151515152,
"grad_norm": 5.848386287689209,
"learning_rate": 5.05050505050505e-07,
"loss": 0.8469,
"step": 5
},
{
"epoch": 0.01818181818181818,
"grad_norm": 5.616578102111816,
"learning_rate": 6.060606060606061e-07,
"loss": 0.8085,
"step": 6
},
{
"epoch": 0.021212121212121213,
"grad_norm": 6.289897441864014,
"learning_rate": 7.070707070707071e-07,
"loss": 0.8985,
"step": 7
},
{
"epoch": 0.024242424242424242,
"grad_norm": 5.57948112487793,
"learning_rate": 8.080808080808082e-07,
"loss": 0.8366,
"step": 8
},
{
"epoch": 0.02727272727272727,
"grad_norm": 5.735244274139404,
"learning_rate": 9.090909090909091e-07,
"loss": 0.86,
"step": 9
},
{
"epoch": 0.030303030303030304,
"grad_norm": 5.462663173675537,
"learning_rate": 1.01010101010101e-06,
"loss": 0.8469,
"step": 10
},
{
"epoch": 0.03333333333333333,
"grad_norm": 4.708677768707275,
"learning_rate": 1.111111111111111e-06,
"loss": 0.8173,
"step": 11
},
{
"epoch": 0.03636363636363636,
"grad_norm": 4.475161552429199,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.7915,
"step": 12
},
{
"epoch": 0.03939393939393939,
"grad_norm": 4.219878673553467,
"learning_rate": 1.3131313131313134e-06,
"loss": 0.8007,
"step": 13
},
{
"epoch": 0.04242424242424243,
"grad_norm": 2.822401762008667,
"learning_rate": 1.4141414141414143e-06,
"loss": 0.7783,
"step": 14
},
{
"epoch": 0.045454545454545456,
"grad_norm": 2.4995763301849365,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.7523,
"step": 15
},
{
"epoch": 0.048484848484848485,
"grad_norm": 2.4066433906555176,
"learning_rate": 1.6161616161616164e-06,
"loss": 0.7468,
"step": 16
},
{
"epoch": 0.051515151515151514,
"grad_norm": 2.2153358459472656,
"learning_rate": 1.7171717171717173e-06,
"loss": 0.7509,
"step": 17
},
{
"epoch": 0.05454545454545454,
"grad_norm": 2.120013952255249,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.7532,
"step": 18
},
{
"epoch": 0.05757575757575758,
"grad_norm": 1.9135936498641968,
"learning_rate": 1.9191919191919192e-06,
"loss": 0.721,
"step": 19
},
{
"epoch": 0.06060606060606061,
"grad_norm": 2.663780450820923,
"learning_rate": 2.02020202020202e-06,
"loss": 0.6962,
"step": 20
},
{
"epoch": 0.06363636363636363,
"grad_norm": 2.8611958026885986,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.6855,
"step": 21
},
{
"epoch": 0.06666666666666667,
"grad_norm": 3.166199207305908,
"learning_rate": 2.222222222222222e-06,
"loss": 0.7219,
"step": 22
},
{
"epoch": 0.0696969696969697,
"grad_norm": 2.878675937652588,
"learning_rate": 2.3232323232323234e-06,
"loss": 0.687,
"step": 23
},
{
"epoch": 0.07272727272727272,
"grad_norm": 2.6447913646698,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.6795,
"step": 24
},
{
"epoch": 0.07575757575757576,
"grad_norm": 2.3938543796539307,
"learning_rate": 2.5252525252525258e-06,
"loss": 0.6837,
"step": 25
},
{
"epoch": 0.07878787878787878,
"grad_norm": 1.7195990085601807,
"learning_rate": 2.6262626262626267e-06,
"loss": 0.6984,
"step": 26
},
{
"epoch": 0.08181818181818182,
"grad_norm": 1.28241765499115,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.6588,
"step": 27
},
{
"epoch": 0.08484848484848485,
"grad_norm": 1.2121256589889526,
"learning_rate": 2.8282828282828286e-06,
"loss": 0.643,
"step": 28
},
{
"epoch": 0.08787878787878788,
"grad_norm": 1.1997158527374268,
"learning_rate": 2.9292929292929295e-06,
"loss": 0.624,
"step": 29
},
{
"epoch": 0.09090909090909091,
"grad_norm": 1.0956838130950928,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.6108,
"step": 30
},
{
"epoch": 0.09393939393939393,
"grad_norm": 1.0332937240600586,
"learning_rate": 3.131313131313132e-06,
"loss": 0.6191,
"step": 31
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.1140916347503662,
"learning_rate": 3.232323232323233e-06,
"loss": 0.6529,
"step": 32
},
{
"epoch": 0.1,
"grad_norm": 0.9233384728431702,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6009,
"step": 33
},
{
"epoch": 0.10303030303030303,
"grad_norm": 0.8741363883018494,
"learning_rate": 3.4343434343434347e-06,
"loss": 0.6248,
"step": 34
},
{
"epoch": 0.10606060606060606,
"grad_norm": 0.7766424417495728,
"learning_rate": 3.5353535353535356e-06,
"loss": 0.6061,
"step": 35
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.7707573771476746,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.6274,
"step": 36
},
{
"epoch": 0.11212121212121212,
"grad_norm": 0.7274775505065918,
"learning_rate": 3.737373737373738e-06,
"loss": 0.6017,
"step": 37
},
{
"epoch": 0.11515151515151516,
"grad_norm": 0.8542383909225464,
"learning_rate": 3.8383838383838385e-06,
"loss": 0.5802,
"step": 38
},
{
"epoch": 0.11818181818181818,
"grad_norm": 0.8010094165802002,
"learning_rate": 3.93939393939394e-06,
"loss": 0.5663,
"step": 39
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.7044663429260254,
"learning_rate": 4.04040404040404e-06,
"loss": 0.5704,
"step": 40
},
{
"epoch": 0.12424242424242424,
"grad_norm": 0.6028207540512085,
"learning_rate": 4.141414141414142e-06,
"loss": 0.5762,
"step": 41
},
{
"epoch": 0.12727272727272726,
"grad_norm": 0.8017765283584595,
"learning_rate": 4.242424242424243e-06,
"loss": 0.6181,
"step": 42
},
{
"epoch": 0.1303030303030303,
"grad_norm": 0.6999064683914185,
"learning_rate": 4.343434343434344e-06,
"loss": 0.5961,
"step": 43
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.6400359272956848,
"learning_rate": 4.444444444444444e-06,
"loss": 0.5896,
"step": 44
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.6916729807853699,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.5754,
"step": 45
},
{
"epoch": 0.1393939393939394,
"grad_norm": 0.6609205007553101,
"learning_rate": 4.646464646464647e-06,
"loss": 0.5743,
"step": 46
},
{
"epoch": 0.14242424242424243,
"grad_norm": 0.5354890823364258,
"learning_rate": 4.747474747474748e-06,
"loss": 0.5791,
"step": 47
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.5994871854782104,
"learning_rate": 4.848484848484849e-06,
"loss": 0.5567,
"step": 48
},
{
"epoch": 0.1484848484848485,
"grad_norm": 0.5859790444374084,
"learning_rate": 4.94949494949495e-06,
"loss": 0.5482,
"step": 49
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.627583384513855,
"learning_rate": 5.0505050505050515e-06,
"loss": 0.5397,
"step": 50
},
{
"epoch": 0.15454545454545454,
"grad_norm": 0.48996925354003906,
"learning_rate": 5.151515151515152e-06,
"loss": 0.5541,
"step": 51
},
{
"epoch": 0.15757575757575756,
"grad_norm": 0.5651494264602661,
"learning_rate": 5.252525252525253e-06,
"loss": 0.573,
"step": 52
},
{
"epoch": 0.1606060606060606,
"grad_norm": 0.6122561097145081,
"learning_rate": 5.353535353535354e-06,
"loss": 0.5512,
"step": 53
},
{
"epoch": 0.16363636363636364,
"grad_norm": 0.49054795503616333,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.5583,
"step": 54
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.5040543079376221,
"learning_rate": 5.555555555555557e-06,
"loss": 0.5457,
"step": 55
},
{
"epoch": 0.1696969696969697,
"grad_norm": 0.5023638606071472,
"learning_rate": 5.656565656565657e-06,
"loss": 0.5338,
"step": 56
},
{
"epoch": 0.17272727272727273,
"grad_norm": 0.6167340874671936,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.5328,
"step": 57
},
{
"epoch": 0.17575757575757575,
"grad_norm": 0.5743213295936584,
"learning_rate": 5.858585858585859e-06,
"loss": 0.5312,
"step": 58
},
{
"epoch": 0.1787878787878788,
"grad_norm": 0.46841517090797424,
"learning_rate": 5.95959595959596e-06,
"loss": 0.5012,
"step": 59
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.5114443302154541,
"learning_rate": 6.060606060606061e-06,
"loss": 0.5377,
"step": 60
},
{
"epoch": 0.18484848484848485,
"grad_norm": 0.5205867886543274,
"learning_rate": 6.1616161616161615e-06,
"loss": 0.5505,
"step": 61
},
{
"epoch": 0.18787878787878787,
"grad_norm": 0.6010080575942993,
"learning_rate": 6.262626262626264e-06,
"loss": 0.5452,
"step": 62
},
{
"epoch": 0.19090909090909092,
"grad_norm": 0.5454538464546204,
"learning_rate": 6.363636363636364e-06,
"loss": 0.527,
"step": 63
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.5522276163101196,
"learning_rate": 6.464646464646466e-06,
"loss": 0.5355,
"step": 64
},
{
"epoch": 0.19696969696969696,
"grad_norm": 0.5053198933601379,
"learning_rate": 6.565656565656566e-06,
"loss": 0.5227,
"step": 65
},
{
"epoch": 0.2,
"grad_norm": 0.5276966094970703,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5341,
"step": 66
},
{
"epoch": 0.20303030303030303,
"grad_norm": 0.5439529418945312,
"learning_rate": 6.767676767676769e-06,
"loss": 0.519,
"step": 67
},
{
"epoch": 0.20606060606060606,
"grad_norm": 0.5188765525817871,
"learning_rate": 6.868686868686869e-06,
"loss": 0.5015,
"step": 68
},
{
"epoch": 0.20909090909090908,
"grad_norm": 0.5360773801803589,
"learning_rate": 6.969696969696971e-06,
"loss": 0.5246,
"step": 69
},
{
"epoch": 0.21212121212121213,
"grad_norm": 0.5436122417449951,
"learning_rate": 7.070707070707071e-06,
"loss": 0.4942,
"step": 70
},
{
"epoch": 0.21515151515151515,
"grad_norm": 0.5304339528083801,
"learning_rate": 7.171717171717172e-06,
"loss": 0.5126,
"step": 71
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.5600169897079468,
"learning_rate": 7.272727272727273e-06,
"loss": 0.5075,
"step": 72
},
{
"epoch": 0.22121212121212122,
"grad_norm": 0.6064160466194153,
"learning_rate": 7.373737373737374e-06,
"loss": 0.5287,
"step": 73
},
{
"epoch": 0.22424242424242424,
"grad_norm": 0.46182766556739807,
"learning_rate": 7.474747474747476e-06,
"loss": 0.5357,
"step": 74
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.5622665882110596,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.5162,
"step": 75
},
{
"epoch": 0.23030303030303031,
"grad_norm": 0.5098185539245605,
"learning_rate": 7.676767676767677e-06,
"loss": 0.5201,
"step": 76
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.5697974562644958,
"learning_rate": 7.77777777777778e-06,
"loss": 0.5319,
"step": 77
},
{
"epoch": 0.23636363636363636,
"grad_norm": 0.5686485171318054,
"learning_rate": 7.87878787878788e-06,
"loss": 0.5209,
"step": 78
},
{
"epoch": 0.23939393939393938,
"grad_norm": 0.541465699672699,
"learning_rate": 7.97979797979798e-06,
"loss": 0.5081,
"step": 79
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.5638330578804016,
"learning_rate": 8.08080808080808e-06,
"loss": 0.4961,
"step": 80
},
{
"epoch": 0.24545454545454545,
"grad_norm": 0.5761530995368958,
"learning_rate": 8.181818181818183e-06,
"loss": 0.5067,
"step": 81
},
{
"epoch": 0.24848484848484848,
"grad_norm": 0.48256605863571167,
"learning_rate": 8.282828282828283e-06,
"loss": 0.532,
"step": 82
},
{
"epoch": 0.2515151515151515,
"grad_norm": 0.6405602693557739,
"learning_rate": 8.383838383838384e-06,
"loss": 0.5021,
"step": 83
},
{
"epoch": 0.2545454545454545,
"grad_norm": 0.5617716908454895,
"learning_rate": 8.484848484848486e-06,
"loss": 0.4923,
"step": 84
},
{
"epoch": 0.25757575757575757,
"grad_norm": 0.5595013499259949,
"learning_rate": 8.585858585858587e-06,
"loss": 0.5194,
"step": 85
},
{
"epoch": 0.2606060606060606,
"grad_norm": 0.5715003609657288,
"learning_rate": 8.686868686868687e-06,
"loss": 0.4951,
"step": 86
},
{
"epoch": 0.2636363636363636,
"grad_norm": 0.5606926679611206,
"learning_rate": 8.787878787878788e-06,
"loss": 0.4966,
"step": 87
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.5606850981712341,
"learning_rate": 8.888888888888888e-06,
"loss": 0.4945,
"step": 88
},
{
"epoch": 0.2696969696969697,
"grad_norm": 0.5261890292167664,
"learning_rate": 8.98989898989899e-06,
"loss": 0.5184,
"step": 89
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.6513155698776245,
"learning_rate": 9.090909090909091e-06,
"loss": 0.5364,
"step": 90
},
{
"epoch": 0.27575757575757576,
"grad_norm": 0.501545786857605,
"learning_rate": 9.191919191919193e-06,
"loss": 0.5108,
"step": 91
},
{
"epoch": 0.2787878787878788,
"grad_norm": 0.5412594079971313,
"learning_rate": 9.292929292929294e-06,
"loss": 0.517,
"step": 92
},
{
"epoch": 0.2818181818181818,
"grad_norm": 0.6492443084716797,
"learning_rate": 9.393939393939396e-06,
"loss": 0.4978,
"step": 93
},
{
"epoch": 0.28484848484848485,
"grad_norm": 0.6265013217926025,
"learning_rate": 9.494949494949497e-06,
"loss": 0.5387,
"step": 94
},
{
"epoch": 0.2878787878787879,
"grad_norm": 0.6805964708328247,
"learning_rate": 9.595959595959597e-06,
"loss": 0.5024,
"step": 95
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.6327937841415405,
"learning_rate": 9.696969696969698e-06,
"loss": 0.4757,
"step": 96
},
{
"epoch": 0.29393939393939394,
"grad_norm": 0.6182631850242615,
"learning_rate": 9.797979797979798e-06,
"loss": 0.5209,
"step": 97
},
{
"epoch": 0.296969696969697,
"grad_norm": 0.564050018787384,
"learning_rate": 9.8989898989899e-06,
"loss": 0.513,
"step": 98
},
{
"epoch": 0.3,
"grad_norm": 0.5654965043067932,
"learning_rate": 1e-05,
"loss": 0.5094,
"step": 99
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.6344062089920044,
"learning_rate": 9.99996891979347e-06,
"loss": 0.508,
"step": 100
},
{
"epoch": 0.30606060606060603,
"grad_norm": 0.6290592551231384,
"learning_rate": 9.999875679560272e-06,
"loss": 0.5108,
"step": 101
},
{
"epoch": 0.3090909090909091,
"grad_norm": 0.5513220429420471,
"learning_rate": 9.999720280459576e-06,
"loss": 0.4855,
"step": 102
},
{
"epoch": 0.31212121212121213,
"grad_norm": 0.6386879682540894,
"learning_rate": 9.999502724423316e-06,
"loss": 0.4909,
"step": 103
},
{
"epoch": 0.3151515151515151,
"grad_norm": 0.555277407169342,
"learning_rate": 9.999223014156167e-06,
"loss": 0.4931,
"step": 104
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.5689773559570312,
"learning_rate": 9.99888115313551e-06,
"loss": 0.4886,
"step": 105
},
{
"epoch": 0.3212121212121212,
"grad_norm": 0.5847551226615906,
"learning_rate": 9.998477145611389e-06,
"loss": 0.518,
"step": 106
},
{
"epoch": 0.3242424242424242,
"grad_norm": 0.5526836514472961,
"learning_rate": 9.99801099660646e-06,
"loss": 0.5152,
"step": 107
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.590153694152832,
"learning_rate": 9.997482711915926e-06,
"loss": 0.5202,
"step": 108
},
{
"epoch": 0.3303030303030303,
"grad_norm": 0.524453341960907,
"learning_rate": 9.996892298107466e-06,
"loss": 0.5102,
"step": 109
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5427029728889465,
"learning_rate": 9.996239762521152e-06,
"loss": 0.4645,
"step": 110
},
{
"epoch": 0.33636363636363636,
"grad_norm": 0.5274785757064819,
"learning_rate": 9.99552511326936e-06,
"loss": 0.5018,
"step": 111
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.5401912927627563,
"learning_rate": 9.99474835923667e-06,
"loss": 0.4916,
"step": 112
},
{
"epoch": 0.3424242424242424,
"grad_norm": 0.5039514303207397,
"learning_rate": 9.993909510079752e-06,
"loss": 0.4974,
"step": 113
},
{
"epoch": 0.34545454545454546,
"grad_norm": 0.5162566304206848,
"learning_rate": 9.993008576227248e-06,
"loss": 0.4974,
"step": 114
},
{
"epoch": 0.3484848484848485,
"grad_norm": 0.5239473581314087,
"learning_rate": 9.99204556887964e-06,
"loss": 0.4824,
"step": 115
},
{
"epoch": 0.3515151515151515,
"grad_norm": 0.46607157588005066,
"learning_rate": 9.991020500009118e-06,
"loss": 0.4699,
"step": 116
},
{
"epoch": 0.35454545454545455,
"grad_norm": 0.45448705554008484,
"learning_rate": 9.989933382359423e-06,
"loss": 0.4797,
"step": 117
},
{
"epoch": 0.3575757575757576,
"grad_norm": 0.5446470975875854,
"learning_rate": 9.988784229445689e-06,
"loss": 0.4839,
"step": 118
},
{
"epoch": 0.3606060606060606,
"grad_norm": 0.49788519740104675,
"learning_rate": 9.98757305555428e-06,
"loss": 0.5038,
"step": 119
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5029739737510681,
"learning_rate": 9.986299875742612e-06,
"loss": 0.4795,
"step": 120
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.6325995326042175,
"learning_rate": 9.98496470583896e-06,
"loss": 0.5298,
"step": 121
},
{
"epoch": 0.3696969696969697,
"grad_norm": 0.5292807221412659,
"learning_rate": 9.98356756244227e-06,
"loss": 0.5072,
"step": 122
},
{
"epoch": 0.37272727272727274,
"grad_norm": 0.6047240495681763,
"learning_rate": 9.982108462921938e-06,
"loss": 0.497,
"step": 123
},
{
"epoch": 0.37575757575757573,
"grad_norm": 0.64040607213974,
"learning_rate": 9.980587425417612e-06,
"loss": 0.5047,
"step": 124
},
{
"epoch": 0.3787878787878788,
"grad_norm": 0.5388203263282776,
"learning_rate": 9.97900446883896e-06,
"loss": 0.489,
"step": 125
},
{
"epoch": 0.38181818181818183,
"grad_norm": 0.5968406200408936,
"learning_rate": 9.977359612865424e-06,
"loss": 0.5043,
"step": 126
},
{
"epoch": 0.38484848484848483,
"grad_norm": 0.5099135637283325,
"learning_rate": 9.975652877945991e-06,
"loss": 0.4939,
"step": 127
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.5063946843147278,
"learning_rate": 9.973884285298932e-06,
"loss": 0.4882,
"step": 128
},
{
"epoch": 0.39090909090909093,
"grad_norm": 0.5433374047279358,
"learning_rate": 9.972053856911534e-06,
"loss": 0.4902,
"step": 129
},
{
"epoch": 0.3939393939393939,
"grad_norm": 0.4687306880950928,
"learning_rate": 9.970161615539837e-06,
"loss": 0.4883,
"step": 130
},
{
"epoch": 0.396969696969697,
"grad_norm": 0.5378535985946655,
"learning_rate": 9.96820758470834e-06,
"loss": 0.5082,
"step": 131
},
{
"epoch": 0.4,
"grad_norm": 0.49391430616378784,
"learning_rate": 9.966191788709716e-06,
"loss": 0.4528,
"step": 132
},
{
"epoch": 0.403030303030303,
"grad_norm": 0.4575445055961609,
"learning_rate": 9.964114252604508e-06,
"loss": 0.4906,
"step": 133
},
{
"epoch": 0.40606060606060607,
"grad_norm": 0.602308452129364,
"learning_rate": 9.961975002220816e-06,
"loss": 0.4973,
"step": 134
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.4728878140449524,
"learning_rate": 9.959774064153977e-06,
"loss": 0.4803,
"step": 135
},
{
"epoch": 0.4121212121212121,
"grad_norm": 0.5318324565887451,
"learning_rate": 9.957511465766236e-06,
"loss": 0.4797,
"step": 136
},
{
"epoch": 0.41515151515151516,
"grad_norm": 0.5328473448753357,
"learning_rate": 9.955187235186403e-06,
"loss": 0.4889,
"step": 137
},
{
"epoch": 0.41818181818181815,
"grad_norm": 0.4802415370941162,
"learning_rate": 9.952801401309504e-06,
"loss": 0.5012,
"step": 138
},
{
"epoch": 0.4212121212121212,
"grad_norm": 0.5675839781761169,
"learning_rate": 9.950353993796424e-06,
"loss": 0.4973,
"step": 139
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.5941027998924255,
"learning_rate": 9.947845043073533e-06,
"loss": 0.4947,
"step": 140
},
{
"epoch": 0.42727272727272725,
"grad_norm": 0.5009533166885376,
"learning_rate": 9.945274580332316e-06,
"loss": 0.4963,
"step": 141
},
{
"epoch": 0.4303030303030303,
"grad_norm": 0.7182719111442566,
"learning_rate": 9.942642637528977e-06,
"loss": 0.492,
"step": 142
},
{
"epoch": 0.43333333333333335,
"grad_norm": 0.6205947995185852,
"learning_rate": 9.939949247384046e-06,
"loss": 0.4953,
"step": 143
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.581794023513794,
"learning_rate": 9.937194443381972e-06,
"loss": 0.4804,
"step": 144
},
{
"epoch": 0.4393939393939394,
"grad_norm": 0.5637320876121521,
"learning_rate": 9.934378259770708e-06,
"loss": 0.4819,
"step": 145
},
{
"epoch": 0.44242424242424244,
"grad_norm": 0.5580178499221802,
"learning_rate": 9.931500731561279e-06,
"loss": 0.4816,
"step": 146
},
{
"epoch": 0.44545454545454544,
"grad_norm": 0.5792706608772278,
"learning_rate": 9.928561894527354e-06,
"loss": 0.4912,
"step": 147
},
{
"epoch": 0.4484848484848485,
"grad_norm": 0.554824948310852,
"learning_rate": 9.925561785204797e-06,
"loss": 0.5057,
"step": 148
},
{
"epoch": 0.45151515151515154,
"grad_norm": 0.548604428768158,
"learning_rate": 9.922500440891217e-06,
"loss": 0.4796,
"step": 149
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.5710535645484924,
"learning_rate": 9.919377899645497e-06,
"loss": 0.4948,
"step": 150
},
{
"epoch": 0.4575757575757576,
"grad_norm": 0.5213266015052795,
"learning_rate": 9.916194200287329e-06,
"loss": 0.4715,
"step": 151
},
{
"epoch": 0.46060606060606063,
"grad_norm": 0.5729761719703674,
"learning_rate": 9.912949382396728e-06,
"loss": 0.4699,
"step": 152
},
{
"epoch": 0.4636363636363636,
"grad_norm": 0.5509311556816101,
"learning_rate": 9.909643486313533e-06,
"loss": 0.4757,
"step": 153
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.6052528619766235,
"learning_rate": 9.906276553136924e-06,
"loss": 0.5023,
"step": 154
},
{
"epoch": 0.4696969696969697,
"grad_norm": 0.6352577209472656,
"learning_rate": 9.902848624724887e-06,
"loss": 0.4839,
"step": 155
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.5328971743583679,
"learning_rate": 9.899359743693715e-06,
"loss": 0.4685,
"step": 156
},
{
"epoch": 0.47575757575757577,
"grad_norm": 0.6209010481834412,
"learning_rate": 9.895809953417464e-06,
"loss": 0.4765,
"step": 157
},
{
"epoch": 0.47878787878787876,
"grad_norm": 0.5842458605766296,
"learning_rate": 9.892199298027416e-06,
"loss": 0.492,
"step": 158
},
{
"epoch": 0.4818181818181818,
"grad_norm": 0.6122411489486694,
"learning_rate": 9.888527822411543e-06,
"loss": 0.4992,
"step": 159
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.6631482839584351,
"learning_rate": 9.88479557221393e-06,
"loss": 0.5166,
"step": 160
},
{
"epoch": 0.48787878787878786,
"grad_norm": 0.6380733847618103,
"learning_rate": 9.881002593834221e-06,
"loss": 0.5043,
"step": 161
},
{
"epoch": 0.4909090909090909,
"grad_norm": 0.6373774409294128,
"learning_rate": 9.877148934427037e-06,
"loss": 0.5015,
"step": 162
},
{
"epoch": 0.49393939393939396,
"grad_norm": 0.5428421497344971,
"learning_rate": 9.873234641901387e-06,
"loss": 0.5158,
"step": 163
},
{
"epoch": 0.49696969696969695,
"grad_norm": 0.5517195463180542,
"learning_rate": 9.869259764920081e-06,
"loss": 0.4559,
"step": 164
},
{
"epoch": 0.5,
"grad_norm": 0.6365246772766113,
"learning_rate": 9.86522435289912e-06,
"loss": 0.4833,
"step": 165
},
{
"epoch": 0.503030303030303,
"grad_norm": 0.49439719319343567,
"learning_rate": 9.861128456007076e-06,
"loss": 0.451,
"step": 166
},
{
"epoch": 0.5060606060606061,
"grad_norm": 0.6125072240829468,
"learning_rate": 9.85697212516448e-06,
"loss": 0.4971,
"step": 167
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.530681848526001,
"learning_rate": 9.85275541204318e-06,
"loss": 0.494,
"step": 168
},
{
"epoch": 0.5121212121212121,
"grad_norm": 0.4542466402053833,
"learning_rate": 9.848478369065703e-06,
"loss": 0.4688,
"step": 169
},
{
"epoch": 0.5151515151515151,
"grad_norm": 0.5714587569236755,
"learning_rate": 9.844141049404598e-06,
"loss": 0.4741,
"step": 170
},
{
"epoch": 0.5181818181818182,
"grad_norm": 0.5518572330474854,
"learning_rate": 9.839743506981783e-06,
"loss": 0.4873,
"step": 171
},
{
"epoch": 0.5212121212121212,
"grad_norm": 0.5865032076835632,
"learning_rate": 9.835285796467867e-06,
"loss": 0.4786,
"step": 172
},
{
"epoch": 0.5242424242424243,
"grad_norm": 0.5356167554855347,
"learning_rate": 9.830767973281477e-06,
"loss": 0.4615,
"step": 173
},
{
"epoch": 0.5272727272727272,
"grad_norm": 0.5017634630203247,
"learning_rate": 9.826190093588564e-06,
"loss": 0.4614,
"step": 174
},
{
"epoch": 0.5303030303030303,
"grad_norm": 0.5031861662864685,
"learning_rate": 9.821552214301705e-06,
"loss": 0.4873,
"step": 175
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.4886428117752075,
"learning_rate": 9.816854393079402e-06,
"loss": 0.494,
"step": 176
},
{
"epoch": 0.5363636363636364,
"grad_norm": 0.5461684465408325,
"learning_rate": 9.812096688325354e-06,
"loss": 0.5059,
"step": 177
},
{
"epoch": 0.5393939393939394,
"grad_norm": 0.4634988605976105,
"learning_rate": 9.80727915918774e-06,
"loss": 0.4627,
"step": 178
},
{
"epoch": 0.5424242424242425,
"grad_norm": 0.44447776675224304,
"learning_rate": 9.802401865558477e-06,
"loss": 0.4916,
"step": 179
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.46829262375831604,
"learning_rate": 9.797464868072489e-06,
"loss": 0.5005,
"step": 180
},
{
"epoch": 0.5484848484848485,
"grad_norm": 0.49591726064682007,
"learning_rate": 9.79246822810693e-06,
"loss": 0.4811,
"step": 181
},
{
"epoch": 0.5515151515151515,
"grad_norm": 0.5024096369743347,
"learning_rate": 9.787412007780445e-06,
"loss": 0.4904,
"step": 182
},
{
"epoch": 0.5545454545454546,
"grad_norm": 0.5566429495811462,
"learning_rate": 9.78229626995238e-06,
"loss": 0.4729,
"step": 183
},
{
"epoch": 0.5575757575757576,
"grad_norm": 0.5653529167175293,
"learning_rate": 9.777121078222015e-06,
"loss": 0.4747,
"step": 184
},
{
"epoch": 0.5606060606060606,
"grad_norm": 0.6398766040802002,
"learning_rate": 9.771886496927756e-06,
"loss": 0.4572,
"step": 185
},
{
"epoch": 0.5636363636363636,
"grad_norm": 0.5789164304733276,
"learning_rate": 9.766592591146353e-06,
"loss": 0.4813,
"step": 186
},
{
"epoch": 0.5666666666666667,
"grad_norm": 0.5712085366249084,
"learning_rate": 9.761239426692077e-06,
"loss": 0.4846,
"step": 187
},
{
"epoch": 0.5696969696969697,
"grad_norm": 0.5621404051780701,
"learning_rate": 9.755827070115915e-06,
"loss": 0.5014,
"step": 188
},
{
"epoch": 0.5727272727272728,
"grad_norm": 0.5247191190719604,
"learning_rate": 9.750355588704728e-06,
"loss": 0.4582,
"step": 189
},
{
"epoch": 0.5757575757575758,
"grad_norm": 0.5438876748085022,
"learning_rate": 9.744825050480425e-06,
"loss": 0.4495,
"step": 190
},
{
"epoch": 0.5787878787878787,
"grad_norm": 0.512938380241394,
"learning_rate": 9.739235524199117e-06,
"loss": 0.4682,
"step": 191
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.5813847184181213,
"learning_rate": 9.733587079350254e-06,
"loss": 0.4841,
"step": 192
},
{
"epoch": 0.5848484848484848,
"grad_norm": 0.5673701167106628,
"learning_rate": 9.727879786155767e-06,
"loss": 0.4633,
"step": 193
},
{
"epoch": 0.5878787878787879,
"grad_norm": 0.5400328040122986,
"learning_rate": 9.7221137155692e-06,
"loss": 0.4608,
"step": 194
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.47881829738616943,
"learning_rate": 9.716288939274818e-06,
"loss": 0.4873,
"step": 195
},
{
"epoch": 0.593939393939394,
"grad_norm": 0.5495237112045288,
"learning_rate": 9.710405529686722e-06,
"loss": 0.4739,
"step": 196
},
{
"epoch": 0.5969696969696969,
"grad_norm": 0.4859870374202728,
"learning_rate": 9.704463559947944e-06,
"loss": 0.4646,
"step": 197
},
{
"epoch": 0.6,
"grad_norm": 0.5082584619522095,
"learning_rate": 9.698463103929542e-06,
"loss": 0.4722,
"step": 198
},
{
"epoch": 0.603030303030303,
"grad_norm": 0.5721731781959534,
"learning_rate": 9.692404236229684e-06,
"loss": 0.4751,
"step": 199
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.5285456776618958,
"learning_rate": 9.686287032172712e-06,
"loss": 0.4754,
"step": 200
},
{
"epoch": 0.6090909090909091,
"grad_norm": 0.4908636510372162,
"learning_rate": 9.680111567808212e-06,
"loss": 0.4788,
"step": 201
},
{
"epoch": 0.6121212121212121,
"grad_norm": 0.6461144089698792,
"learning_rate": 9.673877919910069e-06,
"loss": 0.4782,
"step": 202
},
{
"epoch": 0.6151515151515151,
"grad_norm": 0.535092830657959,
"learning_rate": 9.667586165975507e-06,
"loss": 0.4753,
"step": 203
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.6900569796562195,
"learning_rate": 9.66123638422413e-06,
"loss": 0.4644,
"step": 204
},
{
"epoch": 0.6212121212121212,
"grad_norm": 0.5520806908607483,
"learning_rate": 9.65482865359695e-06,
"loss": 0.4667,
"step": 205
},
{
"epoch": 0.6242424242424243,
"grad_norm": 0.6971328258514404,
"learning_rate": 9.648363053755406e-06,
"loss": 0.4874,
"step": 206
},
{
"epoch": 0.6272727272727273,
"grad_norm": 0.5781907439231873,
"learning_rate": 9.641839665080363e-06,
"loss": 0.4733,
"step": 207
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.5814110636711121,
"learning_rate": 9.635258568671135e-06,
"loss": 0.4882,
"step": 208
},
{
"epoch": 0.6333333333333333,
"grad_norm": 0.5779009461402893,
"learning_rate": 9.628619846344453e-06,
"loss": 0.468,
"step": 209
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.6527352333068848,
"learning_rate": 9.621923580633462e-06,
"loss": 0.4706,
"step": 210
},
{
"epoch": 0.6393939393939394,
"grad_norm": 0.6215603351593018,
"learning_rate": 9.615169854786688e-06,
"loss": 0.4663,
"step": 211
},
{
"epoch": 0.6424242424242425,
"grad_norm": 0.5920988321304321,
"learning_rate": 9.608358752767013e-06,
"loss": 0.4747,
"step": 212
},
{
"epoch": 0.6454545454545455,
"grad_norm": 0.5126884579658508,
"learning_rate": 9.601490359250616e-06,
"loss": 0.4635,
"step": 213
},
{
"epoch": 0.6484848484848484,
"grad_norm": 0.5980071425437927,
"learning_rate": 9.594564759625936e-06,
"loss": 0.4874,
"step": 214
},
{
"epoch": 0.6515151515151515,
"grad_norm": 0.6158670783042908,
"learning_rate": 9.587582039992598e-06,
"loss": 0.4775,
"step": 215
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.6119588017463684,
"learning_rate": 9.580542287160348e-06,
"loss": 0.4727,
"step": 216
},
{
"epoch": 0.6575757575757576,
"grad_norm": 0.533804178237915,
"learning_rate": 9.573445588647978e-06,
"loss": 0.4448,
"step": 217
},
{
"epoch": 0.6606060606060606,
"grad_norm": 0.6370444297790527,
"learning_rate": 9.566292032682228e-06,
"loss": 0.4698,
"step": 218
},
{
"epoch": 0.6636363636363637,
"grad_norm": 0.6251326203346252,
"learning_rate": 9.559081708196696e-06,
"loss": 0.4641,
"step": 219
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.4930393099784851,
"learning_rate": 9.551814704830734e-06,
"loss": 0.4629,
"step": 220
},
{
"epoch": 0.6696969696969697,
"grad_norm": 0.6046044230461121,
"learning_rate": 9.544491112928327e-06,
"loss": 0.4745,
"step": 221
},
{
"epoch": 0.6727272727272727,
"grad_norm": 0.5868167877197266,
"learning_rate": 9.537111023536973e-06,
"loss": 0.4862,
"step": 222
},
{
"epoch": 0.6757575757575758,
"grad_norm": 0.4980459213256836,
"learning_rate": 9.529674528406556e-06,
"loss": 0.4836,
"step": 223
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.5966117978096008,
"learning_rate": 9.522181719988196e-06,
"loss": 0.4719,
"step": 224
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.5293205380439758,
"learning_rate": 9.514632691433108e-06,
"loss": 0.4874,
"step": 225
},
{
"epoch": 0.6848484848484848,
"grad_norm": 0.4588055908679962,
"learning_rate": 9.507027536591436e-06,
"loss": 0.4798,
"step": 226
},
{
"epoch": 0.6878787878787879,
"grad_norm": 0.4997522234916687,
"learning_rate": 9.499366350011093e-06,
"loss": 0.4543,
"step": 227
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.5391511917114258,
"learning_rate": 9.491649226936586e-06,
"loss": 0.483,
"step": 228
},
{
"epoch": 0.693939393939394,
"grad_norm": 0.525338351726532,
"learning_rate": 9.483876263307825e-06,
"loss": 0.4771,
"step": 229
},
{
"epoch": 0.696969696969697,
"grad_norm": 0.47747671604156494,
"learning_rate": 9.476047555758938e-06,
"loss": 0.4869,
"step": 230
},
{
"epoch": 0.7,
"grad_norm": 0.4778452515602112,
"learning_rate": 9.468163201617063e-06,
"loss": 0.47,
"step": 231
},
{
"epoch": 0.703030303030303,
"grad_norm": 0.7326351404190063,
"learning_rate": 9.460223298901138e-06,
"loss": 0.4854,
"step": 232
},
{
"epoch": 0.706060606060606,
"grad_norm": 0.5560402274131775,
"learning_rate": 9.452227946320697e-06,
"loss": 0.4767,
"step": 233
},
{
"epoch": 0.7090909090909091,
"grad_norm": 0.6292885541915894,
"learning_rate": 9.444177243274619e-06,
"loss": 0.4675,
"step": 234
},
{
"epoch": 0.7121212121212122,
"grad_norm": 0.5762522220611572,
"learning_rate": 9.436071289849909e-06,
"loss": 0.4809,
"step": 235
},
{
"epoch": 0.7151515151515152,
"grad_norm": 0.5423203110694885,
"learning_rate": 9.42791018682045e-06,
"loss": 0.4767,
"step": 236
},
{
"epoch": 0.7181818181818181,
"grad_norm": 0.6047458052635193,
"learning_rate": 9.419694035645753e-06,
"loss": 0.4682,
"step": 237
},
{
"epoch": 0.7212121212121212,
"grad_norm": 0.4815622568130493,
"learning_rate": 9.411422938469683e-06,
"loss": 0.4563,
"step": 238
},
{
"epoch": 0.7242424242424242,
"grad_norm": 0.4963318705558777,
"learning_rate": 9.403096998119206e-06,
"loss": 0.4552,
"step": 239
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.5713395476341248,
"learning_rate": 9.394716318103098e-06,
"loss": 0.4446,
"step": 240
},
{
"epoch": 0.7303030303030303,
"grad_norm": 0.5686458349227905,
"learning_rate": 9.386281002610669e-06,
"loss": 0.4611,
"step": 241
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.6181318759918213,
"learning_rate": 9.377791156510456e-06,
"loss": 0.4524,
"step": 242
},
{
"epoch": 0.7363636363636363,
"grad_norm": 0.5240927934646606,
"learning_rate": 9.369246885348926e-06,
"loss": 0.4558,
"step": 243
},
{
"epoch": 0.7393939393939394,
"grad_norm": 0.6781958341598511,
"learning_rate": 9.360648295349165e-06,
"loss": 0.4712,
"step": 244
},
{
"epoch": 0.7424242424242424,
"grad_norm": 0.553178608417511,
"learning_rate": 9.351995493409556e-06,
"loss": 0.4772,
"step": 245
},
{
"epoch": 0.7454545454545455,
"grad_norm": 0.5585768222808838,
"learning_rate": 9.343288587102444e-06,
"loss": 0.4679,
"step": 246
},
{
"epoch": 0.7484848484848485,
"grad_norm": 0.5500451326370239,
"learning_rate": 9.334527684672809e-06,
"loss": 0.4862,
"step": 247
},
{
"epoch": 0.7515151515151515,
"grad_norm": 0.5181669592857361,
"learning_rate": 9.325712895036916e-06,
"loss": 0.476,
"step": 248
},
{
"epoch": 0.7545454545454545,
"grad_norm": 0.5939860343933105,
"learning_rate": 9.316844327780955e-06,
"loss": 0.4764,
"step": 249
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.46427208185195923,
"learning_rate": 9.307922093159688e-06,
"loss": 0.4702,
"step": 250
},
{
"epoch": 0.7606060606060606,
"grad_norm": 0.47566163539886475,
"learning_rate": 9.298946302095074e-06,
"loss": 0.4544,
"step": 251
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.5817909836769104,
"learning_rate": 9.289917066174887e-06,
"loss": 0.4495,
"step": 252
},
{
"epoch": 0.7666666666666667,
"grad_norm": 0.544357180595398,
"learning_rate": 9.280834497651334e-06,
"loss": 0.4721,
"step": 253
},
{
"epoch": 0.7696969696969697,
"grad_norm": 0.5736443996429443,
"learning_rate": 9.271698709439658e-06,
"loss": 0.4719,
"step": 254
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.5211672186851501,
"learning_rate": 9.262509815116732e-06,
"loss": 0.4758,
"step": 255
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.5502645373344421,
"learning_rate": 9.253267928919652e-06,
"loss": 0.4706,
"step": 256
},
{
"epoch": 0.7787878787878788,
"grad_norm": 0.49078571796417236,
"learning_rate": 9.243973165744306e-06,
"loss": 0.4553,
"step": 257
},
{
"epoch": 0.7818181818181819,
"grad_norm": 0.5883756279945374,
"learning_rate": 9.234625641143962e-06,
"loss": 0.4508,
"step": 258
},
{
"epoch": 0.7848484848484848,
"grad_norm": 0.5382703542709351,
"learning_rate": 9.225225471327815e-06,
"loss": 0.4647,
"step": 259
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.5602476000785828,
"learning_rate": 9.215772773159556e-06,
"loss": 0.4871,
"step": 260
},
{
"epoch": 0.7909090909090909,
"grad_norm": 0.5052843689918518,
"learning_rate": 9.206267664155906e-06,
"loss": 0.4474,
"step": 261
},
{
"epoch": 0.793939393939394,
"grad_norm": 0.720058023929596,
"learning_rate": 9.196710262485168e-06,
"loss": 0.4899,
"step": 262
},
{
"epoch": 0.796969696969697,
"grad_norm": 0.564479410648346,
"learning_rate": 9.187100686965749e-06,
"loss": 0.4789,
"step": 263
},
{
"epoch": 0.8,
"grad_norm": 0.5418474078178406,
"learning_rate": 9.177439057064684e-06,
"loss": 0.4816,
"step": 264
},
{
"epoch": 0.803030303030303,
"grad_norm": 0.49409568309783936,
"learning_rate": 9.167725492896153e-06,
"loss": 0.4764,
"step": 265
},
{
"epoch": 0.806060606060606,
"grad_norm": 0.5403966903686523,
"learning_rate": 9.157960115219993e-06,
"loss": 0.487,
"step": 266
},
{
"epoch": 0.8090909090909091,
"grad_norm": 0.5134335160255432,
"learning_rate": 9.148143045440181e-06,
"loss": 0.4652,
"step": 267
},
{
"epoch": 0.8121212121212121,
"grad_norm": 0.4440280497074127,
"learning_rate": 9.138274405603342e-06,
"loss": 0.4638,
"step": 268
},
{
"epoch": 0.8151515151515152,
"grad_norm": 0.5449512600898743,
"learning_rate": 9.128354318397223e-06,
"loss": 0.4619,
"step": 269
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.6011729836463928,
"learning_rate": 9.118382907149164e-06,
"loss": 0.4817,
"step": 270
},
{
"epoch": 0.8212121212121212,
"grad_norm": 0.49820375442504883,
"learning_rate": 9.108360295824576e-06,
"loss": 0.4436,
"step": 271
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.5298901796340942,
"learning_rate": 9.098286609025392e-06,
"loss": 0.4925,
"step": 272
},
{
"epoch": 0.8272727272727273,
"grad_norm": 0.49690747261047363,
"learning_rate": 9.088161971988517e-06,
"loss": 0.4512,
"step": 273
},
{
"epoch": 0.8303030303030303,
"grad_norm": 0.4673530161380768,
"learning_rate": 9.077986510584273e-06,
"loss": 0.4775,
"step": 274
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.5070790648460388,
"learning_rate": 9.067760351314838e-06,
"loss": 0.4772,
"step": 275
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.5117365121841431,
"learning_rate": 9.057483621312671e-06,
"loss": 0.4577,
"step": 276
},
{
"epoch": 0.8393939393939394,
"grad_norm": 0.5128085613250732,
"learning_rate": 9.047156448338927e-06,
"loss": 0.4464,
"step": 277
},
{
"epoch": 0.8424242424242424,
"grad_norm": 0.4917905628681183,
"learning_rate": 9.036778960781874e-06,
"loss": 0.4682,
"step": 278
},
{
"epoch": 0.8454545454545455,
"grad_norm": 0.5163683891296387,
"learning_rate": 9.026351287655294e-06,
"loss": 0.4475,
"step": 279
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.4706951379776001,
"learning_rate": 9.01587355859688e-06,
"loss": 0.4527,
"step": 280
},
{
"epoch": 0.8515151515151516,
"grad_norm": 0.5465824604034424,
"learning_rate": 9.005345903866627e-06,
"loss": 0.4636,
"step": 281
},
{
"epoch": 0.8545454545454545,
"grad_norm": 0.4861038327217102,
"learning_rate": 8.994768454345207e-06,
"loss": 0.459,
"step": 282
},
{
"epoch": 0.8575757575757575,
"grad_norm": 0.5132073760032654,
"learning_rate": 8.984141341532346e-06,
"loss": 0.4323,
"step": 283
},
{
"epoch": 0.8606060606060606,
"grad_norm": 0.48941031098365784,
"learning_rate": 8.973464697545191e-06,
"loss": 0.4689,
"step": 284
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.5348917841911316,
"learning_rate": 8.96273865511666e-06,
"loss": 0.4598,
"step": 285
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.47538742423057556,
"learning_rate": 8.951963347593797e-06,
"loss": 0.4761,
"step": 286
},
{
"epoch": 0.8696969696969697,
"grad_norm": 0.5884383320808411,
"learning_rate": 8.941138908936118e-06,
"loss": 0.4705,
"step": 287
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.5642337203025818,
"learning_rate": 8.930265473713939e-06,
"loss": 0.4701,
"step": 288
},
{
"epoch": 0.8757575757575757,
"grad_norm": 0.5007638931274414,
"learning_rate": 8.9193431771067e-06,
"loss": 0.4705,
"step": 289
},
{
"epoch": 0.8787878787878788,
"grad_norm": 0.5597187876701355,
"learning_rate": 8.908372154901302e-06,
"loss": 0.4762,
"step": 290
},
{
"epoch": 0.8818181818181818,
"grad_norm": 0.5402477383613586,
"learning_rate": 8.897352543490396e-06,
"loss": 0.4683,
"step": 291
},
{
"epoch": 0.8848484848484849,
"grad_norm": 0.49998798966407776,
"learning_rate": 8.8862844798707e-06,
"loss": 0.4806,
"step": 292
},
{
"epoch": 0.8878787878787879,
"grad_norm": 0.5093072056770325,
"learning_rate": 8.875168101641294e-06,
"loss": 0.4809,
"step": 293
},
{
"epoch": 0.8909090909090909,
"grad_norm": 0.4708082377910614,
"learning_rate": 8.864003547001916e-06,
"loss": 0.4241,
"step": 294
},
{
"epoch": 0.8939393939393939,
"grad_norm": 0.5265612602233887,
"learning_rate": 8.852790954751229e-06,
"loss": 0.4352,
"step": 295
},
{
"epoch": 0.896969696969697,
"grad_norm": 0.45366954803466797,
"learning_rate": 8.841530464285105e-06,
"loss": 0.4517,
"step": 296
},
{
"epoch": 0.9,
"grad_norm": 0.49041837453842163,
"learning_rate": 8.83022221559489e-06,
"loss": 0.4538,
"step": 297
},
{
"epoch": 0.9030303030303031,
"grad_norm": 0.4863709509372711,
"learning_rate": 8.81886634926567e-06,
"loss": 0.4656,
"step": 298
},
{
"epoch": 0.906060606060606,
"grad_norm": 0.48961129784584045,
"learning_rate": 8.807463006474514e-06,
"loss": 0.4439,
"step": 299
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.5433361530303955,
"learning_rate": 8.796012328988716e-06,
"loss": 0.4766,
"step": 300
},
{
"epoch": 0.9121212121212121,
"grad_norm": 0.48235633969306946,
"learning_rate": 8.78451445916405e-06,
"loss": 0.4461,
"step": 301
},
{
"epoch": 0.9151515151515152,
"grad_norm": 0.532062292098999,
"learning_rate": 8.772969539942981e-06,
"loss": 0.4732,
"step": 302
},
{
"epoch": 0.9181818181818182,
"grad_norm": 0.45535221695899963,
"learning_rate": 8.7613777148529e-06,
"loss": 0.4664,
"step": 303
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.4744938313961029,
"learning_rate": 8.749739128004329e-06,
"loss": 0.4818,
"step": 304
},
{
"epoch": 0.9242424242424242,
"grad_norm": 0.5316322445869446,
"learning_rate": 8.738053924089149e-06,
"loss": 0.4609,
"step": 305
},
{
"epoch": 0.9272727272727272,
"grad_norm": 0.5303252935409546,
"learning_rate": 8.726322248378775e-06,
"loss": 0.4457,
"step": 306
},
{
"epoch": 0.9303030303030303,
"grad_norm": 0.5524774193763733,
"learning_rate": 8.714544246722369e-06,
"loss": 0.474,
"step": 307
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.49604108929634094,
"learning_rate": 8.702720065545024e-06,
"loss": 0.455,
"step": 308
},
{
"epoch": 0.9363636363636364,
"grad_norm": 0.5008924603462219,
"learning_rate": 8.690849851845933e-06,
"loss": 0.4653,
"step": 309
},
{
"epoch": 0.9393939393939394,
"grad_norm": 0.5711862444877625,
"learning_rate": 8.678933753196577e-06,
"loss": 0.4701,
"step": 310
},
{
"epoch": 0.9424242424242424,
"grad_norm": 0.44712546467781067,
"learning_rate": 8.666971917738876e-06,
"loss": 0.4547,
"step": 311
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.5717918872833252,
"learning_rate": 8.65496449418336e-06,
"loss": 0.4591,
"step": 312
},
{
"epoch": 0.9484848484848485,
"grad_norm": 0.5090660452842712,
"learning_rate": 8.642911631807306e-06,
"loss": 0.4589,
"step": 313
},
{
"epoch": 0.9515151515151515,
"grad_norm": 0.4903143346309662,
"learning_rate": 8.630813480452898e-06,
"loss": 0.4571,
"step": 314
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.6317908763885498,
"learning_rate": 8.61867019052535e-06,
"loss": 0.4663,
"step": 315
},
{
"epoch": 0.9575757575757575,
"grad_norm": 0.4388335943222046,
"learning_rate": 8.606481912991052e-06,
"loss": 0.4626,
"step": 316
},
{
"epoch": 0.9606060606060606,
"grad_norm": 0.5695369839668274,
"learning_rate": 8.594248799375671e-06,
"loss": 0.462,
"step": 317
},
{
"epoch": 0.9636363636363636,
"grad_norm": 0.518403947353363,
"learning_rate": 8.581971001762287e-06,
"loss": 0.4578,
"step": 318
},
{
"epoch": 0.9666666666666667,
"grad_norm": 0.5362844467163086,
"learning_rate": 8.569648672789496e-06,
"loss": 0.4529,
"step": 319
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.499873548746109,
"learning_rate": 8.557281965649508e-06,
"loss": 0.46,
"step": 320
},
{
"epoch": 0.9727272727272728,
"grad_norm": 0.5259467363357544,
"learning_rate": 8.54487103408625e-06,
"loss": 0.4496,
"step": 321
},
{
"epoch": 0.9757575757575757,
"grad_norm": 0.48420971632003784,
"learning_rate": 8.532416032393447e-06,
"loss": 0.4633,
"step": 322
},
{
"epoch": 0.9787878787878788,
"grad_norm": 0.49494659900665283,
"learning_rate": 8.51991711541271e-06,
"loss": 0.4509,
"step": 323
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.49319571256637573,
"learning_rate": 8.507374438531606e-06,
"loss": 0.4639,
"step": 324
},
{
"epoch": 0.9848484848484849,
"grad_norm": 0.48394539952278137,
"learning_rate": 8.494788157681733e-06,
"loss": 0.4536,
"step": 325
},
{
"epoch": 0.9878787878787879,
"grad_norm": 0.5943865776062012,
"learning_rate": 8.482158429336769e-06,
"loss": 0.4599,
"step": 326
},
{
"epoch": 0.990909090909091,
"grad_norm": 0.4566466808319092,
"learning_rate": 8.469485410510545e-06,
"loss": 0.4593,
"step": 327
},
{
"epoch": 0.9939393939393939,
"grad_norm": 0.5179756283760071,
"learning_rate": 8.456769258755078e-06,
"loss": 0.453,
"step": 328
},
{
"epoch": 0.996969696969697,
"grad_norm": 0.6326143145561218,
"learning_rate": 8.444010132158614e-06,
"loss": 0.4932,
"step": 329
},
{
"epoch": 1.0,
"grad_norm": 0.48770657181739807,
"learning_rate": 8.43120818934367e-06,
"loss": 0.4645,
"step": 330
},
{
"epoch": 1.003030303030303,
"grad_norm": 0.47949346899986267,
"learning_rate": 8.418363589465055e-06,
"loss": 0.4263,
"step": 331
},
{
"epoch": 1.006060606060606,
"grad_norm": 0.5179343819618225,
"learning_rate": 8.405476492207902e-06,
"loss": 0.3961,
"step": 332
},
{
"epoch": 1.009090909090909,
"grad_norm": 0.505066990852356,
"learning_rate": 8.392547057785662e-06,
"loss": 0.4154,
"step": 333
},
{
"epoch": 1.0121212121212122,
"grad_norm": 0.44572776556015015,
"learning_rate": 8.379575446938136e-06,
"loss": 0.4076,
"step": 334
},
{
"epoch": 1.0151515151515151,
"grad_norm": 0.4563922584056854,
"learning_rate": 8.366561820929457e-06,
"loss": 0.3917,
"step": 335
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.46021389961242676,
"learning_rate": 8.353506341546106e-06,
"loss": 0.4092,
"step": 336
},
{
"epoch": 1.0212121212121212,
"grad_norm": 0.5978601574897766,
"learning_rate": 8.340409171094874e-06,
"loss": 0.436,
"step": 337
},
{
"epoch": 1.0242424242424242,
"grad_norm": 0.4990571439266205,
"learning_rate": 8.32727047240087e-06,
"loss": 0.4089,
"step": 338
},
{
"epoch": 1.0272727272727273,
"grad_norm": 0.5260419249534607,
"learning_rate": 8.314090408805481e-06,
"loss": 0.4224,
"step": 339
},
{
"epoch": 1.0303030303030303,
"grad_norm": 0.5074383020401001,
"learning_rate": 8.300869144164346e-06,
"loss": 0.4389,
"step": 340
},
{
"epoch": 1.0333333333333334,
"grad_norm": 0.4625466465950012,
"learning_rate": 8.28760684284532e-06,
"loss": 0.4164,
"step": 341
},
{
"epoch": 1.0363636363636364,
"grad_norm": 0.4897065758705139,
"learning_rate": 8.274303669726427e-06,
"loss": 0.4112,
"step": 342
},
{
"epoch": 1.0393939393939393,
"grad_norm": 0.46699315309524536,
"learning_rate": 8.260959790193815e-06,
"loss": 0.4313,
"step": 343
},
{
"epoch": 1.0424242424242425,
"grad_norm": 0.44992008805274963,
"learning_rate": 8.247575370139695e-06,
"loss": 0.4215,
"step": 344
},
{
"epoch": 1.0454545454545454,
"grad_norm": 0.48453715443611145,
"learning_rate": 8.234150575960288e-06,
"loss": 0.3819,
"step": 345
},
{
"epoch": 1.0484848484848486,
"grad_norm": 0.46452537178993225,
"learning_rate": 8.220685574553739e-06,
"loss": 0.3959,
"step": 346
},
{
"epoch": 1.0515151515151515,
"grad_norm": 0.5160481333732605,
"learning_rate": 8.207180533318061e-06,
"loss": 0.4091,
"step": 347
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.5158939957618713,
"learning_rate": 8.193635620149041e-06,
"loss": 0.4102,
"step": 348
},
{
"epoch": 1.0575757575757576,
"grad_norm": 0.4726713299751282,
"learning_rate": 8.180051003438158e-06,
"loss": 0.4367,
"step": 349
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.5599554777145386,
"learning_rate": 8.16642685207049e-06,
"loss": 0.4145,
"step": 350
},
{
"epoch": 1.0636363636363637,
"grad_norm": 0.5167244672775269,
"learning_rate": 8.152763335422612e-06,
"loss": 0.4276,
"step": 351
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.4800064265727997,
"learning_rate": 8.139060623360494e-06,
"loss": 0.3964,
"step": 352
},
{
"epoch": 1.0696969696969698,
"grad_norm": 0.5460079312324524,
"learning_rate": 8.125318886237382e-06,
"loss": 0.4498,
"step": 353
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.5078116655349731,
"learning_rate": 8.111538294891684e-06,
"loss": 0.4302,
"step": 354
},
{
"epoch": 1.0757575757575757,
"grad_norm": 0.5369091033935547,
"learning_rate": 8.097719020644855e-06,
"loss": 0.4339,
"step": 355
},
{
"epoch": 1.0787878787878789,
"grad_norm": 0.47746509313583374,
"learning_rate": 8.083861235299253e-06,
"loss": 0.4207,
"step": 356
},
{
"epoch": 1.0818181818181818,
"grad_norm": 0.4835416376590729,
"learning_rate": 8.06996511113601e-06,
"loss": 0.4294,
"step": 357
},
{
"epoch": 1.084848484848485,
"grad_norm": 0.47259601950645447,
"learning_rate": 8.05603082091289e-06,
"loss": 0.4291,
"step": 358
},
{
"epoch": 1.087878787878788,
"grad_norm": 0.43381235003471375,
"learning_rate": 8.04205853786214e-06,
"loss": 0.3999,
"step": 359
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.469703733921051,
"learning_rate": 8.028048435688333e-06,
"loss": 0.4106,
"step": 360
},
{
"epoch": 1.093939393939394,
"grad_norm": 0.44487234950065613,
"learning_rate": 8.014000688566224e-06,
"loss": 0.3955,
"step": 361
},
{
"epoch": 1.096969696969697,
"grad_norm": 0.444295197725296,
"learning_rate": 7.999915471138562e-06,
"loss": 0.4258,
"step": 362
},
{
"epoch": 1.1,
"grad_norm": 0.4215056300163269,
"learning_rate": 7.985792958513932e-06,
"loss": 0.4327,
"step": 363
},
{
"epoch": 1.103030303030303,
"grad_norm": 0.5015363097190857,
"learning_rate": 7.971633326264581e-06,
"loss": 0.4093,
"step": 364
},
{
"epoch": 1.106060606060606,
"grad_norm": 0.4623165428638458,
"learning_rate": 7.957436750424223e-06,
"loss": 0.4187,
"step": 365
},
{
"epoch": 1.1090909090909091,
"grad_norm": 0.5218795537948608,
"learning_rate": 7.943203407485864e-06,
"loss": 0.4277,
"step": 366
},
{
"epoch": 1.112121212121212,
"grad_norm": 0.46455255150794983,
"learning_rate": 7.928933474399601e-06,
"loss": 0.435,
"step": 367
},
{
"epoch": 1.1151515151515152,
"grad_norm": 0.5003970265388489,
"learning_rate": 7.91462712857042e-06,
"loss": 0.432,
"step": 368
},
{
"epoch": 1.1181818181818182,
"grad_norm": 0.46695271134376526,
"learning_rate": 7.900284547855992e-06,
"loss": 0.407,
"step": 369
},
{
"epoch": 1.121212121212121,
"grad_norm": 0.45484066009521484,
"learning_rate": 7.885905910564466e-06,
"loss": 0.4107,
"step": 370
},
{
"epoch": 1.1242424242424243,
"grad_norm": 0.49817416071891785,
"learning_rate": 7.87149139545225e-06,
"loss": 0.4096,
"step": 371
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.4525980055332184,
"learning_rate": 7.857041181721788e-06,
"loss": 0.4368,
"step": 372
},
{
"epoch": 1.1303030303030304,
"grad_norm": 0.5966557860374451,
"learning_rate": 7.842555449019326e-06,
"loss": 0.4166,
"step": 373
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.42670103907585144,
"learning_rate": 7.828034377432694e-06,
"loss": 0.4333,
"step": 374
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.5341598987579346,
"learning_rate": 7.813478147489052e-06,
"loss": 0.4469,
"step": 375
},
{
"epoch": 1.1393939393939394,
"grad_norm": 0.5007196068763733,
"learning_rate": 7.798886940152654e-06,
"loss": 0.4077,
"step": 376
},
{
"epoch": 1.1424242424242423,
"grad_norm": 0.5085919499397278,
"learning_rate": 7.784260936822592e-06,
"loss": 0.454,
"step": 377
},
{
"epoch": 1.1454545454545455,
"grad_norm": 0.523694634437561,
"learning_rate": 7.769600319330553e-06,
"loss": 0.431,
"step": 378
},
{
"epoch": 1.1484848484848484,
"grad_norm": 0.4709513783454895,
"learning_rate": 7.75490526993854e-06,
"loss": 0.4246,
"step": 379
},
{
"epoch": 1.1515151515151516,
"grad_norm": 0.4955805540084839,
"learning_rate": 7.740175971336624e-06,
"loss": 0.4506,
"step": 380
},
{
"epoch": 1.1545454545454545,
"grad_norm": 0.4655817449092865,
"learning_rate": 7.725412606640658e-06,
"loss": 0.4353,
"step": 381
},
{
"epoch": 1.1575757575757575,
"grad_norm": 0.46022751927375793,
"learning_rate": 7.710615359390018e-06,
"loss": 0.4161,
"step": 382
},
{
"epoch": 1.1606060606060606,
"grad_norm": 0.5289224982261658,
"learning_rate": 7.6957844135453e-06,
"loss": 0.43,
"step": 383
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.5375812649726868,
"learning_rate": 7.680919953486047e-06,
"loss": 0.4231,
"step": 384
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.5493893027305603,
"learning_rate": 7.666022164008458e-06,
"loss": 0.4442,
"step": 385
},
{
"epoch": 1.1696969696969697,
"grad_norm": 0.4879584014415741,
"learning_rate": 7.651091230323079e-06,
"loss": 0.4197,
"step": 386
},
{
"epoch": 1.1727272727272728,
"grad_norm": 0.5651521682739258,
"learning_rate": 7.636127338052513e-06,
"loss": 0.4246,
"step": 387
},
{
"epoch": 1.1757575757575758,
"grad_norm": 0.4888427257537842,
"learning_rate": 7.621130673229105e-06,
"loss": 0.438,
"step": 388
},
{
"epoch": 1.1787878787878787,
"grad_norm": 0.5484802722930908,
"learning_rate": 7.606101422292629e-06,
"loss": 0.4293,
"step": 389
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.4991911053657532,
"learning_rate": 7.5910397720879785e-06,
"loss": 0.4191,
"step": 390
},
{
"epoch": 1.1848484848484848,
"grad_norm": 0.49404406547546387,
"learning_rate": 7.575945909862829e-06,
"loss": 0.4173,
"step": 391
},
{
"epoch": 1.187878787878788,
"grad_norm": 0.5090894103050232,
"learning_rate": 7.5608200232653254e-06,
"loss": 0.4404,
"step": 392
},
{
"epoch": 1.190909090909091,
"grad_norm": 0.5557565093040466,
"learning_rate": 7.545662300341736e-06,
"loss": 0.4463,
"step": 393
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.5056024193763733,
"learning_rate": 7.530472929534126e-06,
"loss": 0.4203,
"step": 394
},
{
"epoch": 1.196969696969697,
"grad_norm": 0.479216068983078,
"learning_rate": 7.515252099678011e-06,
"loss": 0.4024,
"step": 395
},
{
"epoch": 1.2,
"grad_norm": 0.5049172639846802,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4178,
"step": 396
},
{
"epoch": 1.2030303030303031,
"grad_norm": 0.4803786873817444,
"learning_rate": 7.484716820115461e-06,
"loss": 0.4151,
"step": 397
},
{
"epoch": 1.206060606060606,
"grad_norm": 0.48565956950187683,
"learning_rate": 7.469402750026147e-06,
"loss": 0.3898,
"step": 398
},
{
"epoch": 1.209090909090909,
"grad_norm": 0.5321947932243347,
"learning_rate": 7.454057980117842e-06,
"loss": 0.4268,
"step": 399
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.4305421710014343,
"learning_rate": 7.438682701157993e-06,
"loss": 0.4046,
"step": 400
},
{
"epoch": 1.215151515151515,
"grad_norm": 0.5250205993652344,
"learning_rate": 7.423277104293338e-06,
"loss": 0.4104,
"step": 401
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.4442998170852661,
"learning_rate": 7.407841381047533e-06,
"loss": 0.4201,
"step": 402
},
{
"epoch": 1.2212121212121212,
"grad_norm": 0.44337543845176697,
"learning_rate": 7.392375723318761e-06,
"loss": 0.4325,
"step": 403
},
{
"epoch": 1.2242424242424241,
"grad_norm": 0.4957628548145294,
"learning_rate": 7.376880323377357e-06,
"loss": 0.4318,
"step": 404
},
{
"epoch": 1.2272727272727273,
"grad_norm": 0.4373667538166046,
"learning_rate": 7.361355373863415e-06,
"loss": 0.427,
"step": 405
},
{
"epoch": 1.2303030303030302,
"grad_norm": 0.48402732610702515,
"learning_rate": 7.345801067784388e-06,
"loss": 0.4319,
"step": 406
},
{
"epoch": 1.2333333333333334,
"grad_norm": 0.43947574496269226,
"learning_rate": 7.330217598512696e-06,
"loss": 0.4327,
"step": 407
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.4845799207687378,
"learning_rate": 7.314605159783313e-06,
"loss": 0.4284,
"step": 408
},
{
"epoch": 1.2393939393939393,
"grad_norm": 0.4592854380607605,
"learning_rate": 7.298963945691371e-06,
"loss": 0.4347,
"step": 409
},
{
"epoch": 1.2424242424242424,
"grad_norm": 0.4303816556930542,
"learning_rate": 7.283294150689735e-06,
"loss": 0.4342,
"step": 410
},
{
"epoch": 1.2454545454545454,
"grad_norm": 0.46144893765449524,
"learning_rate": 7.2675959695865896e-06,
"loss": 0.4362,
"step": 411
},
{
"epoch": 1.2484848484848485,
"grad_norm": 0.5031886100769043,
"learning_rate": 7.251869597543019e-06,
"loss": 0.4114,
"step": 412
},
{
"epoch": 1.2515151515151515,
"grad_norm": 0.40648922324180603,
"learning_rate": 7.2361152300705795e-06,
"loss": 0.397,
"step": 413
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.4165934920310974,
"learning_rate": 7.2203330630288714e-06,
"loss": 0.4176,
"step": 414
},
{
"epoch": 1.2575757575757576,
"grad_norm": 0.4433961510658264,
"learning_rate": 7.2045232926230965e-06,
"loss": 0.4051,
"step": 415
},
{
"epoch": 1.2606060606060607,
"grad_norm": 0.4428854286670685,
"learning_rate": 7.188686115401628e-06,
"loss": 0.4109,
"step": 416
},
{
"epoch": 1.2636363636363637,
"grad_norm": 0.4384450316429138,
"learning_rate": 7.172821728253563e-06,
"loss": 0.4092,
"step": 417
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.5005664825439453,
"learning_rate": 7.156930328406268e-06,
"loss": 0.4125,
"step": 418
},
{
"epoch": 1.2696969696969698,
"grad_norm": 0.46654659509658813,
"learning_rate": 7.141012113422942e-06,
"loss": 0.419,
"step": 419
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.44787606596946716,
"learning_rate": 7.1250672812001505e-06,
"loss": 0.4199,
"step": 420
},
{
"epoch": 1.2757575757575759,
"grad_norm": 0.5100336670875549,
"learning_rate": 7.109096029965362e-06,
"loss": 0.4294,
"step": 421
},
{
"epoch": 1.2787878787878788,
"grad_norm": 0.4483383297920227,
"learning_rate": 7.093098558274494e-06,
"loss": 0.4091,
"step": 422
},
{
"epoch": 1.2818181818181817,
"grad_norm": 0.4621780216693878,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.4448,
"step": 423
},
{
"epoch": 1.284848484848485,
"grad_norm": 0.4376955032348633,
"learning_rate": 7.061025749375572e-06,
"loss": 0.4014,
"step": 424
},
{
"epoch": 1.2878787878787878,
"grad_norm": 0.4119098484516144,
"learning_rate": 7.044950810899332e-06,
"loss": 0.4251,
"step": 425
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.4029601514339447,
"learning_rate": 7.02885044942567e-06,
"loss": 0.4169,
"step": 426
},
{
"epoch": 1.293939393939394,
"grad_norm": 0.40098485350608826,
"learning_rate": 7.012724865115615e-06,
"loss": 0.42,
"step": 427
},
{
"epoch": 1.2969696969696969,
"grad_norm": 0.45420369505882263,
"learning_rate": 6.996574258443761e-06,
"loss": 0.4084,
"step": 428
},
{
"epoch": 1.3,
"grad_norm": 0.44172826409339905,
"learning_rate": 6.980398830195785e-06,
"loss": 0.4141,
"step": 429
},
{
"epoch": 1.303030303030303,
"grad_norm": 0.42184752225875854,
"learning_rate": 6.964198781465948e-06,
"loss": 0.444,
"step": 430
},
{
"epoch": 1.3060606060606061,
"grad_norm": 0.47455379366874695,
"learning_rate": 6.947974313654592e-06,
"loss": 0.4211,
"step": 431
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.4578656852245331,
"learning_rate": 6.931725628465643e-06,
"loss": 0.4372,
"step": 432
},
{
"epoch": 1.312121212121212,
"grad_norm": 0.49530062079429626,
"learning_rate": 6.9154529279040985e-06,
"loss": 0.4534,
"step": 433
},
{
"epoch": 1.3151515151515152,
"grad_norm": 0.42749521136283875,
"learning_rate": 6.899156414273514e-06,
"loss": 0.4357,
"step": 434
},
{
"epoch": 1.3181818181818181,
"grad_norm": 0.3963139057159424,
"learning_rate": 6.882836290173493e-06,
"loss": 0.4072,
"step": 435
},
{
"epoch": 1.3212121212121213,
"grad_norm": 0.4464685618877411,
"learning_rate": 6.866492758497171e-06,
"loss": 0.3984,
"step": 436
},
{
"epoch": 1.3242424242424242,
"grad_norm": 0.4427390992641449,
"learning_rate": 6.850126022428678e-06,
"loss": 0.4386,
"step": 437
},
{
"epoch": 1.3272727272727272,
"grad_norm": 0.4357937276363373,
"learning_rate": 6.833736285440632e-06,
"loss": 0.4021,
"step": 438
},
{
"epoch": 1.3303030303030303,
"grad_norm": 0.4386712610721588,
"learning_rate": 6.817323751291598e-06,
"loss": 0.4415,
"step": 439
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.4566034972667694,
"learning_rate": 6.800888624023552e-06,
"loss": 0.3911,
"step": 440
},
{
"epoch": 1.3363636363636364,
"grad_norm": 0.4068944454193115,
"learning_rate": 6.78443110795936e-06,
"loss": 0.392,
"step": 441
},
{
"epoch": 1.3393939393939394,
"grad_norm": 0.49135804176330566,
"learning_rate": 6.767951407700217e-06,
"loss": 0.4302,
"step": 442
},
{
"epoch": 1.3424242424242423,
"grad_norm": 0.43315646052360535,
"learning_rate": 6.75144972812312e-06,
"loss": 0.428,
"step": 443
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.4170973598957062,
"learning_rate": 6.734926274378313e-06,
"loss": 0.4313,
"step": 444
},
{
"epoch": 1.3484848484848486,
"grad_norm": 0.4999806582927704,
"learning_rate": 6.7183812518867365e-06,
"loss": 0.417,
"step": 445
},
{
"epoch": 1.3515151515151516,
"grad_norm": 0.4573524594306946,
"learning_rate": 6.701814866337477e-06,
"loss": 0.4292,
"step": 446
},
{
"epoch": 1.3545454545454545,
"grad_norm": 0.458232045173645,
"learning_rate": 6.685227323685209e-06,
"loss": 0.4202,
"step": 447
},
{
"epoch": 1.3575757575757577,
"grad_norm": 0.4656910002231598,
"learning_rate": 6.668618830147634e-06,
"loss": 0.3984,
"step": 448
},
{
"epoch": 1.3606060606060606,
"grad_norm": 0.4599440097808838,
"learning_rate": 6.651989592202913e-06,
"loss": 0.4037,
"step": 449
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.4676293134689331,
"learning_rate": 6.635339816587109e-06,
"loss": 0.4133,
"step": 450
},
{
"epoch": 1.3666666666666667,
"grad_norm": 0.4331243634223938,
"learning_rate": 6.618669710291607e-06,
"loss": 0.4101,
"step": 451
},
{
"epoch": 1.3696969696969696,
"grad_norm": 0.4712773561477661,
"learning_rate": 6.601979480560543e-06,
"loss": 0.4254,
"step": 452
},
{
"epoch": 1.3727272727272728,
"grad_norm": 0.41984647512435913,
"learning_rate": 6.5852693348882345e-06,
"loss": 0.4276,
"step": 453
},
{
"epoch": 1.3757575757575757,
"grad_norm": 0.4325491786003113,
"learning_rate": 6.568539481016593e-06,
"loss": 0.4147,
"step": 454
},
{
"epoch": 1.378787878787879,
"grad_norm": 0.4297987222671509,
"learning_rate": 6.551790126932543e-06,
"loss": 0.4157,
"step": 455
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.4344542920589447,
"learning_rate": 6.535021480865439e-06,
"loss": 0.4204,
"step": 456
},
{
"epoch": 1.3848484848484848,
"grad_norm": 0.3979593515396118,
"learning_rate": 6.5182337512844725e-06,
"loss": 0.422,
"step": 457
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.4758434593677521,
"learning_rate": 6.501427146896087e-06,
"loss": 0.4221,
"step": 458
},
{
"epoch": 1.3909090909090909,
"grad_norm": 0.4996965527534485,
"learning_rate": 6.484601876641375e-06,
"loss": 0.4045,
"step": 459
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.4164718985557556,
"learning_rate": 6.467758149693486e-06,
"loss": 0.4302,
"step": 460
},
{
"epoch": 1.396969696969697,
"grad_norm": 0.4488878548145294,
"learning_rate": 6.450896175455027e-06,
"loss": 0.4217,
"step": 461
},
{
"epoch": 1.4,
"grad_norm": 0.4654311239719391,
"learning_rate": 6.434016163555452e-06,
"loss": 0.4089,
"step": 462
},
{
"epoch": 1.403030303030303,
"grad_norm": 0.435861736536026,
"learning_rate": 6.417118323848465e-06,
"loss": 0.46,
"step": 463
},
{
"epoch": 1.406060606060606,
"grad_norm": 0.4406987726688385,
"learning_rate": 6.400202866409405e-06,
"loss": 0.4263,
"step": 464
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.4633047580718994,
"learning_rate": 6.383270001532636e-06,
"loss": 0.4009,
"step": 465
},
{
"epoch": 1.412121212121212,
"grad_norm": 0.44259321689605713,
"learning_rate": 6.366319939728934e-06,
"loss": 0.3968,
"step": 466
},
{
"epoch": 1.415151515151515,
"grad_norm": 0.46387195587158203,
"learning_rate": 6.3493528917228664e-06,
"loss": 0.4453,
"step": 467
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.4395183026790619,
"learning_rate": 6.332369068450175e-06,
"loss": 0.412,
"step": 468
},
{
"epoch": 1.4212121212121211,
"grad_norm": 0.5252392292022705,
"learning_rate": 6.315368681055157e-06,
"loss": 0.4286,
"step": 469
},
{
"epoch": 1.4242424242424243,
"grad_norm": 0.4034399390220642,
"learning_rate": 6.29835194088803e-06,
"loss": 0.4491,
"step": 470
},
{
"epoch": 1.4272727272727272,
"grad_norm": 0.46798762679100037,
"learning_rate": 6.2813190595023135e-06,
"loss": 0.3974,
"step": 471
},
{
"epoch": 1.4303030303030302,
"grad_norm": 0.4931304454803467,
"learning_rate": 6.264270248652199e-06,
"loss": 0.4235,
"step": 472
},
{
"epoch": 1.4333333333333333,
"grad_norm": 0.42226365208625793,
"learning_rate": 6.247205720289907e-06,
"loss": 0.3972,
"step": 473
},
{
"epoch": 1.4363636363636363,
"grad_norm": 0.4756371080875397,
"learning_rate": 6.230125686563068e-06,
"loss": 0.4113,
"step": 474
},
{
"epoch": 1.4393939393939394,
"grad_norm": 0.487985223531723,
"learning_rate": 6.213030359812069e-06,
"loss": 0.4119,
"step": 475
},
{
"epoch": 1.4424242424242424,
"grad_norm": 0.4072030484676361,
"learning_rate": 6.195919952567426e-06,
"loss": 0.4205,
"step": 476
},
{
"epoch": 1.4454545454545453,
"grad_norm": 0.5348814129829407,
"learning_rate": 6.178794677547138e-06,
"loss": 0.4297,
"step": 477
},
{
"epoch": 1.4484848484848485,
"grad_norm": 0.4264541268348694,
"learning_rate": 6.161654747654033e-06,
"loss": 0.4111,
"step": 478
},
{
"epoch": 1.4515151515151516,
"grad_norm": 0.4194830358028412,
"learning_rate": 6.14450037597314e-06,
"loss": 0.4151,
"step": 479
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.4842208921909332,
"learning_rate": 6.127331775769023e-06,
"loss": 0.4336,
"step": 480
},
{
"epoch": 1.4575757575757575,
"grad_norm": 0.4886646866798401,
"learning_rate": 6.110149160483139e-06,
"loss": 0.3926,
"step": 481
},
{
"epoch": 1.4606060606060607,
"grad_norm": 0.47493505477905273,
"learning_rate": 6.092952743731179e-06,
"loss": 0.4072,
"step": 482
},
{
"epoch": 1.4636363636363636,
"grad_norm": 0.45231956243515015,
"learning_rate": 6.07574273930042e-06,
"loss": 0.4128,
"step": 483
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.4732300937175751,
"learning_rate": 6.058519361147055e-06,
"loss": 0.4063,
"step": 484
},
{
"epoch": 1.4696969696969697,
"grad_norm": 0.45487740635871887,
"learning_rate": 6.041282823393546e-06,
"loss": 0.3982,
"step": 485
},
{
"epoch": 1.4727272727272727,
"grad_norm": 0.40932172536849976,
"learning_rate": 6.024033340325954e-06,
"loss": 0.4367,
"step": 486
},
{
"epoch": 1.4757575757575758,
"grad_norm": 0.51799076795578,
"learning_rate": 6.006771126391278e-06,
"loss": 0.4075,
"step": 487
},
{
"epoch": 1.4787878787878788,
"grad_norm": 0.46568235754966736,
"learning_rate": 5.989496396194787e-06,
"loss": 0.4234,
"step": 488
},
{
"epoch": 1.481818181818182,
"grad_norm": 0.39532041549682617,
"learning_rate": 5.972209364497355e-06,
"loss": 0.4265,
"step": 489
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.42684099078178406,
"learning_rate": 5.954910246212787e-06,
"loss": 0.4404,
"step": 490
},
{
"epoch": 1.4878787878787878,
"grad_norm": 0.469150185585022,
"learning_rate": 5.937599256405151e-06,
"loss": 0.4242,
"step": 491
},
{
"epoch": 1.490909090909091,
"grad_norm": 0.4257190525531769,
"learning_rate": 5.920276610286102e-06,
"loss": 0.4154,
"step": 492
},
{
"epoch": 1.493939393939394,
"grad_norm": 0.4545654058456421,
"learning_rate": 5.90294252321221e-06,
"loss": 0.4277,
"step": 493
},
{
"epoch": 1.496969696969697,
"grad_norm": 0.4450318217277527,
"learning_rate": 5.885597210682273e-06,
"loss": 0.4402,
"step": 494
},
{
"epoch": 1.5,
"grad_norm": 0.46171948313713074,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.4083,
"step": 495
},
{
"epoch": 1.503030303030303,
"grad_norm": 0.49079129099845886,
"learning_rate": 5.850873771944581e-06,
"loss": 0.4058,
"step": 496
},
{
"epoch": 1.506060606060606,
"grad_norm": 0.43085163831710815,
"learning_rate": 5.833496077421485e-06,
"loss": 0.4304,
"step": 497
},
{
"epoch": 1.509090909090909,
"grad_norm": 0.45978105068206787,
"learning_rate": 5.816108020806297e-06,
"loss": 0.406,
"step": 498
},
{
"epoch": 1.5121212121212122,
"grad_norm": 0.5006170868873596,
"learning_rate": 5.798709818268775e-06,
"loss": 0.4051,
"step": 499
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.47631126642227173,
"learning_rate": 5.781301686104808e-06,
"loss": 0.4223,
"step": 500
},
{
"epoch": 1.518181818181818,
"grad_norm": 0.4285683333873749,
"learning_rate": 5.763883840733736e-06,
"loss": 0.4246,
"step": 501
},
{
"epoch": 1.5212121212121212,
"grad_norm": 0.4717022478580475,
"learning_rate": 5.746456498695648e-06,
"loss": 0.4322,
"step": 502
},
{
"epoch": 1.5242424242424244,
"grad_norm": 0.43785741925239563,
"learning_rate": 5.729019876648704e-06,
"loss": 0.4181,
"step": 503
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.4523105025291443,
"learning_rate": 5.711574191366427e-06,
"loss": 0.3962,
"step": 504
},
{
"epoch": 1.5303030303030303,
"grad_norm": 0.46577563881874084,
"learning_rate": 5.694119659735018e-06,
"loss": 0.4185,
"step": 505
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.48579537868499756,
"learning_rate": 5.6766564987506564e-06,
"loss": 0.4373,
"step": 506
},
{
"epoch": 1.5363636363636364,
"grad_norm": 0.3950510621070862,
"learning_rate": 5.659184925516802e-06,
"loss": 0.4182,
"step": 507
},
{
"epoch": 1.5393939393939395,
"grad_norm": 0.3853079080581665,
"learning_rate": 5.641705157241497e-06,
"loss": 0.4022,
"step": 508
},
{
"epoch": 1.5424242424242425,
"grad_norm": 0.40813571214675903,
"learning_rate": 5.624217411234667e-06,
"loss": 0.4155,
"step": 509
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.4753287434577942,
"learning_rate": 5.60672190490541e-06,
"loss": 0.3863,
"step": 510
},
{
"epoch": 1.5484848484848484,
"grad_norm": 0.47361883521080017,
"learning_rate": 5.58921885575931e-06,
"loss": 0.4325,
"step": 511
},
{
"epoch": 1.5515151515151515,
"grad_norm": 0.3923182487487793,
"learning_rate": 5.571708481395719e-06,
"loss": 0.4046,
"step": 512
},
{
"epoch": 1.5545454545454547,
"grad_norm": 0.43248647451400757,
"learning_rate": 5.5541909995050554e-06,
"loss": 0.4127,
"step": 513
},
{
"epoch": 1.5575757575757576,
"grad_norm": 0.4522246718406677,
"learning_rate": 5.536666627866104e-06,
"loss": 0.4001,
"step": 514
},
{
"epoch": 1.5606060606060606,
"grad_norm": 0.4674279987812042,
"learning_rate": 5.519135584343301e-06,
"loss": 0.4105,
"step": 515
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.488587886095047,
"learning_rate": 5.5015980868840254e-06,
"loss": 0.424,
"step": 516
},
{
"epoch": 1.5666666666666667,
"grad_norm": 0.45722997188568115,
"learning_rate": 5.484054353515896e-06,
"loss": 0.4318,
"step": 517
},
{
"epoch": 1.5696969696969698,
"grad_norm": 0.4935101568698883,
"learning_rate": 5.466504602344055e-06,
"loss": 0.4218,
"step": 518
},
{
"epoch": 1.5727272727272728,
"grad_norm": 0.47893407940864563,
"learning_rate": 5.448949051548459e-06,
"loss": 0.4053,
"step": 519
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.41225844621658325,
"learning_rate": 5.431387919381166e-06,
"loss": 0.397,
"step": 520
},
{
"epoch": 1.5787878787878786,
"grad_norm": 0.4375216066837311,
"learning_rate": 5.41382142416362e-06,
"loss": 0.41,
"step": 521
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.5372921228408813,
"learning_rate": 5.396249784283943e-06,
"loss": 0.4104,
"step": 522
},
{
"epoch": 1.584848484848485,
"grad_norm": 0.4583219885826111,
"learning_rate": 5.3786732181942135e-06,
"loss": 0.4085,
"step": 523
},
{
"epoch": 1.587878787878788,
"grad_norm": 0.37842702865600586,
"learning_rate": 5.361091944407751e-06,
"loss": 0.4033,
"step": 524
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.44202497601509094,
"learning_rate": 5.343506181496405e-06,
"loss": 0.4111,
"step": 525
},
{
"epoch": 1.593939393939394,
"grad_norm": 0.4649200141429901,
"learning_rate": 5.3259161480878354e-06,
"loss": 0.4304,
"step": 526
},
{
"epoch": 1.596969696969697,
"grad_norm": 0.41630348563194275,
"learning_rate": 5.308322062862786e-06,
"loss": 0.436,
"step": 527
},
{
"epoch": 1.6,
"grad_norm": 0.4201316833496094,
"learning_rate": 5.290724144552379e-06,
"loss": 0.3961,
"step": 528
},
{
"epoch": 1.603030303030303,
"grad_norm": 0.4639657437801361,
"learning_rate": 5.2731226119353915e-06,
"loss": 0.4089,
"step": 529
},
{
"epoch": 1.606060606060606,
"grad_norm": 0.4132433533668518,
"learning_rate": 5.255517683835528e-06,
"loss": 0.4272,
"step": 530
},
{
"epoch": 1.6090909090909091,
"grad_norm": 0.49520424008369446,
"learning_rate": 5.237909579118713e-06,
"loss": 0.4125,
"step": 531
},
{
"epoch": 1.612121212121212,
"grad_norm": 0.4128758907318115,
"learning_rate": 5.220298516690353e-06,
"loss": 0.4158,
"step": 532
},
{
"epoch": 1.6151515151515152,
"grad_norm": 0.46247929334640503,
"learning_rate": 5.202684715492635e-06,
"loss": 0.4225,
"step": 533
},
{
"epoch": 1.6181818181818182,
"grad_norm": 0.47613954544067383,
"learning_rate": 5.185068394501791e-06,
"loss": 0.4263,
"step": 534
},
{
"epoch": 1.621212121212121,
"grad_norm": 0.39713340997695923,
"learning_rate": 5.1674497727253766e-06,
"loss": 0.4323,
"step": 535
},
{
"epoch": 1.6242424242424243,
"grad_norm": 0.424244225025177,
"learning_rate": 5.149829069199555e-06,
"loss": 0.4082,
"step": 536
},
{
"epoch": 1.6272727272727274,
"grad_norm": 0.41466566920280457,
"learning_rate": 5.132206502986368e-06,
"loss": 0.4023,
"step": 537
},
{
"epoch": 1.6303030303030304,
"grad_norm": 0.38547638058662415,
"learning_rate": 5.114582293171012e-06,
"loss": 0.4127,
"step": 538
},
{
"epoch": 1.6333333333333333,
"grad_norm": 0.404697984457016,
"learning_rate": 5.096956658859122e-06,
"loss": 0.4204,
"step": 539
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.4179372489452362,
"learning_rate": 5.07932981917404e-06,
"loss": 0.4346,
"step": 540
},
{
"epoch": 1.6393939393939394,
"grad_norm": 0.38099920749664307,
"learning_rate": 5.061701993254092e-06,
"loss": 0.4371,
"step": 541
},
{
"epoch": 1.6424242424242426,
"grad_norm": 0.4002698063850403,
"learning_rate": 5.044073400249867e-06,
"loss": 0.4108,
"step": 542
},
{
"epoch": 1.6454545454545455,
"grad_norm": 0.43330150842666626,
"learning_rate": 5.026444259321489e-06,
"loss": 0.4197,
"step": 543
},
{
"epoch": 1.6484848484848484,
"grad_norm": 0.4145337641239166,
"learning_rate": 5.008814789635894e-06,
"loss": 0.4141,
"step": 544
},
{
"epoch": 1.6515151515151514,
"grad_norm": 0.3934999108314514,
"learning_rate": 4.9911852103641065e-06,
"loss": 0.3995,
"step": 545
},
{
"epoch": 1.6545454545454545,
"grad_norm": 0.4316452443599701,
"learning_rate": 4.973555740678512e-06,
"loss": 0.4192,
"step": 546
},
{
"epoch": 1.6575757575757577,
"grad_norm": 0.38727515935897827,
"learning_rate": 4.955926599750134e-06,
"loss": 0.4063,
"step": 547
},
{
"epoch": 1.6606060606060606,
"grad_norm": 0.46579864621162415,
"learning_rate": 4.938298006745909e-06,
"loss": 0.4264,
"step": 548
},
{
"epoch": 1.6636363636363636,
"grad_norm": 0.4145257771015167,
"learning_rate": 4.9206701808259605e-06,
"loss": 0.4188,
"step": 549
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.48353952169418335,
"learning_rate": 4.903043341140879e-06,
"loss": 0.4312,
"step": 550
},
{
"epoch": 1.6696969696969697,
"grad_norm": 0.42981258034706116,
"learning_rate": 4.885417706828989e-06,
"loss": 0.4192,
"step": 551
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.4582386612892151,
"learning_rate": 4.867793497013634e-06,
"loss": 0.4116,
"step": 552
},
{
"epoch": 1.6757575757575758,
"grad_norm": 0.44898802042007446,
"learning_rate": 4.850170930800447e-06,
"loss": 0.4165,
"step": 553
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.3963959217071533,
"learning_rate": 4.832550227274624e-06,
"loss": 0.4181,
"step": 554
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.45196351408958435,
"learning_rate": 4.81493160549821e-06,
"loss": 0.4227,
"step": 555
},
{
"epoch": 1.6848484848484848,
"grad_norm": 0.458886057138443,
"learning_rate": 4.7973152845073666e-06,
"loss": 0.4223,
"step": 556
},
{
"epoch": 1.687878787878788,
"grad_norm": 0.38994601368904114,
"learning_rate": 4.779701483309648e-06,
"loss": 0.4004,
"step": 557
},
{
"epoch": 1.690909090909091,
"grad_norm": 0.4091901481151581,
"learning_rate": 4.762090420881289e-06,
"loss": 0.4087,
"step": 558
},
{
"epoch": 1.6939393939393939,
"grad_norm": 0.4189499020576477,
"learning_rate": 4.7444823161644725e-06,
"loss": 0.4146,
"step": 559
},
{
"epoch": 1.696969696969697,
"grad_norm": 0.41353610157966614,
"learning_rate": 4.726877388064609e-06,
"loss": 0.4073,
"step": 560
},
{
"epoch": 1.7,
"grad_norm": 0.43993616104125977,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.4257,
"step": 561
},
{
"epoch": 1.7030303030303031,
"grad_norm": 0.4330032169818878,
"learning_rate": 4.691677937137217e-06,
"loss": 0.4148,
"step": 562
},
{
"epoch": 1.706060606060606,
"grad_norm": 0.4641442894935608,
"learning_rate": 4.674083851912167e-06,
"loss": 0.4046,
"step": 563
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.4123784899711609,
"learning_rate": 4.6564938185035954e-06,
"loss": 0.4219,
"step": 564
},
{
"epoch": 1.7121212121212122,
"grad_norm": 0.4760873317718506,
"learning_rate": 4.638908055592252e-06,
"loss": 0.4103,
"step": 565
},
{
"epoch": 1.7151515151515153,
"grad_norm": 0.45056799054145813,
"learning_rate": 4.62132678180579e-06,
"loss": 0.407,
"step": 566
},
{
"epoch": 1.7181818181818183,
"grad_norm": 0.4206244647502899,
"learning_rate": 4.603750215716057e-06,
"loss": 0.4194,
"step": 567
},
{
"epoch": 1.7212121212121212,
"grad_norm": 0.40745237469673157,
"learning_rate": 4.58617857583638e-06,
"loss": 0.4139,
"step": 568
},
{
"epoch": 1.7242424242424241,
"grad_norm": 0.38897326588630676,
"learning_rate": 4.568612080618836e-06,
"loss": 0.3957,
"step": 569
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.42113196849823,
"learning_rate": 4.551050948451542e-06,
"loss": 0.4134,
"step": 570
},
{
"epoch": 1.7303030303030305,
"grad_norm": 0.39686131477355957,
"learning_rate": 4.533495397655946e-06,
"loss": 0.3993,
"step": 571
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.43916943669319153,
"learning_rate": 4.515945646484105e-06,
"loss": 0.4104,
"step": 572
},
{
"epoch": 1.7363636363636363,
"grad_norm": 0.4046113193035126,
"learning_rate": 4.498401913115975e-06,
"loss": 0.4172,
"step": 573
},
{
"epoch": 1.7393939393939393,
"grad_norm": 0.39034318923950195,
"learning_rate": 4.4808644156567e-06,
"loss": 0.4322,
"step": 574
},
{
"epoch": 1.7424242424242424,
"grad_norm": 0.41494661569595337,
"learning_rate": 4.463333372133897e-06,
"loss": 0.3948,
"step": 575
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.40686750411987305,
"learning_rate": 4.445809000494945e-06,
"loss": 0.4179,
"step": 576
},
{
"epoch": 1.7484848484848485,
"grad_norm": 0.4119640290737152,
"learning_rate": 4.428291518604283e-06,
"loss": 0.4149,
"step": 577
},
{
"epoch": 1.7515151515151515,
"grad_norm": 0.44735684990882874,
"learning_rate": 4.410781144240692e-06,
"loss": 0.4385,
"step": 578
},
{
"epoch": 1.7545454545454544,
"grad_norm": 0.47033703327178955,
"learning_rate": 4.393278095094591e-06,
"loss": 0.435,
"step": 579
},
{
"epoch": 1.7575757575757576,
"grad_norm": 0.4110475182533264,
"learning_rate": 4.3757825887653345e-06,
"loss": 0.4069,
"step": 580
},
{
"epoch": 1.7606060606060607,
"grad_norm": 0.3891106843948364,
"learning_rate": 4.358294842758504e-06,
"loss": 0.431,
"step": 581
},
{
"epoch": 1.7636363636363637,
"grad_norm": 0.4366550147533417,
"learning_rate": 4.340815074483199e-06,
"loss": 0.4322,
"step": 582
},
{
"epoch": 1.7666666666666666,
"grad_norm": 0.40945494174957275,
"learning_rate": 4.323343501249346e-06,
"loss": 0.4164,
"step": 583
},
{
"epoch": 1.7696969696969695,
"grad_norm": 0.4169653654098511,
"learning_rate": 4.305880340264985e-06,
"loss": 0.4078,
"step": 584
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.43987858295440674,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.4,
"step": 585
},
{
"epoch": 1.7757575757575759,
"grad_norm": 0.434929221868515,
"learning_rate": 4.270980123351299e-06,
"loss": 0.4413,
"step": 586
},
{
"epoch": 1.7787878787878788,
"grad_norm": 0.4042617380619049,
"learning_rate": 4.2535435013043535e-06,
"loss": 0.4075,
"step": 587
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.4270903468132019,
"learning_rate": 4.2361161592662655e-06,
"loss": 0.4372,
"step": 588
},
{
"epoch": 1.7848484848484847,
"grad_norm": 0.37257200479507446,
"learning_rate": 4.218698313895192e-06,
"loss": 0.4094,
"step": 589
},
{
"epoch": 1.7878787878787878,
"grad_norm": 0.3988351821899414,
"learning_rate": 4.2012901817312255e-06,
"loss": 0.4247,
"step": 590
},
{
"epoch": 1.790909090909091,
"grad_norm": 0.4023801386356354,
"learning_rate": 4.183891979193703e-06,
"loss": 0.4177,
"step": 591
},
{
"epoch": 1.793939393939394,
"grad_norm": 0.3908040225505829,
"learning_rate": 4.166503922578516e-06,
"loss": 0.4479,
"step": 592
},
{
"epoch": 1.7969696969696969,
"grad_norm": 0.39427265524864197,
"learning_rate": 4.149126228055419e-06,
"loss": 0.3807,
"step": 593
},
{
"epoch": 1.8,
"grad_norm": 0.3684083819389343,
"learning_rate": 4.131759111665349e-06,
"loss": 0.4074,
"step": 594
},
{
"epoch": 1.803030303030303,
"grad_norm": 0.4389978349208832,
"learning_rate": 4.114402789317729e-06,
"loss": 0.414,
"step": 595
},
{
"epoch": 1.8060606060606061,
"grad_norm": 0.4645283818244934,
"learning_rate": 4.097057476787792e-06,
"loss": 0.4244,
"step": 596
},
{
"epoch": 1.809090909090909,
"grad_norm": 0.4098891317844391,
"learning_rate": 4.079723389713899e-06,
"loss": 0.4185,
"step": 597
},
{
"epoch": 1.812121212121212,
"grad_norm": 0.44468870759010315,
"learning_rate": 4.06240074359485e-06,
"loss": 0.4144,
"step": 598
},
{
"epoch": 1.8151515151515152,
"grad_norm": 0.40740299224853516,
"learning_rate": 4.045089753787214e-06,
"loss": 0.4147,
"step": 599
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.426737517118454,
"learning_rate": 4.027790635502646e-06,
"loss": 0.415,
"step": 600
},
{
"epoch": 1.8212121212121213,
"grad_norm": 0.4545744061470032,
"learning_rate": 4.010503603805214e-06,
"loss": 0.4163,
"step": 601
},
{
"epoch": 1.8242424242424242,
"grad_norm": 0.3721947968006134,
"learning_rate": 3.993228873608724e-06,
"loss": 0.4145,
"step": 602
},
{
"epoch": 1.8272727272727272,
"grad_norm": 0.4343280494213104,
"learning_rate": 3.975966659674048e-06,
"loss": 0.4032,
"step": 603
},
{
"epoch": 1.8303030303030303,
"grad_norm": 0.3900534212589264,
"learning_rate": 3.958717176606456e-06,
"loss": 0.4288,
"step": 604
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.45665907859802246,
"learning_rate": 3.941480638852948e-06,
"loss": 0.3978,
"step": 605
},
{
"epoch": 1.8363636363636364,
"grad_norm": 0.3952561616897583,
"learning_rate": 3.924257260699583e-06,
"loss": 0.4265,
"step": 606
},
{
"epoch": 1.8393939393939394,
"grad_norm": 0.4145510494709015,
"learning_rate": 3.907047256268822e-06,
"loss": 0.4165,
"step": 607
},
{
"epoch": 1.8424242424242423,
"grad_norm": 0.40336254239082336,
"learning_rate": 3.8898508395168645e-06,
"loss": 0.4075,
"step": 608
},
{
"epoch": 1.8454545454545455,
"grad_norm": 0.41499027609825134,
"learning_rate": 3.872668224230979e-06,
"loss": 0.4098,
"step": 609
},
{
"epoch": 1.8484848484848486,
"grad_norm": 0.45877882838249207,
"learning_rate": 3.855499624026861e-06,
"loss": 0.414,
"step": 610
},
{
"epoch": 1.8515151515151516,
"grad_norm": 0.4164450764656067,
"learning_rate": 3.838345252345968e-06,
"loss": 0.4238,
"step": 611
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.41379204392433167,
"learning_rate": 3.821205322452863e-06,
"loss": 0.4007,
"step": 612
},
{
"epoch": 1.8575757575757574,
"grad_norm": 0.38134729862213135,
"learning_rate": 3.804080047432574e-06,
"loss": 0.4093,
"step": 613
},
{
"epoch": 1.8606060606060606,
"grad_norm": 0.4260639548301697,
"learning_rate": 3.786969640187932e-06,
"loss": 0.4107,
"step": 614
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.3941526710987091,
"learning_rate": 3.769874313436933e-06,
"loss": 0.4056,
"step": 615
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.44000816345214844,
"learning_rate": 3.752794279710094e-06,
"loss": 0.4266,
"step": 616
},
{
"epoch": 1.8696969696969696,
"grad_norm": 0.4201476275920868,
"learning_rate": 3.735729751347803e-06,
"loss": 0.4383,
"step": 617
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.3836321234703064,
"learning_rate": 3.7186809404976877e-06,
"loss": 0.4197,
"step": 618
},
{
"epoch": 1.8757575757575757,
"grad_norm": 0.394233763217926,
"learning_rate": 3.701648059111972e-06,
"loss": 0.4059,
"step": 619
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.4040839672088623,
"learning_rate": 3.6846313189448447e-06,
"loss": 0.413,
"step": 620
},
{
"epoch": 1.8818181818181818,
"grad_norm": 0.39362630248069763,
"learning_rate": 3.667630931549826e-06,
"loss": 0.3943,
"step": 621
},
{
"epoch": 1.8848484848484848,
"grad_norm": 0.36628755927085876,
"learning_rate": 3.6506471082771357e-06,
"loss": 0.4315,
"step": 622
},
{
"epoch": 1.887878787878788,
"grad_norm": 0.4035452902317047,
"learning_rate": 3.6336800602710676e-06,
"loss": 0.4104,
"step": 623
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.3663199543952942,
"learning_rate": 3.6167299984673655e-06,
"loss": 0.4189,
"step": 624
},
{
"epoch": 1.893939393939394,
"grad_norm": 0.4386783242225647,
"learning_rate": 3.5997971335905966e-06,
"loss": 0.437,
"step": 625
},
{
"epoch": 1.896969696969697,
"grad_norm": 0.4115349352359772,
"learning_rate": 3.582881676151536e-06,
"loss": 0.4128,
"step": 626
},
{
"epoch": 1.9,
"grad_norm": 0.3740464150905609,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.4024,
"step": 627
},
{
"epoch": 1.903030303030303,
"grad_norm": 0.39950650930404663,
"learning_rate": 3.549103824544975e-06,
"loss": 0.4143,
"step": 628
},
{
"epoch": 1.906060606060606,
"grad_norm": 0.4304462671279907,
"learning_rate": 3.5322418503065148e-06,
"loss": 0.4266,
"step": 629
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.385495662689209,
"learning_rate": 3.5153981233586277e-06,
"loss": 0.4211,
"step": 630
},
{
"epoch": 1.912121212121212,
"grad_norm": 0.37890446186065674,
"learning_rate": 3.498572853103915e-06,
"loss": 0.4192,
"step": 631
},
{
"epoch": 1.915151515151515,
"grad_norm": 0.3713083267211914,
"learning_rate": 3.481766248715528e-06,
"loss": 0.4203,
"step": 632
},
{
"epoch": 1.9181818181818182,
"grad_norm": 0.38436827063560486,
"learning_rate": 3.4649785191345613e-06,
"loss": 0.3995,
"step": 633
},
{
"epoch": 1.9212121212121214,
"grad_norm": 0.3605766296386719,
"learning_rate": 3.4482098730674577e-06,
"loss": 0.3961,
"step": 634
},
{
"epoch": 1.9242424242424243,
"grad_norm": 0.3878589868545532,
"learning_rate": 3.4314605189834076e-06,
"loss": 0.4062,
"step": 635
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.4051166772842407,
"learning_rate": 3.4147306651117663e-06,
"loss": 0.4083,
"step": 636
},
{
"epoch": 1.9303030303030302,
"grad_norm": 0.3763566017150879,
"learning_rate": 3.398020519439459e-06,
"loss": 0.3853,
"step": 637
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.3822113573551178,
"learning_rate": 3.3813302897083955e-06,
"loss": 0.4061,
"step": 638
},
{
"epoch": 1.9363636363636365,
"grad_norm": 0.42593175172805786,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.4307,
"step": 639
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.42011937499046326,
"learning_rate": 3.348010407797088e-06,
"loss": 0.4172,
"step": 640
},
{
"epoch": 1.9424242424242424,
"grad_norm": 0.38729822635650635,
"learning_rate": 3.3313811698523677e-06,
"loss": 0.4147,
"step": 641
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.4373640716075897,
"learning_rate": 3.3147726763147913e-06,
"loss": 0.4049,
"step": 642
},
{
"epoch": 1.9484848484848485,
"grad_norm": 0.3927765488624573,
"learning_rate": 3.298185133662525e-06,
"loss": 0.4158,
"step": 643
},
{
"epoch": 1.9515151515151516,
"grad_norm": 0.40047505497932434,
"learning_rate": 3.2816187481132655e-06,
"loss": 0.4098,
"step": 644
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.4280295670032501,
"learning_rate": 3.2650737256216885e-06,
"loss": 0.4316,
"step": 645
},
{
"epoch": 1.9575757575757575,
"grad_norm": 0.41508781909942627,
"learning_rate": 3.2485502718768814e-06,
"loss": 0.4319,
"step": 646
},
{
"epoch": 1.9606060606060605,
"grad_norm": 0.4096485376358032,
"learning_rate": 3.2320485922997842e-06,
"loss": 0.4026,
"step": 647
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.37917032837867737,
"learning_rate": 3.2155688920406415e-06,
"loss": 0.4001,
"step": 648
},
{
"epoch": 1.9666666666666668,
"grad_norm": 0.4002001881599426,
"learning_rate": 3.1991113759764493e-06,
"loss": 0.4111,
"step": 649
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.4140879213809967,
"learning_rate": 3.1826762487084053e-06,
"loss": 0.4144,
"step": 650
},
{
"epoch": 1.9727272727272727,
"grad_norm": 0.3691096305847168,
"learning_rate": 3.16626371455937e-06,
"loss": 0.4068,
"step": 651
},
{
"epoch": 1.9757575757575756,
"grad_norm": 0.42086780071258545,
"learning_rate": 3.149873977571324e-06,
"loss": 0.3825,
"step": 652
},
{
"epoch": 1.9787878787878788,
"grad_norm": 0.4335564374923706,
"learning_rate": 3.133507241502832e-06,
"loss": 0.4465,
"step": 653
},
{
"epoch": 1.981818181818182,
"grad_norm": 0.4249853491783142,
"learning_rate": 3.1171637098265063e-06,
"loss": 0.4113,
"step": 654
},
{
"epoch": 1.9848484848484849,
"grad_norm": 0.43183475732803345,
"learning_rate": 3.1008435857264862e-06,
"loss": 0.4142,
"step": 655
},
{
"epoch": 1.9878787878787878,
"grad_norm": 0.4180411994457245,
"learning_rate": 3.0845470720959027e-06,
"loss": 0.4122,
"step": 656
},
{
"epoch": 1.990909090909091,
"grad_norm": 0.44475847482681274,
"learning_rate": 3.0682743715343565e-06,
"loss": 0.3895,
"step": 657
},
{
"epoch": 1.993939393939394,
"grad_norm": 0.4236146807670593,
"learning_rate": 3.0520256863454077e-06,
"loss": 0.416,
"step": 658
},
{
"epoch": 1.996969696969697,
"grad_norm": 0.40536797046661377,
"learning_rate": 3.035801218534054e-06,
"loss": 0.421,
"step": 659
},
{
"epoch": 2.0,
"grad_norm": 0.4580300748348236,
"learning_rate": 3.019601169804216e-06,
"loss": 0.4115,
"step": 660
},
{
"epoch": 2.003030303030303,
"grad_norm": 0.5088746547698975,
"learning_rate": 3.00342574155624e-06,
"loss": 0.4071,
"step": 661
},
{
"epoch": 2.006060606060606,
"grad_norm": 0.43281951546669006,
"learning_rate": 2.9872751348843875e-06,
"loss": 0.3828,
"step": 662
},
{
"epoch": 2.0090909090909093,
"grad_norm": 0.40724921226501465,
"learning_rate": 2.9711495505743317e-06,
"loss": 0.4039,
"step": 663
},
{
"epoch": 2.012121212121212,
"grad_norm": 0.39697274565696716,
"learning_rate": 2.9550491891006704e-06,
"loss": 0.3864,
"step": 664
},
{
"epoch": 2.015151515151515,
"grad_norm": 0.42503514885902405,
"learning_rate": 2.938974250624429e-06,
"loss": 0.3684,
"step": 665
},
{
"epoch": 2.018181818181818,
"grad_norm": 0.4273051917552948,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.3763,
"step": 666
},
{
"epoch": 2.021212121212121,
"grad_norm": 0.41625821590423584,
"learning_rate": 2.906901441725507e-06,
"loss": 0.4116,
"step": 667
},
{
"epoch": 2.0242424242424244,
"grad_norm": 0.40681958198547363,
"learning_rate": 2.8909039700346385e-06,
"loss": 0.3684,
"step": 668
},
{
"epoch": 2.0272727272727273,
"grad_norm": 0.4323454797267914,
"learning_rate": 2.8749327187998516e-06,
"loss": 0.4021,
"step": 669
},
{
"epoch": 2.0303030303030303,
"grad_norm": 0.3722967505455017,
"learning_rate": 2.858987886577058e-06,
"loss": 0.3634,
"step": 670
},
{
"epoch": 2.033333333333333,
"grad_norm": 0.3991397023200989,
"learning_rate": 2.843069671593734e-06,
"loss": 0.38,
"step": 671
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.42172548174858093,
"learning_rate": 2.8271782717464413e-06,
"loss": 0.3739,
"step": 672
},
{
"epoch": 2.0393939393939395,
"grad_norm": 0.39456048607826233,
"learning_rate": 2.811313884598373e-06,
"loss": 0.3701,
"step": 673
},
{
"epoch": 2.0424242424242425,
"grad_norm": 0.3840007483959198,
"learning_rate": 2.795476707376905e-06,
"loss": 0.3874,
"step": 674
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.35414138436317444,
"learning_rate": 2.7796669369711294e-06,
"loss": 0.3882,
"step": 675
},
{
"epoch": 2.0484848484848484,
"grad_norm": 0.4287824034690857,
"learning_rate": 2.7638847699294196e-06,
"loss": 0.3977,
"step": 676
},
{
"epoch": 2.0515151515151517,
"grad_norm": 0.402102530002594,
"learning_rate": 2.7481304024569823e-06,
"loss": 0.3907,
"step": 677
},
{
"epoch": 2.0545454545454547,
"grad_norm": 0.39035528898239136,
"learning_rate": 2.7324040304134125e-06,
"loss": 0.3939,
"step": 678
},
{
"epoch": 2.0575757575757576,
"grad_norm": 0.3569696843624115,
"learning_rate": 2.716705849310265e-06,
"loss": 0.3851,
"step": 679
},
{
"epoch": 2.0606060606060606,
"grad_norm": 0.39394411444664,
"learning_rate": 2.701036054308629e-06,
"loss": 0.3803,
"step": 680
},
{
"epoch": 2.0636363636363635,
"grad_norm": 0.371428519487381,
"learning_rate": 2.685394840216688e-06,
"loss": 0.3746,
"step": 681
},
{
"epoch": 2.066666666666667,
"grad_norm": 0.3792346715927124,
"learning_rate": 2.6697824014873076e-06,
"loss": 0.3466,
"step": 682
},
{
"epoch": 2.06969696969697,
"grad_norm": 0.37582284212112427,
"learning_rate": 2.654198932215613e-06,
"loss": 0.3879,
"step": 683
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.38081496953964233,
"learning_rate": 2.6386446261365874e-06,
"loss": 0.3783,
"step": 684
},
{
"epoch": 2.0757575757575757,
"grad_norm": 0.3656942546367645,
"learning_rate": 2.623119676622645e-06,
"loss": 0.3796,
"step": 685
},
{
"epoch": 2.0787878787878786,
"grad_norm": 0.36368727684020996,
"learning_rate": 2.607624276681241e-06,
"loss": 0.3497,
"step": 686
},
{
"epoch": 2.081818181818182,
"grad_norm": 0.3967862129211426,
"learning_rate": 2.5921586189524694e-06,
"loss": 0.3823,
"step": 687
},
{
"epoch": 2.084848484848485,
"grad_norm": 0.37508049607276917,
"learning_rate": 2.5767228957066635e-06,
"loss": 0.3784,
"step": 688
},
{
"epoch": 2.087878787878788,
"grad_norm": 0.3605888783931732,
"learning_rate": 2.561317298842008e-06,
"loss": 0.3858,
"step": 689
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.3733239769935608,
"learning_rate": 2.5459420198821604e-06,
"loss": 0.3648,
"step": 690
},
{
"epoch": 2.0939393939393938,
"grad_norm": 0.3924459218978882,
"learning_rate": 2.530597249973856e-06,
"loss": 0.3747,
"step": 691
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.37564197182655334,
"learning_rate": 2.51528317988454e-06,
"loss": 0.3581,
"step": 692
},
{
"epoch": 2.1,
"grad_norm": 0.3885200321674347,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3786,
"step": 693
},
{
"epoch": 2.103030303030303,
"grad_norm": 0.36674779653549194,
"learning_rate": 2.4847479003219926e-06,
"loss": 0.3707,
"step": 694
},
{
"epoch": 2.106060606060606,
"grad_norm": 0.363193154335022,
"learning_rate": 2.4695270704658753e-06,
"loss": 0.3921,
"step": 695
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.4087706506252289,
"learning_rate": 2.454337699658267e-06,
"loss": 0.3795,
"step": 696
},
{
"epoch": 2.1121212121212123,
"grad_norm": 0.37886175513267517,
"learning_rate": 2.439179976734677e-06,
"loss": 0.3594,
"step": 697
},
{
"epoch": 2.1151515151515152,
"grad_norm": 0.3760082721710205,
"learning_rate": 2.4240540901371727e-06,
"loss": 0.3462,
"step": 698
},
{
"epoch": 2.118181818181818,
"grad_norm": 0.38874131441116333,
"learning_rate": 2.4089602279120224e-06,
"loss": 0.374,
"step": 699
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.39027926325798035,
"learning_rate": 2.393898577707371e-06,
"loss": 0.353,
"step": 700
},
{
"epoch": 2.124242424242424,
"grad_norm": 0.34905844926834106,
"learning_rate": 2.3788693267708975e-06,
"loss": 0.3712,
"step": 701
},
{
"epoch": 2.1272727272727274,
"grad_norm": 0.3495723307132721,
"learning_rate": 2.363872661947488e-06,
"loss": 0.3594,
"step": 702
},
{
"epoch": 2.1303030303030304,
"grad_norm": 0.3776390850543976,
"learning_rate": 2.3489087696769225e-06,
"loss": 0.3803,
"step": 703
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.3674999177455902,
"learning_rate": 2.333977835991545e-06,
"loss": 0.3787,
"step": 704
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.37632232904434204,
"learning_rate": 2.319080046513954e-06,
"loss": 0.3864,
"step": 705
},
{
"epoch": 2.1393939393939396,
"grad_norm": 0.3536114990711212,
"learning_rate": 2.3042155864547024e-06,
"loss": 0.3877,
"step": 706
},
{
"epoch": 2.1424242424242426,
"grad_norm": 0.37344491481781006,
"learning_rate": 2.2893846406099847e-06,
"loss": 0.3827,
"step": 707
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.34745898842811584,
"learning_rate": 2.274587393359342e-06,
"loss": 0.3708,
"step": 708
},
{
"epoch": 2.1484848484848484,
"grad_norm": 0.35172003507614136,
"learning_rate": 2.2598240286633787e-06,
"loss": 0.3932,
"step": 709
},
{
"epoch": 2.1515151515151514,
"grad_norm": 0.35878872871398926,
"learning_rate": 2.245094730061463e-06,
"loss": 0.3473,
"step": 710
},
{
"epoch": 2.1545454545454543,
"grad_norm": 0.359260231256485,
"learning_rate": 2.230399680669449e-06,
"loss": 0.397,
"step": 711
},
{
"epoch": 2.1575757575757577,
"grad_norm": 0.3743046522140503,
"learning_rate": 2.215739063177409e-06,
"loss": 0.3694,
"step": 712
},
{
"epoch": 2.1606060606060606,
"grad_norm": 0.37723642587661743,
"learning_rate": 2.2011130598473498e-06,
"loss": 0.3703,
"step": 713
},
{
"epoch": 2.1636363636363636,
"grad_norm": 0.3454684615135193,
"learning_rate": 2.1865218525109496e-06,
"loss": 0.3637,
"step": 714
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.3447061777114868,
"learning_rate": 2.171965622567308e-06,
"loss": 0.3653,
"step": 715
},
{
"epoch": 2.16969696969697,
"grad_norm": 0.35390952229499817,
"learning_rate": 2.1574445509806764e-06,
"loss": 0.3695,
"step": 716
},
{
"epoch": 2.172727272727273,
"grad_norm": 0.3610280752182007,
"learning_rate": 2.1429588182782147e-06,
"loss": 0.3456,
"step": 717
},
{
"epoch": 2.175757575757576,
"grad_norm": 0.3974767029285431,
"learning_rate": 2.1285086045477515e-06,
"loss": 0.3989,
"step": 718
},
{
"epoch": 2.1787878787878787,
"grad_norm": 0.38098010420799255,
"learning_rate": 2.1140940894355345e-06,
"loss": 0.3752,
"step": 719
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.37288904190063477,
"learning_rate": 2.09971545214401e-06,
"loss": 0.378,
"step": 720
},
{
"epoch": 2.184848484848485,
"grad_norm": 0.34389159083366394,
"learning_rate": 2.0853728714295807e-06,
"loss": 0.3581,
"step": 721
},
{
"epoch": 2.187878787878788,
"grad_norm": 0.35233569145202637,
"learning_rate": 2.0710665256003994e-06,
"loss": 0.3677,
"step": 722
},
{
"epoch": 2.190909090909091,
"grad_norm": 0.37292009592056274,
"learning_rate": 2.0567965925141366e-06,
"loss": 0.3593,
"step": 723
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.39068883657455444,
"learning_rate": 2.0425632495757776e-06,
"loss": 0.3873,
"step": 724
},
{
"epoch": 2.196969696969697,
"grad_norm": 0.386063814163208,
"learning_rate": 2.028366673735421e-06,
"loss": 0.3601,
"step": 725
},
{
"epoch": 2.2,
"grad_norm": 0.410195529460907,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.3771,
"step": 726
},
{
"epoch": 2.203030303030303,
"grad_norm": 0.34756624698638916,
"learning_rate": 2.0000845288614396e-06,
"loss": 0.377,
"step": 727
},
{
"epoch": 2.206060606060606,
"grad_norm": 0.4086437523365021,
"learning_rate": 1.9859993114337773e-06,
"loss": 0.3604,
"step": 728
},
{
"epoch": 2.209090909090909,
"grad_norm": 0.42168623208999634,
"learning_rate": 1.971951564311668e-06,
"loss": 0.3511,
"step": 729
},
{
"epoch": 2.212121212121212,
"grad_norm": 0.39504092931747437,
"learning_rate": 1.9579414621378624e-06,
"loss": 0.3804,
"step": 730
},
{
"epoch": 2.2151515151515153,
"grad_norm": 0.3468715250492096,
"learning_rate": 1.943969179087112e-06,
"loss": 0.3705,
"step": 731
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.34179291129112244,
"learning_rate": 1.9300348888639915e-06,
"loss": 0.3765,
"step": 732
},
{
"epoch": 2.221212121212121,
"grad_norm": 0.3874762952327728,
"learning_rate": 1.916138764700747e-06,
"loss": 0.3444,
"step": 733
},
{
"epoch": 2.224242424242424,
"grad_norm": 0.36602982878685,
"learning_rate": 1.902280979355146e-06,
"loss": 0.3676,
"step": 734
},
{
"epoch": 2.227272727272727,
"grad_norm": 0.37903892993927,
"learning_rate": 1.8884617051083183e-06,
"loss": 0.3611,
"step": 735
},
{
"epoch": 2.2303030303030305,
"grad_norm": 0.35178276896476746,
"learning_rate": 1.8746811137626208e-06,
"loss": 0.3738,
"step": 736
},
{
"epoch": 2.2333333333333334,
"grad_norm": 0.3382170796394348,
"learning_rate": 1.8609393766395083e-06,
"loss": 0.3325,
"step": 737
},
{
"epoch": 2.2363636363636363,
"grad_norm": 0.3748219907283783,
"learning_rate": 1.8472366645773892e-06,
"loss": 0.38,
"step": 738
},
{
"epoch": 2.2393939393939393,
"grad_norm": 0.38781046867370605,
"learning_rate": 1.8335731479295105e-06,
"loss": 0.3989,
"step": 739
},
{
"epoch": 2.242424242424242,
"grad_norm": 0.3649122714996338,
"learning_rate": 1.8199489965618433e-06,
"loss": 0.3884,
"step": 740
},
{
"epoch": 2.2454545454545456,
"grad_norm": 0.35382363200187683,
"learning_rate": 1.8063643798509594e-06,
"loss": 0.3715,
"step": 741
},
{
"epoch": 2.2484848484848485,
"grad_norm": 0.37757110595703125,
"learning_rate": 1.7928194666819398e-06,
"loss": 0.3854,
"step": 742
},
{
"epoch": 2.2515151515151515,
"grad_norm": 0.3414751887321472,
"learning_rate": 1.7793144254462601e-06,
"loss": 0.3622,
"step": 743
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.36525917053222656,
"learning_rate": 1.7658494240397127e-06,
"loss": 0.3773,
"step": 744
},
{
"epoch": 2.257575757575758,
"grad_norm": 0.3339369297027588,
"learning_rate": 1.7524246298603053e-06,
"loss": 0.3663,
"step": 745
},
{
"epoch": 2.2606060606060607,
"grad_norm": 0.3457212746143341,
"learning_rate": 1.739040209806186e-06,
"loss": 0.3829,
"step": 746
},
{
"epoch": 2.2636363636363637,
"grad_norm": 0.3451812267303467,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.3826,
"step": 747
},
{
"epoch": 2.2666666666666666,
"grad_norm": 0.35478025674819946,
"learning_rate": 1.7123931571546826e-06,
"loss": 0.3446,
"step": 748
},
{
"epoch": 2.2696969696969695,
"grad_norm": 0.3515215218067169,
"learning_rate": 1.6991308558356545e-06,
"loss": 0.3868,
"step": 749
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.3347879946231842,
"learning_rate": 1.68590959119452e-06,
"loss": 0.3574,
"step": 750
},
{
"epoch": 2.275757575757576,
"grad_norm": 0.3563007116317749,
"learning_rate": 1.6727295275991311e-06,
"loss": 0.3976,
"step": 751
},
{
"epoch": 2.278787878787879,
"grad_norm": 0.3718889057636261,
"learning_rate": 1.6595908289051266e-06,
"loss": 0.38,
"step": 752
},
{
"epoch": 2.2818181818181817,
"grad_norm": 0.35888758301734924,
"learning_rate": 1.646493658453896e-06,
"loss": 0.3813,
"step": 753
},
{
"epoch": 2.2848484848484847,
"grad_norm": 0.34056735038757324,
"learning_rate": 1.6334381790705439e-06,
"loss": 0.3976,
"step": 754
},
{
"epoch": 2.287878787878788,
"grad_norm": 0.3530712127685547,
"learning_rate": 1.6204245530618662e-06,
"loss": 0.3947,
"step": 755
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.3484843373298645,
"learning_rate": 1.6074529422143398e-06,
"loss": 0.3775,
"step": 756
},
{
"epoch": 2.293939393939394,
"grad_norm": 0.3389803469181061,
"learning_rate": 1.5945235077921011e-06,
"loss": 0.3649,
"step": 757
},
{
"epoch": 2.296969696969697,
"grad_norm": 0.35619989037513733,
"learning_rate": 1.5816364105349451e-06,
"loss": 0.3794,
"step": 758
},
{
"epoch": 2.3,
"grad_norm": 0.3550373315811157,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.3992,
"step": 759
},
{
"epoch": 2.303030303030303,
"grad_norm": 0.34751543402671814,
"learning_rate": 1.5559898678413898e-06,
"loss": 0.3791,
"step": 760
},
{
"epoch": 2.306060606060606,
"grad_norm": 0.35747015476226807,
"learning_rate": 1.5432307412449244e-06,
"loss": 0.4039,
"step": 761
},
{
"epoch": 2.309090909090909,
"grad_norm": 0.35150715708732605,
"learning_rate": 1.5305145894894547e-06,
"loss": 0.3784,
"step": 762
},
{
"epoch": 2.312121212121212,
"grad_norm": 0.35025930404663086,
"learning_rate": 1.517841570663231e-06,
"loss": 0.3682,
"step": 763
},
{
"epoch": 2.315151515151515,
"grad_norm": 0.3555355370044708,
"learning_rate": 1.5052118423182688e-06,
"loss": 0.3708,
"step": 764
},
{
"epoch": 2.3181818181818183,
"grad_norm": 0.37226593494415283,
"learning_rate": 1.4926255614683931e-06,
"loss": 0.3976,
"step": 765
},
{
"epoch": 2.3212121212121213,
"grad_norm": 0.36371520161628723,
"learning_rate": 1.48008288458729e-06,
"loss": 0.3852,
"step": 766
},
{
"epoch": 2.324242424242424,
"grad_norm": 0.3754095137119293,
"learning_rate": 1.4675839676065534e-06,
"loss": 0.3574,
"step": 767
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.38677626848220825,
"learning_rate": 1.4551289659137497e-06,
"loss": 0.3849,
"step": 768
},
{
"epoch": 2.33030303030303,
"grad_norm": 0.35939913988113403,
"learning_rate": 1.442718034350492e-06,
"loss": 0.3761,
"step": 769
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.3449608087539673,
"learning_rate": 1.4303513272105057e-06,
"loss": 0.3897,
"step": 770
},
{
"epoch": 2.3363636363636364,
"grad_norm": 0.34984907507896423,
"learning_rate": 1.4180289982377138e-06,
"loss": 0.3793,
"step": 771
},
{
"epoch": 2.3393939393939394,
"grad_norm": 0.38575461506843567,
"learning_rate": 1.4057512006243312e-06,
"loss": 0.4093,
"step": 772
},
{
"epoch": 2.3424242424242423,
"grad_norm": 0.38205093145370483,
"learning_rate": 1.3935180870089503e-06,
"loss": 0.3901,
"step": 773
},
{
"epoch": 2.3454545454545457,
"grad_norm": 0.353354275226593,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.3753,
"step": 774
},
{
"epoch": 2.3484848484848486,
"grad_norm": 0.3562178313732147,
"learning_rate": 1.3691865195471037e-06,
"loss": 0.372,
"step": 775
},
{
"epoch": 2.3515151515151516,
"grad_norm": 0.325185626745224,
"learning_rate": 1.357088368192696e-06,
"loss": 0.3668,
"step": 776
},
{
"epoch": 2.3545454545454545,
"grad_norm": 0.3486754596233368,
"learning_rate": 1.345035505816642e-06,
"loss": 0.3965,
"step": 777
},
{
"epoch": 2.3575757575757574,
"grad_norm": 0.36219727993011475,
"learning_rate": 1.3330280822611246e-06,
"loss": 0.3498,
"step": 778
},
{
"epoch": 2.3606060606060604,
"grad_norm": 0.37098029255867004,
"learning_rate": 1.3210662468034246e-06,
"loss": 0.3865,
"step": 779
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.35625922679901123,
"learning_rate": 1.3091501481540676e-06,
"loss": 0.3711,
"step": 780
},
{
"epoch": 2.3666666666666667,
"grad_norm": 0.36688151955604553,
"learning_rate": 1.297279934454978e-06,
"loss": 0.3715,
"step": 781
},
{
"epoch": 2.3696969696969696,
"grad_norm": 0.36043912172317505,
"learning_rate": 1.2854557532776323e-06,
"loss": 0.3581,
"step": 782
},
{
"epoch": 2.3727272727272726,
"grad_norm": 0.33972594141960144,
"learning_rate": 1.2736777516212267e-06,
"loss": 0.3637,
"step": 783
},
{
"epoch": 2.375757575757576,
"grad_norm": 0.3574168384075165,
"learning_rate": 1.2619460759108521e-06,
"loss": 0.3751,
"step": 784
},
{
"epoch": 2.378787878787879,
"grad_norm": 0.35279446840286255,
"learning_rate": 1.250260871995671e-06,
"loss": 0.4013,
"step": 785
},
{
"epoch": 2.381818181818182,
"grad_norm": 0.3540102541446686,
"learning_rate": 1.238622285147103e-06,
"loss": 0.3383,
"step": 786
},
{
"epoch": 2.3848484848484848,
"grad_norm": 0.37313178181648254,
"learning_rate": 1.2270304600570193e-06,
"loss": 0.3719,
"step": 787
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.3584001958370209,
"learning_rate": 1.2154855408359507e-06,
"loss": 0.3605,
"step": 788
},
{
"epoch": 2.390909090909091,
"grad_norm": 0.36906707286834717,
"learning_rate": 1.2039876710112847e-06,
"loss": 0.3893,
"step": 789
},
{
"epoch": 2.393939393939394,
"grad_norm": 0.33588269352912903,
"learning_rate": 1.1925369935254872e-06,
"loss": 0.3639,
"step": 790
},
{
"epoch": 2.396969696969697,
"grad_norm": 0.3435981869697571,
"learning_rate": 1.1811336507343296e-06,
"loss": 0.3848,
"step": 791
},
{
"epoch": 2.4,
"grad_norm": 0.35146281123161316,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3858,
"step": 792
},
{
"epoch": 2.403030303030303,
"grad_norm": 0.35498523712158203,
"learning_rate": 1.1584695357148968e-06,
"loss": 0.3549,
"step": 793
},
{
"epoch": 2.4060606060606062,
"grad_norm": 0.34189391136169434,
"learning_rate": 1.1472090452487728e-06,
"loss": 0.3652,
"step": 794
},
{
"epoch": 2.409090909090909,
"grad_norm": 0.35392066836357117,
"learning_rate": 1.135996452998085e-06,
"loss": 0.3623,
"step": 795
},
{
"epoch": 2.412121212121212,
"grad_norm": 0.36120468378067017,
"learning_rate": 1.1248318983587052e-06,
"loss": 0.3887,
"step": 796
},
{
"epoch": 2.415151515151515,
"grad_norm": 0.334212064743042,
"learning_rate": 1.1137155201293021e-06,
"loss": 0.3756,
"step": 797
},
{
"epoch": 2.418181818181818,
"grad_norm": 0.33589741587638855,
"learning_rate": 1.1026474565096068e-06,
"loss": 0.3827,
"step": 798
},
{
"epoch": 2.4212121212121214,
"grad_norm": 0.3547380864620209,
"learning_rate": 1.0916278450986983e-06,
"loss": 0.3803,
"step": 799
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.3462902307510376,
"learning_rate": 1.0806568228932995e-06,
"loss": 0.365,
"step": 800
},
{
"epoch": 2.4272727272727272,
"grad_norm": 0.37670135498046875,
"learning_rate": 1.0697345262860638e-06,
"loss": 0.3824,
"step": 801
},
{
"epoch": 2.43030303030303,
"grad_norm": 0.35779649019241333,
"learning_rate": 1.0588610910638825e-06,
"loss": 0.3607,
"step": 802
},
{
"epoch": 2.4333333333333336,
"grad_norm": 0.35179200768470764,
"learning_rate": 1.0480366524062041e-06,
"loss": 0.3784,
"step": 803
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.35515978932380676,
"learning_rate": 1.0372613448833429e-06,
"loss": 0.3727,
"step": 804
},
{
"epoch": 2.4393939393939394,
"grad_norm": 0.34938177466392517,
"learning_rate": 1.0265353024548103e-06,
"loss": 0.3747,
"step": 805
},
{
"epoch": 2.4424242424242424,
"grad_norm": 0.3501240909099579,
"learning_rate": 1.0158586584676533e-06,
"loss": 0.3809,
"step": 806
},
{
"epoch": 2.4454545454545453,
"grad_norm": 0.35644036531448364,
"learning_rate": 1.0052315456547934e-06,
"loss": 0.3535,
"step": 807
},
{
"epoch": 2.4484848484848483,
"grad_norm": 0.3286396861076355,
"learning_rate": 9.94654096133374e-07,
"loss": 0.3669,
"step": 808
},
{
"epoch": 2.4515151515151516,
"grad_norm": 0.3664129972457886,
"learning_rate": 9.841264414031198e-07,
"loss": 0.3937,
"step": 809
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.34694886207580566,
"learning_rate": 9.73648712344707e-07,
"loss": 0.3958,
"step": 810
},
{
"epoch": 2.4575757575757575,
"grad_norm": 0.3321562111377716,
"learning_rate": 9.632210392181274e-07,
"loss": 0.3559,
"step": 811
},
{
"epoch": 2.4606060606060605,
"grad_norm": 0.34712937474250793,
"learning_rate": 9.528435516610729e-07,
"loss": 0.3665,
"step": 812
},
{
"epoch": 2.463636363636364,
"grad_norm": 0.32587698101997375,
"learning_rate": 9.425163786873292e-07,
"loss": 0.3712,
"step": 813
},
{
"epoch": 2.466666666666667,
"grad_norm": 0.3688627779483795,
"learning_rate": 9.322396486851626e-07,
"loss": 0.3859,
"step": 814
},
{
"epoch": 2.4696969696969697,
"grad_norm": 0.3505525588989258,
"learning_rate": 9.220134894157285e-07,
"loss": 0.3716,
"step": 815
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.34579452872276306,
"learning_rate": 9.118380280114858e-07,
"loss": 0.3613,
"step": 816
},
{
"epoch": 2.4757575757575756,
"grad_norm": 0.34811607003211975,
"learning_rate": 9.017133909746095e-07,
"loss": 0.3963,
"step": 817
},
{
"epoch": 2.4787878787878785,
"grad_norm": 0.32743358612060547,
"learning_rate": 8.916397041754238e-07,
"loss": 0.379,
"step": 818
},
{
"epoch": 2.481818181818182,
"grad_norm": 0.3497425615787506,
"learning_rate": 8.816170928508367e-07,
"loss": 0.3966,
"step": 819
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.3664569854736328,
"learning_rate": 8.716456816027791e-07,
"loss": 0.3804,
"step": 820
},
{
"epoch": 2.487878787878788,
"grad_norm": 0.3476223349571228,
"learning_rate": 8.617255943966579e-07,
"loss": 0.3947,
"step": 821
},
{
"epoch": 2.4909090909090907,
"grad_norm": 0.35610514879226685,
"learning_rate": 8.518569545598198e-07,
"loss": 0.377,
"step": 822
},
{
"epoch": 2.493939393939394,
"grad_norm": 0.3268238306045532,
"learning_rate": 8.420398847800093e-07,
"loss": 0.3939,
"step": 823
},
{
"epoch": 2.496969696969697,
"grad_norm": 0.3369584381580353,
"learning_rate": 8.322745071038474e-07,
"loss": 0.4004,
"step": 824
},
{
"epoch": 2.5,
"grad_norm": 0.3504047691822052,
"learning_rate": 8.225609429353187e-07,
"loss": 0.3667,
"step": 825
},
{
"epoch": 2.503030303030303,
"grad_norm": 0.36456790566444397,
"learning_rate": 8.128993130342538e-07,
"loss": 0.3786,
"step": 826
},
{
"epoch": 2.506060606060606,
"grad_norm": 0.34629517793655396,
"learning_rate": 8.032897375148324e-07,
"loss": 0.3591,
"step": 827
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.3457425832748413,
"learning_rate": 7.937323358440935e-07,
"loss": 0.4037,
"step": 828
},
{
"epoch": 2.512121212121212,
"grad_norm": 0.3337177336215973,
"learning_rate": 7.84227226840445e-07,
"loss": 0.4071,
"step": 829
},
{
"epoch": 2.515151515151515,
"grad_norm": 0.34807246923446655,
"learning_rate": 7.747745286721852e-07,
"loss": 0.3927,
"step": 830
},
{
"epoch": 2.518181818181818,
"grad_norm": 0.33292651176452637,
"learning_rate": 7.653743588560387e-07,
"loss": 0.3601,
"step": 831
},
{
"epoch": 2.5212121212121215,
"grad_norm": 0.3402540385723114,
"learning_rate": 7.560268342556948e-07,
"loss": 0.3879,
"step": 832
},
{
"epoch": 2.5242424242424244,
"grad_norm": 0.32598671317100525,
"learning_rate": 7.467320710803505e-07,
"loss": 0.377,
"step": 833
},
{
"epoch": 2.5272727272727273,
"grad_norm": 0.33640727400779724,
"learning_rate": 7.374901848832683e-07,
"loss": 0.3761,
"step": 834
},
{
"epoch": 2.5303030303030303,
"grad_norm": 0.3488267958164215,
"learning_rate": 7.283012905603437e-07,
"loss": 0.3617,
"step": 835
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.3477902412414551,
"learning_rate": 7.191655023486682e-07,
"loss": 0.3518,
"step": 836
},
{
"epoch": 2.536363636363636,
"grad_norm": 0.3296281695365906,
"learning_rate": 7.100829338251147e-07,
"loss": 0.3777,
"step": 837
},
{
"epoch": 2.5393939393939395,
"grad_norm": 0.3473811149597168,
"learning_rate": 7.010536979049277e-07,
"loss": 0.3688,
"step": 838
},
{
"epoch": 2.5424242424242425,
"grad_norm": 0.3439598083496094,
"learning_rate": 6.920779068403127e-07,
"loss": 0.379,
"step": 839
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.3552432656288147,
"learning_rate": 6.831556722190453e-07,
"loss": 0.3718,
"step": 840
},
{
"epoch": 2.5484848484848484,
"grad_norm": 0.3223789930343628,
"learning_rate": 6.74287104963085e-07,
"loss": 0.3776,
"step": 841
},
{
"epoch": 2.5515151515151517,
"grad_norm": 0.31215062737464905,
"learning_rate": 6.654723153271913e-07,
"loss": 0.394,
"step": 842
},
{
"epoch": 2.5545454545454547,
"grad_norm": 0.3523038625717163,
"learning_rate": 6.567114128975571e-07,
"loss": 0.3655,
"step": 843
},
{
"epoch": 2.5575757575757576,
"grad_norm": 0.3508360981941223,
"learning_rate": 6.480045065904461e-07,
"loss": 0.3657,
"step": 844
},
{
"epoch": 2.5606060606060606,
"grad_norm": 0.3497951030731201,
"learning_rate": 6.393517046508363e-07,
"loss": 0.3754,
"step": 845
},
{
"epoch": 2.5636363636363635,
"grad_norm": 0.3315987288951874,
"learning_rate": 6.307531146510754e-07,
"loss": 0.4066,
"step": 846
},
{
"epoch": 2.5666666666666664,
"grad_norm": 0.33407914638519287,
"learning_rate": 6.222088434895462e-07,
"loss": 0.3518,
"step": 847
},
{
"epoch": 2.56969696969697,
"grad_norm": 0.33812716603279114,
"learning_rate": 6.137189973893331e-07,
"loss": 0.3476,
"step": 848
},
{
"epoch": 2.5727272727272728,
"grad_norm": 0.3333025574684143,
"learning_rate": 6.052836818969027e-07,
"loss": 0.3766,
"step": 849
},
{
"epoch": 2.5757575757575757,
"grad_norm": 0.3389793932437897,
"learning_rate": 5.969030018807953e-07,
"loss": 0.3737,
"step": 850
},
{
"epoch": 2.5787878787878786,
"grad_norm": 0.3426154553890228,
"learning_rate": 5.885770615303182e-07,
"loss": 0.366,
"step": 851
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.3441448211669922,
"learning_rate": 5.803059643542491e-07,
"loss": 0.3675,
"step": 852
},
{
"epoch": 2.584848484848485,
"grad_norm": 0.3547995984554291,
"learning_rate": 5.720898131795494e-07,
"loss": 0.3675,
"step": 853
},
{
"epoch": 2.587878787878788,
"grad_norm": 0.32679978013038635,
"learning_rate": 5.639287101500923e-07,
"loss": 0.3864,
"step": 854
},
{
"epoch": 2.590909090909091,
"grad_norm": 0.3426445424556732,
"learning_rate": 5.558227567253832e-07,
"loss": 0.3681,
"step": 855
},
{
"epoch": 2.5939393939393938,
"grad_norm": 0.33950114250183105,
"learning_rate": 5.477720536793035e-07,
"loss": 0.3866,
"step": 856
},
{
"epoch": 2.5969696969696967,
"grad_norm": 0.35901251435279846,
"learning_rate": 5.397767010988614e-07,
"loss": 0.3779,
"step": 857
},
{
"epoch": 2.6,
"grad_norm": 0.34515222907066345,
"learning_rate": 5.318367983829393e-07,
"loss": 0.3632,
"step": 858
},
{
"epoch": 2.603030303030303,
"grad_norm": 0.34496960043907166,
"learning_rate": 5.239524442410627e-07,
"loss": 0.4095,
"step": 859
},
{
"epoch": 2.606060606060606,
"grad_norm": 0.33476129174232483,
"learning_rate": 5.16123736692175e-07,
"loss": 0.3637,
"step": 860
},
{
"epoch": 2.6090909090909093,
"grad_norm": 0.31554561853408813,
"learning_rate": 5.083507730634152e-07,
"loss": 0.3949,
"step": 861
},
{
"epoch": 2.6121212121212123,
"grad_norm": 0.3599357306957245,
"learning_rate": 5.006336499889075e-07,
"loss": 0.3639,
"step": 862
},
{
"epoch": 2.6151515151515152,
"grad_norm": 0.3472028970718384,
"learning_rate": 4.929724634085664e-07,
"loss": 0.3628,
"step": 863
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.32713690400123596,
"learning_rate": 4.853673085668947e-07,
"loss": 0.3867,
"step": 864
},
{
"epoch": 2.621212121212121,
"grad_norm": 0.3341231346130371,
"learning_rate": 4.778182800118053e-07,
"loss": 0.368,
"step": 865
},
{
"epoch": 2.624242424242424,
"grad_norm": 0.3199952244758606,
"learning_rate": 4.7032547159344466e-07,
"loss": 0.3857,
"step": 866
},
{
"epoch": 2.6272727272727274,
"grad_norm": 0.34618866443634033,
"learning_rate": 4.628889764630279e-07,
"loss": 0.3761,
"step": 867
},
{
"epoch": 2.6303030303030304,
"grad_norm": 0.3341084420681,
"learning_rate": 4.5550888707167505e-07,
"loss": 0.3584,
"step": 868
},
{
"epoch": 2.6333333333333333,
"grad_norm": 0.3550304174423218,
"learning_rate": 4.481852951692672e-07,
"loss": 0.3694,
"step": 869
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.3328497111797333,
"learning_rate": 4.4091829180330503e-07,
"loss": 0.3694,
"step": 870
},
{
"epoch": 2.6393939393939396,
"grad_norm": 0.32884612679481506,
"learning_rate": 4.33707967317773e-07,
"loss": 0.3703,
"step": 871
},
{
"epoch": 2.6424242424242426,
"grad_norm": 0.36629319190979004,
"learning_rate": 4.26554411352022e-07,
"loss": 0.3762,
"step": 872
},
{
"epoch": 2.6454545454545455,
"grad_norm": 0.3324334919452667,
"learning_rate": 4.194577128396521e-07,
"loss": 0.3664,
"step": 873
},
{
"epoch": 2.6484848484848484,
"grad_norm": 0.3406725227832794,
"learning_rate": 4.1241796000740296e-07,
"loss": 0.3995,
"step": 874
},
{
"epoch": 2.6515151515151514,
"grad_norm": 0.3479924201965332,
"learning_rate": 4.054352403740641e-07,
"loss": 0.3846,
"step": 875
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.3131670355796814,
"learning_rate": 3.985096407493838e-07,
"loss": 0.3806,
"step": 876
},
{
"epoch": 2.6575757575757577,
"grad_norm": 0.3157781958580017,
"learning_rate": 3.916412472329884e-07,
"loss": 0.3826,
"step": 877
},
{
"epoch": 2.6606060606060606,
"grad_norm": 0.3542560338973999,
"learning_rate": 3.8483014521331184e-07,
"loss": 0.3611,
"step": 878
},
{
"epoch": 2.6636363636363636,
"grad_norm": 0.34196749329566956,
"learning_rate": 3.7807641936653984e-07,
"loss": 0.3815,
"step": 879
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.3302360475063324,
"learning_rate": 3.7138015365554834e-07,
"loss": 0.3746,
"step": 880
},
{
"epoch": 2.66969696969697,
"grad_norm": 0.3370774984359741,
"learning_rate": 3.6474143132886607e-07,
"loss": 0.3691,
"step": 881
},
{
"epoch": 2.672727272727273,
"grad_norm": 0.3441545367240906,
"learning_rate": 3.581603349196372e-07,
"loss": 0.3679,
"step": 882
},
{
"epoch": 2.675757575757576,
"grad_norm": 0.3569502830505371,
"learning_rate": 3.516369462445968e-07,
"loss": 0.354,
"step": 883
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.34038347005844116,
"learning_rate": 3.4517134640305097e-07,
"loss": 0.4109,
"step": 884
},
{
"epoch": 2.6818181818181817,
"grad_norm": 0.325907438993454,
"learning_rate": 3.3876361577587115e-07,
"loss": 0.3982,
"step": 885
},
{
"epoch": 2.6848484848484846,
"grad_norm": 0.32801732420921326,
"learning_rate": 3.324138340244948e-07,
"loss": 0.3947,
"step": 886
},
{
"epoch": 2.687878787878788,
"grad_norm": 0.3471670150756836,
"learning_rate": 3.261220800899323e-07,
"loss": 0.3918,
"step": 887
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.34977656602859497,
"learning_rate": 3.1988843219178776e-07,
"loss": 0.3774,
"step": 888
},
{
"epoch": 2.693939393939394,
"grad_norm": 0.3262813985347748,
"learning_rate": 3.1371296782728875e-07,
"loss": 0.3744,
"step": 889
},
{
"epoch": 2.6969696969696972,
"grad_norm": 0.33313867449760437,
"learning_rate": 3.0759576377031697e-07,
"loss": 0.3755,
"step": 890
},
{
"epoch": 2.7,
"grad_norm": 0.3310135006904602,
"learning_rate": 3.015368960704584e-07,
"loss": 0.388,
"step": 891
},
{
"epoch": 2.703030303030303,
"grad_norm": 0.342500776052475,
"learning_rate": 2.955364400520583e-07,
"loss": 0.3875,
"step": 892
},
{
"epoch": 2.706060606060606,
"grad_norm": 0.3611791133880615,
"learning_rate": 2.8959447031327916e-07,
"loss": 0.3618,
"step": 893
},
{
"epoch": 2.709090909090909,
"grad_norm": 0.32603222131729126,
"learning_rate": 2.8371106072518194e-07,
"loss": 0.3921,
"step": 894
},
{
"epoch": 2.712121212121212,
"grad_norm": 0.3323443531990051,
"learning_rate": 2.7788628443080003e-07,
"loss": 0.3544,
"step": 895
},
{
"epoch": 2.7151515151515153,
"grad_norm": 0.31575021147727966,
"learning_rate": 2.7212021384423415e-07,
"loss": 0.363,
"step": 896
},
{
"epoch": 2.7181818181818183,
"grad_norm": 0.34948068857192993,
"learning_rate": 2.664129206497479e-07,
"loss": 0.3999,
"step": 897
},
{
"epoch": 2.721212121212121,
"grad_norm": 0.3439748287200928,
"learning_rate": 2.6076447580088426e-07,
"loss": 0.3744,
"step": 898
},
{
"epoch": 2.724242424242424,
"grad_norm": 0.33368557691574097,
"learning_rate": 2.5517494951957544e-07,
"loss": 0.3768,
"step": 899
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.33264872431755066,
"learning_rate": 2.4964441129527337e-07,
"loss": 0.3942,
"step": 900
},
{
"epoch": 2.7303030303030305,
"grad_norm": 0.36262091994285583,
"learning_rate": 2.441729298840861e-07,
"loss": 0.381,
"step": 901
},
{
"epoch": 2.7333333333333334,
"grad_norm": 0.3555268943309784,
"learning_rate": 2.3876057330792344e-07,
"loss": 0.3638,
"step": 902
},
{
"epoch": 2.7363636363636363,
"grad_norm": 0.33674266934394836,
"learning_rate": 2.3340740885364922e-07,
"loss": 0.3484,
"step": 903
},
{
"epoch": 2.7393939393939393,
"grad_norm": 0.34257423877716064,
"learning_rate": 2.2811350307224534e-07,
"loss": 0.3826,
"step": 904
},
{
"epoch": 2.742424242424242,
"grad_norm": 0.3373337984085083,
"learning_rate": 2.2287892177798642e-07,
"loss": 0.364,
"step": 905
},
{
"epoch": 2.7454545454545456,
"grad_norm": 0.3389836847782135,
"learning_rate": 2.1770373004762035e-07,
"loss": 0.3689,
"step": 906
},
{
"epoch": 2.7484848484848485,
"grad_norm": 0.32391688227653503,
"learning_rate": 2.1258799221955618e-07,
"loss": 0.3962,
"step": 907
},
{
"epoch": 2.7515151515151515,
"grad_norm": 0.3334949314594269,
"learning_rate": 2.0753177189307138e-07,
"loss": 0.3799,
"step": 908
},
{
"epoch": 2.7545454545454544,
"grad_norm": 0.3442675769329071,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.3931,
"step": 909
},
{
"epoch": 2.757575757575758,
"grad_norm": 0.3674370348453522,
"learning_rate": 1.9759813444152342e-07,
"loss": 0.3712,
"step": 910
},
{
"epoch": 2.7606060606060607,
"grad_norm": 0.33962857723236084,
"learning_rate": 1.9272084081226272e-07,
"loss": 0.3621,
"step": 911
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.31753936409950256,
"learning_rate": 1.8790331167464758e-07,
"loss": 0.383,
"step": 912
},
{
"epoch": 2.7666666666666666,
"grad_norm": 0.33068257570266724,
"learning_rate": 1.8314560692059836e-07,
"loss": 0.3827,
"step": 913
},
{
"epoch": 2.7696969696969695,
"grad_norm": 0.32340729236602783,
"learning_rate": 1.7844778569829412e-07,
"loss": 0.3736,
"step": 914
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.3291246294975281,
"learning_rate": 1.738099064114368e-07,
"loss": 0.3689,
"step": 915
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.32609397172927856,
"learning_rate": 1.6923202671852379e-07,
"loss": 0.4106,
"step": 916
},
{
"epoch": 2.778787878787879,
"grad_norm": 0.3484046459197998,
"learning_rate": 1.6471420353213362e-07,
"loss": 0.3779,
"step": 917
},
{
"epoch": 2.7818181818181817,
"grad_norm": 0.32795125246047974,
"learning_rate": 1.6025649301821877e-07,
"loss": 0.3986,
"step": 918
},
{
"epoch": 2.7848484848484847,
"grad_norm": 0.3558945655822754,
"learning_rate": 1.5585895059540336e-07,
"loss": 0.3638,
"step": 919
},
{
"epoch": 2.787878787878788,
"grad_norm": 0.32118871808052063,
"learning_rate": 1.5152163093429762e-07,
"loss": 0.3879,
"step": 920
},
{
"epoch": 2.790909090909091,
"grad_norm": 0.34035980701446533,
"learning_rate": 1.4724458795681962e-07,
"loss": 0.3742,
"step": 921
},
{
"epoch": 2.793939393939394,
"grad_norm": 0.3438761234283447,
"learning_rate": 1.4302787483551962e-07,
"loss": 0.381,
"step": 922
},
{
"epoch": 2.796969696969697,
"grad_norm": 0.3301991820335388,
"learning_rate": 1.388715439929239e-07,
"loss": 0.3468,
"step": 923
},
{
"epoch": 2.8,
"grad_norm": 0.36519476771354675,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.3593,
"step": 924
},
{
"epoch": 2.8030303030303028,
"grad_norm": 0.3334772288799286,
"learning_rate": 1.3074023507991917e-07,
"loss": 0.3835,
"step": 925
},
{
"epoch": 2.806060606060606,
"grad_norm": 0.3869641125202179,
"learning_rate": 1.267653580986139e-07,
"loss": 0.3666,
"step": 926
},
{
"epoch": 2.809090909090909,
"grad_norm": 0.3358801007270813,
"learning_rate": 1.2285106557296479e-07,
"loss": 0.3873,
"step": 927
},
{
"epoch": 2.812121212121212,
"grad_norm": 0.32431620359420776,
"learning_rate": 1.1899740616578004e-07,
"loss": 0.3619,
"step": 928
},
{
"epoch": 2.8151515151515154,
"grad_norm": 0.34791114926338196,
"learning_rate": 1.1520442778607032e-07,
"loss": 0.3643,
"step": 929
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.33882835507392883,
"learning_rate": 1.1147217758845752e-07,
"loss": 0.3669,
"step": 930
},
{
"epoch": 2.8212121212121213,
"grad_norm": 0.35345929861068726,
"learning_rate": 1.0780070197258408e-07,
"loss": 0.3998,
"step": 931
},
{
"epoch": 2.824242424242424,
"grad_norm": 0.4318199157714844,
"learning_rate": 1.0419004658253795e-07,
"loss": 0.3586,
"step": 932
},
{
"epoch": 2.827272727272727,
"grad_norm": 0.33187335729599,
"learning_rate": 1.0064025630628583e-07,
"loss": 0.3688,
"step": 933
},
{
"epoch": 2.83030303030303,
"grad_norm": 0.33873116970062256,
"learning_rate": 9.715137527511298e-08,
"loss": 0.3752,
"step": 934
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.3571045994758606,
"learning_rate": 9.372344686307655e-08,
"loss": 0.3614,
"step": 935
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.3408893346786499,
"learning_rate": 9.035651368646647e-08,
"loss": 0.3765,
"step": 936
},
{
"epoch": 2.8393939393939394,
"grad_norm": 0.35720014572143555,
"learning_rate": 8.705061760327372e-08,
"loss": 0.354,
"step": 937
},
{
"epoch": 2.8424242424242423,
"grad_norm": 0.33762773871421814,
"learning_rate": 8.380579971267178e-08,
"loss": 0.3729,
"step": 938
},
{
"epoch": 2.8454545454545457,
"grad_norm": 0.3154089152812958,
"learning_rate": 8.06221003545038e-08,
"loss": 0.3574,
"step": 939
},
{
"epoch": 2.8484848484848486,
"grad_norm": 0.3321691155433655,
"learning_rate": 7.749955910878459e-08,
"loss": 0.3676,
"step": 940
},
{
"epoch": 2.8515151515151516,
"grad_norm": 0.3242235779762268,
"learning_rate": 7.443821479520441e-08,
"loss": 0.3932,
"step": 941
},
{
"epoch": 2.8545454545454545,
"grad_norm": 0.3255268931388855,
"learning_rate": 7.143810547264762e-08,
"loss": 0.3812,
"step": 942
},
{
"epoch": 2.8575757575757574,
"grad_norm": 0.3191761076450348,
"learning_rate": 6.849926843872257e-08,
"loss": 0.3922,
"step": 943
},
{
"epoch": 2.8606060606060604,
"grad_norm": 0.34105202555656433,
"learning_rate": 6.562174022929358e-08,
"loss": 0.3734,
"step": 944
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.3678226172924042,
"learning_rate": 6.280555661802857e-08,
"loss": 0.3707,
"step": 945
},
{
"epoch": 2.8666666666666667,
"grad_norm": 0.34355148673057556,
"learning_rate": 6.005075261595495e-08,
"loss": 0.3835,
"step": 946
},
{
"epoch": 2.8696969696969696,
"grad_norm": 0.3354954123497009,
"learning_rate": 5.735736247102497e-08,
"loss": 0.3691,
"step": 947
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.3545462489128113,
"learning_rate": 5.472541966768552e-08,
"loss": 0.3765,
"step": 948
},
{
"epoch": 2.875757575757576,
"grad_norm": 0.30889591574668884,
"learning_rate": 5.215495692646788e-08,
"loss": 0.3946,
"step": 949
},
{
"epoch": 2.878787878787879,
"grad_norm": 0.32154256105422974,
"learning_rate": 4.9646006203577515e-08,
"loss": 0.3881,
"step": 950
},
{
"epoch": 2.881818181818182,
"grad_norm": 0.33414721488952637,
"learning_rate": 4.719859869049659e-08,
"loss": 0.3714,
"step": 951
},
{
"epoch": 2.8848484848484848,
"grad_norm": 0.34182053804397583,
"learning_rate": 4.481276481359764e-08,
"loss": 0.3876,
"step": 952
},
{
"epoch": 2.8878787878787877,
"grad_norm": 0.34174323081970215,
"learning_rate": 4.2488534233764425e-08,
"loss": 0.3727,
"step": 953
},
{
"epoch": 2.8909090909090907,
"grad_norm": 0.3377932012081146,
"learning_rate": 4.02259358460233e-08,
"loss": 0.352,
"step": 954
},
{
"epoch": 2.893939393939394,
"grad_norm": 0.353381484746933,
"learning_rate": 3.8024997779185225e-08,
"loss": 0.375,
"step": 955
},
{
"epoch": 2.896969696969697,
"grad_norm": 0.32608339190483093,
"learning_rate": 3.588574739549322e-08,
"loss": 0.3759,
"step": 956
},
{
"epoch": 2.9,
"grad_norm": 0.3262963891029358,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.3906,
"step": 957
},
{
"epoch": 2.9030303030303033,
"grad_norm": 0.33256980776786804,
"learning_rate": 3.179241529166099e-08,
"loss": 0.3642,
"step": 958
},
{
"epoch": 2.9060606060606062,
"grad_norm": 0.32789772748947144,
"learning_rate": 2.983838446016407e-08,
"loss": 0.3708,
"step": 959
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.34206628799438477,
"learning_rate": 2.7946143088466437e-08,
"loss": 0.3714,
"step": 960
},
{
"epoch": 2.912121212121212,
"grad_norm": 0.3323133885860443,
"learning_rate": 2.6115714701069327e-08,
"loss": 0.3664,
"step": 961
},
{
"epoch": 2.915151515151515,
"grad_norm": 0.3356127440929413,
"learning_rate": 2.434712205400924e-08,
"loss": 0.3686,
"step": 962
},
{
"epoch": 2.918181818181818,
"grad_norm": 0.32426542043685913,
"learning_rate": 2.264038713457706e-08,
"loss": 0.3526,
"step": 963
},
{
"epoch": 2.9212121212121214,
"grad_norm": 0.3274119198322296,
"learning_rate": 2.0995531161041028e-08,
"loss": 0.3462,
"step": 964
},
{
"epoch": 2.9242424242424243,
"grad_norm": 0.33501943945884705,
"learning_rate": 1.94125745823881e-08,
"loss": 0.3877,
"step": 965
},
{
"epoch": 2.9272727272727272,
"grad_norm": 0.3432499170303345,
"learning_rate": 1.789153707806357e-08,
"loss": 0.4045,
"step": 966
},
{
"epoch": 2.93030303030303,
"grad_norm": 0.3250257074832916,
"learning_rate": 1.6432437557732383e-08,
"loss": 0.3964,
"step": 967
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.34239816665649414,
"learning_rate": 1.5035294161039882e-08,
"loss": 0.3536,
"step": 968
},
{
"epoch": 2.9363636363636365,
"grad_norm": 0.32236921787261963,
"learning_rate": 1.3700124257388092e-08,
"loss": 0.4131,
"step": 969
},
{
"epoch": 2.9393939393939394,
"grad_norm": 0.3150605857372284,
"learning_rate": 1.2426944445719791e-08,
"loss": 0.3854,
"step": 970
},
{
"epoch": 2.9424242424242424,
"grad_norm": 0.33636316657066345,
"learning_rate": 1.1215770554312e-08,
"loss": 0.356,
"step": 971
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.31633424758911133,
"learning_rate": 1.006661764057837e-08,
"loss": 0.3695,
"step": 972
},
{
"epoch": 2.9484848484848483,
"grad_norm": 0.3274213671684265,
"learning_rate": 8.979499990882102e-09,
"loss": 0.3941,
"step": 973
},
{
"epoch": 2.9515151515151516,
"grad_norm": 0.3197808265686035,
"learning_rate": 7.954431120359985e-09,
"loss": 0.3808,
"step": 974
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.3330005705356598,
"learning_rate": 6.991423772753636e-09,
"loss": 0.3913,
"step": 975
},
{
"epoch": 2.9575757575757575,
"grad_norm": 0.3442140817642212,
"learning_rate": 6.090489920249076e-09,
"loss": 0.3846,
"step": 976
},
{
"epoch": 2.9606060606060605,
"grad_norm": 0.3263948857784271,
"learning_rate": 5.2516407633312895e-09,
"loss": 0.3581,
"step": 977
},
{
"epoch": 2.963636363636364,
"grad_norm": 0.31933942437171936,
"learning_rate": 4.474886730641004e-09,
"loss": 0.3722,
"step": 978
},
{
"epoch": 2.966666666666667,
"grad_norm": 0.316170334815979,
"learning_rate": 3.760237478849793e-09,
"loss": 0.363,
"step": 979
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.3238048553466797,
"learning_rate": 3.1077018925351753e-09,
"loss": 0.3703,
"step": 980
},
{
"epoch": 2.9727272727272727,
"grad_norm": 0.3422054350376129,
"learning_rate": 2.5172880840745873e-09,
"loss": 0.3714,
"step": 981
},
{
"epoch": 2.9757575757575756,
"grad_norm": 0.2909259796142578,
"learning_rate": 1.989003393539912e-09,
"loss": 0.4171,
"step": 982
},
{
"epoch": 2.9787878787878785,
"grad_norm": 0.32509645819664,
"learning_rate": 1.5228543886114389e-09,
"loss": 0.37,
"step": 983
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.33289819955825806,
"learning_rate": 1.118846864490708e-09,
"loss": 0.3711,
"step": 984
},
{
"epoch": 2.984848484848485,
"grad_norm": 0.32753217220306396,
"learning_rate": 7.769858438338995e-10,
"loss": 0.3621,
"step": 985
},
{
"epoch": 2.987878787878788,
"grad_norm": 0.3397236764431,
"learning_rate": 4.972755766846637e-10,
"loss": 0.364,
"step": 986
},
{
"epoch": 2.990909090909091,
"grad_norm": 0.3434215188026428,
"learning_rate": 2.797195404247166e-10,
"loss": 0.358,
"step": 987
},
{
"epoch": 2.993939393939394,
"grad_norm": 0.33324527740478516,
"learning_rate": 1.2432043972832042e-10,
"loss": 0.3845,
"step": 988
},
{
"epoch": 2.996969696969697,
"grad_norm": 0.32625526189804077,
"learning_rate": 3.108020653008748e-11,
"loss": 0.3751,
"step": 989
},
{
"epoch": 3.0,
"grad_norm": 0.3243604302406311,
"learning_rate": 0.0,
"loss": 0.3597,
"step": 990
},
{
"epoch": 3.0,
"step": 990,
"total_flos": 1.437255561186902e+18,
"train_loss": 0.0,
"train_runtime": 6.6052,
"train_samples_per_second": 14386.761,
"train_steps_per_second": 149.881
}
],
"logging_steps": 1,
"max_steps": 990,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.437255561186902e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}