flyingbugs's picture
Model save
1607104 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1992,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015060240963855422,
"grad_norm": 39.57606131842853,
"learning_rate": 2.5000000000000004e-07,
"loss": 11.9627,
"step": 1
},
{
"epoch": 0.0030120481927710845,
"grad_norm": 38.57745223428044,
"learning_rate": 5.000000000000001e-07,
"loss": 11.889,
"step": 2
},
{
"epoch": 0.004518072289156626,
"grad_norm": 43.73792985337445,
"learning_rate": 7.5e-07,
"loss": 11.7109,
"step": 3
},
{
"epoch": 0.006024096385542169,
"grad_norm": 39.29710621094019,
"learning_rate": 1.0000000000000002e-06,
"loss": 12.0491,
"step": 4
},
{
"epoch": 0.007530120481927711,
"grad_norm": 40.79593949570384,
"learning_rate": 1.25e-06,
"loss": 11.7384,
"step": 5
},
{
"epoch": 0.009036144578313253,
"grad_norm": 40.60300822379552,
"learning_rate": 1.5e-06,
"loss": 11.815,
"step": 6
},
{
"epoch": 0.010542168674698794,
"grad_norm": 37.62161247172016,
"learning_rate": 1.7500000000000002e-06,
"loss": 12.0194,
"step": 7
},
{
"epoch": 0.012048192771084338,
"grad_norm": 40.66649147908786,
"learning_rate": 2.0000000000000003e-06,
"loss": 11.7432,
"step": 8
},
{
"epoch": 0.01355421686746988,
"grad_norm": 42.1011872606964,
"learning_rate": 2.25e-06,
"loss": 11.6626,
"step": 9
},
{
"epoch": 0.015060240963855422,
"grad_norm": 48.896184208388576,
"learning_rate": 2.5e-06,
"loss": 10.8882,
"step": 10
},
{
"epoch": 0.016566265060240965,
"grad_norm": 48.215525906713054,
"learning_rate": 2.7500000000000004e-06,
"loss": 10.7452,
"step": 11
},
{
"epoch": 0.018072289156626505,
"grad_norm": 51.51217061983061,
"learning_rate": 3e-06,
"loss": 10.6458,
"step": 12
},
{
"epoch": 0.01957831325301205,
"grad_norm": 70.47183758440084,
"learning_rate": 3.2500000000000002e-06,
"loss": 7.2207,
"step": 13
},
{
"epoch": 0.02108433734939759,
"grad_norm": 71.09316703950228,
"learning_rate": 3.5000000000000004e-06,
"loss": 6.9514,
"step": 14
},
{
"epoch": 0.022590361445783132,
"grad_norm": 63.55566519192783,
"learning_rate": 3.75e-06,
"loss": 6.1593,
"step": 15
},
{
"epoch": 0.024096385542168676,
"grad_norm": 58.953339529342415,
"learning_rate": 4.000000000000001e-06,
"loss": 5.9159,
"step": 16
},
{
"epoch": 0.025602409638554216,
"grad_norm": 26.676160229250186,
"learning_rate": 4.250000000000001e-06,
"loss": 3.7025,
"step": 17
},
{
"epoch": 0.02710843373493976,
"grad_norm": 10.23672437423773,
"learning_rate": 4.5e-06,
"loss": 2.7642,
"step": 18
},
{
"epoch": 0.0286144578313253,
"grad_norm": 8.131301558409636,
"learning_rate": 4.75e-06,
"loss": 2.6027,
"step": 19
},
{
"epoch": 0.030120481927710843,
"grad_norm": 6.680310775221112,
"learning_rate": 5e-06,
"loss": 2.4538,
"step": 20
},
{
"epoch": 0.03162650602409638,
"grad_norm": 5.737976638517552,
"learning_rate": 5.25e-06,
"loss": 2.6364,
"step": 21
},
{
"epoch": 0.03313253012048193,
"grad_norm": 5.062025622839122,
"learning_rate": 5.500000000000001e-06,
"loss": 2.2649,
"step": 22
},
{
"epoch": 0.03463855421686747,
"grad_norm": 11.061052422183486,
"learning_rate": 5.750000000000001e-06,
"loss": 2.2775,
"step": 23
},
{
"epoch": 0.03614457831325301,
"grad_norm": 7.455210356356891,
"learning_rate": 6e-06,
"loss": 1.8131,
"step": 24
},
{
"epoch": 0.03765060240963856,
"grad_norm": 12.643017361462327,
"learning_rate": 6.25e-06,
"loss": 2.0669,
"step": 25
},
{
"epoch": 0.0391566265060241,
"grad_norm": 3.62360846454253,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.0419,
"step": 26
},
{
"epoch": 0.04066265060240964,
"grad_norm": 2.726262580179469,
"learning_rate": 6.750000000000001e-06,
"loss": 1.9748,
"step": 27
},
{
"epoch": 0.04216867469879518,
"grad_norm": 2.0669438402549423,
"learning_rate": 7.000000000000001e-06,
"loss": 2.097,
"step": 28
},
{
"epoch": 0.043674698795180725,
"grad_norm": 1.5755084948901494,
"learning_rate": 7.25e-06,
"loss": 1.8668,
"step": 29
},
{
"epoch": 0.045180722891566265,
"grad_norm": 1.6544362972997315,
"learning_rate": 7.5e-06,
"loss": 1.817,
"step": 30
},
{
"epoch": 0.046686746987951805,
"grad_norm": 1.5612278534770148,
"learning_rate": 7.75e-06,
"loss": 1.7516,
"step": 31
},
{
"epoch": 0.04819277108433735,
"grad_norm": 1.5594338826745717,
"learning_rate": 8.000000000000001e-06,
"loss": 1.8749,
"step": 32
},
{
"epoch": 0.04969879518072289,
"grad_norm": 1.1045308165060217,
"learning_rate": 8.25e-06,
"loss": 1.7344,
"step": 33
},
{
"epoch": 0.05120481927710843,
"grad_norm": 0.9038318617781625,
"learning_rate": 8.500000000000002e-06,
"loss": 1.5401,
"step": 34
},
{
"epoch": 0.05271084337349398,
"grad_norm": 0.8886116091710398,
"learning_rate": 8.75e-06,
"loss": 1.487,
"step": 35
},
{
"epoch": 0.05421686746987952,
"grad_norm": 1.125340855973849,
"learning_rate": 9e-06,
"loss": 1.7029,
"step": 36
},
{
"epoch": 0.05572289156626506,
"grad_norm": 1.0943609976235324,
"learning_rate": 9.25e-06,
"loss": 1.8399,
"step": 37
},
{
"epoch": 0.0572289156626506,
"grad_norm": 0.7602105483509752,
"learning_rate": 9.5e-06,
"loss": 1.5791,
"step": 38
},
{
"epoch": 0.058734939759036146,
"grad_norm": 0.8216337616107,
"learning_rate": 9.750000000000002e-06,
"loss": 1.425,
"step": 39
},
{
"epoch": 0.060240963855421686,
"grad_norm": 1.0203425484269777,
"learning_rate": 1e-05,
"loss": 1.7001,
"step": 40
},
{
"epoch": 0.061746987951807226,
"grad_norm": 0.7518667296811056,
"learning_rate": 1.025e-05,
"loss": 1.6928,
"step": 41
},
{
"epoch": 0.06325301204819277,
"grad_norm": 0.6184057626397994,
"learning_rate": 1.05e-05,
"loss": 1.6731,
"step": 42
},
{
"epoch": 0.06475903614457831,
"grad_norm": 1.085261961241021,
"learning_rate": 1.075e-05,
"loss": 1.6487,
"step": 43
},
{
"epoch": 0.06626506024096386,
"grad_norm": 0.761278335512061,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.4524,
"step": 44
},
{
"epoch": 0.0677710843373494,
"grad_norm": 0.5698924234269683,
"learning_rate": 1.125e-05,
"loss": 1.4021,
"step": 45
},
{
"epoch": 0.06927710843373494,
"grad_norm": 0.603495938887024,
"learning_rate": 1.1500000000000002e-05,
"loss": 1.4194,
"step": 46
},
{
"epoch": 0.07078313253012049,
"grad_norm": 0.6353052036926367,
"learning_rate": 1.175e-05,
"loss": 1.262,
"step": 47
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.5235664912789643,
"learning_rate": 1.2e-05,
"loss": 1.5307,
"step": 48
},
{
"epoch": 0.07379518072289157,
"grad_norm": 0.5307550251947036,
"learning_rate": 1.225e-05,
"loss": 1.552,
"step": 49
},
{
"epoch": 0.07530120481927711,
"grad_norm": 0.5913304069868457,
"learning_rate": 1.25e-05,
"loss": 1.388,
"step": 50
},
{
"epoch": 0.07680722891566265,
"grad_norm": 0.5185830792589633,
"learning_rate": 1.2750000000000002e-05,
"loss": 1.5412,
"step": 51
},
{
"epoch": 0.0783132530120482,
"grad_norm": 0.6589642094176167,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.4116,
"step": 52
},
{
"epoch": 0.07981927710843373,
"grad_norm": 0.5155504378659284,
"learning_rate": 1.3250000000000002e-05,
"loss": 1.3184,
"step": 53
},
{
"epoch": 0.08132530120481928,
"grad_norm": 0.5925421613149038,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.4131,
"step": 54
},
{
"epoch": 0.08283132530120482,
"grad_norm": 0.6866152502871953,
"learning_rate": 1.3750000000000002e-05,
"loss": 1.4009,
"step": 55
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.4876547398731001,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.4006,
"step": 56
},
{
"epoch": 0.0858433734939759,
"grad_norm": 0.560086124656963,
"learning_rate": 1.4249999999999999e-05,
"loss": 1.6387,
"step": 57
},
{
"epoch": 0.08734939759036145,
"grad_norm": 0.7834357280155554,
"learning_rate": 1.45e-05,
"loss": 1.3298,
"step": 58
},
{
"epoch": 0.08885542168674698,
"grad_norm": 0.5561096772474583,
"learning_rate": 1.475e-05,
"loss": 1.3909,
"step": 59
},
{
"epoch": 0.09036144578313253,
"grad_norm": 0.4993183864941936,
"learning_rate": 1.5e-05,
"loss": 1.422,
"step": 60
},
{
"epoch": 0.09186746987951808,
"grad_norm": 0.5864527696714835,
"learning_rate": 1.525e-05,
"loss": 1.3084,
"step": 61
},
{
"epoch": 0.09337349397590361,
"grad_norm": 0.46989435754740533,
"learning_rate": 1.55e-05,
"loss": 1.5295,
"step": 62
},
{
"epoch": 0.09487951807228916,
"grad_norm": 0.4636151318017185,
"learning_rate": 1.575e-05,
"loss": 1.2964,
"step": 63
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.49657129941674627,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3435,
"step": 64
},
{
"epoch": 0.09789156626506024,
"grad_norm": 0.5137608549362312,
"learning_rate": 1.6250000000000002e-05,
"loss": 1.5097,
"step": 65
},
{
"epoch": 0.09939759036144578,
"grad_norm": 0.5331450997022084,
"learning_rate": 1.65e-05,
"loss": 1.3683,
"step": 66
},
{
"epoch": 0.10090361445783133,
"grad_norm": 1.1738837732867826,
"learning_rate": 1.675e-05,
"loss": 1.4534,
"step": 67
},
{
"epoch": 0.10240963855421686,
"grad_norm": 1.4818755662288532,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.3104,
"step": 68
},
{
"epoch": 0.10391566265060241,
"grad_norm": 0.46998351441307423,
"learning_rate": 1.725e-05,
"loss": 1.3173,
"step": 69
},
{
"epoch": 0.10542168674698796,
"grad_norm": 0.6300461082248988,
"learning_rate": 1.75e-05,
"loss": 1.6391,
"step": 70
},
{
"epoch": 0.10692771084337349,
"grad_norm": 0.4538484820968132,
"learning_rate": 1.775e-05,
"loss": 1.2369,
"step": 71
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.5693123792497679,
"learning_rate": 1.8e-05,
"loss": 1.3977,
"step": 72
},
{
"epoch": 0.10993975903614457,
"grad_norm": 0.4267785641823739,
"learning_rate": 1.825e-05,
"loss": 1.2441,
"step": 73
},
{
"epoch": 0.11144578313253012,
"grad_norm": 0.45372952789848425,
"learning_rate": 1.85e-05,
"loss": 1.4498,
"step": 74
},
{
"epoch": 0.11295180722891567,
"grad_norm": 0.5964933709321214,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.3772,
"step": 75
},
{
"epoch": 0.1144578313253012,
"grad_norm": 0.44113701176750414,
"learning_rate": 1.9e-05,
"loss": 1.3311,
"step": 76
},
{
"epoch": 0.11596385542168675,
"grad_norm": 0.5424749212034263,
"learning_rate": 1.925e-05,
"loss": 1.3406,
"step": 77
},
{
"epoch": 0.11746987951807229,
"grad_norm": 0.5602026883197359,
"learning_rate": 1.9500000000000003e-05,
"loss": 1.3433,
"step": 78
},
{
"epoch": 0.11897590361445783,
"grad_norm": 0.5161550636965766,
"learning_rate": 1.9750000000000002e-05,
"loss": 1.2833,
"step": 79
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.5494278834050665,
"learning_rate": 2e-05,
"loss": 1.3428,
"step": 80
},
{
"epoch": 0.12198795180722892,
"grad_norm": 0.4749743434369092,
"learning_rate": 2.025e-05,
"loss": 1.361,
"step": 81
},
{
"epoch": 0.12349397590361445,
"grad_norm": 0.5152746225616003,
"learning_rate": 2.05e-05,
"loss": 1.3434,
"step": 82
},
{
"epoch": 0.125,
"grad_norm": 0.49769467362448544,
"learning_rate": 2.075e-05,
"loss": 1.3549,
"step": 83
},
{
"epoch": 0.12650602409638553,
"grad_norm": 0.42586193224308666,
"learning_rate": 2.1e-05,
"loss": 1.2761,
"step": 84
},
{
"epoch": 0.1280120481927711,
"grad_norm": 0.43049747901533697,
"learning_rate": 2.125e-05,
"loss": 1.1048,
"step": 85
},
{
"epoch": 0.12951807228915663,
"grad_norm": 0.4300040045304476,
"learning_rate": 2.15e-05,
"loss": 1.3222,
"step": 86
},
{
"epoch": 0.13102409638554216,
"grad_norm": 0.4539707985400512,
"learning_rate": 2.175e-05,
"loss": 1.2956,
"step": 87
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.5113659970146665,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.1825,
"step": 88
},
{
"epoch": 0.13403614457831325,
"grad_norm": 0.4696855119335388,
"learning_rate": 2.2250000000000002e-05,
"loss": 1.33,
"step": 89
},
{
"epoch": 0.1355421686746988,
"grad_norm": 0.48740020480971413,
"learning_rate": 2.25e-05,
"loss": 1.3285,
"step": 90
},
{
"epoch": 0.13704819277108435,
"grad_norm": 0.6120882011653199,
"learning_rate": 2.275e-05,
"loss": 1.2955,
"step": 91
},
{
"epoch": 0.13855421686746988,
"grad_norm": 0.626070737084123,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.2564,
"step": 92
},
{
"epoch": 0.14006024096385541,
"grad_norm": 0.5115876117219039,
"learning_rate": 2.3250000000000003e-05,
"loss": 1.1585,
"step": 93
},
{
"epoch": 0.14156626506024098,
"grad_norm": 0.5312439490389347,
"learning_rate": 2.35e-05,
"loss": 1.199,
"step": 94
},
{
"epoch": 0.1430722891566265,
"grad_norm": 0.5178218308751487,
"learning_rate": 2.375e-05,
"loss": 1.2843,
"step": 95
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.5638561495484555,
"learning_rate": 2.4e-05,
"loss": 1.3353,
"step": 96
},
{
"epoch": 0.1460843373493976,
"grad_norm": 0.47928329189503854,
"learning_rate": 2.425e-05,
"loss": 1.212,
"step": 97
},
{
"epoch": 0.14759036144578314,
"grad_norm": 0.636746255511944,
"learning_rate": 2.45e-05,
"loss": 1.1688,
"step": 98
},
{
"epoch": 0.14909638554216867,
"grad_norm": 0.47863944376154033,
"learning_rate": 2.4750000000000002e-05,
"loss": 1.1455,
"step": 99
},
{
"epoch": 0.15060240963855423,
"grad_norm": 0.48856286767729595,
"learning_rate": 2.5e-05,
"loss": 1.2307,
"step": 100
},
{
"epoch": 0.15210843373493976,
"grad_norm": 0.5041475886641232,
"learning_rate": 2.525e-05,
"loss": 1.2202,
"step": 101
},
{
"epoch": 0.1536144578313253,
"grad_norm": 0.5009317818018456,
"learning_rate": 2.5500000000000003e-05,
"loss": 1.2094,
"step": 102
},
{
"epoch": 0.15512048192771086,
"grad_norm": 0.5400496694807283,
"learning_rate": 2.5750000000000002e-05,
"loss": 1.1183,
"step": 103
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.7255362096563035,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.2641,
"step": 104
},
{
"epoch": 0.15813253012048192,
"grad_norm": 0.6139535115904823,
"learning_rate": 2.625e-05,
"loss": 1.1794,
"step": 105
},
{
"epoch": 0.15963855421686746,
"grad_norm": 0.5593844805675523,
"learning_rate": 2.6500000000000004e-05,
"loss": 1.2002,
"step": 106
},
{
"epoch": 0.16114457831325302,
"grad_norm": 0.6011044609093333,
"learning_rate": 2.6750000000000003e-05,
"loss": 1.2662,
"step": 107
},
{
"epoch": 0.16265060240963855,
"grad_norm": 0.5616663132349249,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.2525,
"step": 108
},
{
"epoch": 0.16415662650602408,
"grad_norm": 0.7481729798627802,
"learning_rate": 2.725e-05,
"loss": 1.3143,
"step": 109
},
{
"epoch": 0.16566265060240964,
"grad_norm": 0.618561988141154,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.1892,
"step": 110
},
{
"epoch": 0.16716867469879518,
"grad_norm": 0.6350493291213389,
"learning_rate": 2.7750000000000004e-05,
"loss": 1.2683,
"step": 111
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.6182365948984953,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.0567,
"step": 112
},
{
"epoch": 0.17018072289156627,
"grad_norm": 0.5282439604462374,
"learning_rate": 2.825e-05,
"loss": 1.374,
"step": 113
},
{
"epoch": 0.1716867469879518,
"grad_norm": 0.5875069638458675,
"learning_rate": 2.8499999999999998e-05,
"loss": 1.273,
"step": 114
},
{
"epoch": 0.17319277108433734,
"grad_norm": 0.45724819984799503,
"learning_rate": 2.8749999999999997e-05,
"loss": 1.0741,
"step": 115
},
{
"epoch": 0.1746987951807229,
"grad_norm": 0.548430511572042,
"learning_rate": 2.9e-05,
"loss": 1.264,
"step": 116
},
{
"epoch": 0.17620481927710843,
"grad_norm": 0.5978381716686159,
"learning_rate": 2.925e-05,
"loss": 1.1547,
"step": 117
},
{
"epoch": 0.17771084337349397,
"grad_norm": 0.5942228844049342,
"learning_rate": 2.95e-05,
"loss": 1.279,
"step": 118
},
{
"epoch": 0.17921686746987953,
"grad_norm": 0.5407899077606231,
"learning_rate": 2.975e-05,
"loss": 1.2298,
"step": 119
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.7609777257326175,
"learning_rate": 3e-05,
"loss": 1.3364,
"step": 120
},
{
"epoch": 0.1822289156626506,
"grad_norm": 0.6831522869129811,
"learning_rate": 3.025e-05,
"loss": 1.3858,
"step": 121
},
{
"epoch": 0.18373493975903615,
"grad_norm": 0.7504294787815489,
"learning_rate": 3.05e-05,
"loss": 1.2534,
"step": 122
},
{
"epoch": 0.1852409638554217,
"grad_norm": 0.4906159103936579,
"learning_rate": 3.075e-05,
"loss": 1.0826,
"step": 123
},
{
"epoch": 0.18674698795180722,
"grad_norm": 0.8235335479725208,
"learning_rate": 3.1e-05,
"loss": 1.2604,
"step": 124
},
{
"epoch": 0.18825301204819278,
"grad_norm": 1.0335951169911453,
"learning_rate": 3.125e-05,
"loss": 1.1844,
"step": 125
},
{
"epoch": 0.1897590361445783,
"grad_norm": 0.6322445023656,
"learning_rate": 3.15e-05,
"loss": 1.1518,
"step": 126
},
{
"epoch": 0.19126506024096385,
"grad_norm": 0.6539601888332923,
"learning_rate": 3.175e-05,
"loss": 1.2635,
"step": 127
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.8805441625508611,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.2456,
"step": 128
},
{
"epoch": 0.19427710843373494,
"grad_norm": 0.6362640617693966,
"learning_rate": 3.2250000000000005e-05,
"loss": 1.1606,
"step": 129
},
{
"epoch": 0.19578313253012047,
"grad_norm": 0.6543773930077222,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.3733,
"step": 130
},
{
"epoch": 0.19728915662650603,
"grad_norm": 0.8456329148794505,
"learning_rate": 3.275e-05,
"loss": 1.2881,
"step": 131
},
{
"epoch": 0.19879518072289157,
"grad_norm": 0.7798143144373343,
"learning_rate": 3.3e-05,
"loss": 1.2539,
"step": 132
},
{
"epoch": 0.2003012048192771,
"grad_norm": 0.5649146772162791,
"learning_rate": 3.325e-05,
"loss": 1.1565,
"step": 133
},
{
"epoch": 0.20180722891566266,
"grad_norm": 0.7868074993906898,
"learning_rate": 3.35e-05,
"loss": 1.2336,
"step": 134
},
{
"epoch": 0.2033132530120482,
"grad_norm": 0.8539054957048405,
"learning_rate": 3.375000000000001e-05,
"loss": 1.2119,
"step": 135
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.608074392131221,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.1515,
"step": 136
},
{
"epoch": 0.2063253012048193,
"grad_norm": 0.8305689711715424,
"learning_rate": 3.4250000000000006e-05,
"loss": 1.182,
"step": 137
},
{
"epoch": 0.20783132530120482,
"grad_norm": 1.4659619654883613,
"learning_rate": 3.45e-05,
"loss": 1.342,
"step": 138
},
{
"epoch": 0.20933734939759036,
"grad_norm": 1.0012967715150258,
"learning_rate": 3.475e-05,
"loss": 1.134,
"step": 139
},
{
"epoch": 0.21084337349397592,
"grad_norm": 1.841449726119881,
"learning_rate": 3.5e-05,
"loss": 1.2503,
"step": 140
},
{
"epoch": 0.21234939759036145,
"grad_norm": 1.7472556026235277,
"learning_rate": 3.525e-05,
"loss": 1.1299,
"step": 141
},
{
"epoch": 0.21385542168674698,
"grad_norm": 1.3403460128662175,
"learning_rate": 3.55e-05,
"loss": 1.0762,
"step": 142
},
{
"epoch": 0.21536144578313254,
"grad_norm": 1.252908657410089,
"learning_rate": 3.575e-05,
"loss": 1.2084,
"step": 143
},
{
"epoch": 0.21686746987951808,
"grad_norm": 1.4333911136243944,
"learning_rate": 3.6e-05,
"loss": 1.0785,
"step": 144
},
{
"epoch": 0.2183734939759036,
"grad_norm": 0.6748341835741349,
"learning_rate": 3.625e-05,
"loss": 1.1928,
"step": 145
},
{
"epoch": 0.21987951807228914,
"grad_norm": 1.5586371750373669,
"learning_rate": 3.65e-05,
"loss": 1.1381,
"step": 146
},
{
"epoch": 0.2213855421686747,
"grad_norm": 0.6783961919229362,
"learning_rate": 3.675e-05,
"loss": 1.3883,
"step": 147
},
{
"epoch": 0.22289156626506024,
"grad_norm": 1.8591814216635019,
"learning_rate": 3.7e-05,
"loss": 1.3179,
"step": 148
},
{
"epoch": 0.22439759036144577,
"grad_norm": 1.3822936108243598,
"learning_rate": 3.7250000000000004e-05,
"loss": 1.2124,
"step": 149
},
{
"epoch": 0.22590361445783133,
"grad_norm": 2.7785491488634264,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.1767,
"step": 150
},
{
"epoch": 0.22740963855421686,
"grad_norm": 3.0818450530681893,
"learning_rate": 3.775e-05,
"loss": 1.1505,
"step": 151
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.773086741639908,
"learning_rate": 3.8e-05,
"loss": 1.0885,
"step": 152
},
{
"epoch": 0.23042168674698796,
"grad_norm": 2.3053968508239526,
"learning_rate": 3.825e-05,
"loss": 1.3184,
"step": 153
},
{
"epoch": 0.2319277108433735,
"grad_norm": 1.8672692391834331,
"learning_rate": 3.85e-05,
"loss": 1.2499,
"step": 154
},
{
"epoch": 0.23343373493975902,
"grad_norm": 4.330568800320398,
"learning_rate": 3.875e-05,
"loss": 1.1837,
"step": 155
},
{
"epoch": 0.23493975903614459,
"grad_norm": 2.7646592356040003,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.2772,
"step": 156
},
{
"epoch": 0.23644578313253012,
"grad_norm": 1.595555200127894,
"learning_rate": 3.9250000000000005e-05,
"loss": 1.2407,
"step": 157
},
{
"epoch": 0.23795180722891565,
"grad_norm": 2.015882701771897,
"learning_rate": 3.9500000000000005e-05,
"loss": 1.3639,
"step": 158
},
{
"epoch": 0.2394578313253012,
"grad_norm": 2.4247022341569573,
"learning_rate": 3.9750000000000004e-05,
"loss": 1.2166,
"step": 159
},
{
"epoch": 0.24096385542168675,
"grad_norm": 1.201182586017534,
"learning_rate": 4e-05,
"loss": 1.0983,
"step": 160
},
{
"epoch": 0.24246987951807228,
"grad_norm": 1.3150470550474023,
"learning_rate": 4.025e-05,
"loss": 1.1311,
"step": 161
},
{
"epoch": 0.24397590361445784,
"grad_norm": 1.4097973347611912,
"learning_rate": 4.05e-05,
"loss": 1.1095,
"step": 162
},
{
"epoch": 0.24548192771084337,
"grad_norm": 1.2035175427147458,
"learning_rate": 4.075e-05,
"loss": 1.2158,
"step": 163
},
{
"epoch": 0.2469879518072289,
"grad_norm": 1.322678662967718,
"learning_rate": 4.1e-05,
"loss": 1.1124,
"step": 164
},
{
"epoch": 0.24849397590361447,
"grad_norm": 1.1201026910380605,
"learning_rate": 4.125e-05,
"loss": 1.1738,
"step": 165
},
{
"epoch": 0.25,
"grad_norm": 1.3309670495547214,
"learning_rate": 4.15e-05,
"loss": 1.2588,
"step": 166
},
{
"epoch": 0.25150602409638556,
"grad_norm": 1.446319804446339,
"learning_rate": 4.175e-05,
"loss": 1.0585,
"step": 167
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.7234375262162984,
"learning_rate": 4.2e-05,
"loss": 1.1338,
"step": 168
},
{
"epoch": 0.2545180722891566,
"grad_norm": 1.3503681347948917,
"learning_rate": 4.2250000000000004e-05,
"loss": 1.0938,
"step": 169
},
{
"epoch": 0.2560240963855422,
"grad_norm": 0.9766181855110214,
"learning_rate": 4.25e-05,
"loss": 1.0734,
"step": 170
},
{
"epoch": 0.2575301204819277,
"grad_norm": 1.4335268658024551,
"learning_rate": 4.275e-05,
"loss": 1.2955,
"step": 171
},
{
"epoch": 0.25903614457831325,
"grad_norm": 1.3606865757997293,
"learning_rate": 4.3e-05,
"loss": 1.1185,
"step": 172
},
{
"epoch": 0.2605421686746988,
"grad_norm": 0.7179564585307041,
"learning_rate": 4.325e-05,
"loss": 1.3014,
"step": 173
},
{
"epoch": 0.2620481927710843,
"grad_norm": 1.482316708521197,
"learning_rate": 4.35e-05,
"loss": 1.221,
"step": 174
},
{
"epoch": 0.2635542168674699,
"grad_norm": 0.8323067807154744,
"learning_rate": 4.375e-05,
"loss": 1.2386,
"step": 175
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.8773497783196046,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.0735,
"step": 176
},
{
"epoch": 0.26656626506024095,
"grad_norm": 0.8080641709179126,
"learning_rate": 4.4250000000000005e-05,
"loss": 1.1619,
"step": 177
},
{
"epoch": 0.2680722891566265,
"grad_norm": 0.9841467149957353,
"learning_rate": 4.4500000000000004e-05,
"loss": 1.3064,
"step": 178
},
{
"epoch": 0.26957831325301207,
"grad_norm": 0.7316088166486044,
"learning_rate": 4.4750000000000004e-05,
"loss": 1.1456,
"step": 179
},
{
"epoch": 0.2710843373493976,
"grad_norm": 0.7523828074326532,
"learning_rate": 4.5e-05,
"loss": 1.3101,
"step": 180
},
{
"epoch": 0.27259036144578314,
"grad_norm": 3.372899742507733,
"learning_rate": 4.525e-05,
"loss": 1.2082,
"step": 181
},
{
"epoch": 0.2740963855421687,
"grad_norm": 1.0840530542822477,
"learning_rate": 4.55e-05,
"loss": 1.2206,
"step": 182
},
{
"epoch": 0.2756024096385542,
"grad_norm": 1.4187564966633521,
"learning_rate": 4.575e-05,
"loss": 1.2211,
"step": 183
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.8850993935732638,
"learning_rate": 4.600000000000001e-05,
"loss": 1.0761,
"step": 184
},
{
"epoch": 0.2786144578313253,
"grad_norm": 1.8544169886923978,
"learning_rate": 4.6250000000000006e-05,
"loss": 1.0796,
"step": 185
},
{
"epoch": 0.28012048192771083,
"grad_norm": 1.0767191188725969,
"learning_rate": 4.6500000000000005e-05,
"loss": 1.14,
"step": 186
},
{
"epoch": 0.2816265060240964,
"grad_norm": 2.3502056771455067,
"learning_rate": 4.6750000000000005e-05,
"loss": 1.2778,
"step": 187
},
{
"epoch": 0.28313253012048195,
"grad_norm": 1.813897135362189,
"learning_rate": 4.7e-05,
"loss": 1.1488,
"step": 188
},
{
"epoch": 0.28463855421686746,
"grad_norm": 1.3120380258694695,
"learning_rate": 4.7249999999999997e-05,
"loss": 1.0994,
"step": 189
},
{
"epoch": 0.286144578313253,
"grad_norm": 1.4609354194796313,
"learning_rate": 4.75e-05,
"loss": 1.2513,
"step": 190
},
{
"epoch": 0.2876506024096386,
"grad_norm": 1.338319833787497,
"learning_rate": 4.775e-05,
"loss": 1.2829,
"step": 191
},
{
"epoch": 0.2891566265060241,
"grad_norm": 1.061771437097537,
"learning_rate": 4.8e-05,
"loss": 1.1399,
"step": 192
},
{
"epoch": 0.29066265060240964,
"grad_norm": 2.2518205398036244,
"learning_rate": 4.825e-05,
"loss": 1.0902,
"step": 193
},
{
"epoch": 0.2921686746987952,
"grad_norm": 2.501749016333484,
"learning_rate": 4.85e-05,
"loss": 1.1557,
"step": 194
},
{
"epoch": 0.2936746987951807,
"grad_norm": 0.986670899997431,
"learning_rate": 4.875e-05,
"loss": 1.2294,
"step": 195
},
{
"epoch": 0.29518072289156627,
"grad_norm": 2.203564827742068,
"learning_rate": 4.9e-05,
"loss": 1.049,
"step": 196
},
{
"epoch": 0.29668674698795183,
"grad_norm": 0.8490130543132344,
"learning_rate": 4.9250000000000004e-05,
"loss": 1.0062,
"step": 197
},
{
"epoch": 0.29819277108433734,
"grad_norm": 1.834859368403903,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.1523,
"step": 198
},
{
"epoch": 0.2996987951807229,
"grad_norm": 1.0082103447014106,
"learning_rate": 4.975e-05,
"loss": 1.2407,
"step": 199
},
{
"epoch": 0.30120481927710846,
"grad_norm": 2.484362515364646,
"learning_rate": 5e-05,
"loss": 1.2052,
"step": 200
},
{
"epoch": 0.30271084337349397,
"grad_norm": 2.807293589334073,
"learning_rate": 4.997209821428572e-05,
"loss": 1.0995,
"step": 201
},
{
"epoch": 0.3042168674698795,
"grad_norm": 0.9629680174399969,
"learning_rate": 4.9944196428571435e-05,
"loss": 1.0819,
"step": 202
},
{
"epoch": 0.3057228915662651,
"grad_norm": 2.6482798882637257,
"learning_rate": 4.9916294642857145e-05,
"loss": 1.1511,
"step": 203
},
{
"epoch": 0.3072289156626506,
"grad_norm": 2.591142055611865,
"learning_rate": 4.9888392857142854e-05,
"loss": 1.1257,
"step": 204
},
{
"epoch": 0.30873493975903615,
"grad_norm": 1.244664077898625,
"learning_rate": 4.986049107142857e-05,
"loss": 1.2029,
"step": 205
},
{
"epoch": 0.3102409638554217,
"grad_norm": 1.4149817517398067,
"learning_rate": 4.983258928571429e-05,
"loss": 1.0481,
"step": 206
},
{
"epoch": 0.3117469879518072,
"grad_norm": 1.710955644020336,
"learning_rate": 4.9804687500000004e-05,
"loss": 1.0415,
"step": 207
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.7871423097569887,
"learning_rate": 4.977678571428572e-05,
"loss": 1.1626,
"step": 208
},
{
"epoch": 0.3147590361445783,
"grad_norm": 1.6533122473343886,
"learning_rate": 4.974888392857143e-05,
"loss": 1.1399,
"step": 209
},
{
"epoch": 0.31626506024096385,
"grad_norm": 1.062090177273032,
"learning_rate": 4.9720982142857146e-05,
"loss": 1.0609,
"step": 210
},
{
"epoch": 0.3177710843373494,
"grad_norm": 1.3358528924045199,
"learning_rate": 4.9693080357142856e-05,
"loss": 1.0546,
"step": 211
},
{
"epoch": 0.3192771084337349,
"grad_norm": 1.2699082435429738,
"learning_rate": 4.966517857142857e-05,
"loss": 1.3104,
"step": 212
},
{
"epoch": 0.3207831325301205,
"grad_norm": 1.631520467551371,
"learning_rate": 4.963727678571429e-05,
"loss": 1.1091,
"step": 213
},
{
"epoch": 0.32228915662650603,
"grad_norm": 1.2841795005461383,
"learning_rate": 4.9609375000000005e-05,
"loss": 1.1702,
"step": 214
},
{
"epoch": 0.32379518072289154,
"grad_norm": 2.11801132990623,
"learning_rate": 4.9581473214285714e-05,
"loss": 1.2395,
"step": 215
},
{
"epoch": 0.3253012048192771,
"grad_norm": 1.7630852566359885,
"learning_rate": 4.955357142857143e-05,
"loss": 1.0816,
"step": 216
},
{
"epoch": 0.32680722891566266,
"grad_norm": 1.8649444527091168,
"learning_rate": 4.952566964285715e-05,
"loss": 1.0944,
"step": 217
},
{
"epoch": 0.32831325301204817,
"grad_norm": 1.603212202589856,
"learning_rate": 4.949776785714286e-05,
"loss": 1.1128,
"step": 218
},
{
"epoch": 0.32981927710843373,
"grad_norm": 1.3817148805369528,
"learning_rate": 4.946986607142857e-05,
"loss": 1.0143,
"step": 219
},
{
"epoch": 0.3313253012048193,
"grad_norm": 1.9878154605365819,
"learning_rate": 4.944196428571429e-05,
"loss": 1.1158,
"step": 220
},
{
"epoch": 0.3328313253012048,
"grad_norm": 1.3198018552904554,
"learning_rate": 4.94140625e-05,
"loss": 1.2964,
"step": 221
},
{
"epoch": 0.33433734939759036,
"grad_norm": 2.0072371055012557,
"learning_rate": 4.9386160714285716e-05,
"loss": 1.0403,
"step": 222
},
{
"epoch": 0.3358433734939759,
"grad_norm": 1.3148461408166356,
"learning_rate": 4.935825892857143e-05,
"loss": 0.9557,
"step": 223
},
{
"epoch": 0.3373493975903614,
"grad_norm": 1.4391527281143421,
"learning_rate": 4.933035714285715e-05,
"loss": 1.146,
"step": 224
},
{
"epoch": 0.338855421686747,
"grad_norm": 2.0567966670673847,
"learning_rate": 4.930245535714286e-05,
"loss": 1.0913,
"step": 225
},
{
"epoch": 0.34036144578313254,
"grad_norm": 0.8486481409603515,
"learning_rate": 4.9274553571428574e-05,
"loss": 1.21,
"step": 226
},
{
"epoch": 0.34186746987951805,
"grad_norm": 0.7682819967080843,
"learning_rate": 4.9246651785714284e-05,
"loss": 1.2319,
"step": 227
},
{
"epoch": 0.3433734939759036,
"grad_norm": 0.7183741214872998,
"learning_rate": 4.921875e-05,
"loss": 1.0864,
"step": 228
},
{
"epoch": 0.34487951807228917,
"grad_norm": 0.850567254195245,
"learning_rate": 4.919084821428572e-05,
"loss": 1.0155,
"step": 229
},
{
"epoch": 0.3463855421686747,
"grad_norm": 0.7147187377321871,
"learning_rate": 4.916294642857143e-05,
"loss": 1.1591,
"step": 230
},
{
"epoch": 0.34789156626506024,
"grad_norm": 0.6308795014146827,
"learning_rate": 4.913504464285715e-05,
"loss": 1.1556,
"step": 231
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.5235892375314235,
"learning_rate": 4.910714285714286e-05,
"loss": 1.0519,
"step": 232
},
{
"epoch": 0.3509036144578313,
"grad_norm": 2.3647614127653918,
"learning_rate": 4.9079241071428576e-05,
"loss": 1.1801,
"step": 233
},
{
"epoch": 0.35240963855421686,
"grad_norm": 1.2636782024703317,
"learning_rate": 4.9051339285714285e-05,
"loss": 1.0888,
"step": 234
},
{
"epoch": 0.3539156626506024,
"grad_norm": 1.452216077255174,
"learning_rate": 4.90234375e-05,
"loss": 1.1946,
"step": 235
},
{
"epoch": 0.35542168674698793,
"grad_norm": 1.431058262283714,
"learning_rate": 4.899553571428572e-05,
"loss": 1.1393,
"step": 236
},
{
"epoch": 0.3569277108433735,
"grad_norm": 1.5447839137148456,
"learning_rate": 4.8967633928571434e-05,
"loss": 1.1518,
"step": 237
},
{
"epoch": 0.35843373493975905,
"grad_norm": 1.3321646900352668,
"learning_rate": 4.893973214285715e-05,
"loss": 1.2129,
"step": 238
},
{
"epoch": 0.35993975903614456,
"grad_norm": 1.6041879864334145,
"learning_rate": 4.891183035714286e-05,
"loss": 1.1473,
"step": 239
},
{
"epoch": 0.3614457831325301,
"grad_norm": 1.1604595848270383,
"learning_rate": 4.888392857142857e-05,
"loss": 1.0934,
"step": 240
},
{
"epoch": 0.3629518072289157,
"grad_norm": 2.1163407553243307,
"learning_rate": 4.8856026785714286e-05,
"loss": 1.105,
"step": 241
},
{
"epoch": 0.3644578313253012,
"grad_norm": 1.8247461907959592,
"learning_rate": 4.8828125e-05,
"loss": 1.2259,
"step": 242
},
{
"epoch": 0.36596385542168675,
"grad_norm": 1.6880899716207813,
"learning_rate": 4.880022321428572e-05,
"loss": 1.1582,
"step": 243
},
{
"epoch": 0.3674698795180723,
"grad_norm": 1.945949991185642,
"learning_rate": 4.8772321428571435e-05,
"loss": 1.0984,
"step": 244
},
{
"epoch": 0.3689759036144578,
"grad_norm": 0.8083465654697327,
"learning_rate": 4.8744419642857145e-05,
"loss": 1.2172,
"step": 245
},
{
"epoch": 0.3704819277108434,
"grad_norm": 1.3108711367199426,
"learning_rate": 4.8716517857142855e-05,
"loss": 1.1721,
"step": 246
},
{
"epoch": 0.37198795180722893,
"grad_norm": 0.5855610261806767,
"learning_rate": 4.868861607142857e-05,
"loss": 1.101,
"step": 247
},
{
"epoch": 0.37349397590361444,
"grad_norm": 1.2212964794899914,
"learning_rate": 4.866071428571429e-05,
"loss": 1.1313,
"step": 248
},
{
"epoch": 0.375,
"grad_norm": 1.0063435061999662,
"learning_rate": 4.8632812500000004e-05,
"loss": 0.9664,
"step": 249
},
{
"epoch": 0.37650602409638556,
"grad_norm": 0.473003506269346,
"learning_rate": 4.860491071428572e-05,
"loss": 1.0709,
"step": 250
},
{
"epoch": 0.37801204819277107,
"grad_norm": 0.6820715341678072,
"learning_rate": 4.857700892857143e-05,
"loss": 1.2115,
"step": 251
},
{
"epoch": 0.3795180722891566,
"grad_norm": 0.5335642842726658,
"learning_rate": 4.8549107142857146e-05,
"loss": 1.1002,
"step": 252
},
{
"epoch": 0.3810240963855422,
"grad_norm": 0.5512515177615058,
"learning_rate": 4.8521205357142856e-05,
"loss": 1.1927,
"step": 253
},
{
"epoch": 0.3825301204819277,
"grad_norm": 1.0690031356718976,
"learning_rate": 4.849330357142857e-05,
"loss": 1.0297,
"step": 254
},
{
"epoch": 0.38403614457831325,
"grad_norm": 0.7003395703715207,
"learning_rate": 4.846540178571429e-05,
"loss": 1.1792,
"step": 255
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.4871631349176089,
"learning_rate": 4.8437500000000005e-05,
"loss": 1.0313,
"step": 256
},
{
"epoch": 0.3870481927710843,
"grad_norm": 0.6252189228536179,
"learning_rate": 4.8409598214285715e-05,
"loss": 1.0858,
"step": 257
},
{
"epoch": 0.3885542168674699,
"grad_norm": 0.7303291619404965,
"learning_rate": 4.838169642857143e-05,
"loss": 1.1193,
"step": 258
},
{
"epoch": 0.39006024096385544,
"grad_norm": 0.6196210211779465,
"learning_rate": 4.835379464285715e-05,
"loss": 1.1132,
"step": 259
},
{
"epoch": 0.39156626506024095,
"grad_norm": 0.4613735798032197,
"learning_rate": 4.832589285714286e-05,
"loss": 1.1769,
"step": 260
},
{
"epoch": 0.3930722891566265,
"grad_norm": 0.8724249592082794,
"learning_rate": 4.8297991071428573e-05,
"loss": 1.0633,
"step": 261
},
{
"epoch": 0.39457831325301207,
"grad_norm": 0.6436609538908512,
"learning_rate": 4.827008928571429e-05,
"loss": 0.8923,
"step": 262
},
{
"epoch": 0.3960843373493976,
"grad_norm": 0.6961730289258252,
"learning_rate": 4.82421875e-05,
"loss": 1.101,
"step": 263
},
{
"epoch": 0.39759036144578314,
"grad_norm": 1.1666860817560007,
"learning_rate": 4.8214285714285716e-05,
"loss": 1.0223,
"step": 264
},
{
"epoch": 0.3990963855421687,
"grad_norm": 0.8060079738768117,
"learning_rate": 4.818638392857143e-05,
"loss": 1.1685,
"step": 265
},
{
"epoch": 0.4006024096385542,
"grad_norm": 0.6068978262664019,
"learning_rate": 4.815848214285715e-05,
"loss": 1.2158,
"step": 266
},
{
"epoch": 0.40210843373493976,
"grad_norm": 0.6133589630474396,
"learning_rate": 4.813058035714286e-05,
"loss": 1.134,
"step": 267
},
{
"epoch": 0.4036144578313253,
"grad_norm": 0.5655602311860805,
"learning_rate": 4.8102678571428575e-05,
"loss": 1.1884,
"step": 268
},
{
"epoch": 0.40512048192771083,
"grad_norm": 0.6304054537890617,
"learning_rate": 4.8074776785714284e-05,
"loss": 1.0413,
"step": 269
},
{
"epoch": 0.4066265060240964,
"grad_norm": 0.8627697754260435,
"learning_rate": 4.8046875e-05,
"loss": 1.0705,
"step": 270
},
{
"epoch": 0.40813253012048195,
"grad_norm": 0.7094121861835088,
"learning_rate": 4.801897321428572e-05,
"loss": 1.1068,
"step": 271
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.5620756718286611,
"learning_rate": 4.7991071428571433e-05,
"loss": 1.1286,
"step": 272
},
{
"epoch": 0.411144578313253,
"grad_norm": 0.8297215285339797,
"learning_rate": 4.796316964285715e-05,
"loss": 0.9547,
"step": 273
},
{
"epoch": 0.4126506024096386,
"grad_norm": 0.5653620112559294,
"learning_rate": 4.793526785714286e-05,
"loss": 1.0796,
"step": 274
},
{
"epoch": 0.4141566265060241,
"grad_norm": 0.5058016537443488,
"learning_rate": 4.790736607142857e-05,
"loss": 1.0754,
"step": 275
},
{
"epoch": 0.41566265060240964,
"grad_norm": 0.5718375025598983,
"learning_rate": 4.7879464285714285e-05,
"loss": 1.1264,
"step": 276
},
{
"epoch": 0.4171686746987952,
"grad_norm": 0.6083124264681959,
"learning_rate": 4.78515625e-05,
"loss": 1.1462,
"step": 277
},
{
"epoch": 0.4186746987951807,
"grad_norm": 0.6214272312149512,
"learning_rate": 4.782366071428572e-05,
"loss": 1.1854,
"step": 278
},
{
"epoch": 0.42018072289156627,
"grad_norm": 0.6911162713381959,
"learning_rate": 4.7795758928571435e-05,
"loss": 1.142,
"step": 279
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.5524545372998528,
"learning_rate": 4.7767857142857144e-05,
"loss": 1.0226,
"step": 280
},
{
"epoch": 0.42319277108433734,
"grad_norm": 0.7868835084355736,
"learning_rate": 4.7739955357142854e-05,
"loss": 1.0698,
"step": 281
},
{
"epoch": 0.4246987951807229,
"grad_norm": 0.5748798967151076,
"learning_rate": 4.771205357142857e-05,
"loss": 1.1523,
"step": 282
},
{
"epoch": 0.42620481927710846,
"grad_norm": 0.984453017404523,
"learning_rate": 4.768415178571429e-05,
"loss": 1.1704,
"step": 283
},
{
"epoch": 0.42771084337349397,
"grad_norm": 0.7478577493878779,
"learning_rate": 4.765625e-05,
"loss": 1.0541,
"step": 284
},
{
"epoch": 0.4292168674698795,
"grad_norm": 1.0316405869831138,
"learning_rate": 4.762834821428572e-05,
"loss": 1.1742,
"step": 285
},
{
"epoch": 0.4307228915662651,
"grad_norm": 0.9411713937287206,
"learning_rate": 4.7600446428571436e-05,
"loss": 1.1275,
"step": 286
},
{
"epoch": 0.4322289156626506,
"grad_norm": 0.790899789671601,
"learning_rate": 4.7572544642857145e-05,
"loss": 1.0086,
"step": 287
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.9413498730674247,
"learning_rate": 4.7544642857142855e-05,
"loss": 1.14,
"step": 288
},
{
"epoch": 0.4352409638554217,
"grad_norm": 0.9627651044056021,
"learning_rate": 4.751674107142857e-05,
"loss": 1.0731,
"step": 289
},
{
"epoch": 0.4367469879518072,
"grad_norm": 0.6242926613973893,
"learning_rate": 4.748883928571429e-05,
"loss": 1.1256,
"step": 290
},
{
"epoch": 0.4382530120481928,
"grad_norm": 0.851461421349802,
"learning_rate": 4.7460937500000004e-05,
"loss": 1.0941,
"step": 291
},
{
"epoch": 0.4397590361445783,
"grad_norm": 0.7495510668130898,
"learning_rate": 4.743303571428572e-05,
"loss": 1.0394,
"step": 292
},
{
"epoch": 0.44126506024096385,
"grad_norm": 0.7195590612170437,
"learning_rate": 4.740513392857143e-05,
"loss": 1.1445,
"step": 293
},
{
"epoch": 0.4427710843373494,
"grad_norm": 0.5960303701573098,
"learning_rate": 4.7377232142857147e-05,
"loss": 1.0585,
"step": 294
},
{
"epoch": 0.4442771084337349,
"grad_norm": 0.9842342043781458,
"learning_rate": 4.7349330357142856e-05,
"loss": 1.0377,
"step": 295
},
{
"epoch": 0.4457831325301205,
"grad_norm": 1.0537294003669568,
"learning_rate": 4.732142857142857e-05,
"loss": 1.083,
"step": 296
},
{
"epoch": 0.44728915662650603,
"grad_norm": 0.65566664626814,
"learning_rate": 4.729352678571429e-05,
"loss": 1.1926,
"step": 297
},
{
"epoch": 0.44879518072289154,
"grad_norm": 1.1848990165916284,
"learning_rate": 4.7265625000000005e-05,
"loss": 1.0994,
"step": 298
},
{
"epoch": 0.4503012048192771,
"grad_norm": 0.9891837032009826,
"learning_rate": 4.7237723214285715e-05,
"loss": 1.173,
"step": 299
},
{
"epoch": 0.45180722891566266,
"grad_norm": 0.6688119337187576,
"learning_rate": 4.720982142857143e-05,
"loss": 1.1737,
"step": 300
},
{
"epoch": 0.45331325301204817,
"grad_norm": 1.3142909754099252,
"learning_rate": 4.718191964285715e-05,
"loss": 1.1526,
"step": 301
},
{
"epoch": 0.45481927710843373,
"grad_norm": 0.5657599508166598,
"learning_rate": 4.715401785714286e-05,
"loss": 0.9977,
"step": 302
},
{
"epoch": 0.4563253012048193,
"grad_norm": 1.2536012478516283,
"learning_rate": 4.7126116071428574e-05,
"loss": 1.0355,
"step": 303
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.8305598736430878,
"learning_rate": 4.709821428571429e-05,
"loss": 1.0141,
"step": 304
},
{
"epoch": 0.45933734939759036,
"grad_norm": 0.5666233932254965,
"learning_rate": 4.70703125e-05,
"loss": 1.1752,
"step": 305
},
{
"epoch": 0.4608433734939759,
"grad_norm": 0.8260580089665397,
"learning_rate": 4.7042410714285716e-05,
"loss": 1.0258,
"step": 306
},
{
"epoch": 0.4623493975903614,
"grad_norm": 0.8400449995564603,
"learning_rate": 4.701450892857143e-05,
"loss": 1.0304,
"step": 307
},
{
"epoch": 0.463855421686747,
"grad_norm": 0.6206327848818953,
"learning_rate": 4.698660714285715e-05,
"loss": 1.1415,
"step": 308
},
{
"epoch": 0.46536144578313254,
"grad_norm": 0.6179564541534572,
"learning_rate": 4.695870535714286e-05,
"loss": 1.2258,
"step": 309
},
{
"epoch": 0.46686746987951805,
"grad_norm": 0.7585172715247736,
"learning_rate": 4.6930803571428575e-05,
"loss": 1.0665,
"step": 310
},
{
"epoch": 0.4683734939759036,
"grad_norm": 0.6519740872909565,
"learning_rate": 4.6902901785714285e-05,
"loss": 1.0575,
"step": 311
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.5348307727480986,
"learning_rate": 4.6875e-05,
"loss": 1.0592,
"step": 312
},
{
"epoch": 0.4713855421686747,
"grad_norm": 0.582404450830819,
"learning_rate": 4.684709821428572e-05,
"loss": 1.0107,
"step": 313
},
{
"epoch": 0.47289156626506024,
"grad_norm": 0.5575064869464086,
"learning_rate": 4.6819196428571434e-05,
"loss": 1.0666,
"step": 314
},
{
"epoch": 0.4743975903614458,
"grad_norm": 0.8064772943831982,
"learning_rate": 4.679129464285715e-05,
"loss": 1.0164,
"step": 315
},
{
"epoch": 0.4759036144578313,
"grad_norm": 0.5740805150731614,
"learning_rate": 4.676339285714286e-05,
"loss": 1.1175,
"step": 316
},
{
"epoch": 0.47740963855421686,
"grad_norm": 0.637076313731006,
"learning_rate": 4.673549107142857e-05,
"loss": 1.1172,
"step": 317
},
{
"epoch": 0.4789156626506024,
"grad_norm": 0.5881085905964725,
"learning_rate": 4.6707589285714286e-05,
"loss": 1.2374,
"step": 318
},
{
"epoch": 0.48042168674698793,
"grad_norm": 0.6973661856474144,
"learning_rate": 4.66796875e-05,
"loss": 1.0896,
"step": 319
},
{
"epoch": 0.4819277108433735,
"grad_norm": 1.1323534611631751,
"learning_rate": 4.665178571428572e-05,
"loss": 1.1134,
"step": 320
},
{
"epoch": 0.48343373493975905,
"grad_norm": 0.6468024439890357,
"learning_rate": 4.6623883928571435e-05,
"loss": 1.1543,
"step": 321
},
{
"epoch": 0.48493975903614456,
"grad_norm": 0.6212620214284293,
"learning_rate": 4.6595982142857145e-05,
"loss": 1.1858,
"step": 322
},
{
"epoch": 0.4864457831325301,
"grad_norm": 0.5920187453423337,
"learning_rate": 4.6568080357142854e-05,
"loss": 1.1841,
"step": 323
},
{
"epoch": 0.4879518072289157,
"grad_norm": 0.7522252767909109,
"learning_rate": 4.654017857142857e-05,
"loss": 1.0614,
"step": 324
},
{
"epoch": 0.4894578313253012,
"grad_norm": 0.7720395110920457,
"learning_rate": 4.651227678571429e-05,
"loss": 1.0174,
"step": 325
},
{
"epoch": 0.49096385542168675,
"grad_norm": 0.5179137926554194,
"learning_rate": 4.6484375e-05,
"loss": 1.1899,
"step": 326
},
{
"epoch": 0.4924698795180723,
"grad_norm": 0.664356590249216,
"learning_rate": 4.645647321428572e-05,
"loss": 1.1706,
"step": 327
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.5263451119796876,
"learning_rate": 4.642857142857143e-05,
"loss": 1.171,
"step": 328
},
{
"epoch": 0.4954819277108434,
"grad_norm": 0.6114326540624144,
"learning_rate": 4.6400669642857146e-05,
"loss": 1.0609,
"step": 329
},
{
"epoch": 0.49698795180722893,
"grad_norm": 0.6758407174730853,
"learning_rate": 4.6372767857142855e-05,
"loss": 1.1003,
"step": 330
},
{
"epoch": 0.49849397590361444,
"grad_norm": 0.581585715051734,
"learning_rate": 4.634486607142857e-05,
"loss": 0.9861,
"step": 331
},
{
"epoch": 0.5,
"grad_norm": 0.5988217364814561,
"learning_rate": 4.631696428571429e-05,
"loss": 1.0703,
"step": 332
},
{
"epoch": 0.5015060240963856,
"grad_norm": 0.5515737128525399,
"learning_rate": 4.6289062500000005e-05,
"loss": 1.1344,
"step": 333
},
{
"epoch": 0.5030120481927711,
"grad_norm": 0.588593134249206,
"learning_rate": 4.6261160714285714e-05,
"loss": 1.1005,
"step": 334
},
{
"epoch": 0.5045180722891566,
"grad_norm": 0.5383505046146273,
"learning_rate": 4.623325892857143e-05,
"loss": 0.9466,
"step": 335
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.5048430115850058,
"learning_rate": 4.620535714285715e-05,
"loss": 1.1279,
"step": 336
},
{
"epoch": 0.5075301204819277,
"grad_norm": 0.487591855315763,
"learning_rate": 4.6177455357142857e-05,
"loss": 1.0164,
"step": 337
},
{
"epoch": 0.5090361445783133,
"grad_norm": 0.5524820921251307,
"learning_rate": 4.614955357142857e-05,
"loss": 1.054,
"step": 338
},
{
"epoch": 0.5105421686746988,
"grad_norm": 0.5744986926349741,
"learning_rate": 4.612165178571429e-05,
"loss": 1.0984,
"step": 339
},
{
"epoch": 0.5120481927710844,
"grad_norm": 0.5508993572471312,
"learning_rate": 4.609375e-05,
"loss": 1.0286,
"step": 340
},
{
"epoch": 0.5135542168674698,
"grad_norm": 0.43166815795807073,
"learning_rate": 4.6065848214285715e-05,
"loss": 1.1016,
"step": 341
},
{
"epoch": 0.5150602409638554,
"grad_norm": 0.7094909764579079,
"learning_rate": 4.603794642857143e-05,
"loss": 1.0311,
"step": 342
},
{
"epoch": 0.516566265060241,
"grad_norm": 0.9744107241541942,
"learning_rate": 4.601004464285715e-05,
"loss": 1.1499,
"step": 343
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.4515776055497446,
"learning_rate": 4.598214285714286e-05,
"loss": 1.0415,
"step": 344
},
{
"epoch": 0.5195783132530121,
"grad_norm": 0.9818306316338725,
"learning_rate": 4.5954241071428574e-05,
"loss": 1.1366,
"step": 345
},
{
"epoch": 0.5210843373493976,
"grad_norm": 0.6024720525198606,
"learning_rate": 4.592633928571429e-05,
"loss": 0.9559,
"step": 346
},
{
"epoch": 0.5225903614457831,
"grad_norm": 0.592032069497474,
"learning_rate": 4.58984375e-05,
"loss": 1.0116,
"step": 347
},
{
"epoch": 0.5240963855421686,
"grad_norm": 0.5102200581640144,
"learning_rate": 4.5870535714285716e-05,
"loss": 1.0141,
"step": 348
},
{
"epoch": 0.5256024096385542,
"grad_norm": 0.672267320385825,
"learning_rate": 4.584263392857143e-05,
"loss": 0.9747,
"step": 349
},
{
"epoch": 0.5271084337349398,
"grad_norm": 0.938335760524442,
"learning_rate": 4.581473214285715e-05,
"loss": 1.1767,
"step": 350
},
{
"epoch": 0.5286144578313253,
"grad_norm": 0.7656088723888667,
"learning_rate": 4.578683035714286e-05,
"loss": 0.9704,
"step": 351
},
{
"epoch": 0.5301204819277109,
"grad_norm": 1.070174105464317,
"learning_rate": 4.5758928571428575e-05,
"loss": 1.1489,
"step": 352
},
{
"epoch": 0.5316265060240963,
"grad_norm": 0.6687072295947085,
"learning_rate": 4.5731026785714285e-05,
"loss": 1.1995,
"step": 353
},
{
"epoch": 0.5331325301204819,
"grad_norm": 0.7277253224117494,
"learning_rate": 4.5703125e-05,
"loss": 0.9879,
"step": 354
},
{
"epoch": 0.5346385542168675,
"grad_norm": 0.49034491112587963,
"learning_rate": 4.567522321428572e-05,
"loss": 1.0221,
"step": 355
},
{
"epoch": 0.536144578313253,
"grad_norm": 1.060991642886161,
"learning_rate": 4.5647321428571434e-05,
"loss": 1.2066,
"step": 356
},
{
"epoch": 0.5376506024096386,
"grad_norm": 0.7469884743687695,
"learning_rate": 4.561941964285715e-05,
"loss": 1.0678,
"step": 357
},
{
"epoch": 0.5391566265060241,
"grad_norm": 0.7600879744808855,
"learning_rate": 4.559151785714286e-05,
"loss": 1.1091,
"step": 358
},
{
"epoch": 0.5406626506024096,
"grad_norm": 0.7678747907647254,
"learning_rate": 4.556361607142857e-05,
"loss": 1.1494,
"step": 359
},
{
"epoch": 0.5421686746987951,
"grad_norm": 1.2055152589349136,
"learning_rate": 4.5535714285714286e-05,
"loss": 1.058,
"step": 360
},
{
"epoch": 0.5436746987951807,
"grad_norm": 0.6724818408231992,
"learning_rate": 4.55078125e-05,
"loss": 1.1077,
"step": 361
},
{
"epoch": 0.5451807228915663,
"grad_norm": 0.948351667341676,
"learning_rate": 4.547991071428572e-05,
"loss": 1.1105,
"step": 362
},
{
"epoch": 0.5466867469879518,
"grad_norm": 0.7459221254292225,
"learning_rate": 4.5452008928571435e-05,
"loss": 1.1048,
"step": 363
},
{
"epoch": 0.5481927710843374,
"grad_norm": 0.6446186731602351,
"learning_rate": 4.5424107142857145e-05,
"loss": 1.0731,
"step": 364
},
{
"epoch": 0.5496987951807228,
"grad_norm": 0.6235438385434885,
"learning_rate": 4.5396205357142854e-05,
"loss": 0.9971,
"step": 365
},
{
"epoch": 0.5512048192771084,
"grad_norm": 0.856252275108117,
"learning_rate": 4.536830357142857e-05,
"loss": 1.0534,
"step": 366
},
{
"epoch": 0.552710843373494,
"grad_norm": 0.7215170271626542,
"learning_rate": 4.534040178571429e-05,
"loss": 1.0996,
"step": 367
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.641882526212357,
"learning_rate": 4.5312500000000004e-05,
"loss": 1.0795,
"step": 368
},
{
"epoch": 0.5557228915662651,
"grad_norm": 0.7016746500903057,
"learning_rate": 4.528459821428572e-05,
"loss": 1.1164,
"step": 369
},
{
"epoch": 0.5572289156626506,
"grad_norm": 0.7780313378842422,
"learning_rate": 4.525669642857143e-05,
"loss": 0.9507,
"step": 370
},
{
"epoch": 0.5587349397590361,
"grad_norm": 0.8925575799409526,
"learning_rate": 4.5228794642857146e-05,
"loss": 1.0685,
"step": 371
},
{
"epoch": 0.5602409638554217,
"grad_norm": 0.8329945669972127,
"learning_rate": 4.5200892857142856e-05,
"loss": 1.1421,
"step": 372
},
{
"epoch": 0.5617469879518072,
"grad_norm": 0.5016476396946253,
"learning_rate": 4.517299107142857e-05,
"loss": 1.0305,
"step": 373
},
{
"epoch": 0.5632530120481928,
"grad_norm": 1.2663234219391322,
"learning_rate": 4.514508928571429e-05,
"loss": 1.1224,
"step": 374
},
{
"epoch": 0.5647590361445783,
"grad_norm": 0.6107780642457702,
"learning_rate": 4.5117187500000005e-05,
"loss": 1.0856,
"step": 375
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.8888456544902237,
"learning_rate": 4.5089285714285714e-05,
"loss": 1.0653,
"step": 376
},
{
"epoch": 0.5677710843373494,
"grad_norm": 1.229011464951799,
"learning_rate": 4.506138392857143e-05,
"loss": 1.1313,
"step": 377
},
{
"epoch": 0.5692771084337349,
"grad_norm": 0.46505211936133195,
"learning_rate": 4.503348214285715e-05,
"loss": 1.0073,
"step": 378
},
{
"epoch": 0.5707831325301205,
"grad_norm": 0.7945202800005255,
"learning_rate": 4.500558035714286e-05,
"loss": 1.068,
"step": 379
},
{
"epoch": 0.572289156626506,
"grad_norm": 0.7474226327444002,
"learning_rate": 4.497767857142857e-05,
"loss": 1.0441,
"step": 380
},
{
"epoch": 0.5737951807228916,
"grad_norm": 0.5844298039779413,
"learning_rate": 4.494977678571429e-05,
"loss": 0.9666,
"step": 381
},
{
"epoch": 0.5753012048192772,
"grad_norm": 0.8765269881568523,
"learning_rate": 4.4921875e-05,
"loss": 1.1449,
"step": 382
},
{
"epoch": 0.5768072289156626,
"grad_norm": 0.49837813850168466,
"learning_rate": 4.4893973214285716e-05,
"loss": 1.0007,
"step": 383
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.6216738163324567,
"learning_rate": 4.486607142857143e-05,
"loss": 1.1564,
"step": 384
},
{
"epoch": 0.5798192771084337,
"grad_norm": 0.585019009519289,
"learning_rate": 4.483816964285715e-05,
"loss": 0.9969,
"step": 385
},
{
"epoch": 0.5813253012048193,
"grad_norm": 0.447535769396093,
"learning_rate": 4.481026785714286e-05,
"loss": 1.1458,
"step": 386
},
{
"epoch": 0.5828313253012049,
"grad_norm": 0.6321230400943506,
"learning_rate": 4.4782366071428574e-05,
"loss": 1.1423,
"step": 387
},
{
"epoch": 0.5843373493975904,
"grad_norm": 0.5752618871504956,
"learning_rate": 4.4754464285714284e-05,
"loss": 1.0281,
"step": 388
},
{
"epoch": 0.5858433734939759,
"grad_norm": 0.48889366921660804,
"learning_rate": 4.47265625e-05,
"loss": 1.0911,
"step": 389
},
{
"epoch": 0.5873493975903614,
"grad_norm": 0.5883448833648237,
"learning_rate": 4.469866071428572e-05,
"loss": 0.9658,
"step": 390
},
{
"epoch": 0.588855421686747,
"grad_norm": 0.5682377283162737,
"learning_rate": 4.467075892857143e-05,
"loss": 1.091,
"step": 391
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.45325949959594397,
"learning_rate": 4.464285714285715e-05,
"loss": 1.0412,
"step": 392
},
{
"epoch": 0.5918674698795181,
"grad_norm": 0.8283776199862782,
"learning_rate": 4.461495535714286e-05,
"loss": 1.1257,
"step": 393
},
{
"epoch": 0.5933734939759037,
"grad_norm": 0.6057079618921248,
"learning_rate": 4.458705357142857e-05,
"loss": 1.0283,
"step": 394
},
{
"epoch": 0.5948795180722891,
"grad_norm": 1.7245860410960632,
"learning_rate": 4.4559151785714285e-05,
"loss": 0.9688,
"step": 395
},
{
"epoch": 0.5963855421686747,
"grad_norm": 0.6754412678482318,
"learning_rate": 4.453125e-05,
"loss": 1.0835,
"step": 396
},
{
"epoch": 0.5978915662650602,
"grad_norm": 0.9707457629018403,
"learning_rate": 4.450334821428572e-05,
"loss": 1.0772,
"step": 397
},
{
"epoch": 0.5993975903614458,
"grad_norm": 0.7922097512203519,
"learning_rate": 4.4475446428571434e-05,
"loss": 1.0734,
"step": 398
},
{
"epoch": 0.6009036144578314,
"grad_norm": 0.9823309026201563,
"learning_rate": 4.4447544642857144e-05,
"loss": 1.0064,
"step": 399
},
{
"epoch": 0.6024096385542169,
"grad_norm": 1.193757095606662,
"learning_rate": 4.4419642857142854e-05,
"loss": 1.0973,
"step": 400
},
{
"epoch": 0.6039156626506024,
"grad_norm": 0.8798802181039858,
"learning_rate": 4.439174107142857e-05,
"loss": 1.0251,
"step": 401
},
{
"epoch": 0.6054216867469879,
"grad_norm": 1.4376950776492325,
"learning_rate": 4.4363839285714286e-05,
"loss": 1.0005,
"step": 402
},
{
"epoch": 0.6069277108433735,
"grad_norm": 0.6951608792696263,
"learning_rate": 4.43359375e-05,
"loss": 1.0636,
"step": 403
},
{
"epoch": 0.608433734939759,
"grad_norm": 1.3599368232030171,
"learning_rate": 4.430803571428572e-05,
"loss": 0.9548,
"step": 404
},
{
"epoch": 0.6099397590361446,
"grad_norm": 0.743167840372275,
"learning_rate": 4.4280133928571436e-05,
"loss": 1.0488,
"step": 405
},
{
"epoch": 0.6114457831325302,
"grad_norm": 0.6600759894508788,
"learning_rate": 4.4252232142857145e-05,
"loss": 1.0062,
"step": 406
},
{
"epoch": 0.6129518072289156,
"grad_norm": 1.0981361259623472,
"learning_rate": 4.4224330357142855e-05,
"loss": 1.0507,
"step": 407
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.6213758277314005,
"learning_rate": 4.419642857142857e-05,
"loss": 1.0434,
"step": 408
},
{
"epoch": 0.6159638554216867,
"grad_norm": 2.0016126399185383,
"learning_rate": 4.416852678571429e-05,
"loss": 0.9981,
"step": 409
},
{
"epoch": 0.6174698795180723,
"grad_norm": 0.9275092114284722,
"learning_rate": 4.4140625000000004e-05,
"loss": 1.0843,
"step": 410
},
{
"epoch": 0.6189759036144579,
"grad_norm": 2.1148510979024993,
"learning_rate": 4.411272321428572e-05,
"loss": 0.958,
"step": 411
},
{
"epoch": 0.6204819277108434,
"grad_norm": 1.4638962200590835,
"learning_rate": 4.408482142857143e-05,
"loss": 1.1924,
"step": 412
},
{
"epoch": 0.6219879518072289,
"grad_norm": 1.1185691869270866,
"learning_rate": 4.4056919642857146e-05,
"loss": 1.079,
"step": 413
},
{
"epoch": 0.6234939759036144,
"grad_norm": 1.3289424792770768,
"learning_rate": 4.4029017857142856e-05,
"loss": 1.062,
"step": 414
},
{
"epoch": 0.625,
"grad_norm": 1.0781944816305413,
"learning_rate": 4.400111607142857e-05,
"loss": 1.0422,
"step": 415
},
{
"epoch": 0.6265060240963856,
"grad_norm": 1.1988686516574925,
"learning_rate": 4.397321428571429e-05,
"loss": 1.0695,
"step": 416
},
{
"epoch": 0.6280120481927711,
"grad_norm": 1.4828154593514484,
"learning_rate": 4.3945312500000005e-05,
"loss": 0.9709,
"step": 417
},
{
"epoch": 0.6295180722891566,
"grad_norm": 1.486560731791325,
"learning_rate": 4.3917410714285715e-05,
"loss": 1.0716,
"step": 418
},
{
"epoch": 0.6310240963855421,
"grad_norm": 1.2158112051346686,
"learning_rate": 4.388950892857143e-05,
"loss": 0.9942,
"step": 419
},
{
"epoch": 0.6325301204819277,
"grad_norm": 1.2438499991299516,
"learning_rate": 4.386160714285715e-05,
"loss": 1.1067,
"step": 420
},
{
"epoch": 0.6340361445783133,
"grad_norm": 1.3048474834428594,
"learning_rate": 4.383370535714286e-05,
"loss": 0.9891,
"step": 421
},
{
"epoch": 0.6355421686746988,
"grad_norm": 0.9494210418100297,
"learning_rate": 4.3805803571428574e-05,
"loss": 0.9584,
"step": 422
},
{
"epoch": 0.6370481927710844,
"grad_norm": 1.7877658499010463,
"learning_rate": 4.377790178571429e-05,
"loss": 1.0928,
"step": 423
},
{
"epoch": 0.6385542168674698,
"grad_norm": 1.157593994161981,
"learning_rate": 4.375e-05,
"loss": 1.0758,
"step": 424
},
{
"epoch": 0.6400602409638554,
"grad_norm": 1.4700580698947876,
"learning_rate": 4.3722098214285716e-05,
"loss": 1.0765,
"step": 425
},
{
"epoch": 0.641566265060241,
"grad_norm": 1.6172229938690263,
"learning_rate": 4.369419642857143e-05,
"loss": 1.1141,
"step": 426
},
{
"epoch": 0.6430722891566265,
"grad_norm": 1.186922768657512,
"learning_rate": 4.366629464285715e-05,
"loss": 1.0818,
"step": 427
},
{
"epoch": 0.6445783132530121,
"grad_norm": 0.9000625293138284,
"learning_rate": 4.363839285714286e-05,
"loss": 1.0091,
"step": 428
},
{
"epoch": 0.6460843373493976,
"grad_norm": 1.5329538781006897,
"learning_rate": 4.3610491071428575e-05,
"loss": 1.0598,
"step": 429
},
{
"epoch": 0.6475903614457831,
"grad_norm": 1.145271874414727,
"learning_rate": 4.3582589285714284e-05,
"loss": 1.0717,
"step": 430
},
{
"epoch": 0.6490963855421686,
"grad_norm": 2.122861754241977,
"learning_rate": 4.35546875e-05,
"loss": 1.0814,
"step": 431
},
{
"epoch": 0.6506024096385542,
"grad_norm": 1.7614466759807201,
"learning_rate": 4.352678571428572e-05,
"loss": 1.0076,
"step": 432
},
{
"epoch": 0.6521084337349398,
"grad_norm": 0.7529157498533703,
"learning_rate": 4.3498883928571434e-05,
"loss": 1.0871,
"step": 433
},
{
"epoch": 0.6536144578313253,
"grad_norm": 2.168442340549844,
"learning_rate": 4.347098214285715e-05,
"loss": 0.9343,
"step": 434
},
{
"epoch": 0.6551204819277109,
"grad_norm": 0.9457789149026808,
"learning_rate": 4.344308035714286e-05,
"loss": 1.1188,
"step": 435
},
{
"epoch": 0.6566265060240963,
"grad_norm": 1.7463067966041532,
"learning_rate": 4.341517857142857e-05,
"loss": 1.0036,
"step": 436
},
{
"epoch": 0.6581325301204819,
"grad_norm": 2.039092530607203,
"learning_rate": 4.3387276785714286e-05,
"loss": 1.0014,
"step": 437
},
{
"epoch": 0.6596385542168675,
"grad_norm": 0.9761634186974393,
"learning_rate": 4.3359375e-05,
"loss": 1.0091,
"step": 438
},
{
"epoch": 0.661144578313253,
"grad_norm": 1.9207662275749011,
"learning_rate": 4.333147321428572e-05,
"loss": 1.1039,
"step": 439
},
{
"epoch": 0.6626506024096386,
"grad_norm": 1.397212702360919,
"learning_rate": 4.3303571428571435e-05,
"loss": 1.1315,
"step": 440
},
{
"epoch": 0.6641566265060241,
"grad_norm": 1.3107767098478034,
"learning_rate": 4.3275669642857144e-05,
"loss": 0.9561,
"step": 441
},
{
"epoch": 0.6656626506024096,
"grad_norm": 1.3429399999326865,
"learning_rate": 4.3247767857142854e-05,
"loss": 0.9758,
"step": 442
},
{
"epoch": 0.6671686746987951,
"grad_norm": 0.8608423294203231,
"learning_rate": 4.321986607142857e-05,
"loss": 1.1276,
"step": 443
},
{
"epoch": 0.6686746987951807,
"grad_norm": 1.0188145996023645,
"learning_rate": 4.319196428571429e-05,
"loss": 0.914,
"step": 444
},
{
"epoch": 0.6701807228915663,
"grad_norm": 0.8770241086418853,
"learning_rate": 4.31640625e-05,
"loss": 1.0692,
"step": 445
},
{
"epoch": 0.6716867469879518,
"grad_norm": 0.7890132956189565,
"learning_rate": 4.313616071428572e-05,
"loss": 1.0889,
"step": 446
},
{
"epoch": 0.6731927710843374,
"grad_norm": 0.7733500297805268,
"learning_rate": 4.310825892857143e-05,
"loss": 1.1,
"step": 447
},
{
"epoch": 0.6746987951807228,
"grad_norm": 1.1068288092889933,
"learning_rate": 4.3080357142857145e-05,
"loss": 1.0012,
"step": 448
},
{
"epoch": 0.6762048192771084,
"grad_norm": 0.6682318177779804,
"learning_rate": 4.3052455357142855e-05,
"loss": 1.0595,
"step": 449
},
{
"epoch": 0.677710843373494,
"grad_norm": 1.0377171596437371,
"learning_rate": 4.302455357142857e-05,
"loss": 0.9763,
"step": 450
},
{
"epoch": 0.6792168674698795,
"grad_norm": 0.5850270076129606,
"learning_rate": 4.299665178571429e-05,
"loss": 1.16,
"step": 451
},
{
"epoch": 0.6807228915662651,
"grad_norm": 1.1243459812134664,
"learning_rate": 4.2968750000000004e-05,
"loss": 1.0413,
"step": 452
},
{
"epoch": 0.6822289156626506,
"grad_norm": 1.2069244736981295,
"learning_rate": 4.2940848214285714e-05,
"loss": 1.0474,
"step": 453
},
{
"epoch": 0.6837349397590361,
"grad_norm": 0.5907377229988777,
"learning_rate": 4.291294642857143e-05,
"loss": 1.0469,
"step": 454
},
{
"epoch": 0.6852409638554217,
"grad_norm": 0.8972447130430958,
"learning_rate": 4.288504464285715e-05,
"loss": 1.0284,
"step": 455
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.6819388664485988,
"learning_rate": 4.2857142857142856e-05,
"loss": 1.0549,
"step": 456
},
{
"epoch": 0.6882530120481928,
"grad_norm": 0.7519592144563017,
"learning_rate": 4.282924107142857e-05,
"loss": 1.0682,
"step": 457
},
{
"epoch": 0.6897590361445783,
"grad_norm": 0.992607516447039,
"learning_rate": 4.280133928571429e-05,
"loss": 0.9229,
"step": 458
},
{
"epoch": 0.6912650602409639,
"grad_norm": 0.7955272701154339,
"learning_rate": 4.27734375e-05,
"loss": 0.9537,
"step": 459
},
{
"epoch": 0.6927710843373494,
"grad_norm": 0.7355671544476373,
"learning_rate": 4.2745535714285715e-05,
"loss": 1.1429,
"step": 460
},
{
"epoch": 0.6942771084337349,
"grad_norm": 0.6657688051425265,
"learning_rate": 4.271763392857143e-05,
"loss": 1.074,
"step": 461
},
{
"epoch": 0.6957831325301205,
"grad_norm": 1.1755497391860503,
"learning_rate": 4.268973214285715e-05,
"loss": 0.9946,
"step": 462
},
{
"epoch": 0.697289156626506,
"grad_norm": 0.5278686978148386,
"learning_rate": 4.266183035714286e-05,
"loss": 1.0636,
"step": 463
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.6033555383053728,
"learning_rate": 4.2633928571428574e-05,
"loss": 1.1309,
"step": 464
},
{
"epoch": 0.7003012048192772,
"grad_norm": 0.5648585620681138,
"learning_rate": 4.260602678571429e-05,
"loss": 0.9817,
"step": 465
},
{
"epoch": 0.7018072289156626,
"grad_norm": 0.8366231563071388,
"learning_rate": 4.2578125e-05,
"loss": 1.0416,
"step": 466
},
{
"epoch": 0.7033132530120482,
"grad_norm": 0.7773833211774298,
"learning_rate": 4.2550223214285716e-05,
"loss": 1.0436,
"step": 467
},
{
"epoch": 0.7048192771084337,
"grad_norm": 4.183966329685267,
"learning_rate": 4.252232142857143e-05,
"loss": 1.2348,
"step": 468
},
{
"epoch": 0.7063253012048193,
"grad_norm": 1.2910525281911964,
"learning_rate": 4.249441964285715e-05,
"loss": 1.0763,
"step": 469
},
{
"epoch": 0.7078313253012049,
"grad_norm": 0.5490842324511688,
"learning_rate": 4.246651785714286e-05,
"loss": 1.0901,
"step": 470
},
{
"epoch": 0.7093373493975904,
"grad_norm": 0.9632001556441507,
"learning_rate": 4.2438616071428575e-05,
"loss": 1.0656,
"step": 471
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.47132505448452205,
"learning_rate": 4.2410714285714285e-05,
"loss": 0.99,
"step": 472
},
{
"epoch": 0.7123493975903614,
"grad_norm": 0.9511883587293538,
"learning_rate": 4.23828125e-05,
"loss": 1.1328,
"step": 473
},
{
"epoch": 0.713855421686747,
"grad_norm": 0.6093692989449008,
"learning_rate": 4.235491071428572e-05,
"loss": 1.0063,
"step": 474
},
{
"epoch": 0.7153614457831325,
"grad_norm": 0.5139370335207126,
"learning_rate": 4.2327008928571434e-05,
"loss": 0.9604,
"step": 475
},
{
"epoch": 0.7168674698795181,
"grad_norm": 0.8130126772990096,
"learning_rate": 4.229910714285715e-05,
"loss": 1.1156,
"step": 476
},
{
"epoch": 0.7183734939759037,
"grad_norm": 0.7681738922447233,
"learning_rate": 4.227120535714286e-05,
"loss": 1.0167,
"step": 477
},
{
"epoch": 0.7198795180722891,
"grad_norm": 0.5072386035606629,
"learning_rate": 4.224330357142857e-05,
"loss": 0.9847,
"step": 478
},
{
"epoch": 0.7213855421686747,
"grad_norm": 0.7159194292764187,
"learning_rate": 4.2215401785714286e-05,
"loss": 1.0858,
"step": 479
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.4799309012819447,
"learning_rate": 4.21875e-05,
"loss": 1.0682,
"step": 480
},
{
"epoch": 0.7243975903614458,
"grad_norm": 0.4591800343573713,
"learning_rate": 4.215959821428572e-05,
"loss": 1.1973,
"step": 481
},
{
"epoch": 0.7259036144578314,
"grad_norm": 0.5704159830373629,
"learning_rate": 4.2131696428571435e-05,
"loss": 0.9583,
"step": 482
},
{
"epoch": 0.7274096385542169,
"grad_norm": 0.5963481173178118,
"learning_rate": 4.2103794642857145e-05,
"loss": 1.0664,
"step": 483
},
{
"epoch": 0.7289156626506024,
"grad_norm": 0.5340394776943623,
"learning_rate": 4.2075892857142854e-05,
"loss": 1.0156,
"step": 484
},
{
"epoch": 0.7304216867469879,
"grad_norm": 0.5620406504345488,
"learning_rate": 4.204799107142857e-05,
"loss": 1.0417,
"step": 485
},
{
"epoch": 0.7319277108433735,
"grad_norm": 0.5691400180400379,
"learning_rate": 4.202008928571429e-05,
"loss": 1.0677,
"step": 486
},
{
"epoch": 0.733433734939759,
"grad_norm": 0.4754430941444271,
"learning_rate": 4.1992187500000003e-05,
"loss": 1.0723,
"step": 487
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.5383333179764404,
"learning_rate": 4.196428571428572e-05,
"loss": 0.8904,
"step": 488
},
{
"epoch": 0.7364457831325302,
"grad_norm": 0.5848750819218848,
"learning_rate": 4.193638392857143e-05,
"loss": 1.033,
"step": 489
},
{
"epoch": 0.7379518072289156,
"grad_norm": 0.5217749461149191,
"learning_rate": 4.1908482142857146e-05,
"loss": 1.0952,
"step": 490
},
{
"epoch": 0.7394578313253012,
"grad_norm": 0.5240119307028889,
"learning_rate": 4.1880580357142855e-05,
"loss": 1.1312,
"step": 491
},
{
"epoch": 0.7409638554216867,
"grad_norm": 0.661539498098112,
"learning_rate": 4.185267857142857e-05,
"loss": 0.9749,
"step": 492
},
{
"epoch": 0.7424698795180723,
"grad_norm": 0.7390671972107995,
"learning_rate": 4.182477678571429e-05,
"loss": 1.0223,
"step": 493
},
{
"epoch": 0.7439759036144579,
"grad_norm": 0.5232681511419042,
"learning_rate": 4.1796875000000005e-05,
"loss": 1.0679,
"step": 494
},
{
"epoch": 0.7454819277108434,
"grad_norm": 0.6732474259531636,
"learning_rate": 4.1768973214285714e-05,
"loss": 0.9715,
"step": 495
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.48511395106017224,
"learning_rate": 4.174107142857143e-05,
"loss": 1.0589,
"step": 496
},
{
"epoch": 0.7484939759036144,
"grad_norm": 0.5806246193363507,
"learning_rate": 4.171316964285715e-05,
"loss": 1.1199,
"step": 497
},
{
"epoch": 0.75,
"grad_norm": 0.5491686188695876,
"learning_rate": 4.1685267857142857e-05,
"loss": 1.0185,
"step": 498
},
{
"epoch": 0.7515060240963856,
"grad_norm": 0.5046604098611595,
"learning_rate": 4.165736607142857e-05,
"loss": 1.0998,
"step": 499
},
{
"epoch": 0.7530120481927711,
"grad_norm": 0.5300954916585534,
"learning_rate": 4.162946428571429e-05,
"loss": 1.0748,
"step": 500
},
{
"epoch": 0.7545180722891566,
"grad_norm": 0.46710221799044943,
"learning_rate": 4.16015625e-05,
"loss": 1.0668,
"step": 501
},
{
"epoch": 0.7560240963855421,
"grad_norm": 0.4427131777159438,
"learning_rate": 4.1573660714285715e-05,
"loss": 0.9988,
"step": 502
},
{
"epoch": 0.7575301204819277,
"grad_norm": 0.632772523394511,
"learning_rate": 4.154575892857143e-05,
"loss": 1.0915,
"step": 503
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.5784350289319655,
"learning_rate": 4.151785714285715e-05,
"loss": 0.9337,
"step": 504
},
{
"epoch": 0.7605421686746988,
"grad_norm": 0.45390410381807517,
"learning_rate": 4.148995535714286e-05,
"loss": 1.0911,
"step": 505
},
{
"epoch": 0.7620481927710844,
"grad_norm": 0.5926009510936472,
"learning_rate": 4.1462053571428574e-05,
"loss": 1.0088,
"step": 506
},
{
"epoch": 0.7635542168674698,
"grad_norm": 0.8087973510816183,
"learning_rate": 4.1434151785714284e-05,
"loss": 1.0127,
"step": 507
},
{
"epoch": 0.7650602409638554,
"grad_norm": 0.4027623331214715,
"learning_rate": 4.140625e-05,
"loss": 0.9812,
"step": 508
},
{
"epoch": 0.766566265060241,
"grad_norm": 0.5467449250418355,
"learning_rate": 4.1378348214285717e-05,
"loss": 1.0894,
"step": 509
},
{
"epoch": 0.7680722891566265,
"grad_norm": 0.5073960592394419,
"learning_rate": 4.135044642857143e-05,
"loss": 0.9381,
"step": 510
},
{
"epoch": 0.7695783132530121,
"grad_norm": 0.6134124152805418,
"learning_rate": 4.132254464285715e-05,
"loss": 0.9819,
"step": 511
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.42844936197577005,
"learning_rate": 4.129464285714286e-05,
"loss": 1.0166,
"step": 512
},
{
"epoch": 0.7725903614457831,
"grad_norm": 0.44531267474047137,
"learning_rate": 4.126674107142857e-05,
"loss": 1.1284,
"step": 513
},
{
"epoch": 0.7740963855421686,
"grad_norm": 0.48513626927453324,
"learning_rate": 4.1238839285714285e-05,
"loss": 0.9945,
"step": 514
},
{
"epoch": 0.7756024096385542,
"grad_norm": 0.5224731115221803,
"learning_rate": 4.12109375e-05,
"loss": 1.1963,
"step": 515
},
{
"epoch": 0.7771084337349398,
"grad_norm": 0.47365577634975636,
"learning_rate": 4.118303571428572e-05,
"loss": 1.0418,
"step": 516
},
{
"epoch": 0.7786144578313253,
"grad_norm": 0.4935060074038172,
"learning_rate": 4.1155133928571434e-05,
"loss": 0.96,
"step": 517
},
{
"epoch": 0.7801204819277109,
"grad_norm": 0.4610733719249603,
"learning_rate": 4.112723214285715e-05,
"loss": 0.9765,
"step": 518
},
{
"epoch": 0.7816265060240963,
"grad_norm": 1.5586020551222097,
"learning_rate": 4.109933035714285e-05,
"loss": 1.1135,
"step": 519
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.5352142658065009,
"learning_rate": 4.107142857142857e-05,
"loss": 1.0045,
"step": 520
},
{
"epoch": 0.7846385542168675,
"grad_norm": 0.637429045325593,
"learning_rate": 4.1043526785714286e-05,
"loss": 1.0388,
"step": 521
},
{
"epoch": 0.786144578313253,
"grad_norm": 0.48611194053154283,
"learning_rate": 4.1015625e-05,
"loss": 1.1529,
"step": 522
},
{
"epoch": 0.7876506024096386,
"grad_norm": 20.76088456281926,
"learning_rate": 4.098772321428572e-05,
"loss": 1.2982,
"step": 523
},
{
"epoch": 0.7891566265060241,
"grad_norm": 1.2347760461276898,
"learning_rate": 4.0959821428571435e-05,
"loss": 1.0949,
"step": 524
},
{
"epoch": 0.7906626506024096,
"grad_norm": 0.9500565519144629,
"learning_rate": 4.0931919642857145e-05,
"loss": 1.0016,
"step": 525
},
{
"epoch": 0.7921686746987951,
"grad_norm": 0.8294636756936213,
"learning_rate": 4.0904017857142855e-05,
"loss": 1.0741,
"step": 526
},
{
"epoch": 0.7936746987951807,
"grad_norm": 1.1486545878471635,
"learning_rate": 4.087611607142857e-05,
"loss": 0.9655,
"step": 527
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.5759270867694973,
"learning_rate": 4.084821428571429e-05,
"loss": 0.9398,
"step": 528
},
{
"epoch": 0.7966867469879518,
"grad_norm": 0.8883457780354299,
"learning_rate": 4.0820312500000004e-05,
"loss": 1.0148,
"step": 529
},
{
"epoch": 0.7981927710843374,
"grad_norm": 0.9406074304915966,
"learning_rate": 4.079241071428572e-05,
"loss": 1.0884,
"step": 530
},
{
"epoch": 0.7996987951807228,
"grad_norm": 0.6075598707396435,
"learning_rate": 4.076450892857143e-05,
"loss": 1.0232,
"step": 531
},
{
"epoch": 0.8012048192771084,
"grad_norm": 0.6421253790134748,
"learning_rate": 4.0736607142857146e-05,
"loss": 1.1187,
"step": 532
},
{
"epoch": 0.802710843373494,
"grad_norm": 0.7311161897026253,
"learning_rate": 4.0708705357142856e-05,
"loss": 1.1312,
"step": 533
},
{
"epoch": 0.8042168674698795,
"grad_norm": 0.6576520727703772,
"learning_rate": 4.068080357142857e-05,
"loss": 0.9948,
"step": 534
},
{
"epoch": 0.8057228915662651,
"grad_norm": 0.6023751711855272,
"learning_rate": 4.065290178571429e-05,
"loss": 0.9705,
"step": 535
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.5829853717860507,
"learning_rate": 4.0625000000000005e-05,
"loss": 1.0883,
"step": 536
},
{
"epoch": 0.8087349397590361,
"grad_norm": 0.7805220413032683,
"learning_rate": 4.0597098214285715e-05,
"loss": 1.0616,
"step": 537
},
{
"epoch": 0.8102409638554217,
"grad_norm": 0.644067786282687,
"learning_rate": 4.056919642857143e-05,
"loss": 1.0326,
"step": 538
},
{
"epoch": 0.8117469879518072,
"grad_norm": 0.5697753129586425,
"learning_rate": 4.054129464285715e-05,
"loss": 1.0215,
"step": 539
},
{
"epoch": 0.8132530120481928,
"grad_norm": 0.5001581961171637,
"learning_rate": 4.051339285714286e-05,
"loss": 1.1289,
"step": 540
},
{
"epoch": 0.8147590361445783,
"grad_norm": 0.7292306308043551,
"learning_rate": 4.048549107142857e-05,
"loss": 1.0372,
"step": 541
},
{
"epoch": 0.8162650602409639,
"grad_norm": 0.8743206083159424,
"learning_rate": 4.045758928571429e-05,
"loss": 1.1881,
"step": 542
},
{
"epoch": 0.8177710843373494,
"grad_norm": 0.6887135216150085,
"learning_rate": 4.04296875e-05,
"loss": 0.977,
"step": 543
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.5226816094136586,
"learning_rate": 4.0401785714285716e-05,
"loss": 1.0242,
"step": 544
},
{
"epoch": 0.8207831325301205,
"grad_norm": 0.633806653685841,
"learning_rate": 4.037388392857143e-05,
"loss": 1.1074,
"step": 545
},
{
"epoch": 0.822289156626506,
"grad_norm": 0.49700254765408847,
"learning_rate": 4.034598214285715e-05,
"loss": 1.072,
"step": 546
},
{
"epoch": 0.8237951807228916,
"grad_norm": 0.7488895277921449,
"learning_rate": 4.031808035714286e-05,
"loss": 1.0307,
"step": 547
},
{
"epoch": 0.8253012048192772,
"grad_norm": 0.46150420474832204,
"learning_rate": 4.0290178571428574e-05,
"loss": 1.0372,
"step": 548
},
{
"epoch": 0.8268072289156626,
"grad_norm": 0.5765382468808435,
"learning_rate": 4.0262276785714284e-05,
"loss": 1.0506,
"step": 549
},
{
"epoch": 0.8283132530120482,
"grad_norm": 0.8643902519824668,
"learning_rate": 4.0234375e-05,
"loss": 0.9508,
"step": 550
},
{
"epoch": 0.8298192771084337,
"grad_norm": 0.6750943201178752,
"learning_rate": 4.020647321428572e-05,
"loss": 0.9645,
"step": 551
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.5774107056894972,
"learning_rate": 4.017857142857143e-05,
"loss": 0.9554,
"step": 552
},
{
"epoch": 0.8328313253012049,
"grad_norm": 1.0549727459569322,
"learning_rate": 4.015066964285715e-05,
"loss": 0.9805,
"step": 553
},
{
"epoch": 0.8343373493975904,
"grad_norm": 0.7549284446975768,
"learning_rate": 4.012276785714286e-05,
"loss": 0.9376,
"step": 554
},
{
"epoch": 0.8358433734939759,
"grad_norm": 0.6473641823391572,
"learning_rate": 4.009486607142857e-05,
"loss": 1.0528,
"step": 555
},
{
"epoch": 0.8373493975903614,
"grad_norm": 1.238967433030929,
"learning_rate": 4.0066964285714285e-05,
"loss": 0.9671,
"step": 556
},
{
"epoch": 0.838855421686747,
"grad_norm": 0.4671487931730317,
"learning_rate": 4.00390625e-05,
"loss": 0.9557,
"step": 557
},
{
"epoch": 0.8403614457831325,
"grad_norm": 1.495622523855259,
"learning_rate": 4.001116071428572e-05,
"loss": 1.089,
"step": 558
},
{
"epoch": 0.8418674698795181,
"grad_norm": 0.6642043030760867,
"learning_rate": 3.9983258928571434e-05,
"loss": 1.0534,
"step": 559
},
{
"epoch": 0.8433734939759037,
"grad_norm": 1.1516161598577168,
"learning_rate": 3.9955357142857144e-05,
"loss": 1.0327,
"step": 560
},
{
"epoch": 0.8448795180722891,
"grad_norm": 0.9229303300247966,
"learning_rate": 3.9927455357142854e-05,
"loss": 0.9651,
"step": 561
},
{
"epoch": 0.8463855421686747,
"grad_norm": 0.652629015945715,
"learning_rate": 3.989955357142857e-05,
"loss": 1.0095,
"step": 562
},
{
"epoch": 0.8478915662650602,
"grad_norm": 1.1083982205777,
"learning_rate": 3.9871651785714286e-05,
"loss": 0.9651,
"step": 563
},
{
"epoch": 0.8493975903614458,
"grad_norm": 0.65107622455566,
"learning_rate": 3.984375e-05,
"loss": 0.9875,
"step": 564
},
{
"epoch": 0.8509036144578314,
"grad_norm": 1.3799884071167927,
"learning_rate": 3.981584821428572e-05,
"loss": 1.069,
"step": 565
},
{
"epoch": 0.8524096385542169,
"grad_norm": 0.7973365525408832,
"learning_rate": 3.978794642857143e-05,
"loss": 1.0449,
"step": 566
},
{
"epoch": 0.8539156626506024,
"grad_norm": 1.537298614493782,
"learning_rate": 3.9760044642857145e-05,
"loss": 1.0266,
"step": 567
},
{
"epoch": 0.8554216867469879,
"grad_norm": 1.0735134647726037,
"learning_rate": 3.9732142857142855e-05,
"loss": 0.9769,
"step": 568
},
{
"epoch": 0.8569277108433735,
"grad_norm": 1.219210497636576,
"learning_rate": 3.970424107142857e-05,
"loss": 1.0725,
"step": 569
},
{
"epoch": 0.858433734939759,
"grad_norm": 0.8250382790058974,
"learning_rate": 3.967633928571429e-05,
"loss": 1.0464,
"step": 570
},
{
"epoch": 0.8599397590361446,
"grad_norm": 1.39320314301614,
"learning_rate": 3.9648437500000004e-05,
"loss": 1.0705,
"step": 571
},
{
"epoch": 0.8614457831325302,
"grad_norm": 0.7700524945575854,
"learning_rate": 3.9620535714285714e-05,
"loss": 1.0897,
"step": 572
},
{
"epoch": 0.8629518072289156,
"grad_norm": 1.705675566032994,
"learning_rate": 3.959263392857143e-05,
"loss": 1.0193,
"step": 573
},
{
"epoch": 0.8644578313253012,
"grad_norm": 0.8977750576677003,
"learning_rate": 3.9564732142857146e-05,
"loss": 0.9754,
"step": 574
},
{
"epoch": 0.8659638554216867,
"grad_norm": 1.4877461425780285,
"learning_rate": 3.9536830357142856e-05,
"loss": 1.0329,
"step": 575
},
{
"epoch": 0.8674698795180723,
"grad_norm": 1.015998208595534,
"learning_rate": 3.950892857142857e-05,
"loss": 1.03,
"step": 576
},
{
"epoch": 0.8689759036144579,
"grad_norm": 1.362180323797355,
"learning_rate": 3.948102678571429e-05,
"loss": 1.1001,
"step": 577
},
{
"epoch": 0.8704819277108434,
"grad_norm": 1.2012064102023352,
"learning_rate": 3.9453125000000005e-05,
"loss": 1.0655,
"step": 578
},
{
"epoch": 0.8719879518072289,
"grad_norm": 1.260040819534742,
"learning_rate": 3.9425223214285715e-05,
"loss": 1.056,
"step": 579
},
{
"epoch": 0.8734939759036144,
"grad_norm": 0.9384819635438446,
"learning_rate": 3.939732142857143e-05,
"loss": 0.9372,
"step": 580
},
{
"epoch": 0.875,
"grad_norm": 1.3559049743907958,
"learning_rate": 3.936941964285715e-05,
"loss": 1.0869,
"step": 581
},
{
"epoch": 0.8765060240963856,
"grad_norm": 1.165145587749377,
"learning_rate": 3.934151785714286e-05,
"loss": 0.927,
"step": 582
},
{
"epoch": 0.8780120481927711,
"grad_norm": 1.3233612469600329,
"learning_rate": 3.9313616071428574e-05,
"loss": 1.0089,
"step": 583
},
{
"epoch": 0.8795180722891566,
"grad_norm": 1.225173244597109,
"learning_rate": 3.928571428571429e-05,
"loss": 0.9703,
"step": 584
},
{
"epoch": 0.8810240963855421,
"grad_norm": 1.1869317467043707,
"learning_rate": 3.92578125e-05,
"loss": 0.9897,
"step": 585
},
{
"epoch": 0.8825301204819277,
"grad_norm": 0.9384009909978221,
"learning_rate": 3.9229910714285716e-05,
"loss": 1.1028,
"step": 586
},
{
"epoch": 0.8840361445783133,
"grad_norm": 1.3562039144032472,
"learning_rate": 3.920200892857143e-05,
"loss": 0.9317,
"step": 587
},
{
"epoch": 0.8855421686746988,
"grad_norm": 1.0705358733120784,
"learning_rate": 3.917410714285715e-05,
"loss": 1.1164,
"step": 588
},
{
"epoch": 0.8870481927710844,
"grad_norm": 1.6987950004107686,
"learning_rate": 3.914620535714286e-05,
"loss": 1.0568,
"step": 589
},
{
"epoch": 0.8885542168674698,
"grad_norm": 1.6540634277991682,
"learning_rate": 3.9118303571428575e-05,
"loss": 0.9835,
"step": 590
},
{
"epoch": 0.8900602409638554,
"grad_norm": 1.0302922417595635,
"learning_rate": 3.9090401785714284e-05,
"loss": 1.0497,
"step": 591
},
{
"epoch": 0.891566265060241,
"grad_norm": 1.381470034807906,
"learning_rate": 3.90625e-05,
"loss": 0.9792,
"step": 592
},
{
"epoch": 0.8930722891566265,
"grad_norm": 0.9166752816116456,
"learning_rate": 3.903459821428572e-05,
"loss": 0.9833,
"step": 593
},
{
"epoch": 0.8945783132530121,
"grad_norm": 0.9676162358689105,
"learning_rate": 3.9006696428571434e-05,
"loss": 0.9873,
"step": 594
},
{
"epoch": 0.8960843373493976,
"grad_norm": 1.1807099261460772,
"learning_rate": 3.897879464285715e-05,
"loss": 1.0436,
"step": 595
},
{
"epoch": 0.8975903614457831,
"grad_norm": 0.5822772311920681,
"learning_rate": 3.895089285714286e-05,
"loss": 0.9877,
"step": 596
},
{
"epoch": 0.8990963855421686,
"grad_norm": 1.0348609684669337,
"learning_rate": 3.892299107142857e-05,
"loss": 1.0069,
"step": 597
},
{
"epoch": 0.9006024096385542,
"grad_norm": 0.5178551932810507,
"learning_rate": 3.8895089285714286e-05,
"loss": 0.9055,
"step": 598
},
{
"epoch": 0.9021084337349398,
"grad_norm": 0.8361060948986553,
"learning_rate": 3.88671875e-05,
"loss": 0.9925,
"step": 599
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.7897096855178408,
"learning_rate": 3.883928571428572e-05,
"loss": 1.0538,
"step": 600
},
{
"epoch": 0.9051204819277109,
"grad_norm": 0.6229183804412646,
"learning_rate": 3.8811383928571435e-05,
"loss": 1.0924,
"step": 601
},
{
"epoch": 0.9066265060240963,
"grad_norm": 1.0992736521527116,
"learning_rate": 3.8783482142857144e-05,
"loss": 1.0554,
"step": 602
},
{
"epoch": 0.9081325301204819,
"grad_norm": 0.49215047188612737,
"learning_rate": 3.8755580357142854e-05,
"loss": 0.899,
"step": 603
},
{
"epoch": 0.9096385542168675,
"grad_norm": 0.8409280586500487,
"learning_rate": 3.872767857142857e-05,
"loss": 1.1456,
"step": 604
},
{
"epoch": 0.911144578313253,
"grad_norm": 0.8642297554545003,
"learning_rate": 3.869977678571429e-05,
"loss": 1.0845,
"step": 605
},
{
"epoch": 0.9126506024096386,
"grad_norm": 0.5905437221547382,
"learning_rate": 3.8671875e-05,
"loss": 1.0217,
"step": 606
},
{
"epoch": 0.9141566265060241,
"grad_norm": 1.2288004661293166,
"learning_rate": 3.864397321428572e-05,
"loss": 0.9663,
"step": 607
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.6791954246114843,
"learning_rate": 3.861607142857143e-05,
"loss": 1.0332,
"step": 608
},
{
"epoch": 0.9171686746987951,
"grad_norm": 1.4986476204928367,
"learning_rate": 3.8588169642857146e-05,
"loss": 0.9711,
"step": 609
},
{
"epoch": 0.9186746987951807,
"grad_norm": 0.7115724933255649,
"learning_rate": 3.8560267857142855e-05,
"loss": 1.0656,
"step": 610
},
{
"epoch": 0.9201807228915663,
"grad_norm": 1.37942683324599,
"learning_rate": 3.853236607142857e-05,
"loss": 0.9916,
"step": 611
},
{
"epoch": 0.9216867469879518,
"grad_norm": 0.9361426644764892,
"learning_rate": 3.850446428571429e-05,
"loss": 1.0162,
"step": 612
},
{
"epoch": 0.9231927710843374,
"grad_norm": 1.323292489239318,
"learning_rate": 3.8476562500000004e-05,
"loss": 1.1794,
"step": 613
},
{
"epoch": 0.9246987951807228,
"grad_norm": 1.3520129579088174,
"learning_rate": 3.8448660714285714e-05,
"loss": 0.9864,
"step": 614
},
{
"epoch": 0.9262048192771084,
"grad_norm": 0.8562885977282053,
"learning_rate": 3.842075892857143e-05,
"loss": 1.1206,
"step": 615
},
{
"epoch": 0.927710843373494,
"grad_norm": 1.1681053935029966,
"learning_rate": 3.839285714285715e-05,
"loss": 1.034,
"step": 616
},
{
"epoch": 0.9292168674698795,
"grad_norm": 1.21135102246154,
"learning_rate": 3.8364955357142856e-05,
"loss": 1.0853,
"step": 617
},
{
"epoch": 0.9307228915662651,
"grad_norm": 1.0809756749602952,
"learning_rate": 3.833705357142857e-05,
"loss": 1.0011,
"step": 618
},
{
"epoch": 0.9322289156626506,
"grad_norm": 1.4392527616430273,
"learning_rate": 3.830915178571429e-05,
"loss": 1.0623,
"step": 619
},
{
"epoch": 0.9337349397590361,
"grad_norm": 1.7238067680214808,
"learning_rate": 3.828125e-05,
"loss": 1.1108,
"step": 620
},
{
"epoch": 0.9352409638554217,
"grad_norm": 0.7141003372341443,
"learning_rate": 3.8253348214285715e-05,
"loss": 1.0355,
"step": 621
},
{
"epoch": 0.9367469879518072,
"grad_norm": 1.920317249732109,
"learning_rate": 3.822544642857143e-05,
"loss": 0.9502,
"step": 622
},
{
"epoch": 0.9382530120481928,
"grad_norm": 1.1325336267739634,
"learning_rate": 3.819754464285715e-05,
"loss": 0.9222,
"step": 623
},
{
"epoch": 0.9397590361445783,
"grad_norm": 3.10987872504335,
"learning_rate": 3.816964285714286e-05,
"loss": 1.0379,
"step": 624
},
{
"epoch": 0.9412650602409639,
"grad_norm": 3.6788884400538637,
"learning_rate": 3.8141741071428574e-05,
"loss": 1.09,
"step": 625
},
{
"epoch": 0.9427710843373494,
"grad_norm": 1.2926824850036187,
"learning_rate": 3.8113839285714284e-05,
"loss": 0.9803,
"step": 626
},
{
"epoch": 0.9442771084337349,
"grad_norm": 2.2419070455527796,
"learning_rate": 3.80859375e-05,
"loss": 1.021,
"step": 627
},
{
"epoch": 0.9457831325301205,
"grad_norm": 3.1403698877141184,
"learning_rate": 3.8058035714285716e-05,
"loss": 0.9841,
"step": 628
},
{
"epoch": 0.947289156626506,
"grad_norm": 2.4012589320381093,
"learning_rate": 3.803013392857143e-05,
"loss": 0.9949,
"step": 629
},
{
"epoch": 0.9487951807228916,
"grad_norm": 0.8572740486776923,
"learning_rate": 3.800223214285715e-05,
"loss": 0.9546,
"step": 630
},
{
"epoch": 0.9503012048192772,
"grad_norm": 2.8050610790356454,
"learning_rate": 3.797433035714286e-05,
"loss": 1.0013,
"step": 631
},
{
"epoch": 0.9518072289156626,
"grad_norm": 1.908643222221255,
"learning_rate": 3.794642857142857e-05,
"loss": 0.9094,
"step": 632
},
{
"epoch": 0.9533132530120482,
"grad_norm": 1.417179819921587,
"learning_rate": 3.7918526785714285e-05,
"loss": 0.9683,
"step": 633
},
{
"epoch": 0.9548192771084337,
"grad_norm": 1.235393191077872,
"learning_rate": 3.7890625e-05,
"loss": 0.992,
"step": 634
},
{
"epoch": 0.9563253012048193,
"grad_norm": 1.0649448743880061,
"learning_rate": 3.786272321428572e-05,
"loss": 0.8832,
"step": 635
},
{
"epoch": 0.9578313253012049,
"grad_norm": 1.0727342323258373,
"learning_rate": 3.7834821428571434e-05,
"loss": 1.0219,
"step": 636
},
{
"epoch": 0.9593373493975904,
"grad_norm": 1.2025606190297995,
"learning_rate": 3.780691964285715e-05,
"loss": 1.1049,
"step": 637
},
{
"epoch": 0.9608433734939759,
"grad_norm": 0.9318123100082958,
"learning_rate": 3.777901785714286e-05,
"loss": 1.0263,
"step": 638
},
{
"epoch": 0.9623493975903614,
"grad_norm": 1.1779792080497802,
"learning_rate": 3.775111607142857e-05,
"loss": 1.0431,
"step": 639
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.9994615515678256,
"learning_rate": 3.7723214285714286e-05,
"loss": 1.0465,
"step": 640
},
{
"epoch": 0.9653614457831325,
"grad_norm": 0.9165007797860708,
"learning_rate": 3.76953125e-05,
"loss": 0.9464,
"step": 641
},
{
"epoch": 0.9668674698795181,
"grad_norm": 0.7981390720918065,
"learning_rate": 3.766741071428572e-05,
"loss": 1.065,
"step": 642
},
{
"epoch": 0.9683734939759037,
"grad_norm": 1.9041625079598417,
"learning_rate": 3.7639508928571435e-05,
"loss": 0.9869,
"step": 643
},
{
"epoch": 0.9698795180722891,
"grad_norm": 0.9532779308544986,
"learning_rate": 3.7611607142857145e-05,
"loss": 0.991,
"step": 644
},
{
"epoch": 0.9713855421686747,
"grad_norm": 1.5198136415744088,
"learning_rate": 3.7583705357142854e-05,
"loss": 1.0553,
"step": 645
},
{
"epoch": 0.9728915662650602,
"grad_norm": 1.6286733585303488,
"learning_rate": 3.755580357142857e-05,
"loss": 1.0343,
"step": 646
},
{
"epoch": 0.9743975903614458,
"grad_norm": 0.5816056387849801,
"learning_rate": 3.752790178571429e-05,
"loss": 1.0607,
"step": 647
},
{
"epoch": 0.9759036144578314,
"grad_norm": 1.91401415037052,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.0298,
"step": 648
},
{
"epoch": 0.9774096385542169,
"grad_norm": 0.7183439846786953,
"learning_rate": 3.747209821428572e-05,
"loss": 1.1132,
"step": 649
},
{
"epoch": 0.9789156626506024,
"grad_norm": 1.940353735391463,
"learning_rate": 3.744419642857143e-05,
"loss": 0.9719,
"step": 650
},
{
"epoch": 0.9804216867469879,
"grad_norm": 1.5707680071972856,
"learning_rate": 3.7416294642857146e-05,
"loss": 0.9433,
"step": 651
},
{
"epoch": 0.9819277108433735,
"grad_norm": 0.5326076013688031,
"learning_rate": 3.7388392857142856e-05,
"loss": 0.9367,
"step": 652
},
{
"epoch": 0.983433734939759,
"grad_norm": 2.2947178660586034,
"learning_rate": 3.736049107142857e-05,
"loss": 1.0369,
"step": 653
},
{
"epoch": 0.9849397590361446,
"grad_norm": 1.063260754589839,
"learning_rate": 3.733258928571429e-05,
"loss": 1.0589,
"step": 654
},
{
"epoch": 0.9864457831325302,
"grad_norm": 1.1962228182983154,
"learning_rate": 3.7304687500000005e-05,
"loss": 0.9458,
"step": 655
},
{
"epoch": 0.9879518072289156,
"grad_norm": 1.3479135652682828,
"learning_rate": 3.7276785714285714e-05,
"loss": 0.9639,
"step": 656
},
{
"epoch": 0.9894578313253012,
"grad_norm": 0.7159274053881269,
"learning_rate": 3.724888392857143e-05,
"loss": 0.998,
"step": 657
},
{
"epoch": 0.9909638554216867,
"grad_norm": 1.2351089779284292,
"learning_rate": 3.722098214285715e-05,
"loss": 1.1012,
"step": 658
},
{
"epoch": 0.9924698795180723,
"grad_norm": 0.7220335940987308,
"learning_rate": 3.719308035714286e-05,
"loss": 0.9845,
"step": 659
},
{
"epoch": 0.9939759036144579,
"grad_norm": 0.9034768041976976,
"learning_rate": 3.716517857142857e-05,
"loss": 0.9912,
"step": 660
},
{
"epoch": 0.9954819277108434,
"grad_norm": 0.6052170398560012,
"learning_rate": 3.713727678571429e-05,
"loss": 0.9337,
"step": 661
},
{
"epoch": 0.9969879518072289,
"grad_norm": 0.8268905369091016,
"learning_rate": 3.7109375e-05,
"loss": 0.9683,
"step": 662
},
{
"epoch": 0.9984939759036144,
"grad_norm": 1.0448488879804698,
"learning_rate": 3.7081473214285715e-05,
"loss": 0.9812,
"step": 663
},
{
"epoch": 1.0,
"grad_norm": 0.5009508545664141,
"learning_rate": 3.705357142857143e-05,
"loss": 0.923,
"step": 664
},
{
"epoch": 1.0015060240963856,
"grad_norm": 0.6108014692526499,
"learning_rate": 3.702566964285715e-05,
"loss": 0.9726,
"step": 665
},
{
"epoch": 1.0030120481927711,
"grad_norm": 0.48109124880455584,
"learning_rate": 3.699776785714286e-05,
"loss": 0.8847,
"step": 666
},
{
"epoch": 1.0045180722891567,
"grad_norm": 0.6506513045651271,
"learning_rate": 3.6969866071428574e-05,
"loss": 0.8475,
"step": 667
},
{
"epoch": 1.0060240963855422,
"grad_norm": 0.5063784409636892,
"learning_rate": 3.6941964285714284e-05,
"loss": 0.9114,
"step": 668
},
{
"epoch": 1.0075301204819278,
"grad_norm": 0.7200470780800002,
"learning_rate": 3.69140625e-05,
"loss": 0.8494,
"step": 669
},
{
"epoch": 1.0090361445783131,
"grad_norm": 0.41637884985774193,
"learning_rate": 3.688616071428572e-05,
"loss": 0.9025,
"step": 670
},
{
"epoch": 1.0105421686746987,
"grad_norm": 0.5920244474223786,
"learning_rate": 3.685825892857143e-05,
"loss": 0.7915,
"step": 671
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.4567531321037605,
"learning_rate": 3.683035714285715e-05,
"loss": 0.8841,
"step": 672
},
{
"epoch": 1.0135542168674698,
"grad_norm": 0.5688846409177291,
"learning_rate": 3.680245535714286e-05,
"loss": 0.7995,
"step": 673
},
{
"epoch": 1.0150602409638554,
"grad_norm": 0.4817264641859637,
"learning_rate": 3.677455357142857e-05,
"loss": 0.8177,
"step": 674
},
{
"epoch": 1.016566265060241,
"grad_norm": 0.41393014874379624,
"learning_rate": 3.6746651785714285e-05,
"loss": 0.9008,
"step": 675
},
{
"epoch": 1.0180722891566265,
"grad_norm": 0.5634388402317265,
"learning_rate": 3.671875e-05,
"loss": 0.941,
"step": 676
},
{
"epoch": 1.019578313253012,
"grad_norm": 9.314612742945025,
"learning_rate": 3.669084821428572e-05,
"loss": 1.0404,
"step": 677
},
{
"epoch": 1.0210843373493976,
"grad_norm": 0.6458667506911293,
"learning_rate": 3.6662946428571434e-05,
"loss": 0.9458,
"step": 678
},
{
"epoch": 1.0225903614457832,
"grad_norm": 0.5056638784949958,
"learning_rate": 3.6635044642857144e-05,
"loss": 0.921,
"step": 679
},
{
"epoch": 1.0240963855421688,
"grad_norm": 0.46503276361705503,
"learning_rate": 3.6607142857142853e-05,
"loss": 0.9047,
"step": 680
},
{
"epoch": 1.0256024096385543,
"grad_norm": 0.46994327620701,
"learning_rate": 3.657924107142857e-05,
"loss": 0.9267,
"step": 681
},
{
"epoch": 1.0271084337349397,
"grad_norm": 0.48315388900563083,
"learning_rate": 3.6551339285714286e-05,
"loss": 0.9298,
"step": 682
},
{
"epoch": 1.0286144578313252,
"grad_norm": 0.5784174315698658,
"learning_rate": 3.65234375e-05,
"loss": 0.8905,
"step": 683
},
{
"epoch": 1.0301204819277108,
"grad_norm": 0.4659736522900751,
"learning_rate": 3.649553571428572e-05,
"loss": 0.8784,
"step": 684
},
{
"epoch": 1.0316265060240963,
"grad_norm": 0.5855253051495973,
"learning_rate": 3.646763392857143e-05,
"loss": 0.9221,
"step": 685
},
{
"epoch": 1.033132530120482,
"grad_norm": 0.4136018470376596,
"learning_rate": 3.6439732142857145e-05,
"loss": 0.8519,
"step": 686
},
{
"epoch": 1.0346385542168675,
"grad_norm": 0.784160377686168,
"learning_rate": 3.6411830357142855e-05,
"loss": 0.7476,
"step": 687
},
{
"epoch": 1.036144578313253,
"grad_norm": 0.9081785784599531,
"learning_rate": 3.638392857142857e-05,
"loss": 0.8251,
"step": 688
},
{
"epoch": 1.0376506024096386,
"grad_norm": 0.7983702618088612,
"learning_rate": 3.635602678571429e-05,
"loss": 0.8871,
"step": 689
},
{
"epoch": 1.0391566265060241,
"grad_norm": 0.5786323001777359,
"learning_rate": 3.6328125000000004e-05,
"loss": 0.9322,
"step": 690
},
{
"epoch": 1.0406626506024097,
"grad_norm": 0.9648492235159225,
"learning_rate": 3.630022321428572e-05,
"loss": 0.9359,
"step": 691
},
{
"epoch": 1.0421686746987953,
"grad_norm": 0.4873229563675746,
"learning_rate": 3.627232142857143e-05,
"loss": 0.9355,
"step": 692
},
{
"epoch": 1.0436746987951808,
"grad_norm": 0.8761989592870119,
"learning_rate": 3.6244419642857146e-05,
"loss": 0.902,
"step": 693
},
{
"epoch": 1.0451807228915662,
"grad_norm": 0.5651764300585157,
"learning_rate": 3.6216517857142856e-05,
"loss": 0.9976,
"step": 694
},
{
"epoch": 1.0466867469879517,
"grad_norm": 0.9731934056831608,
"learning_rate": 3.618861607142857e-05,
"loss": 0.8314,
"step": 695
},
{
"epoch": 1.0481927710843373,
"grad_norm": 0.6845031495318143,
"learning_rate": 3.616071428571429e-05,
"loss": 0.9277,
"step": 696
},
{
"epoch": 1.0496987951807228,
"grad_norm": 0.9211398922606056,
"learning_rate": 3.6132812500000005e-05,
"loss": 0.851,
"step": 697
},
{
"epoch": 1.0512048192771084,
"grad_norm": 1.0826734834534257,
"learning_rate": 3.6104910714285715e-05,
"loss": 0.9021,
"step": 698
},
{
"epoch": 1.052710843373494,
"grad_norm": 0.6287612910088898,
"learning_rate": 3.607700892857143e-05,
"loss": 0.9298,
"step": 699
},
{
"epoch": 1.0542168674698795,
"grad_norm": 1.2836339354636723,
"learning_rate": 3.604910714285715e-05,
"loss": 0.8586,
"step": 700
},
{
"epoch": 1.055722891566265,
"grad_norm": 0.6603261987597695,
"learning_rate": 3.602120535714286e-05,
"loss": 0.8144,
"step": 701
},
{
"epoch": 1.0572289156626506,
"grad_norm": 1.5151521196596673,
"learning_rate": 3.599330357142857e-05,
"loss": 0.8665,
"step": 702
},
{
"epoch": 1.0587349397590362,
"grad_norm": 1.0387703910744868,
"learning_rate": 3.596540178571429e-05,
"loss": 0.9203,
"step": 703
},
{
"epoch": 1.0602409638554218,
"grad_norm": 1.0650833425124084,
"learning_rate": 3.59375e-05,
"loss": 0.9702,
"step": 704
},
{
"epoch": 1.0617469879518073,
"grad_norm": 1.3468863288922117,
"learning_rate": 3.5909598214285716e-05,
"loss": 0.8848,
"step": 705
},
{
"epoch": 1.0632530120481927,
"grad_norm": 0.9234850808465509,
"learning_rate": 3.588169642857143e-05,
"loss": 0.9705,
"step": 706
},
{
"epoch": 1.0647590361445782,
"grad_norm": 0.8749303857203196,
"learning_rate": 3.585379464285715e-05,
"loss": 0.8679,
"step": 707
},
{
"epoch": 1.0662650602409638,
"grad_norm": 1.3833806213067452,
"learning_rate": 3.582589285714286e-05,
"loss": 0.8436,
"step": 708
},
{
"epoch": 1.0677710843373494,
"grad_norm": 0.4993947993242181,
"learning_rate": 3.5797991071428575e-05,
"loss": 0.9428,
"step": 709
},
{
"epoch": 1.069277108433735,
"grad_norm": 0.6742861600580248,
"learning_rate": 3.5770089285714284e-05,
"loss": 0.9374,
"step": 710
},
{
"epoch": 1.0707831325301205,
"grad_norm": 0.7823099201740765,
"learning_rate": 3.57421875e-05,
"loss": 0.865,
"step": 711
},
{
"epoch": 1.072289156626506,
"grad_norm": 0.9538973558879908,
"learning_rate": 3.571428571428572e-05,
"loss": 0.9264,
"step": 712
},
{
"epoch": 1.0737951807228916,
"grad_norm": 0.592149810282526,
"learning_rate": 3.568638392857143e-05,
"loss": 0.9228,
"step": 713
},
{
"epoch": 1.0753012048192772,
"grad_norm": 0.6802561888412879,
"learning_rate": 3.565848214285715e-05,
"loss": 0.9009,
"step": 714
},
{
"epoch": 1.0768072289156627,
"grad_norm": 0.4202644425444327,
"learning_rate": 3.563058035714286e-05,
"loss": 0.8966,
"step": 715
},
{
"epoch": 1.0783132530120483,
"grad_norm": 0.5357027909635481,
"learning_rate": 3.560267857142857e-05,
"loss": 0.8921,
"step": 716
},
{
"epoch": 1.0798192771084336,
"grad_norm": 0.45679254583102713,
"learning_rate": 3.5574776785714285e-05,
"loss": 0.8754,
"step": 717
},
{
"epoch": 1.0813253012048192,
"grad_norm": 0.49442997546840867,
"learning_rate": 3.5546875e-05,
"loss": 0.8642,
"step": 718
},
{
"epoch": 1.0828313253012047,
"grad_norm": 0.6941702796563687,
"learning_rate": 3.551897321428572e-05,
"loss": 0.9021,
"step": 719
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.6232817497598714,
"learning_rate": 3.5491071428571435e-05,
"loss": 0.9065,
"step": 720
},
{
"epoch": 1.0858433734939759,
"grad_norm": 0.814486459511498,
"learning_rate": 3.5463169642857144e-05,
"loss": 0.9199,
"step": 721
},
{
"epoch": 1.0873493975903614,
"grad_norm": 1.0286183194511116,
"learning_rate": 3.5435267857142854e-05,
"loss": 0.8921,
"step": 722
},
{
"epoch": 1.088855421686747,
"grad_norm": 0.6993708802422818,
"learning_rate": 3.540736607142857e-05,
"loss": 0.8697,
"step": 723
},
{
"epoch": 1.0903614457831325,
"grad_norm": 1.2299282659131794,
"learning_rate": 3.5379464285714287e-05,
"loss": 0.8559,
"step": 724
},
{
"epoch": 1.091867469879518,
"grad_norm": 0.7702122966071391,
"learning_rate": 3.53515625e-05,
"loss": 0.891,
"step": 725
},
{
"epoch": 1.0933734939759037,
"grad_norm": 0.7168219298970882,
"learning_rate": 3.532366071428572e-05,
"loss": 0.9401,
"step": 726
},
{
"epoch": 1.0948795180722892,
"grad_norm": 0.7494806025809874,
"learning_rate": 3.529575892857143e-05,
"loss": 0.9422,
"step": 727
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.8086149759158916,
"learning_rate": 3.5267857142857145e-05,
"loss": 0.8695,
"step": 728
},
{
"epoch": 1.0978915662650603,
"grad_norm": 0.4634158642289007,
"learning_rate": 3.5239955357142855e-05,
"loss": 0.9594,
"step": 729
},
{
"epoch": 1.0993975903614457,
"grad_norm": 0.7235744336795877,
"learning_rate": 3.521205357142857e-05,
"loss": 0.946,
"step": 730
},
{
"epoch": 1.1009036144578312,
"grad_norm": 0.5410896638395775,
"learning_rate": 3.518415178571429e-05,
"loss": 1.0219,
"step": 731
},
{
"epoch": 1.1024096385542168,
"grad_norm": 0.9124377164283959,
"learning_rate": 3.5156250000000004e-05,
"loss": 0.8357,
"step": 732
},
{
"epoch": 1.1039156626506024,
"grad_norm": 0.6503918923801927,
"learning_rate": 3.5128348214285714e-05,
"loss": 0.8927,
"step": 733
},
{
"epoch": 1.105421686746988,
"grad_norm": 1.293316609049965,
"learning_rate": 3.510044642857143e-05,
"loss": 0.9097,
"step": 734
},
{
"epoch": 1.1069277108433735,
"grad_norm": 0.8331710726675682,
"learning_rate": 3.5072544642857147e-05,
"loss": 1.0453,
"step": 735
},
{
"epoch": 1.108433734939759,
"grad_norm": 1.4502754413437113,
"learning_rate": 3.5044642857142856e-05,
"loss": 0.9436,
"step": 736
},
{
"epoch": 1.1099397590361446,
"grad_norm": 1.1502969070207132,
"learning_rate": 3.501674107142857e-05,
"loss": 0.9504,
"step": 737
},
{
"epoch": 1.1114457831325302,
"grad_norm": 1.119023615539657,
"learning_rate": 3.498883928571429e-05,
"loss": 0.9267,
"step": 738
},
{
"epoch": 1.1129518072289157,
"grad_norm": 1.0204636417593693,
"learning_rate": 3.49609375e-05,
"loss": 0.8519,
"step": 739
},
{
"epoch": 1.1144578313253013,
"grad_norm": 1.1980278084790463,
"learning_rate": 3.4933035714285715e-05,
"loss": 0.8781,
"step": 740
},
{
"epoch": 1.1159638554216866,
"grad_norm": 1.0589446814404644,
"learning_rate": 3.490513392857143e-05,
"loss": 0.9517,
"step": 741
},
{
"epoch": 1.1174698795180722,
"grad_norm": 0.9538924117326562,
"learning_rate": 3.487723214285715e-05,
"loss": 0.9771,
"step": 742
},
{
"epoch": 1.1189759036144578,
"grad_norm": 0.7900490886841242,
"learning_rate": 3.484933035714286e-05,
"loss": 0.8833,
"step": 743
},
{
"epoch": 1.1204819277108433,
"grad_norm": 0.8709184164106646,
"learning_rate": 3.4821428571428574e-05,
"loss": 1.0126,
"step": 744
},
{
"epoch": 1.1219879518072289,
"grad_norm": 0.86575556172183,
"learning_rate": 3.479352678571428e-05,
"loss": 0.842,
"step": 745
},
{
"epoch": 1.1234939759036144,
"grad_norm": 0.7651299220416493,
"learning_rate": 3.4765625e-05,
"loss": 0.8375,
"step": 746
},
{
"epoch": 1.125,
"grad_norm": 0.7414145687635941,
"learning_rate": 3.4737723214285716e-05,
"loss": 0.9172,
"step": 747
},
{
"epoch": 1.1265060240963856,
"grad_norm": 0.6735819231389837,
"learning_rate": 3.470982142857143e-05,
"loss": 1.0393,
"step": 748
},
{
"epoch": 1.1280120481927711,
"grad_norm": 0.75577145913509,
"learning_rate": 3.468191964285715e-05,
"loss": 0.8946,
"step": 749
},
{
"epoch": 1.1295180722891567,
"grad_norm": 0.8952374237580829,
"learning_rate": 3.465401785714286e-05,
"loss": 0.9027,
"step": 750
},
{
"epoch": 1.1310240963855422,
"grad_norm": 0.6374599180740311,
"learning_rate": 3.4626116071428575e-05,
"loss": 0.8637,
"step": 751
},
{
"epoch": 1.1325301204819278,
"grad_norm": 0.9441595371336349,
"learning_rate": 3.4598214285714284e-05,
"loss": 0.9052,
"step": 752
},
{
"epoch": 1.1340361445783134,
"grad_norm": 0.7237508068411852,
"learning_rate": 3.45703125e-05,
"loss": 0.9686,
"step": 753
},
{
"epoch": 1.1355421686746987,
"grad_norm": 0.9860823064741998,
"learning_rate": 3.454241071428572e-05,
"loss": 0.9242,
"step": 754
},
{
"epoch": 1.1370481927710843,
"grad_norm": 0.5628224504874209,
"learning_rate": 3.4514508928571434e-05,
"loss": 0.8561,
"step": 755
},
{
"epoch": 1.1385542168674698,
"grad_norm": 0.7725612386594292,
"learning_rate": 3.448660714285715e-05,
"loss": 0.8721,
"step": 756
},
{
"epoch": 1.1400602409638554,
"grad_norm": 0.5222060820793952,
"learning_rate": 3.445870535714286e-05,
"loss": 0.7821,
"step": 757
},
{
"epoch": 1.141566265060241,
"grad_norm": 0.5716537762598741,
"learning_rate": 3.443080357142857e-05,
"loss": 0.8668,
"step": 758
},
{
"epoch": 1.1430722891566265,
"grad_norm": 0.7028308364211386,
"learning_rate": 3.4402901785714286e-05,
"loss": 0.9267,
"step": 759
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.9101534854345079,
"learning_rate": 3.4375e-05,
"loss": 0.9177,
"step": 760
},
{
"epoch": 1.1460843373493976,
"grad_norm": 0.4499479429671646,
"learning_rate": 3.434709821428572e-05,
"loss": 0.9537,
"step": 761
},
{
"epoch": 1.1475903614457832,
"grad_norm": 0.6452729090442565,
"learning_rate": 3.4319196428571435e-05,
"loss": 0.8526,
"step": 762
},
{
"epoch": 1.1490963855421688,
"grad_norm": 0.4570956203625393,
"learning_rate": 3.4291294642857144e-05,
"loss": 0.9189,
"step": 763
},
{
"epoch": 1.1506024096385543,
"grad_norm": 0.8031430609389195,
"learning_rate": 3.4263392857142854e-05,
"loss": 0.8587,
"step": 764
},
{
"epoch": 1.1521084337349397,
"grad_norm": 0.553997415804624,
"learning_rate": 3.423549107142857e-05,
"loss": 0.9465,
"step": 765
},
{
"epoch": 1.1536144578313252,
"grad_norm": 0.5636584612109936,
"learning_rate": 3.420758928571429e-05,
"loss": 0.8867,
"step": 766
},
{
"epoch": 1.1551204819277108,
"grad_norm": 0.5603222832855538,
"learning_rate": 3.41796875e-05,
"loss": 0.9134,
"step": 767
},
{
"epoch": 1.1566265060240963,
"grad_norm": 0.4877892627016596,
"learning_rate": 3.415178571428572e-05,
"loss": 0.901,
"step": 768
},
{
"epoch": 1.158132530120482,
"grad_norm": 0.6316467876774854,
"learning_rate": 3.412388392857143e-05,
"loss": 0.8681,
"step": 769
},
{
"epoch": 1.1596385542168675,
"grad_norm": 0.45313432981008994,
"learning_rate": 3.4095982142857146e-05,
"loss": 0.8244,
"step": 770
},
{
"epoch": 1.161144578313253,
"grad_norm": 0.4765936427101797,
"learning_rate": 3.4068080357142855e-05,
"loss": 0.8626,
"step": 771
},
{
"epoch": 1.1626506024096386,
"grad_norm": 0.6739732704749145,
"learning_rate": 3.404017857142857e-05,
"loss": 0.8752,
"step": 772
},
{
"epoch": 1.1641566265060241,
"grad_norm": 0.4956774379084542,
"learning_rate": 3.401227678571429e-05,
"loss": 0.9259,
"step": 773
},
{
"epoch": 1.1656626506024097,
"grad_norm": 0.6142850591776837,
"learning_rate": 3.3984375000000004e-05,
"loss": 0.9067,
"step": 774
},
{
"epoch": 1.1671686746987953,
"grad_norm": 0.5637976629450164,
"learning_rate": 3.3956473214285714e-05,
"loss": 0.8407,
"step": 775
},
{
"epoch": 1.1686746987951806,
"grad_norm": 0.4763989461010731,
"learning_rate": 3.392857142857143e-05,
"loss": 0.8233,
"step": 776
},
{
"epoch": 1.1701807228915664,
"grad_norm": 0.5663989823695585,
"learning_rate": 3.390066964285715e-05,
"loss": 0.9751,
"step": 777
},
{
"epoch": 1.1716867469879517,
"grad_norm": 0.47631480639379525,
"learning_rate": 3.3872767857142856e-05,
"loss": 0.9523,
"step": 778
},
{
"epoch": 1.1731927710843373,
"grad_norm": 0.5547204992599337,
"learning_rate": 3.384486607142857e-05,
"loss": 0.9624,
"step": 779
},
{
"epoch": 1.1746987951807228,
"grad_norm": 0.41725057316685243,
"learning_rate": 3.381696428571429e-05,
"loss": 0.9577,
"step": 780
},
{
"epoch": 1.1762048192771084,
"grad_norm": 0.7407876910860511,
"learning_rate": 3.37890625e-05,
"loss": 0.8844,
"step": 781
},
{
"epoch": 1.177710843373494,
"grad_norm": 0.7385893367291911,
"learning_rate": 3.3761160714285715e-05,
"loss": 0.8198,
"step": 782
},
{
"epoch": 1.1792168674698795,
"grad_norm": 0.4134331990574501,
"learning_rate": 3.373325892857143e-05,
"loss": 0.8712,
"step": 783
},
{
"epoch": 1.180722891566265,
"grad_norm": 0.6938781420406752,
"learning_rate": 3.370535714285715e-05,
"loss": 0.9357,
"step": 784
},
{
"epoch": 1.1822289156626506,
"grad_norm": 0.6459919993160489,
"learning_rate": 3.367745535714286e-05,
"loss": 0.9595,
"step": 785
},
{
"epoch": 1.1837349397590362,
"grad_norm": 0.8136341825262028,
"learning_rate": 3.3649553571428574e-05,
"loss": 0.8401,
"step": 786
},
{
"epoch": 1.1852409638554218,
"grad_norm": 0.4856243432969622,
"learning_rate": 3.3621651785714284e-05,
"loss": 0.9315,
"step": 787
},
{
"epoch": 1.1867469879518073,
"grad_norm": 0.6874454418294755,
"learning_rate": 3.359375e-05,
"loss": 0.9184,
"step": 788
},
{
"epoch": 1.1882530120481927,
"grad_norm": 0.5663982154342588,
"learning_rate": 3.3565848214285716e-05,
"loss": 0.8274,
"step": 789
},
{
"epoch": 1.1897590361445782,
"grad_norm": 0.8274558013914739,
"learning_rate": 3.353794642857143e-05,
"loss": 0.9551,
"step": 790
},
{
"epoch": 1.1912650602409638,
"grad_norm": 1.753425145215422,
"learning_rate": 3.351004464285715e-05,
"loss": 0.9781,
"step": 791
},
{
"epoch": 1.1927710843373494,
"grad_norm": 0.594784071592936,
"learning_rate": 3.348214285714286e-05,
"loss": 0.9015,
"step": 792
},
{
"epoch": 1.194277108433735,
"grad_norm": 0.4608926277056844,
"learning_rate": 3.345424107142857e-05,
"loss": 0.9163,
"step": 793
},
{
"epoch": 1.1957831325301205,
"grad_norm": 0.5614325905914386,
"learning_rate": 3.3426339285714285e-05,
"loss": 0.8854,
"step": 794
},
{
"epoch": 1.197289156626506,
"grad_norm": 0.43422993049663355,
"learning_rate": 3.33984375e-05,
"loss": 0.8762,
"step": 795
},
{
"epoch": 1.1987951807228916,
"grad_norm": 0.8064586826203364,
"learning_rate": 3.337053571428572e-05,
"loss": 0.9196,
"step": 796
},
{
"epoch": 1.2003012048192772,
"grad_norm": 0.6357384216371462,
"learning_rate": 3.3342633928571434e-05,
"loss": 0.9594,
"step": 797
},
{
"epoch": 1.2018072289156627,
"grad_norm": 0.4980958732431626,
"learning_rate": 3.3314732142857144e-05,
"loss": 0.9601,
"step": 798
},
{
"epoch": 1.2033132530120483,
"grad_norm": 0.49112594448829,
"learning_rate": 3.328683035714285e-05,
"loss": 0.9574,
"step": 799
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.6434503140112771,
"learning_rate": 3.325892857142857e-05,
"loss": 0.8582,
"step": 800
},
{
"epoch": 1.2063253012048194,
"grad_norm": 0.6651994326677865,
"learning_rate": 3.3231026785714286e-05,
"loss": 0.9533,
"step": 801
},
{
"epoch": 1.2078313253012047,
"grad_norm": 0.5612159878089357,
"learning_rate": 3.3203125e-05,
"loss": 0.9169,
"step": 802
},
{
"epoch": 1.2093373493975903,
"grad_norm": 0.6496841475677518,
"learning_rate": 3.317522321428572e-05,
"loss": 0.8281,
"step": 803
},
{
"epoch": 1.2108433734939759,
"grad_norm": 0.7661426827982452,
"learning_rate": 3.314732142857143e-05,
"loss": 0.8758,
"step": 804
},
{
"epoch": 1.2123493975903614,
"grad_norm": 1.0466441579702943,
"learning_rate": 3.3119419642857145e-05,
"loss": 0.9221,
"step": 805
},
{
"epoch": 1.213855421686747,
"grad_norm": 18.152755303097027,
"learning_rate": 3.3091517857142854e-05,
"loss": 1.2501,
"step": 806
},
{
"epoch": 1.2153614457831325,
"grad_norm": 0.4787709989547289,
"learning_rate": 3.306361607142857e-05,
"loss": 0.8441,
"step": 807
},
{
"epoch": 1.216867469879518,
"grad_norm": 1.0236041372644524,
"learning_rate": 3.303571428571429e-05,
"loss": 0.9011,
"step": 808
},
{
"epoch": 1.2183734939759037,
"grad_norm": 0.4860730014988597,
"learning_rate": 3.3007812500000004e-05,
"loss": 0.9861,
"step": 809
},
{
"epoch": 1.2198795180722892,
"grad_norm": 0.615297577156611,
"learning_rate": 3.297991071428572e-05,
"loss": 0.8624,
"step": 810
},
{
"epoch": 1.2213855421686748,
"grad_norm": 0.5350584379936432,
"learning_rate": 3.295200892857143e-05,
"loss": 0.9349,
"step": 811
},
{
"epoch": 1.2228915662650603,
"grad_norm": 0.5616315067396345,
"learning_rate": 3.2924107142857146e-05,
"loss": 0.9328,
"step": 812
},
{
"epoch": 1.2243975903614457,
"grad_norm": 0.5163456805933699,
"learning_rate": 3.2896205357142856e-05,
"loss": 0.8537,
"step": 813
},
{
"epoch": 1.2259036144578312,
"grad_norm": 0.4727273598631116,
"learning_rate": 3.286830357142857e-05,
"loss": 0.8442,
"step": 814
},
{
"epoch": 1.2274096385542168,
"grad_norm": 0.5929025121948194,
"learning_rate": 3.284040178571429e-05,
"loss": 0.8618,
"step": 815
},
{
"epoch": 1.2289156626506024,
"grad_norm": 0.8148387041210994,
"learning_rate": 3.2812500000000005e-05,
"loss": 0.8374,
"step": 816
},
{
"epoch": 1.230421686746988,
"grad_norm": 0.6908310925370548,
"learning_rate": 3.2784598214285714e-05,
"loss": 0.9044,
"step": 817
},
{
"epoch": 1.2319277108433735,
"grad_norm": 0.43222684841829623,
"learning_rate": 3.275669642857143e-05,
"loss": 0.934,
"step": 818
},
{
"epoch": 1.233433734939759,
"grad_norm": 0.9010902072544995,
"learning_rate": 3.272879464285715e-05,
"loss": 0.8887,
"step": 819
},
{
"epoch": 1.2349397590361446,
"grad_norm": 0.4687954767525252,
"learning_rate": 3.270089285714286e-05,
"loss": 1.0474,
"step": 820
},
{
"epoch": 1.2364457831325302,
"grad_norm": 1.8617835475525193,
"learning_rate": 3.267299107142857e-05,
"loss": 0.9299,
"step": 821
},
{
"epoch": 1.2379518072289157,
"grad_norm": 0.6160113502627966,
"learning_rate": 3.264508928571429e-05,
"loss": 0.9279,
"step": 822
},
{
"epoch": 1.2394578313253013,
"grad_norm": 0.7259158545954387,
"learning_rate": 3.26171875e-05,
"loss": 0.8902,
"step": 823
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.694722771327242,
"learning_rate": 3.2589285714285716e-05,
"loss": 0.952,
"step": 824
},
{
"epoch": 1.2424698795180722,
"grad_norm": 0.9480083874033397,
"learning_rate": 3.256138392857143e-05,
"loss": 0.9277,
"step": 825
},
{
"epoch": 1.2439759036144578,
"grad_norm": 0.6147362652958437,
"learning_rate": 3.253348214285715e-05,
"loss": 0.9694,
"step": 826
},
{
"epoch": 1.2454819277108433,
"grad_norm": 0.8604900840312624,
"learning_rate": 3.250558035714286e-05,
"loss": 0.8973,
"step": 827
},
{
"epoch": 1.2469879518072289,
"grad_norm": 0.4476056693342411,
"learning_rate": 3.2477678571428574e-05,
"loss": 0.9275,
"step": 828
},
{
"epoch": 1.2484939759036144,
"grad_norm": 0.9858482633159954,
"learning_rate": 3.2449776785714284e-05,
"loss": 0.8754,
"step": 829
},
{
"epoch": 1.25,
"grad_norm": 0.4756258041730579,
"learning_rate": 3.2421875e-05,
"loss": 0.912,
"step": 830
},
{
"epoch": 1.2515060240963856,
"grad_norm": 0.7451693911115064,
"learning_rate": 3.239397321428572e-05,
"loss": 0.9422,
"step": 831
},
{
"epoch": 1.2530120481927711,
"grad_norm": 0.6333122806348873,
"learning_rate": 3.236607142857143e-05,
"loss": 0.8307,
"step": 832
},
{
"epoch": 1.2545180722891567,
"grad_norm": 0.7860563162682318,
"learning_rate": 3.233816964285715e-05,
"loss": 0.8725,
"step": 833
},
{
"epoch": 1.2560240963855422,
"grad_norm": 0.6481387964076182,
"learning_rate": 3.231026785714286e-05,
"loss": 0.9562,
"step": 834
},
{
"epoch": 1.2575301204819276,
"grad_norm": 0.970604305621364,
"learning_rate": 3.228236607142857e-05,
"loss": 0.9525,
"step": 835
},
{
"epoch": 1.2590361445783134,
"grad_norm": 0.48975095013129233,
"learning_rate": 3.2254464285714285e-05,
"loss": 1.0172,
"step": 836
},
{
"epoch": 1.2605421686746987,
"grad_norm": 0.8592465168883675,
"learning_rate": 3.22265625e-05,
"loss": 0.9827,
"step": 837
},
{
"epoch": 1.2620481927710843,
"grad_norm": 0.38633909877745226,
"learning_rate": 3.219866071428572e-05,
"loss": 0.9515,
"step": 838
},
{
"epoch": 1.2635542168674698,
"grad_norm": 0.8996143752723128,
"learning_rate": 3.2170758928571434e-05,
"loss": 0.9291,
"step": 839
},
{
"epoch": 1.2650602409638554,
"grad_norm": 0.38970743167927957,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.9359,
"step": 840
},
{
"epoch": 1.266566265060241,
"grad_norm": 0.8410327812415249,
"learning_rate": 3.2114955357142854e-05,
"loss": 0.8945,
"step": 841
},
{
"epoch": 1.2680722891566265,
"grad_norm": 0.42982273459588355,
"learning_rate": 3.208705357142857e-05,
"loss": 0.9092,
"step": 842
},
{
"epoch": 1.269578313253012,
"grad_norm": 0.7362470213580851,
"learning_rate": 3.2059151785714286e-05,
"loss": 0.8137,
"step": 843
},
{
"epoch": 1.2710843373493976,
"grad_norm": 0.4236106585932027,
"learning_rate": 3.203125e-05,
"loss": 0.8731,
"step": 844
},
{
"epoch": 1.2725903614457832,
"grad_norm": 0.7898705332594714,
"learning_rate": 3.200334821428572e-05,
"loss": 0.9029,
"step": 845
},
{
"epoch": 1.2740963855421688,
"grad_norm": 0.4889329314457672,
"learning_rate": 3.197544642857143e-05,
"loss": 0.8103,
"step": 846
},
{
"epoch": 1.2756024096385543,
"grad_norm": 0.625007382525353,
"learning_rate": 3.1947544642857145e-05,
"loss": 0.9777,
"step": 847
},
{
"epoch": 1.2771084337349397,
"grad_norm": 0.4402843531183987,
"learning_rate": 3.1919642857142855e-05,
"loss": 0.8573,
"step": 848
},
{
"epoch": 1.2786144578313254,
"grad_norm": 0.7629686792283262,
"learning_rate": 3.189174107142857e-05,
"loss": 0.9272,
"step": 849
},
{
"epoch": 1.2801204819277108,
"grad_norm": 0.3988022070110374,
"learning_rate": 3.186383928571429e-05,
"loss": 0.8051,
"step": 850
},
{
"epoch": 1.2816265060240963,
"grad_norm": 0.6190256863871609,
"learning_rate": 3.1835937500000004e-05,
"loss": 0.9126,
"step": 851
},
{
"epoch": 1.283132530120482,
"grad_norm": 0.425636529174002,
"learning_rate": 3.1808035714285713e-05,
"loss": 0.928,
"step": 852
},
{
"epoch": 1.2846385542168675,
"grad_norm": 0.6325601591650077,
"learning_rate": 3.178013392857143e-05,
"loss": 0.9913,
"step": 853
},
{
"epoch": 1.286144578313253,
"grad_norm": 0.40946899064758685,
"learning_rate": 3.1752232142857146e-05,
"loss": 0.9162,
"step": 854
},
{
"epoch": 1.2876506024096386,
"grad_norm": 0.4792819937930962,
"learning_rate": 3.1724330357142856e-05,
"loss": 0.9717,
"step": 855
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.3624656563477281,
"learning_rate": 3.169642857142857e-05,
"loss": 0.9462,
"step": 856
},
{
"epoch": 1.2906626506024097,
"grad_norm": 0.42166692776454073,
"learning_rate": 3.166852678571429e-05,
"loss": 0.8704,
"step": 857
},
{
"epoch": 1.2921686746987953,
"grad_norm": 0.5207360129322047,
"learning_rate": 3.1640625e-05,
"loss": 0.893,
"step": 858
},
{
"epoch": 1.2936746987951806,
"grad_norm": 0.5096525186194942,
"learning_rate": 3.1612723214285715e-05,
"loss": 0.869,
"step": 859
},
{
"epoch": 1.2951807228915664,
"grad_norm": 0.511383407689634,
"learning_rate": 3.158482142857143e-05,
"loss": 0.8532,
"step": 860
},
{
"epoch": 1.2966867469879517,
"grad_norm": 0.7498125062253638,
"learning_rate": 3.155691964285715e-05,
"loss": 0.8454,
"step": 861
},
{
"epoch": 1.2981927710843373,
"grad_norm": 0.5956457900030582,
"learning_rate": 3.152901785714286e-05,
"loss": 1.0556,
"step": 862
},
{
"epoch": 1.2996987951807228,
"grad_norm": 0.6921899337376264,
"learning_rate": 3.1501116071428573e-05,
"loss": 0.9557,
"step": 863
},
{
"epoch": 1.3012048192771084,
"grad_norm": 0.5895718832427568,
"learning_rate": 3.147321428571428e-05,
"loss": 0.8755,
"step": 864
},
{
"epoch": 1.302710843373494,
"grad_norm": 0.5914604177695594,
"learning_rate": 3.14453125e-05,
"loss": 0.8999,
"step": 865
},
{
"epoch": 1.3042168674698795,
"grad_norm": 0.49780731750593815,
"learning_rate": 3.1417410714285716e-05,
"loss": 0.8079,
"step": 866
},
{
"epoch": 1.305722891566265,
"grad_norm": 0.4171585296833949,
"learning_rate": 3.138950892857143e-05,
"loss": 0.9796,
"step": 867
},
{
"epoch": 1.3072289156626506,
"grad_norm": 0.44034325878896186,
"learning_rate": 3.136160714285715e-05,
"loss": 1.0129,
"step": 868
},
{
"epoch": 1.3087349397590362,
"grad_norm": 0.44314468918720434,
"learning_rate": 3.133370535714286e-05,
"loss": 0.908,
"step": 869
},
{
"epoch": 1.3102409638554218,
"grad_norm": 0.5559106124842806,
"learning_rate": 3.1305803571428575e-05,
"loss": 0.912,
"step": 870
},
{
"epoch": 1.3117469879518073,
"grad_norm": 0.44198143900393866,
"learning_rate": 3.1277901785714284e-05,
"loss": 0.8263,
"step": 871
},
{
"epoch": 1.3132530120481927,
"grad_norm": 0.6103804800880671,
"learning_rate": 3.125e-05,
"loss": 0.9674,
"step": 872
},
{
"epoch": 1.3147590361445782,
"grad_norm": 0.4525034816661817,
"learning_rate": 3.122209821428572e-05,
"loss": 0.8639,
"step": 873
},
{
"epoch": 1.3162650602409638,
"grad_norm": 0.498303581581871,
"learning_rate": 3.1194196428571433e-05,
"loss": 0.9663,
"step": 874
},
{
"epoch": 1.3177710843373494,
"grad_norm": 0.5425137199224408,
"learning_rate": 3.116629464285715e-05,
"loss": 0.9069,
"step": 875
},
{
"epoch": 1.319277108433735,
"grad_norm": 0.42927260839025866,
"learning_rate": 3.113839285714286e-05,
"loss": 0.8466,
"step": 876
},
{
"epoch": 1.3207831325301205,
"grad_norm": 0.41884136601417665,
"learning_rate": 3.111049107142857e-05,
"loss": 0.9327,
"step": 877
},
{
"epoch": 1.322289156626506,
"grad_norm": 0.37576319682402065,
"learning_rate": 3.1082589285714285e-05,
"loss": 0.839,
"step": 878
},
{
"epoch": 1.3237951807228916,
"grad_norm": 0.4528884655129762,
"learning_rate": 3.10546875e-05,
"loss": 0.9614,
"step": 879
},
{
"epoch": 1.3253012048192772,
"grad_norm": 0.387633508365781,
"learning_rate": 3.102678571428572e-05,
"loss": 0.8761,
"step": 880
},
{
"epoch": 1.3268072289156627,
"grad_norm": 0.37834818891681754,
"learning_rate": 3.0998883928571435e-05,
"loss": 0.8533,
"step": 881
},
{
"epoch": 1.3283132530120483,
"grad_norm": 0.3558288553052607,
"learning_rate": 3.0970982142857144e-05,
"loss": 0.8872,
"step": 882
},
{
"epoch": 1.3298192771084336,
"grad_norm": 0.393810584470361,
"learning_rate": 3.0943080357142854e-05,
"loss": 0.8402,
"step": 883
},
{
"epoch": 1.3313253012048194,
"grad_norm": 16.795980485641497,
"learning_rate": 3.091517857142857e-05,
"loss": 1.7433,
"step": 884
},
{
"epoch": 1.3328313253012047,
"grad_norm": 0.507699801281132,
"learning_rate": 3.088727678571429e-05,
"loss": 0.9552,
"step": 885
},
{
"epoch": 1.3343373493975903,
"grad_norm": 0.5301985771272634,
"learning_rate": 3.0859375e-05,
"loss": 0.8839,
"step": 886
},
{
"epoch": 1.3358433734939759,
"grad_norm": 0.4108504323373235,
"learning_rate": 3.083147321428572e-05,
"loss": 0.8945,
"step": 887
},
{
"epoch": 1.3373493975903614,
"grad_norm": 0.4261620705812752,
"learning_rate": 3.080357142857143e-05,
"loss": 0.8308,
"step": 888
},
{
"epoch": 1.338855421686747,
"grad_norm": 0.4414241351641769,
"learning_rate": 3.0775669642857145e-05,
"loss": 0.8948,
"step": 889
},
{
"epoch": 1.3403614457831325,
"grad_norm": 0.4788576783276288,
"learning_rate": 3.0747767857142855e-05,
"loss": 0.8658,
"step": 890
},
{
"epoch": 1.341867469879518,
"grad_norm": 0.38817925357290234,
"learning_rate": 3.071986607142857e-05,
"loss": 0.9047,
"step": 891
},
{
"epoch": 1.3433734939759037,
"grad_norm": 0.4916048233483969,
"learning_rate": 3.069196428571429e-05,
"loss": 0.9246,
"step": 892
},
{
"epoch": 1.3448795180722892,
"grad_norm": 0.42602511591273085,
"learning_rate": 3.0664062500000004e-05,
"loss": 0.9201,
"step": 893
},
{
"epoch": 1.3463855421686746,
"grad_norm": 0.47129859838933036,
"learning_rate": 3.0636160714285714e-05,
"loss": 0.8395,
"step": 894
},
{
"epoch": 1.3478915662650603,
"grad_norm": 0.535316216113868,
"learning_rate": 3.060825892857143e-05,
"loss": 0.9449,
"step": 895
},
{
"epoch": 1.3493975903614457,
"grad_norm": 0.40112369314925556,
"learning_rate": 3.0580357142857147e-05,
"loss": 0.943,
"step": 896
},
{
"epoch": 1.3509036144578312,
"grad_norm": 0.681946841730207,
"learning_rate": 3.0552455357142856e-05,
"loss": 0.9255,
"step": 897
},
{
"epoch": 1.3524096385542168,
"grad_norm": 0.4126644062790991,
"learning_rate": 3.052455357142857e-05,
"loss": 0.8996,
"step": 898
},
{
"epoch": 1.3539156626506024,
"grad_norm": 0.45835871983819493,
"learning_rate": 3.0496651785714286e-05,
"loss": 0.7082,
"step": 899
},
{
"epoch": 1.355421686746988,
"grad_norm": 0.44990265028570336,
"learning_rate": 3.0468750000000002e-05,
"loss": 0.8755,
"step": 900
},
{
"epoch": 1.3569277108433735,
"grad_norm": 0.4587545686731054,
"learning_rate": 3.0440848214285715e-05,
"loss": 0.9152,
"step": 901
},
{
"epoch": 1.358433734939759,
"grad_norm": 0.46881115556908187,
"learning_rate": 3.041294642857143e-05,
"loss": 0.9064,
"step": 902
},
{
"epoch": 1.3599397590361446,
"grad_norm": 0.43874739109934835,
"learning_rate": 3.0385044642857148e-05,
"loss": 0.9135,
"step": 903
},
{
"epoch": 1.3614457831325302,
"grad_norm": 0.4295568498942259,
"learning_rate": 3.0357142857142857e-05,
"loss": 0.904,
"step": 904
},
{
"epoch": 1.3629518072289157,
"grad_norm": 0.43180452504935435,
"learning_rate": 3.032924107142857e-05,
"loss": 0.9336,
"step": 905
},
{
"epoch": 1.3644578313253013,
"grad_norm": 0.40663774586950147,
"learning_rate": 3.0301339285714287e-05,
"loss": 0.9037,
"step": 906
},
{
"epoch": 1.3659638554216866,
"grad_norm": 0.40248236277136,
"learning_rate": 3.02734375e-05,
"loss": 0.9044,
"step": 907
},
{
"epoch": 1.3674698795180724,
"grad_norm": 0.4752065910980524,
"learning_rate": 3.0245535714285716e-05,
"loss": 0.8568,
"step": 908
},
{
"epoch": 1.3689759036144578,
"grad_norm": 0.4202067499061365,
"learning_rate": 3.0217633928571433e-05,
"loss": 0.9557,
"step": 909
},
{
"epoch": 1.3704819277108433,
"grad_norm": 0.48057568453940086,
"learning_rate": 3.0189732142857146e-05,
"loss": 0.9031,
"step": 910
},
{
"epoch": 1.3719879518072289,
"grad_norm": 0.41435379001504125,
"learning_rate": 3.0161830357142855e-05,
"loss": 0.8241,
"step": 911
},
{
"epoch": 1.3734939759036144,
"grad_norm": 0.48992183825525126,
"learning_rate": 3.013392857142857e-05,
"loss": 0.8183,
"step": 912
},
{
"epoch": 1.375,
"grad_norm": 0.4534849819849003,
"learning_rate": 3.0106026785714288e-05,
"loss": 0.8807,
"step": 913
},
{
"epoch": 1.3765060240963856,
"grad_norm": 0.5981801322734634,
"learning_rate": 3.0078125e-05,
"loss": 0.9341,
"step": 914
},
{
"epoch": 1.3780120481927711,
"grad_norm": 0.3432122885958966,
"learning_rate": 3.0050223214285717e-05,
"loss": 0.9449,
"step": 915
},
{
"epoch": 1.3795180722891567,
"grad_norm": 0.39529737308209306,
"learning_rate": 3.002232142857143e-05,
"loss": 0.9597,
"step": 916
},
{
"epoch": 1.3810240963855422,
"grad_norm": 0.4754670733632709,
"learning_rate": 2.9994419642857147e-05,
"loss": 0.9381,
"step": 917
},
{
"epoch": 1.3825301204819276,
"grad_norm": 0.354701686868473,
"learning_rate": 2.9966517857142856e-05,
"loss": 0.8241,
"step": 918
},
{
"epoch": 1.3840361445783134,
"grad_norm": 0.4646504667817198,
"learning_rate": 2.9938616071428573e-05,
"loss": 0.893,
"step": 919
},
{
"epoch": 1.3855421686746987,
"grad_norm": 0.40307076471520065,
"learning_rate": 2.9910714285714286e-05,
"loss": 0.8357,
"step": 920
},
{
"epoch": 1.3870481927710843,
"grad_norm": 0.38695844998925766,
"learning_rate": 2.9882812500000002e-05,
"loss": 1.0317,
"step": 921
},
{
"epoch": 1.3885542168674698,
"grad_norm": 0.4424516996273697,
"learning_rate": 2.9854910714285715e-05,
"loss": 0.9349,
"step": 922
},
{
"epoch": 1.3900602409638554,
"grad_norm": 0.4296662542305843,
"learning_rate": 2.982700892857143e-05,
"loss": 0.9133,
"step": 923
},
{
"epoch": 1.391566265060241,
"grad_norm": 0.39583114987361917,
"learning_rate": 2.9799107142857148e-05,
"loss": 0.8541,
"step": 924
},
{
"epoch": 1.3930722891566265,
"grad_norm": 0.3716252912468994,
"learning_rate": 2.9771205357142858e-05,
"loss": 0.8552,
"step": 925
},
{
"epoch": 1.394578313253012,
"grad_norm": 0.46409285705623116,
"learning_rate": 2.974330357142857e-05,
"loss": 0.8815,
"step": 926
},
{
"epoch": 1.3960843373493976,
"grad_norm": 0.46679229577123127,
"learning_rate": 2.9715401785714287e-05,
"loss": 0.9119,
"step": 927
},
{
"epoch": 1.3975903614457832,
"grad_norm": 0.37324705107023204,
"learning_rate": 2.96875e-05,
"loss": 0.858,
"step": 928
},
{
"epoch": 1.3990963855421688,
"grad_norm": 0.6055787303176817,
"learning_rate": 2.9659598214285716e-05,
"loss": 0.879,
"step": 929
},
{
"epoch": 1.4006024096385543,
"grad_norm": 0.44231258660837997,
"learning_rate": 2.9631696428571433e-05,
"loss": 0.8918,
"step": 930
},
{
"epoch": 1.4021084337349397,
"grad_norm": 0.5216836981885677,
"learning_rate": 2.9603794642857146e-05,
"loss": 0.8983,
"step": 931
},
{
"epoch": 1.4036144578313254,
"grad_norm": 0.45039712128934334,
"learning_rate": 2.9575892857142855e-05,
"loss": 0.866,
"step": 932
},
{
"epoch": 1.4051204819277108,
"grad_norm": 0.43138599222949137,
"learning_rate": 2.9547991071428572e-05,
"loss": 0.9119,
"step": 933
},
{
"epoch": 1.4066265060240963,
"grad_norm": 0.44689025069904836,
"learning_rate": 2.9520089285714285e-05,
"loss": 0.9448,
"step": 934
},
{
"epoch": 1.408132530120482,
"grad_norm": 0.5515134809856107,
"learning_rate": 2.94921875e-05,
"loss": 0.9571,
"step": 935
},
{
"epoch": 1.4096385542168675,
"grad_norm": 0.44054087149528043,
"learning_rate": 2.9464285714285718e-05,
"loss": 0.9321,
"step": 936
},
{
"epoch": 1.411144578313253,
"grad_norm": 0.4302396074014632,
"learning_rate": 2.943638392857143e-05,
"loss": 0.915,
"step": 937
},
{
"epoch": 1.4126506024096386,
"grad_norm": 0.3705013577833683,
"learning_rate": 2.9408482142857147e-05,
"loss": 0.9078,
"step": 938
},
{
"epoch": 1.4141566265060241,
"grad_norm": 0.43752985808952727,
"learning_rate": 2.9380580357142857e-05,
"loss": 0.8134,
"step": 939
},
{
"epoch": 1.4156626506024097,
"grad_norm": 0.40165269513129087,
"learning_rate": 2.9352678571428573e-05,
"loss": 0.9359,
"step": 940
},
{
"epoch": 1.4171686746987953,
"grad_norm": 0.44638924492138565,
"learning_rate": 2.9324776785714286e-05,
"loss": 0.9178,
"step": 941
},
{
"epoch": 1.4186746987951806,
"grad_norm": 0.3858729382551046,
"learning_rate": 2.9296875000000002e-05,
"loss": 0.9488,
"step": 942
},
{
"epoch": 1.4201807228915664,
"grad_norm": 0.4235418849665121,
"learning_rate": 2.9268973214285715e-05,
"loss": 0.9584,
"step": 943
},
{
"epoch": 1.4216867469879517,
"grad_norm": 0.40935038101707705,
"learning_rate": 2.9241071428571432e-05,
"loss": 0.8791,
"step": 944
},
{
"epoch": 1.4231927710843373,
"grad_norm": 0.5114282733382471,
"learning_rate": 2.9213169642857148e-05,
"loss": 0.8905,
"step": 945
},
{
"epoch": 1.4246987951807228,
"grad_norm": 0.4119119373713173,
"learning_rate": 2.9185267857142858e-05,
"loss": 0.9075,
"step": 946
},
{
"epoch": 1.4262048192771084,
"grad_norm": 0.5142876058325453,
"learning_rate": 2.915736607142857e-05,
"loss": 0.9086,
"step": 947
},
{
"epoch": 1.427710843373494,
"grad_norm": 0.4273910035400046,
"learning_rate": 2.9129464285714287e-05,
"loss": 0.8474,
"step": 948
},
{
"epoch": 1.4292168674698795,
"grad_norm": 0.508469931678086,
"learning_rate": 2.91015625e-05,
"loss": 0.8644,
"step": 949
},
{
"epoch": 1.430722891566265,
"grad_norm": 0.3769076026021688,
"learning_rate": 2.9073660714285716e-05,
"loss": 0.8895,
"step": 950
},
{
"epoch": 1.4322289156626506,
"grad_norm": 0.43350827923442914,
"learning_rate": 2.9045758928571433e-05,
"loss": 0.9456,
"step": 951
},
{
"epoch": 1.4337349397590362,
"grad_norm": 0.38175759650148644,
"learning_rate": 2.9017857142857146e-05,
"loss": 0.8666,
"step": 952
},
{
"epoch": 1.4352409638554218,
"grad_norm": 0.49558119407682405,
"learning_rate": 2.8989955357142855e-05,
"loss": 0.9223,
"step": 953
},
{
"epoch": 1.4367469879518073,
"grad_norm": 0.4791778060312613,
"learning_rate": 2.8962053571428572e-05,
"loss": 0.8662,
"step": 954
},
{
"epoch": 1.4382530120481927,
"grad_norm": 0.4205517575204398,
"learning_rate": 2.8934151785714285e-05,
"loss": 0.8864,
"step": 955
},
{
"epoch": 1.4397590361445782,
"grad_norm": 0.4889673844916552,
"learning_rate": 2.890625e-05,
"loss": 0.8954,
"step": 956
},
{
"epoch": 1.4412650602409638,
"grad_norm": 0.3942749337229581,
"learning_rate": 2.8878348214285718e-05,
"loss": 0.9153,
"step": 957
},
{
"epoch": 1.4427710843373494,
"grad_norm": 0.6125185023981995,
"learning_rate": 2.885044642857143e-05,
"loss": 0.8448,
"step": 958
},
{
"epoch": 1.444277108433735,
"grad_norm": 5.64863717426843,
"learning_rate": 2.8822544642857147e-05,
"loss": 1.2668,
"step": 959
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.8404161340936614,
"learning_rate": 2.8794642857142857e-05,
"loss": 0.9226,
"step": 960
},
{
"epoch": 1.447289156626506,
"grad_norm": 0.4413000679839806,
"learning_rate": 2.876674107142857e-05,
"loss": 0.8919,
"step": 961
},
{
"epoch": 1.4487951807228916,
"grad_norm": 0.7160076374188957,
"learning_rate": 2.8738839285714286e-05,
"loss": 0.8588,
"step": 962
},
{
"epoch": 1.4503012048192772,
"grad_norm": 0.6886098912461931,
"learning_rate": 2.8710937500000002e-05,
"loss": 0.871,
"step": 963
},
{
"epoch": 1.4518072289156627,
"grad_norm": 0.5805795691792662,
"learning_rate": 2.8683035714285715e-05,
"loss": 1.0005,
"step": 964
},
{
"epoch": 1.4533132530120483,
"grad_norm": 0.6021271632952364,
"learning_rate": 2.8655133928571432e-05,
"loss": 0.8929,
"step": 965
},
{
"epoch": 1.4548192771084336,
"grad_norm": 0.6163237282072056,
"learning_rate": 2.8627232142857148e-05,
"loss": 0.8744,
"step": 966
},
{
"epoch": 1.4563253012048194,
"grad_norm": 0.8369685656279915,
"learning_rate": 2.8599330357142854e-05,
"loss": 0.8855,
"step": 967
},
{
"epoch": 1.4578313253012047,
"grad_norm": 0.45203308825392746,
"learning_rate": 2.857142857142857e-05,
"loss": 0.9074,
"step": 968
},
{
"epoch": 1.4593373493975903,
"grad_norm": 0.8296567908671902,
"learning_rate": 2.8543526785714287e-05,
"loss": 0.9263,
"step": 969
},
{
"epoch": 1.4608433734939759,
"grad_norm": 0.5360431068928597,
"learning_rate": 2.8515625e-05,
"loss": 0.901,
"step": 970
},
{
"epoch": 1.4623493975903614,
"grad_norm": 0.5597430667006373,
"learning_rate": 2.8487723214285717e-05,
"loss": 0.8531,
"step": 971
},
{
"epoch": 1.463855421686747,
"grad_norm": 0.4704336467239508,
"learning_rate": 2.8459821428571433e-05,
"loss": 0.7662,
"step": 972
},
{
"epoch": 1.4653614457831325,
"grad_norm": 0.6638563308526126,
"learning_rate": 2.8431919642857146e-05,
"loss": 0.896,
"step": 973
},
{
"epoch": 1.466867469879518,
"grad_norm": 0.49416570561268547,
"learning_rate": 2.8404017857142856e-05,
"loss": 1.0146,
"step": 974
},
{
"epoch": 1.4683734939759037,
"grad_norm": 0.5921378569755934,
"learning_rate": 2.8376116071428572e-05,
"loss": 0.8941,
"step": 975
},
{
"epoch": 1.4698795180722892,
"grad_norm": 0.5233538565652254,
"learning_rate": 2.8348214285714285e-05,
"loss": 0.9178,
"step": 976
},
{
"epoch": 1.4713855421686746,
"grad_norm": 0.5225045066976453,
"learning_rate": 2.83203125e-05,
"loss": 0.8585,
"step": 977
},
{
"epoch": 1.4728915662650603,
"grad_norm": 0.4429380390456755,
"learning_rate": 2.8292410714285718e-05,
"loss": 0.8866,
"step": 978
},
{
"epoch": 1.4743975903614457,
"grad_norm": 0.5352845381764894,
"learning_rate": 2.826450892857143e-05,
"loss": 0.9015,
"step": 979
},
{
"epoch": 1.4759036144578312,
"grad_norm": 0.45847867846721085,
"learning_rate": 2.8236607142857147e-05,
"loss": 0.8403,
"step": 980
},
{
"epoch": 1.4774096385542168,
"grad_norm": 0.4483896618488256,
"learning_rate": 2.8208705357142857e-05,
"loss": 0.9063,
"step": 981
},
{
"epoch": 1.4789156626506024,
"grad_norm": 0.5447575650467709,
"learning_rate": 2.818080357142857e-05,
"loss": 1.0174,
"step": 982
},
{
"epoch": 1.480421686746988,
"grad_norm": 0.4612032662256187,
"learning_rate": 2.8152901785714286e-05,
"loss": 0.82,
"step": 983
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.515486539846895,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.8925,
"step": 984
},
{
"epoch": 1.483433734939759,
"grad_norm": 0.4519574930979925,
"learning_rate": 2.8097098214285716e-05,
"loss": 0.9567,
"step": 985
},
{
"epoch": 1.4849397590361446,
"grad_norm": 0.5086924383145075,
"learning_rate": 2.8069196428571432e-05,
"loss": 0.9037,
"step": 986
},
{
"epoch": 1.4864457831325302,
"grad_norm": 0.5178992094320662,
"learning_rate": 2.8041294642857145e-05,
"loss": 0.9038,
"step": 987
},
{
"epoch": 1.4879518072289157,
"grad_norm": 0.5041369962905347,
"learning_rate": 2.8013392857142855e-05,
"loss": 0.9132,
"step": 988
},
{
"epoch": 1.4894578313253013,
"grad_norm": 0.6959413181570508,
"learning_rate": 2.798549107142857e-05,
"loss": 0.8232,
"step": 989
},
{
"epoch": 1.4909638554216866,
"grad_norm": 0.5489099521501453,
"learning_rate": 2.7957589285714287e-05,
"loss": 0.9571,
"step": 990
},
{
"epoch": 1.4924698795180724,
"grad_norm": 0.6536002850117123,
"learning_rate": 2.79296875e-05,
"loss": 0.9144,
"step": 991
},
{
"epoch": 1.4939759036144578,
"grad_norm": 0.5039034474264213,
"learning_rate": 2.7901785714285717e-05,
"loss": 0.9283,
"step": 992
},
{
"epoch": 1.4954819277108433,
"grad_norm": 0.872821550686971,
"learning_rate": 2.787388392857143e-05,
"loss": 0.8567,
"step": 993
},
{
"epoch": 1.4969879518072289,
"grad_norm": 0.733000574607516,
"learning_rate": 2.7845982142857146e-05,
"loss": 0.8719,
"step": 994
},
{
"epoch": 1.4984939759036144,
"grad_norm": 1.095577854607601,
"learning_rate": 2.7818080357142856e-05,
"loss": 0.9303,
"step": 995
},
{
"epoch": 1.5,
"grad_norm": 0.5564833254227586,
"learning_rate": 2.7790178571428572e-05,
"loss": 0.8541,
"step": 996
},
{
"epoch": 1.5015060240963856,
"grad_norm": 0.7970598996504878,
"learning_rate": 2.7762276785714285e-05,
"loss": 0.8351,
"step": 997
},
{
"epoch": 1.5030120481927711,
"grad_norm": 0.6549045841862853,
"learning_rate": 2.7734375e-05,
"loss": 0.8689,
"step": 998
},
{
"epoch": 1.5045180722891565,
"grad_norm": 0.5681432676198283,
"learning_rate": 2.7706473214285718e-05,
"loss": 0.8468,
"step": 999
},
{
"epoch": 1.5060240963855422,
"grad_norm": 0.5198610149820071,
"learning_rate": 2.767857142857143e-05,
"loss": 0.9135,
"step": 1000
},
{
"epoch": 1.5075301204819276,
"grad_norm": 0.4286227471606881,
"learning_rate": 2.7650669642857147e-05,
"loss": 0.9154,
"step": 1001
},
{
"epoch": 1.5090361445783134,
"grad_norm": 0.45213438219612373,
"learning_rate": 2.7622767857142857e-05,
"loss": 1.0552,
"step": 1002
},
{
"epoch": 1.5105421686746987,
"grad_norm": 0.4466567702725702,
"learning_rate": 2.759486607142857e-05,
"loss": 0.9207,
"step": 1003
},
{
"epoch": 1.5120481927710845,
"grad_norm": 0.4781696876914203,
"learning_rate": 2.7566964285714286e-05,
"loss": 0.8698,
"step": 1004
},
{
"epoch": 1.5135542168674698,
"grad_norm": 0.39400250706951795,
"learning_rate": 2.7539062500000003e-05,
"loss": 0.8552,
"step": 1005
},
{
"epoch": 1.5150602409638554,
"grad_norm": 0.41646484774735887,
"learning_rate": 2.7511160714285716e-05,
"loss": 0.8364,
"step": 1006
},
{
"epoch": 1.516566265060241,
"grad_norm": 0.47039327185060265,
"learning_rate": 2.7483258928571432e-05,
"loss": 0.8218,
"step": 1007
},
{
"epoch": 1.5180722891566265,
"grad_norm": 0.4654067313493103,
"learning_rate": 2.7455357142857145e-05,
"loss": 0.9374,
"step": 1008
},
{
"epoch": 1.519578313253012,
"grad_norm": 0.6297256206630292,
"learning_rate": 2.7427455357142855e-05,
"loss": 0.855,
"step": 1009
},
{
"epoch": 1.5210843373493976,
"grad_norm": 1.0080774986704135,
"learning_rate": 2.739955357142857e-05,
"loss": 0.9028,
"step": 1010
},
{
"epoch": 1.5225903614457832,
"grad_norm": 0.5187847383580094,
"learning_rate": 2.7371651785714288e-05,
"loss": 0.8409,
"step": 1011
},
{
"epoch": 1.5240963855421685,
"grad_norm": 0.449765870728605,
"learning_rate": 2.734375e-05,
"loss": 0.8711,
"step": 1012
},
{
"epoch": 1.5256024096385543,
"grad_norm": 0.47409835405594225,
"learning_rate": 2.7315848214285717e-05,
"loss": 0.9095,
"step": 1013
},
{
"epoch": 1.5271084337349397,
"grad_norm": 0.5880034935399832,
"learning_rate": 2.728794642857143e-05,
"loss": 0.9548,
"step": 1014
},
{
"epoch": 1.5286144578313254,
"grad_norm": 0.5220251015788524,
"learning_rate": 2.7260044642857146e-05,
"loss": 0.8551,
"step": 1015
},
{
"epoch": 1.5301204819277108,
"grad_norm": 0.4774411463626857,
"learning_rate": 2.7232142857142856e-05,
"loss": 0.9206,
"step": 1016
},
{
"epoch": 1.5316265060240963,
"grad_norm": 0.5489884470247157,
"learning_rate": 2.7204241071428572e-05,
"loss": 0.8259,
"step": 1017
},
{
"epoch": 1.533132530120482,
"grad_norm": 0.5086757946209051,
"learning_rate": 2.7176339285714285e-05,
"loss": 0.8289,
"step": 1018
},
{
"epoch": 1.5346385542168675,
"grad_norm": 0.5784371239216568,
"learning_rate": 2.7148437500000002e-05,
"loss": 0.941,
"step": 1019
},
{
"epoch": 1.536144578313253,
"grad_norm": 0.46741896878153755,
"learning_rate": 2.7120535714285715e-05,
"loss": 0.8084,
"step": 1020
},
{
"epoch": 1.5376506024096386,
"grad_norm": 0.7177481385712936,
"learning_rate": 2.709263392857143e-05,
"loss": 1.0161,
"step": 1021
},
{
"epoch": 1.5391566265060241,
"grad_norm": 0.49994447714150714,
"learning_rate": 2.7064732142857148e-05,
"loss": 0.8637,
"step": 1022
},
{
"epoch": 1.5406626506024095,
"grad_norm": 0.6287430493098758,
"learning_rate": 2.7036830357142857e-05,
"loss": 0.8143,
"step": 1023
},
{
"epoch": 1.5421686746987953,
"grad_norm": 0.3920548388750148,
"learning_rate": 2.700892857142857e-05,
"loss": 0.8837,
"step": 1024
},
{
"epoch": 1.5436746987951806,
"grad_norm": 0.6764540424051386,
"learning_rate": 2.6981026785714287e-05,
"loss": 0.8354,
"step": 1025
},
{
"epoch": 1.5451807228915664,
"grad_norm": 0.4701722364908312,
"learning_rate": 2.6953125000000003e-05,
"loss": 0.8649,
"step": 1026
},
{
"epoch": 1.5466867469879517,
"grad_norm": 0.5935049344421184,
"learning_rate": 2.6925223214285716e-05,
"loss": 0.8252,
"step": 1027
},
{
"epoch": 1.5481927710843375,
"grad_norm": 0.545118305218684,
"learning_rate": 2.6897321428571432e-05,
"loss": 0.8371,
"step": 1028
},
{
"epoch": 1.5496987951807228,
"grad_norm": 0.5292176024357023,
"learning_rate": 2.6869419642857145e-05,
"loss": 0.9568,
"step": 1029
},
{
"epoch": 1.5512048192771084,
"grad_norm": 0.48211286898454325,
"learning_rate": 2.6841517857142855e-05,
"loss": 0.8817,
"step": 1030
},
{
"epoch": 1.552710843373494,
"grad_norm": 0.6036687048133429,
"learning_rate": 2.681361607142857e-05,
"loss": 0.8412,
"step": 1031
},
{
"epoch": 1.5542168674698795,
"grad_norm": 0.4708685354243883,
"learning_rate": 2.6785714285714288e-05,
"loss": 0.8624,
"step": 1032
},
{
"epoch": 1.555722891566265,
"grad_norm": 0.5082277431951896,
"learning_rate": 2.67578125e-05,
"loss": 0.8902,
"step": 1033
},
{
"epoch": 1.5572289156626506,
"grad_norm": 0.41639515172248037,
"learning_rate": 2.6729910714285717e-05,
"loss": 0.9796,
"step": 1034
},
{
"epoch": 1.5587349397590362,
"grad_norm": 0.4890056323597087,
"learning_rate": 2.670200892857143e-05,
"loss": 0.7832,
"step": 1035
},
{
"epoch": 1.5602409638554215,
"grad_norm": 0.4276124434575915,
"learning_rate": 2.6674107142857147e-05,
"loss": 0.8327,
"step": 1036
},
{
"epoch": 1.5617469879518073,
"grad_norm": 0.4875966693879269,
"learning_rate": 2.6646205357142856e-05,
"loss": 0.8438,
"step": 1037
},
{
"epoch": 1.5632530120481927,
"grad_norm": 0.4725349010575883,
"learning_rate": 2.6618303571428573e-05,
"loss": 0.8615,
"step": 1038
},
{
"epoch": 1.5647590361445785,
"grad_norm": 0.6074670439413593,
"learning_rate": 2.6590401785714286e-05,
"loss": 0.8177,
"step": 1039
},
{
"epoch": 1.5662650602409638,
"grad_norm": 0.39430296974887447,
"learning_rate": 2.6562500000000002e-05,
"loss": 0.9546,
"step": 1040
},
{
"epoch": 1.5677710843373494,
"grad_norm": 0.5441456169202094,
"learning_rate": 2.6534598214285715e-05,
"loss": 0.8679,
"step": 1041
},
{
"epoch": 1.569277108433735,
"grad_norm": 0.358508883257567,
"learning_rate": 2.650669642857143e-05,
"loss": 0.7898,
"step": 1042
},
{
"epoch": 1.5707831325301205,
"grad_norm": 0.3762515748818302,
"learning_rate": 2.6478794642857148e-05,
"loss": 0.8059,
"step": 1043
},
{
"epoch": 1.572289156626506,
"grad_norm": 0.5600450043477502,
"learning_rate": 2.6450892857142857e-05,
"loss": 0.8669,
"step": 1044
},
{
"epoch": 1.5737951807228916,
"grad_norm": 16.209210109537786,
"learning_rate": 2.642299107142857e-05,
"loss": 0.9118,
"step": 1045
},
{
"epoch": 1.5753012048192772,
"grad_norm": 0.8245789403780897,
"learning_rate": 2.6395089285714287e-05,
"loss": 0.8334,
"step": 1046
},
{
"epoch": 1.5768072289156625,
"grad_norm": 0.44476760145948024,
"learning_rate": 2.63671875e-05,
"loss": 0.9057,
"step": 1047
},
{
"epoch": 1.5783132530120483,
"grad_norm": 0.8906977482828659,
"learning_rate": 2.6339285714285716e-05,
"loss": 0.9876,
"step": 1048
},
{
"epoch": 1.5798192771084336,
"grad_norm": 0.3995912086248366,
"learning_rate": 2.6311383928571432e-05,
"loss": 0.8815,
"step": 1049
},
{
"epoch": 1.5813253012048194,
"grad_norm": 0.7047183950339773,
"learning_rate": 2.6283482142857145e-05,
"loss": 0.8627,
"step": 1050
},
{
"epoch": 1.5828313253012047,
"grad_norm": 0.4350293302104783,
"learning_rate": 2.6255580357142855e-05,
"loss": 0.9496,
"step": 1051
},
{
"epoch": 1.5843373493975905,
"grad_norm": 0.6121972284851375,
"learning_rate": 2.622767857142857e-05,
"loss": 0.9315,
"step": 1052
},
{
"epoch": 1.5858433734939759,
"grad_norm": 0.4729430107404573,
"learning_rate": 2.6199776785714284e-05,
"loss": 0.8751,
"step": 1053
},
{
"epoch": 1.5873493975903614,
"grad_norm": 2.386770897995437,
"learning_rate": 2.6171875e-05,
"loss": 0.9184,
"step": 1054
},
{
"epoch": 1.588855421686747,
"grad_norm": 0.8831646708929439,
"learning_rate": 2.6143973214285717e-05,
"loss": 0.8509,
"step": 1055
},
{
"epoch": 1.5903614457831325,
"grad_norm": 0.7235782847166342,
"learning_rate": 2.611607142857143e-05,
"loss": 0.8447,
"step": 1056
},
{
"epoch": 1.591867469879518,
"grad_norm": 0.9124163331531953,
"learning_rate": 2.6088169642857147e-05,
"loss": 0.8459,
"step": 1057
},
{
"epoch": 1.5933734939759037,
"grad_norm": 0.5516253955906611,
"learning_rate": 2.6060267857142856e-05,
"loss": 0.8241,
"step": 1058
},
{
"epoch": 1.5948795180722892,
"grad_norm": 0.8846864142818344,
"learning_rate": 2.6032366071428573e-05,
"loss": 0.9648,
"step": 1059
},
{
"epoch": 1.5963855421686746,
"grad_norm": 0.5839211289775349,
"learning_rate": 2.6004464285714286e-05,
"loss": 0.8851,
"step": 1060
},
{
"epoch": 1.5978915662650603,
"grad_norm": 0.8077824232419398,
"learning_rate": 2.5976562500000002e-05,
"loss": 0.9407,
"step": 1061
},
{
"epoch": 1.5993975903614457,
"grad_norm": 0.5136089141652534,
"learning_rate": 2.5948660714285715e-05,
"loss": 0.8141,
"step": 1062
},
{
"epoch": 1.6009036144578315,
"grad_norm": 15.63050599847379,
"learning_rate": 2.592075892857143e-05,
"loss": 1.4084,
"step": 1063
},
{
"epoch": 1.6024096385542168,
"grad_norm": 0.7466960868740622,
"learning_rate": 2.5892857142857148e-05,
"loss": 0.8348,
"step": 1064
},
{
"epoch": 1.6039156626506024,
"grad_norm": 0.4341231244020683,
"learning_rate": 2.5864955357142857e-05,
"loss": 0.8573,
"step": 1065
},
{
"epoch": 1.605421686746988,
"grad_norm": 0.727077717382004,
"learning_rate": 2.583705357142857e-05,
"loss": 0.8246,
"step": 1066
},
{
"epoch": 1.6069277108433735,
"grad_norm": 0.5386450432861674,
"learning_rate": 2.5809151785714287e-05,
"loss": 0.9585,
"step": 1067
},
{
"epoch": 1.608433734939759,
"grad_norm": 0.6504714580080353,
"learning_rate": 2.578125e-05,
"loss": 0.8749,
"step": 1068
},
{
"epoch": 1.6099397590361446,
"grad_norm": 0.6088390872207317,
"learning_rate": 2.5753348214285716e-05,
"loss": 0.9327,
"step": 1069
},
{
"epoch": 1.6114457831325302,
"grad_norm": 0.5109625334350382,
"learning_rate": 2.5725446428571433e-05,
"loss": 0.8977,
"step": 1070
},
{
"epoch": 1.6129518072289155,
"grad_norm": 0.48705336887176803,
"learning_rate": 2.5697544642857146e-05,
"loss": 0.8916,
"step": 1071
},
{
"epoch": 1.6144578313253013,
"grad_norm": 0.6793312644482432,
"learning_rate": 2.5669642857142855e-05,
"loss": 0.8786,
"step": 1072
},
{
"epoch": 1.6159638554216866,
"grad_norm": 0.4698018735544544,
"learning_rate": 2.564174107142857e-05,
"loss": 0.9074,
"step": 1073
},
{
"epoch": 1.6174698795180724,
"grad_norm": 0.5705506461964672,
"learning_rate": 2.5613839285714285e-05,
"loss": 0.7961,
"step": 1074
},
{
"epoch": 1.6189759036144578,
"grad_norm": 0.5178852517247916,
"learning_rate": 2.55859375e-05,
"loss": 0.8614,
"step": 1075
},
{
"epoch": 1.6204819277108435,
"grad_norm": 0.7310792259857241,
"learning_rate": 2.5558035714285717e-05,
"loss": 0.9301,
"step": 1076
},
{
"epoch": 1.6219879518072289,
"grad_norm": 0.5190994712012568,
"learning_rate": 2.553013392857143e-05,
"loss": 0.9372,
"step": 1077
},
{
"epoch": 1.6234939759036144,
"grad_norm": 0.7951723141808771,
"learning_rate": 2.5502232142857147e-05,
"loss": 0.8946,
"step": 1078
},
{
"epoch": 1.625,
"grad_norm": 0.7794492806585496,
"learning_rate": 2.5474330357142856e-05,
"loss": 0.8432,
"step": 1079
},
{
"epoch": 1.6265060240963856,
"grad_norm": 0.6707838061340728,
"learning_rate": 2.544642857142857e-05,
"loss": 0.971,
"step": 1080
},
{
"epoch": 1.6280120481927711,
"grad_norm": 0.8834110189662773,
"learning_rate": 2.5418526785714286e-05,
"loss": 0.8979,
"step": 1081
},
{
"epoch": 1.6295180722891565,
"grad_norm": 0.6014786930546973,
"learning_rate": 2.5390625000000002e-05,
"loss": 0.8954,
"step": 1082
},
{
"epoch": 1.6310240963855422,
"grad_norm": 0.8540760877617614,
"learning_rate": 2.5362723214285715e-05,
"loss": 0.8548,
"step": 1083
},
{
"epoch": 1.6325301204819276,
"grad_norm": 0.5040331857999337,
"learning_rate": 2.533482142857143e-05,
"loss": 0.9351,
"step": 1084
},
{
"epoch": 1.6340361445783134,
"grad_norm": 1.0265522392715367,
"learning_rate": 2.5306919642857148e-05,
"loss": 0.8747,
"step": 1085
},
{
"epoch": 1.6355421686746987,
"grad_norm": 0.4218069447272765,
"learning_rate": 2.5279017857142858e-05,
"loss": 0.7964,
"step": 1086
},
{
"epoch": 1.6370481927710845,
"grad_norm": 0.9069849259697897,
"learning_rate": 2.525111607142857e-05,
"loss": 0.8784,
"step": 1087
},
{
"epoch": 1.6385542168674698,
"grad_norm": 0.5545891560849115,
"learning_rate": 2.5223214285714287e-05,
"loss": 0.8462,
"step": 1088
},
{
"epoch": 1.6400602409638554,
"grad_norm": 1.126477471558439,
"learning_rate": 2.51953125e-05,
"loss": 0.8734,
"step": 1089
},
{
"epoch": 1.641566265060241,
"grad_norm": 0.6957228414264802,
"learning_rate": 2.5167410714285716e-05,
"loss": 0.8246,
"step": 1090
},
{
"epoch": 1.6430722891566265,
"grad_norm": 1.4171979334183584,
"learning_rate": 2.5139508928571433e-05,
"loss": 0.889,
"step": 1091
},
{
"epoch": 1.644578313253012,
"grad_norm": 0.9831143469531066,
"learning_rate": 2.5111607142857146e-05,
"loss": 0.9333,
"step": 1092
},
{
"epoch": 1.6460843373493976,
"grad_norm": 0.944246143147534,
"learning_rate": 2.5083705357142855e-05,
"loss": 0.8715,
"step": 1093
},
{
"epoch": 1.6475903614457832,
"grad_norm": 1.2471206081577508,
"learning_rate": 2.5055803571428572e-05,
"loss": 0.7923,
"step": 1094
},
{
"epoch": 1.6490963855421685,
"grad_norm": 0.5349074428489512,
"learning_rate": 2.5027901785714285e-05,
"loss": 0.9104,
"step": 1095
},
{
"epoch": 1.6506024096385543,
"grad_norm": 1.0426515589882186,
"learning_rate": 2.5e-05,
"loss": 0.8367,
"step": 1096
},
{
"epoch": 1.6521084337349397,
"grad_norm": 0.5852550163241762,
"learning_rate": 2.4972098214285718e-05,
"loss": 0.9166,
"step": 1097
},
{
"epoch": 1.6536144578313254,
"grad_norm": 1.2841749652741983,
"learning_rate": 2.4944196428571427e-05,
"loss": 0.8525,
"step": 1098
},
{
"epoch": 1.6551204819277108,
"grad_norm": 0.8828118027062222,
"learning_rate": 2.4916294642857144e-05,
"loss": 0.8077,
"step": 1099
},
{
"epoch": 1.6566265060240963,
"grad_norm": 0.7990379061511043,
"learning_rate": 2.488839285714286e-05,
"loss": 0.9245,
"step": 1100
},
{
"epoch": 1.658132530120482,
"grad_norm": 1.0404691951944989,
"learning_rate": 2.4860491071428573e-05,
"loss": 0.786,
"step": 1101
},
{
"epoch": 1.6596385542168675,
"grad_norm": 2.4511140342568556,
"learning_rate": 2.4832589285714286e-05,
"loss": 0.9759,
"step": 1102
},
{
"epoch": 1.661144578313253,
"grad_norm": 1.566106141066553,
"learning_rate": 2.4804687500000002e-05,
"loss": 0.9199,
"step": 1103
},
{
"epoch": 1.6626506024096386,
"grad_norm": 0.9189347877238775,
"learning_rate": 2.4776785714285715e-05,
"loss": 0.7623,
"step": 1104
},
{
"epoch": 1.6641566265060241,
"grad_norm": 1.2922694093765588,
"learning_rate": 2.474888392857143e-05,
"loss": 0.8958,
"step": 1105
},
{
"epoch": 1.6656626506024095,
"grad_norm": 1.1521882549415206,
"learning_rate": 2.4720982142857145e-05,
"loss": 0.7848,
"step": 1106
},
{
"epoch": 1.6671686746987953,
"grad_norm": 0.49105081466568046,
"learning_rate": 2.4693080357142858e-05,
"loss": 0.9014,
"step": 1107
},
{
"epoch": 1.6686746987951806,
"grad_norm": 1.021364190774778,
"learning_rate": 2.4665178571428574e-05,
"loss": 0.9674,
"step": 1108
},
{
"epoch": 1.6701807228915664,
"grad_norm": 0.39713372043418954,
"learning_rate": 2.4637276785714287e-05,
"loss": 0.8371,
"step": 1109
},
{
"epoch": 1.6716867469879517,
"grad_norm": 0.880796407191392,
"learning_rate": 2.4609375e-05,
"loss": 0.8427,
"step": 1110
},
{
"epoch": 1.6731927710843375,
"grad_norm": 0.43777351705020745,
"learning_rate": 2.4581473214285717e-05,
"loss": 0.8861,
"step": 1111
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.9033630496631916,
"learning_rate": 2.455357142857143e-05,
"loss": 0.9055,
"step": 1112
},
{
"epoch": 1.6762048192771084,
"grad_norm": 0.5418185266054305,
"learning_rate": 2.4525669642857143e-05,
"loss": 0.8593,
"step": 1113
},
{
"epoch": 1.677710843373494,
"grad_norm": 0.8281163828338365,
"learning_rate": 2.449776785714286e-05,
"loss": 0.9075,
"step": 1114
},
{
"epoch": 1.6792168674698795,
"grad_norm": 0.7553700063230477,
"learning_rate": 2.4469866071428575e-05,
"loss": 0.8485,
"step": 1115
},
{
"epoch": 1.680722891566265,
"grad_norm": 0.6331198031487005,
"learning_rate": 2.4441964285714285e-05,
"loss": 0.8484,
"step": 1116
},
{
"epoch": 1.6822289156626506,
"grad_norm": 0.6889778842730878,
"learning_rate": 2.44140625e-05,
"loss": 0.8534,
"step": 1117
},
{
"epoch": 1.6837349397590362,
"grad_norm": 0.5135180763345606,
"learning_rate": 2.4386160714285718e-05,
"loss": 0.8931,
"step": 1118
},
{
"epoch": 1.6852409638554215,
"grad_norm": 0.7409248265527695,
"learning_rate": 2.4358258928571427e-05,
"loss": 0.9164,
"step": 1119
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.46994945955734424,
"learning_rate": 2.4330357142857144e-05,
"loss": 0.8619,
"step": 1120
},
{
"epoch": 1.6882530120481927,
"grad_norm": 0.7037554041916976,
"learning_rate": 2.430245535714286e-05,
"loss": 0.871,
"step": 1121
},
{
"epoch": 1.6897590361445785,
"grad_norm": 0.5357304166539075,
"learning_rate": 2.4274553571428573e-05,
"loss": 0.8691,
"step": 1122
},
{
"epoch": 1.6912650602409638,
"grad_norm": 0.7575261173085044,
"learning_rate": 2.4246651785714286e-05,
"loss": 0.9031,
"step": 1123
},
{
"epoch": 1.6927710843373494,
"grad_norm": 0.7476767229435017,
"learning_rate": 2.4218750000000003e-05,
"loss": 0.8515,
"step": 1124
},
{
"epoch": 1.694277108433735,
"grad_norm": 0.6778365982017799,
"learning_rate": 2.4190848214285716e-05,
"loss": 0.7263,
"step": 1125
},
{
"epoch": 1.6957831325301205,
"grad_norm": 0.8360614439997173,
"learning_rate": 2.416294642857143e-05,
"loss": 0.9608,
"step": 1126
},
{
"epoch": 1.697289156626506,
"grad_norm": 0.551963799065818,
"learning_rate": 2.4135044642857145e-05,
"loss": 0.8879,
"step": 1127
},
{
"epoch": 1.6987951807228916,
"grad_norm": 0.7658015878545192,
"learning_rate": 2.4107142857142858e-05,
"loss": 0.8899,
"step": 1128
},
{
"epoch": 1.7003012048192772,
"grad_norm": 0.40770587670016156,
"learning_rate": 2.4079241071428574e-05,
"loss": 0.9541,
"step": 1129
},
{
"epoch": 1.7018072289156625,
"grad_norm": 0.6472839612468347,
"learning_rate": 2.4051339285714287e-05,
"loss": 0.8801,
"step": 1130
},
{
"epoch": 1.7033132530120483,
"grad_norm": 0.429044091541502,
"learning_rate": 2.40234375e-05,
"loss": 0.8922,
"step": 1131
},
{
"epoch": 1.7048192771084336,
"grad_norm": 0.7029579106656687,
"learning_rate": 2.3995535714285717e-05,
"loss": 0.92,
"step": 1132
},
{
"epoch": 1.7063253012048194,
"grad_norm": 0.6263261771747816,
"learning_rate": 2.396763392857143e-05,
"loss": 0.8415,
"step": 1133
},
{
"epoch": 1.7078313253012047,
"grad_norm": 0.7503608910203321,
"learning_rate": 2.3939732142857143e-05,
"loss": 0.7998,
"step": 1134
},
{
"epoch": 1.7093373493975905,
"grad_norm": 0.6449649861016904,
"learning_rate": 2.391183035714286e-05,
"loss": 0.9381,
"step": 1135
},
{
"epoch": 1.7108433734939759,
"grad_norm": 0.518245381377626,
"learning_rate": 2.3883928571428572e-05,
"loss": 0.9294,
"step": 1136
},
{
"epoch": 1.7123493975903614,
"grad_norm": 0.648406053826853,
"learning_rate": 2.3856026785714285e-05,
"loss": 0.9205,
"step": 1137
},
{
"epoch": 1.713855421686747,
"grad_norm": 0.3911163463164327,
"learning_rate": 2.3828125e-05,
"loss": 0.9143,
"step": 1138
},
{
"epoch": 1.7153614457831325,
"grad_norm": 0.5423540252865888,
"learning_rate": 2.3800223214285718e-05,
"loss": 0.8771,
"step": 1139
},
{
"epoch": 1.716867469879518,
"grad_norm": 0.3885816076500336,
"learning_rate": 2.3772321428571428e-05,
"loss": 0.8558,
"step": 1140
},
{
"epoch": 1.7183734939759037,
"grad_norm": 0.4170280249079803,
"learning_rate": 2.3744419642857144e-05,
"loss": 0.8528,
"step": 1141
},
{
"epoch": 1.7198795180722892,
"grad_norm": 0.4152071903657909,
"learning_rate": 2.371651785714286e-05,
"loss": 0.8671,
"step": 1142
},
{
"epoch": 1.7213855421686746,
"grad_norm": 0.39474714716059084,
"learning_rate": 2.3688616071428573e-05,
"loss": 0.9109,
"step": 1143
},
{
"epoch": 1.7228915662650603,
"grad_norm": 0.3707507315413714,
"learning_rate": 2.3660714285714286e-05,
"loss": 0.8666,
"step": 1144
},
{
"epoch": 1.7243975903614457,
"grad_norm": 0.4266395661053771,
"learning_rate": 2.3632812500000003e-05,
"loss": 0.8211,
"step": 1145
},
{
"epoch": 1.7259036144578315,
"grad_norm": 0.47063781140188166,
"learning_rate": 2.3604910714285716e-05,
"loss": 0.8609,
"step": 1146
},
{
"epoch": 1.7274096385542168,
"grad_norm": 0.45024887632094757,
"learning_rate": 2.357700892857143e-05,
"loss": 0.8811,
"step": 1147
},
{
"epoch": 1.7289156626506024,
"grad_norm": 0.4790059949669508,
"learning_rate": 2.3549107142857145e-05,
"loss": 0.942,
"step": 1148
},
{
"epoch": 1.730421686746988,
"grad_norm": 0.3553309777536489,
"learning_rate": 2.3521205357142858e-05,
"loss": 0.8743,
"step": 1149
},
{
"epoch": 1.7319277108433735,
"grad_norm": 0.5274125186592818,
"learning_rate": 2.3493303571428574e-05,
"loss": 0.7704,
"step": 1150
},
{
"epoch": 1.733433734939759,
"grad_norm": 0.43261906364919867,
"learning_rate": 2.3465401785714287e-05,
"loss": 0.9138,
"step": 1151
},
{
"epoch": 1.7349397590361446,
"grad_norm": 0.653110685438967,
"learning_rate": 2.34375e-05,
"loss": 0.8418,
"step": 1152
},
{
"epoch": 1.7364457831325302,
"grad_norm": 0.5359506662334108,
"learning_rate": 2.3409598214285717e-05,
"loss": 0.8662,
"step": 1153
},
{
"epoch": 1.7379518072289155,
"grad_norm": 0.6643758042833765,
"learning_rate": 2.338169642857143e-05,
"loss": 0.8548,
"step": 1154
},
{
"epoch": 1.7394578313253013,
"grad_norm": 0.5355004065059475,
"learning_rate": 2.3353794642857143e-05,
"loss": 0.9029,
"step": 1155
},
{
"epoch": 1.7409638554216866,
"grad_norm": 0.4767330708254114,
"learning_rate": 2.332589285714286e-05,
"loss": 0.9254,
"step": 1156
},
{
"epoch": 1.7424698795180724,
"grad_norm": 0.5348063162622679,
"learning_rate": 2.3297991071428572e-05,
"loss": 0.8945,
"step": 1157
},
{
"epoch": 1.7439759036144578,
"grad_norm": 0.5029629065593897,
"learning_rate": 2.3270089285714285e-05,
"loss": 0.8276,
"step": 1158
},
{
"epoch": 1.7454819277108435,
"grad_norm": 0.670622804378133,
"learning_rate": 2.32421875e-05,
"loss": 0.8748,
"step": 1159
},
{
"epoch": 1.7469879518072289,
"grad_norm": 0.41487334156708916,
"learning_rate": 2.3214285714285715e-05,
"loss": 0.8299,
"step": 1160
},
{
"epoch": 1.7484939759036144,
"grad_norm": 0.6215766008707231,
"learning_rate": 2.3186383928571428e-05,
"loss": 0.8587,
"step": 1161
},
{
"epoch": 1.75,
"grad_norm": 0.37005229516348453,
"learning_rate": 2.3158482142857144e-05,
"loss": 0.8827,
"step": 1162
},
{
"epoch": 1.7515060240963856,
"grad_norm": 0.601524553304628,
"learning_rate": 2.3130580357142857e-05,
"loss": 0.9173,
"step": 1163
},
{
"epoch": 1.7530120481927711,
"grad_norm": 0.49760183476218545,
"learning_rate": 2.3102678571428573e-05,
"loss": 0.8551,
"step": 1164
},
{
"epoch": 1.7545180722891565,
"grad_norm": 0.49273447990399655,
"learning_rate": 2.3074776785714286e-05,
"loss": 0.9015,
"step": 1165
},
{
"epoch": 1.7560240963855422,
"grad_norm": 0.4136666630533229,
"learning_rate": 2.3046875e-05,
"loss": 0.9869,
"step": 1166
},
{
"epoch": 1.7575301204819276,
"grad_norm": 0.7709640381400192,
"learning_rate": 2.3018973214285716e-05,
"loss": 0.8721,
"step": 1167
},
{
"epoch": 1.7590361445783134,
"grad_norm": 0.37140731844814445,
"learning_rate": 2.299107142857143e-05,
"loss": 0.8829,
"step": 1168
},
{
"epoch": 1.7605421686746987,
"grad_norm": 0.6031319607510395,
"learning_rate": 2.2963169642857145e-05,
"loss": 0.8864,
"step": 1169
},
{
"epoch": 1.7620481927710845,
"grad_norm": 0.43834734848724594,
"learning_rate": 2.2935267857142858e-05,
"loss": 0.9325,
"step": 1170
},
{
"epoch": 1.7635542168674698,
"grad_norm": 0.5075632864944567,
"learning_rate": 2.2907366071428575e-05,
"loss": 0.8632,
"step": 1171
},
{
"epoch": 1.7650602409638554,
"grad_norm": 0.37935778044305524,
"learning_rate": 2.2879464285714288e-05,
"loss": 0.8555,
"step": 1172
},
{
"epoch": 1.766566265060241,
"grad_norm": 0.42918722851753877,
"learning_rate": 2.28515625e-05,
"loss": 0.9493,
"step": 1173
},
{
"epoch": 1.7680722891566265,
"grad_norm": 0.42990181277527434,
"learning_rate": 2.2823660714285717e-05,
"loss": 0.9117,
"step": 1174
},
{
"epoch": 1.769578313253012,
"grad_norm": 0.57611012626393,
"learning_rate": 2.279575892857143e-05,
"loss": 0.9196,
"step": 1175
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.39484852274105237,
"learning_rate": 2.2767857142857143e-05,
"loss": 0.9619,
"step": 1176
},
{
"epoch": 1.7725903614457832,
"grad_norm": 0.4364554717477179,
"learning_rate": 2.273995535714286e-05,
"loss": 0.8332,
"step": 1177
},
{
"epoch": 1.7740963855421685,
"grad_norm": 0.44687320358594623,
"learning_rate": 2.2712053571428572e-05,
"loss": 0.8664,
"step": 1178
},
{
"epoch": 1.7756024096385543,
"grad_norm": 0.4355803895438567,
"learning_rate": 2.2684151785714285e-05,
"loss": 0.9295,
"step": 1179
},
{
"epoch": 1.7771084337349397,
"grad_norm": 0.4264727826213937,
"learning_rate": 2.2656250000000002e-05,
"loss": 0.9485,
"step": 1180
},
{
"epoch": 1.7786144578313254,
"grad_norm": 0.4220939425258434,
"learning_rate": 2.2628348214285715e-05,
"loss": 0.8971,
"step": 1181
},
{
"epoch": 1.7801204819277108,
"grad_norm": 0.3743712763900201,
"learning_rate": 2.2600446428571428e-05,
"loss": 0.8768,
"step": 1182
},
{
"epoch": 1.7816265060240963,
"grad_norm": 0.37172612482630585,
"learning_rate": 2.2572544642857144e-05,
"loss": 0.879,
"step": 1183
},
{
"epoch": 1.783132530120482,
"grad_norm": 0.35650784428629834,
"learning_rate": 2.2544642857142857e-05,
"loss": 0.8168,
"step": 1184
},
{
"epoch": 1.7846385542168675,
"grad_norm": 1.296099308475157,
"learning_rate": 2.2516741071428574e-05,
"loss": 0.8715,
"step": 1185
},
{
"epoch": 1.786144578313253,
"grad_norm": 0.4057169370483399,
"learning_rate": 2.2488839285714287e-05,
"loss": 0.8506,
"step": 1186
},
{
"epoch": 1.7876506024096386,
"grad_norm": 0.3727660219889766,
"learning_rate": 2.24609375e-05,
"loss": 0.8516,
"step": 1187
},
{
"epoch": 1.7891566265060241,
"grad_norm": 0.39331291245426225,
"learning_rate": 2.2433035714285716e-05,
"loss": 0.9187,
"step": 1188
},
{
"epoch": 1.7906626506024095,
"grad_norm": 0.43628962564299745,
"learning_rate": 2.240513392857143e-05,
"loss": 0.746,
"step": 1189
},
{
"epoch": 1.7921686746987953,
"grad_norm": 0.34873175548859475,
"learning_rate": 2.2377232142857142e-05,
"loss": 0.9133,
"step": 1190
},
{
"epoch": 1.7936746987951806,
"grad_norm": 0.48913886108825744,
"learning_rate": 2.234933035714286e-05,
"loss": 0.8278,
"step": 1191
},
{
"epoch": 1.7951807228915664,
"grad_norm": 0.37283517212000034,
"learning_rate": 2.2321428571428575e-05,
"loss": 0.7914,
"step": 1192
},
{
"epoch": 1.7966867469879517,
"grad_norm": 0.42574970105878374,
"learning_rate": 2.2293526785714284e-05,
"loss": 0.9606,
"step": 1193
},
{
"epoch": 1.7981927710843375,
"grad_norm": 0.45615903441789923,
"learning_rate": 2.2265625e-05,
"loss": 0.8552,
"step": 1194
},
{
"epoch": 1.7996987951807228,
"grad_norm": 0.3523152673600134,
"learning_rate": 2.2237723214285717e-05,
"loss": 0.7769,
"step": 1195
},
{
"epoch": 1.8012048192771084,
"grad_norm": 0.4377380606676607,
"learning_rate": 2.2209821428571427e-05,
"loss": 0.8392,
"step": 1196
},
{
"epoch": 1.802710843373494,
"grad_norm": 0.36103913345649585,
"learning_rate": 2.2181919642857143e-05,
"loss": 0.9377,
"step": 1197
},
{
"epoch": 1.8042168674698795,
"grad_norm": 0.3789994448388259,
"learning_rate": 2.215401785714286e-05,
"loss": 0.7739,
"step": 1198
},
{
"epoch": 1.805722891566265,
"grad_norm": 0.45666886588011224,
"learning_rate": 2.2126116071428573e-05,
"loss": 0.8825,
"step": 1199
},
{
"epoch": 1.8072289156626506,
"grad_norm": 0.4369048519548364,
"learning_rate": 2.2098214285714286e-05,
"loss": 0.792,
"step": 1200
},
{
"epoch": 1.8087349397590362,
"grad_norm": 0.3703901620887643,
"learning_rate": 2.2070312500000002e-05,
"loss": 1.0138,
"step": 1201
},
{
"epoch": 1.8102409638554215,
"grad_norm": 0.5153154277635835,
"learning_rate": 2.2042410714285715e-05,
"loss": 0.8877,
"step": 1202
},
{
"epoch": 1.8117469879518073,
"grad_norm": 0.4117527486029097,
"learning_rate": 2.2014508928571428e-05,
"loss": 0.864,
"step": 1203
},
{
"epoch": 1.8132530120481927,
"grad_norm": 0.5370753083061592,
"learning_rate": 2.1986607142857144e-05,
"loss": 0.7491,
"step": 1204
},
{
"epoch": 1.8147590361445785,
"grad_norm": 0.40506792727469476,
"learning_rate": 2.1958705357142857e-05,
"loss": 0.852,
"step": 1205
},
{
"epoch": 1.8162650602409638,
"grad_norm": 0.4633663449143497,
"learning_rate": 2.1930803571428574e-05,
"loss": 0.8451,
"step": 1206
},
{
"epoch": 1.8177710843373494,
"grad_norm": 0.4919814791892918,
"learning_rate": 2.1902901785714287e-05,
"loss": 0.8725,
"step": 1207
},
{
"epoch": 1.819277108433735,
"grad_norm": 0.42137635617028457,
"learning_rate": 2.1875e-05,
"loss": 0.8602,
"step": 1208
},
{
"epoch": 1.8207831325301205,
"grad_norm": 0.5085803789684036,
"learning_rate": 2.1847098214285716e-05,
"loss": 0.8566,
"step": 1209
},
{
"epoch": 1.822289156626506,
"grad_norm": 0.40247273449736626,
"learning_rate": 2.181919642857143e-05,
"loss": 0.9855,
"step": 1210
},
{
"epoch": 1.8237951807228916,
"grad_norm": 0.5802511472165098,
"learning_rate": 2.1791294642857142e-05,
"loss": 0.8369,
"step": 1211
},
{
"epoch": 1.8253012048192772,
"grad_norm": 0.4067527885415625,
"learning_rate": 2.176339285714286e-05,
"loss": 0.8585,
"step": 1212
},
{
"epoch": 1.8268072289156625,
"grad_norm": 0.5310373943609165,
"learning_rate": 2.1735491071428575e-05,
"loss": 0.8547,
"step": 1213
},
{
"epoch": 1.8283132530120483,
"grad_norm": 0.3824614812529985,
"learning_rate": 2.1707589285714285e-05,
"loss": 0.9207,
"step": 1214
},
{
"epoch": 1.8298192771084336,
"grad_norm": 0.4688348275065373,
"learning_rate": 2.16796875e-05,
"loss": 0.8747,
"step": 1215
},
{
"epoch": 1.8313253012048194,
"grad_norm": 0.3605304958140787,
"learning_rate": 2.1651785714285717e-05,
"loss": 0.8762,
"step": 1216
},
{
"epoch": 1.8328313253012047,
"grad_norm": 0.4284443428488817,
"learning_rate": 2.1623883928571427e-05,
"loss": 0.8907,
"step": 1217
},
{
"epoch": 1.8343373493975905,
"grad_norm": 0.45226400512709514,
"learning_rate": 2.1595982142857143e-05,
"loss": 0.894,
"step": 1218
},
{
"epoch": 1.8358433734939759,
"grad_norm": 0.323676781419377,
"learning_rate": 2.156808035714286e-05,
"loss": 0.8417,
"step": 1219
},
{
"epoch": 1.8373493975903614,
"grad_norm": 0.4276587392407293,
"learning_rate": 2.1540178571428573e-05,
"loss": 0.8389,
"step": 1220
},
{
"epoch": 1.838855421686747,
"grad_norm": 0.42456391708790264,
"learning_rate": 2.1512276785714286e-05,
"loss": 0.8524,
"step": 1221
},
{
"epoch": 1.8403614457831325,
"grad_norm": 0.7850074422223948,
"learning_rate": 2.1484375000000002e-05,
"loss": 0.8754,
"step": 1222
},
{
"epoch": 1.841867469879518,
"grad_norm": 0.4640737736119975,
"learning_rate": 2.1456473214285715e-05,
"loss": 0.9282,
"step": 1223
},
{
"epoch": 1.8433734939759037,
"grad_norm": 0.3585983116850509,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.9389,
"step": 1224
},
{
"epoch": 1.8448795180722892,
"grad_norm": 0.44747295897650574,
"learning_rate": 2.1400669642857145e-05,
"loss": 0.8108,
"step": 1225
},
{
"epoch": 1.8463855421686746,
"grad_norm": 0.3529359646839571,
"learning_rate": 2.1372767857142858e-05,
"loss": 0.8901,
"step": 1226
},
{
"epoch": 1.8478915662650603,
"grad_norm": 0.3852830913108626,
"learning_rate": 2.1344866071428574e-05,
"loss": 0.8874,
"step": 1227
},
{
"epoch": 1.8493975903614457,
"grad_norm": 0.3464315482990844,
"learning_rate": 2.1316964285714287e-05,
"loss": 0.9072,
"step": 1228
},
{
"epoch": 1.8509036144578315,
"grad_norm": 0.4011789468409193,
"learning_rate": 2.12890625e-05,
"loss": 0.8593,
"step": 1229
},
{
"epoch": 1.8524096385542168,
"grad_norm": 0.3947518366597008,
"learning_rate": 2.1261160714285716e-05,
"loss": 0.8345,
"step": 1230
},
{
"epoch": 1.8539156626506024,
"grad_norm": 0.665410917945062,
"learning_rate": 2.123325892857143e-05,
"loss": 0.8534,
"step": 1231
},
{
"epoch": 1.855421686746988,
"grad_norm": 0.4802638256383104,
"learning_rate": 2.1205357142857142e-05,
"loss": 0.7968,
"step": 1232
},
{
"epoch": 1.8569277108433735,
"grad_norm": 0.3803607598090677,
"learning_rate": 2.117745535714286e-05,
"loss": 0.8624,
"step": 1233
},
{
"epoch": 1.858433734939759,
"grad_norm": 0.5759009821621237,
"learning_rate": 2.1149553571428575e-05,
"loss": 0.9244,
"step": 1234
},
{
"epoch": 1.8599397590361446,
"grad_norm": 0.4130173719255352,
"learning_rate": 2.1121651785714285e-05,
"loss": 0.9339,
"step": 1235
},
{
"epoch": 1.8614457831325302,
"grad_norm": 0.6183080668913786,
"learning_rate": 2.109375e-05,
"loss": 0.9796,
"step": 1236
},
{
"epoch": 1.8629518072289155,
"grad_norm": 0.403554124818301,
"learning_rate": 2.1065848214285718e-05,
"loss": 0.7715,
"step": 1237
},
{
"epoch": 1.8644578313253013,
"grad_norm": 0.9548159726360518,
"learning_rate": 2.1037946428571427e-05,
"loss": 0.8788,
"step": 1238
},
{
"epoch": 1.8659638554216866,
"grad_norm": 0.3530584700087297,
"learning_rate": 2.1010044642857144e-05,
"loss": 0.8584,
"step": 1239
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.7495389471772913,
"learning_rate": 2.098214285714286e-05,
"loss": 0.8442,
"step": 1240
},
{
"epoch": 1.8689759036144578,
"grad_norm": 0.6821998896745344,
"learning_rate": 2.0954241071428573e-05,
"loss": 0.8546,
"step": 1241
},
{
"epoch": 1.8704819277108435,
"grad_norm": 0.47479887802226356,
"learning_rate": 2.0926339285714286e-05,
"loss": 0.9334,
"step": 1242
},
{
"epoch": 1.8719879518072289,
"grad_norm": 0.5603048620977122,
"learning_rate": 2.0898437500000002e-05,
"loss": 0.8423,
"step": 1243
},
{
"epoch": 1.8734939759036144,
"grad_norm": 0.39388646214201617,
"learning_rate": 2.0870535714285715e-05,
"loss": 0.8927,
"step": 1244
},
{
"epoch": 1.875,
"grad_norm": 0.4607169838560337,
"learning_rate": 2.0842633928571428e-05,
"loss": 0.8001,
"step": 1245
},
{
"epoch": 1.8765060240963856,
"grad_norm": 0.5569354825890832,
"learning_rate": 2.0814732142857145e-05,
"loss": 0.9392,
"step": 1246
},
{
"epoch": 1.8780120481927711,
"grad_norm": 0.43577830730470063,
"learning_rate": 2.0786830357142858e-05,
"loss": 0.9214,
"step": 1247
},
{
"epoch": 1.8795180722891565,
"grad_norm": 0.44562052553074394,
"learning_rate": 2.0758928571428574e-05,
"loss": 0.8198,
"step": 1248
},
{
"epoch": 1.8810240963855422,
"grad_norm": 0.4025769661860242,
"learning_rate": 2.0731026785714287e-05,
"loss": 0.8114,
"step": 1249
},
{
"epoch": 1.8825301204819276,
"grad_norm": 0.4797517480130995,
"learning_rate": 2.0703125e-05,
"loss": 0.8653,
"step": 1250
},
{
"epoch": 1.8840361445783134,
"grad_norm": 6.637770259712631,
"learning_rate": 2.0675223214285716e-05,
"loss": 1.1432,
"step": 1251
},
{
"epoch": 1.8855421686746987,
"grad_norm": 0.469987011003461,
"learning_rate": 2.064732142857143e-05,
"loss": 0.8394,
"step": 1252
},
{
"epoch": 1.8870481927710845,
"grad_norm": 0.4466450566360911,
"learning_rate": 2.0619419642857142e-05,
"loss": 0.8566,
"step": 1253
},
{
"epoch": 1.8885542168674698,
"grad_norm": 0.38088356126480577,
"learning_rate": 2.059151785714286e-05,
"loss": 0.8228,
"step": 1254
},
{
"epoch": 1.8900602409638554,
"grad_norm": 0.3631229556411153,
"learning_rate": 2.0563616071428575e-05,
"loss": 0.9498,
"step": 1255
},
{
"epoch": 1.891566265060241,
"grad_norm": 0.3993298380354046,
"learning_rate": 2.0535714285714285e-05,
"loss": 0.8492,
"step": 1256
},
{
"epoch": 1.8930722891566265,
"grad_norm": 0.40840340740312214,
"learning_rate": 2.05078125e-05,
"loss": 0.8524,
"step": 1257
},
{
"epoch": 1.894578313253012,
"grad_norm": 0.4612638073125458,
"learning_rate": 2.0479910714285718e-05,
"loss": 0.8666,
"step": 1258
},
{
"epoch": 1.8960843373493976,
"grad_norm": 0.5083920050245206,
"learning_rate": 2.0452008928571427e-05,
"loss": 0.9183,
"step": 1259
},
{
"epoch": 1.8975903614457832,
"grad_norm": 0.422016812147062,
"learning_rate": 2.0424107142857144e-05,
"loss": 0.8921,
"step": 1260
},
{
"epoch": 1.8990963855421685,
"grad_norm": 0.45752369706471674,
"learning_rate": 2.039620535714286e-05,
"loss": 0.9183,
"step": 1261
},
{
"epoch": 1.9006024096385543,
"grad_norm": 2.855018690075539,
"learning_rate": 2.0368303571428573e-05,
"loss": 0.8848,
"step": 1262
},
{
"epoch": 1.9021084337349397,
"grad_norm": 0.4639541572398917,
"learning_rate": 2.0340401785714286e-05,
"loss": 0.9341,
"step": 1263
},
{
"epoch": 1.9036144578313254,
"grad_norm": 0.4321145036292446,
"learning_rate": 2.0312500000000002e-05,
"loss": 0.82,
"step": 1264
},
{
"epoch": 1.9051204819277108,
"grad_norm": 0.4401457739860857,
"learning_rate": 2.0284598214285715e-05,
"loss": 0.9836,
"step": 1265
},
{
"epoch": 1.9066265060240963,
"grad_norm": 0.45590209389797237,
"learning_rate": 2.025669642857143e-05,
"loss": 0.8318,
"step": 1266
},
{
"epoch": 1.908132530120482,
"grad_norm": 0.3560469694781191,
"learning_rate": 2.0228794642857145e-05,
"loss": 0.9077,
"step": 1267
},
{
"epoch": 1.9096385542168675,
"grad_norm": 0.37827882094451176,
"learning_rate": 2.0200892857142858e-05,
"loss": 0.9629,
"step": 1268
},
{
"epoch": 1.911144578313253,
"grad_norm": 0.3713252422880374,
"learning_rate": 2.0172991071428574e-05,
"loss": 0.8183,
"step": 1269
},
{
"epoch": 1.9126506024096386,
"grad_norm": 0.33883448325551585,
"learning_rate": 2.0145089285714287e-05,
"loss": 0.8105,
"step": 1270
},
{
"epoch": 1.9141566265060241,
"grad_norm": 0.35590457815199456,
"learning_rate": 2.01171875e-05,
"loss": 0.7834,
"step": 1271
},
{
"epoch": 1.9156626506024095,
"grad_norm": 0.5578784489374652,
"learning_rate": 2.0089285714285717e-05,
"loss": 0.8759,
"step": 1272
},
{
"epoch": 1.9171686746987953,
"grad_norm": 0.40926383677816613,
"learning_rate": 2.006138392857143e-05,
"loss": 0.886,
"step": 1273
},
{
"epoch": 1.9186746987951806,
"grad_norm": 0.486194173387504,
"learning_rate": 2.0033482142857143e-05,
"loss": 0.9347,
"step": 1274
},
{
"epoch": 1.9201807228915664,
"grad_norm": 0.4136427713437559,
"learning_rate": 2.000558035714286e-05,
"loss": 0.9332,
"step": 1275
},
{
"epoch": 1.9216867469879517,
"grad_norm": 0.3483855660385954,
"learning_rate": 1.9977678571428572e-05,
"loss": 0.9359,
"step": 1276
},
{
"epoch": 1.9231927710843375,
"grad_norm": 0.437143609492026,
"learning_rate": 1.9949776785714285e-05,
"loss": 0.8524,
"step": 1277
},
{
"epoch": 1.9246987951807228,
"grad_norm": 0.3839105041725241,
"learning_rate": 1.9921875e-05,
"loss": 0.917,
"step": 1278
},
{
"epoch": 1.9262048192771084,
"grad_norm": 0.44255535746860586,
"learning_rate": 1.9893973214285714e-05,
"loss": 1.0097,
"step": 1279
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.40492062330103035,
"learning_rate": 1.9866071428571427e-05,
"loss": 0.8993,
"step": 1280
},
{
"epoch": 1.9292168674698795,
"grad_norm": 0.3477269035558404,
"learning_rate": 1.9838169642857144e-05,
"loss": 0.8446,
"step": 1281
},
{
"epoch": 1.930722891566265,
"grad_norm": 0.35901069911086986,
"learning_rate": 1.9810267857142857e-05,
"loss": 0.9402,
"step": 1282
},
{
"epoch": 1.9322289156626506,
"grad_norm": 0.3925820811568655,
"learning_rate": 1.9782366071428573e-05,
"loss": 0.8543,
"step": 1283
},
{
"epoch": 1.9337349397590362,
"grad_norm": 0.44410281224026454,
"learning_rate": 1.9754464285714286e-05,
"loss": 0.8916,
"step": 1284
},
{
"epoch": 1.9352409638554215,
"grad_norm": 0.4528830549358855,
"learning_rate": 1.9726562500000003e-05,
"loss": 0.9176,
"step": 1285
},
{
"epoch": 1.9367469879518073,
"grad_norm": 0.4251187019338651,
"learning_rate": 1.9698660714285716e-05,
"loss": 0.8343,
"step": 1286
},
{
"epoch": 1.9382530120481927,
"grad_norm": 0.9134414518004109,
"learning_rate": 1.967075892857143e-05,
"loss": 0.9058,
"step": 1287
},
{
"epoch": 1.9397590361445785,
"grad_norm": 0.4645213676150715,
"learning_rate": 1.9642857142857145e-05,
"loss": 0.8748,
"step": 1288
},
{
"epoch": 1.9412650602409638,
"grad_norm": 0.36520678033321824,
"learning_rate": 1.9614955357142858e-05,
"loss": 0.8423,
"step": 1289
},
{
"epoch": 1.9427710843373494,
"grad_norm": 0.5627155074171815,
"learning_rate": 1.9587053571428574e-05,
"loss": 0.9502,
"step": 1290
},
{
"epoch": 1.944277108433735,
"grad_norm": 0.43588759649566605,
"learning_rate": 1.9559151785714287e-05,
"loss": 0.9533,
"step": 1291
},
{
"epoch": 1.9457831325301205,
"grad_norm": 0.4492804069126713,
"learning_rate": 1.953125e-05,
"loss": 0.8347,
"step": 1292
},
{
"epoch": 1.947289156626506,
"grad_norm": 0.4774656152831537,
"learning_rate": 1.9503348214285717e-05,
"loss": 0.854,
"step": 1293
},
{
"epoch": 1.9487951807228916,
"grad_norm": 0.3636554052712092,
"learning_rate": 1.947544642857143e-05,
"loss": 0.938,
"step": 1294
},
{
"epoch": 1.9503012048192772,
"grad_norm": 0.428297806360514,
"learning_rate": 1.9447544642857143e-05,
"loss": 0.8964,
"step": 1295
},
{
"epoch": 1.9518072289156625,
"grad_norm": 0.39153956216387253,
"learning_rate": 1.941964285714286e-05,
"loss": 0.8541,
"step": 1296
},
{
"epoch": 1.9533132530120483,
"grad_norm": 0.332920802684014,
"learning_rate": 1.9391741071428572e-05,
"loss": 0.8538,
"step": 1297
},
{
"epoch": 1.9548192771084336,
"grad_norm": 0.3537239044824588,
"learning_rate": 1.9363839285714285e-05,
"loss": 0.8111,
"step": 1298
},
{
"epoch": 1.9563253012048194,
"grad_norm": 0.3910323433826671,
"learning_rate": 1.93359375e-05,
"loss": 0.7748,
"step": 1299
},
{
"epoch": 1.9578313253012047,
"grad_norm": 0.37260307966315526,
"learning_rate": 1.9308035714285715e-05,
"loss": 0.8392,
"step": 1300
},
{
"epoch": 1.9593373493975905,
"grad_norm": 0.3466814569395361,
"learning_rate": 1.9280133928571428e-05,
"loss": 0.8919,
"step": 1301
},
{
"epoch": 1.9608433734939759,
"grad_norm": 0.3668127686697282,
"learning_rate": 1.9252232142857144e-05,
"loss": 0.9278,
"step": 1302
},
{
"epoch": 1.9623493975903614,
"grad_norm": 0.3940255424369653,
"learning_rate": 1.9224330357142857e-05,
"loss": 0.855,
"step": 1303
},
{
"epoch": 1.963855421686747,
"grad_norm": 0.34885469264598656,
"learning_rate": 1.9196428571428573e-05,
"loss": 0.9353,
"step": 1304
},
{
"epoch": 1.9653614457831325,
"grad_norm": 0.3334308112535604,
"learning_rate": 1.9168526785714286e-05,
"loss": 0.9215,
"step": 1305
},
{
"epoch": 1.966867469879518,
"grad_norm": 0.3765065029636254,
"learning_rate": 1.9140625e-05,
"loss": 0.9731,
"step": 1306
},
{
"epoch": 1.9683734939759037,
"grad_norm": 0.36181841484585925,
"learning_rate": 1.9112723214285716e-05,
"loss": 0.9216,
"step": 1307
},
{
"epoch": 1.9698795180722892,
"grad_norm": 0.32612366899867334,
"learning_rate": 1.908482142857143e-05,
"loss": 0.8867,
"step": 1308
},
{
"epoch": 1.9713855421686746,
"grad_norm": 0.3705611280482762,
"learning_rate": 1.9056919642857142e-05,
"loss": 0.911,
"step": 1309
},
{
"epoch": 1.9728915662650603,
"grad_norm": 0.3232502381675511,
"learning_rate": 1.9029017857142858e-05,
"loss": 0.8989,
"step": 1310
},
{
"epoch": 1.9743975903614457,
"grad_norm": 0.8443694301319985,
"learning_rate": 1.9001116071428575e-05,
"loss": 0.8978,
"step": 1311
},
{
"epoch": 1.9759036144578315,
"grad_norm": 0.4540515753709519,
"learning_rate": 1.8973214285714284e-05,
"loss": 0.8448,
"step": 1312
},
{
"epoch": 1.9774096385542168,
"grad_norm": 0.3696027893273105,
"learning_rate": 1.89453125e-05,
"loss": 0.8359,
"step": 1313
},
{
"epoch": 1.9789156626506024,
"grad_norm": 0.35571569865318226,
"learning_rate": 1.8917410714285717e-05,
"loss": 0.862,
"step": 1314
},
{
"epoch": 1.980421686746988,
"grad_norm": 0.31180543442817893,
"learning_rate": 1.888950892857143e-05,
"loss": 0.8617,
"step": 1315
},
{
"epoch": 1.9819277108433735,
"grad_norm": 0.36160408239255354,
"learning_rate": 1.8861607142857143e-05,
"loss": 0.8913,
"step": 1316
},
{
"epoch": 1.983433734939759,
"grad_norm": 0.45790777153260115,
"learning_rate": 1.883370535714286e-05,
"loss": 0.8879,
"step": 1317
},
{
"epoch": 1.9849397590361446,
"grad_norm": 0.496836432274577,
"learning_rate": 1.8805803571428572e-05,
"loss": 0.869,
"step": 1318
},
{
"epoch": 1.9864457831325302,
"grad_norm": 0.3626950062319982,
"learning_rate": 1.8777901785714285e-05,
"loss": 0.8908,
"step": 1319
},
{
"epoch": 1.9879518072289155,
"grad_norm": 0.4427359957060849,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.8673,
"step": 1320
},
{
"epoch": 1.9894578313253013,
"grad_norm": 0.42629369958281044,
"learning_rate": 1.8722098214285715e-05,
"loss": 0.9288,
"step": 1321
},
{
"epoch": 1.9909638554216866,
"grad_norm": 0.4316198651007131,
"learning_rate": 1.8694196428571428e-05,
"loss": 0.8668,
"step": 1322
},
{
"epoch": 1.9924698795180724,
"grad_norm": 0.3788207632419721,
"learning_rate": 1.8666294642857144e-05,
"loss": 0.984,
"step": 1323
},
{
"epoch": 1.9939759036144578,
"grad_norm": 0.3812915905222573,
"learning_rate": 1.8638392857142857e-05,
"loss": 0.9209,
"step": 1324
},
{
"epoch": 1.9954819277108435,
"grad_norm": 0.35102950154747864,
"learning_rate": 1.8610491071428574e-05,
"loss": 0.8559,
"step": 1325
},
{
"epoch": 1.9969879518072289,
"grad_norm": 0.3121508096951349,
"learning_rate": 1.8582589285714287e-05,
"loss": 0.882,
"step": 1326
},
{
"epoch": 1.9984939759036144,
"grad_norm": 0.43980176631894125,
"learning_rate": 1.85546875e-05,
"loss": 1.0333,
"step": 1327
},
{
"epoch": 2.0,
"grad_norm": 0.3639881171930256,
"learning_rate": 1.8526785714285716e-05,
"loss": 0.8306,
"step": 1328
},
{
"epoch": 2.0015060240963853,
"grad_norm": 0.4645419957682661,
"learning_rate": 1.849888392857143e-05,
"loss": 0.7573,
"step": 1329
},
{
"epoch": 2.003012048192771,
"grad_norm": 0.3991769959264662,
"learning_rate": 1.8470982142857142e-05,
"loss": 0.7975,
"step": 1330
},
{
"epoch": 2.0045180722891565,
"grad_norm": 0.366644177641209,
"learning_rate": 1.844308035714286e-05,
"loss": 0.7799,
"step": 1331
},
{
"epoch": 2.0060240963855422,
"grad_norm": 0.460572017760071,
"learning_rate": 1.8415178571428575e-05,
"loss": 0.7055,
"step": 1332
},
{
"epoch": 2.0075301204819276,
"grad_norm": 0.43649135576103276,
"learning_rate": 1.8387276785714284e-05,
"loss": 0.7187,
"step": 1333
},
{
"epoch": 2.0090361445783134,
"grad_norm": 1.5677143195685717,
"learning_rate": 1.8359375e-05,
"loss": 0.7939,
"step": 1334
},
{
"epoch": 2.0105421686746987,
"grad_norm": 0.4892665120388365,
"learning_rate": 1.8331473214285717e-05,
"loss": 0.7309,
"step": 1335
},
{
"epoch": 2.0120481927710845,
"grad_norm": 0.4642370069684976,
"learning_rate": 1.8303571428571427e-05,
"loss": 0.8,
"step": 1336
},
{
"epoch": 2.01355421686747,
"grad_norm": 0.6818301999537285,
"learning_rate": 1.8275669642857143e-05,
"loss": 0.6766,
"step": 1337
},
{
"epoch": 2.0150602409638556,
"grad_norm": 0.4337687681212051,
"learning_rate": 1.824776785714286e-05,
"loss": 0.8006,
"step": 1338
},
{
"epoch": 2.016566265060241,
"grad_norm": 0.45583287566271075,
"learning_rate": 1.8219866071428573e-05,
"loss": 0.7657,
"step": 1339
},
{
"epoch": 2.0180722891566263,
"grad_norm": 0.5968682463705136,
"learning_rate": 1.8191964285714286e-05,
"loss": 0.8185,
"step": 1340
},
{
"epoch": 2.019578313253012,
"grad_norm": 0.41808321844989643,
"learning_rate": 1.8164062500000002e-05,
"loss": 0.7072,
"step": 1341
},
{
"epoch": 2.0210843373493974,
"grad_norm": 0.46643525754868204,
"learning_rate": 1.8136160714285715e-05,
"loss": 0.7267,
"step": 1342
},
{
"epoch": 2.022590361445783,
"grad_norm": 0.3876279819071906,
"learning_rate": 1.8108258928571428e-05,
"loss": 0.7417,
"step": 1343
},
{
"epoch": 2.0240963855421685,
"grad_norm": 0.4098094575732391,
"learning_rate": 1.8080357142857144e-05,
"loss": 0.7083,
"step": 1344
},
{
"epoch": 2.0256024096385543,
"grad_norm": 0.49406976276463177,
"learning_rate": 1.8052455357142857e-05,
"loss": 0.8178,
"step": 1345
},
{
"epoch": 2.0271084337349397,
"grad_norm": 0.45369411013371175,
"learning_rate": 1.8024553571428574e-05,
"loss": 0.7517,
"step": 1346
},
{
"epoch": 2.0286144578313254,
"grad_norm": 0.33950542385105087,
"learning_rate": 1.7996651785714287e-05,
"loss": 0.7702,
"step": 1347
},
{
"epoch": 2.0301204819277108,
"grad_norm": 0.5260328792053571,
"learning_rate": 1.796875e-05,
"loss": 0.7149,
"step": 1348
},
{
"epoch": 2.0316265060240966,
"grad_norm": 0.39030208535546845,
"learning_rate": 1.7940848214285716e-05,
"loss": 0.7894,
"step": 1349
},
{
"epoch": 2.033132530120482,
"grad_norm": 0.415210832549199,
"learning_rate": 1.791294642857143e-05,
"loss": 0.744,
"step": 1350
},
{
"epoch": 2.0346385542168677,
"grad_norm": 0.46336490743463876,
"learning_rate": 1.7885044642857142e-05,
"loss": 0.7955,
"step": 1351
},
{
"epoch": 2.036144578313253,
"grad_norm": 0.39514711622942733,
"learning_rate": 1.785714285714286e-05,
"loss": 0.6959,
"step": 1352
},
{
"epoch": 2.0376506024096384,
"grad_norm": 0.362320780601732,
"learning_rate": 1.7829241071428575e-05,
"loss": 0.7671,
"step": 1353
},
{
"epoch": 2.039156626506024,
"grad_norm": 0.6758194108847355,
"learning_rate": 1.7801339285714284e-05,
"loss": 0.7043,
"step": 1354
},
{
"epoch": 2.0406626506024095,
"grad_norm": 0.3575986667122882,
"learning_rate": 1.77734375e-05,
"loss": 0.6845,
"step": 1355
},
{
"epoch": 2.0421686746987953,
"grad_norm": 0.38028183575537383,
"learning_rate": 1.7745535714285717e-05,
"loss": 0.7155,
"step": 1356
},
{
"epoch": 2.0436746987951806,
"grad_norm": 0.38077365274183594,
"learning_rate": 1.7717633928571427e-05,
"loss": 0.771,
"step": 1357
},
{
"epoch": 2.0451807228915664,
"grad_norm": 0.33288640240919065,
"learning_rate": 1.7689732142857143e-05,
"loss": 0.7881,
"step": 1358
},
{
"epoch": 2.0466867469879517,
"grad_norm": 0.38075008941069827,
"learning_rate": 1.766183035714286e-05,
"loss": 0.804,
"step": 1359
},
{
"epoch": 2.0481927710843375,
"grad_norm": 0.3426384122315898,
"learning_rate": 1.7633928571428573e-05,
"loss": 0.7245,
"step": 1360
},
{
"epoch": 2.049698795180723,
"grad_norm": 0.3234425184935498,
"learning_rate": 1.7606026785714286e-05,
"loss": 0.7127,
"step": 1361
},
{
"epoch": 2.0512048192771086,
"grad_norm": 0.3415220519117308,
"learning_rate": 1.7578125000000002e-05,
"loss": 0.8097,
"step": 1362
},
{
"epoch": 2.052710843373494,
"grad_norm": 0.30201764745575643,
"learning_rate": 1.7550223214285715e-05,
"loss": 0.7515,
"step": 1363
},
{
"epoch": 2.0542168674698793,
"grad_norm": 0.45086145913840586,
"learning_rate": 1.7522321428571428e-05,
"loss": 0.7597,
"step": 1364
},
{
"epoch": 2.055722891566265,
"grad_norm": 2.201751277313472,
"learning_rate": 1.7494419642857144e-05,
"loss": 0.7647,
"step": 1365
},
{
"epoch": 2.0572289156626504,
"grad_norm": 0.4144888920502803,
"learning_rate": 1.7466517857142857e-05,
"loss": 0.8357,
"step": 1366
},
{
"epoch": 2.058734939759036,
"grad_norm": 0.43518660160553496,
"learning_rate": 1.7438616071428574e-05,
"loss": 0.719,
"step": 1367
},
{
"epoch": 2.0602409638554215,
"grad_norm": 0.468138277260558,
"learning_rate": 1.7410714285714287e-05,
"loss": 0.8172,
"step": 1368
},
{
"epoch": 2.0617469879518073,
"grad_norm": 0.41405090777040815,
"learning_rate": 1.73828125e-05,
"loss": 0.7224,
"step": 1369
},
{
"epoch": 2.0632530120481927,
"grad_norm": 0.4790785643336795,
"learning_rate": 1.7354910714285716e-05,
"loss": 0.8008,
"step": 1370
},
{
"epoch": 2.0647590361445785,
"grad_norm": 0.34582355702719386,
"learning_rate": 1.732700892857143e-05,
"loss": 0.7541,
"step": 1371
},
{
"epoch": 2.066265060240964,
"grad_norm": 0.36152945746865667,
"learning_rate": 1.7299107142857142e-05,
"loss": 0.7268,
"step": 1372
},
{
"epoch": 2.0677710843373496,
"grad_norm": 5.576082478496776,
"learning_rate": 1.727120535714286e-05,
"loss": 0.9278,
"step": 1373
},
{
"epoch": 2.069277108433735,
"grad_norm": 0.41868069740108393,
"learning_rate": 1.7243303571428575e-05,
"loss": 0.8436,
"step": 1374
},
{
"epoch": 2.0707831325301207,
"grad_norm": 0.3647904972558811,
"learning_rate": 1.7215401785714285e-05,
"loss": 0.7964,
"step": 1375
},
{
"epoch": 2.072289156626506,
"grad_norm": 0.516012245346367,
"learning_rate": 1.71875e-05,
"loss": 0.765,
"step": 1376
},
{
"epoch": 2.0737951807228914,
"grad_norm": 0.3592428556181413,
"learning_rate": 1.7159598214285717e-05,
"loss": 0.7335,
"step": 1377
},
{
"epoch": 2.075301204819277,
"grad_norm": 0.36092554523752485,
"learning_rate": 1.7131696428571427e-05,
"loss": 0.7949,
"step": 1378
},
{
"epoch": 2.0768072289156625,
"grad_norm": 0.2876660554940345,
"learning_rate": 1.7103794642857143e-05,
"loss": 0.6423,
"step": 1379
},
{
"epoch": 2.0783132530120483,
"grad_norm": 0.4223033831891601,
"learning_rate": 1.707589285714286e-05,
"loss": 0.7569,
"step": 1380
},
{
"epoch": 2.0798192771084336,
"grad_norm": 0.4173530939927145,
"learning_rate": 1.7047991071428573e-05,
"loss": 0.7801,
"step": 1381
},
{
"epoch": 2.0813253012048194,
"grad_norm": 0.3642914088410953,
"learning_rate": 1.7020089285714286e-05,
"loss": 0.8204,
"step": 1382
},
{
"epoch": 2.0828313253012047,
"grad_norm": 0.45066842005845775,
"learning_rate": 1.6992187500000002e-05,
"loss": 0.7172,
"step": 1383
},
{
"epoch": 2.0843373493975905,
"grad_norm": 0.3616712702809069,
"learning_rate": 1.6964285714285715e-05,
"loss": 0.7609,
"step": 1384
},
{
"epoch": 2.085843373493976,
"grad_norm": 0.40530917169528374,
"learning_rate": 1.6936383928571428e-05,
"loss": 0.7043,
"step": 1385
},
{
"epoch": 2.0873493975903616,
"grad_norm": 0.3605786894238744,
"learning_rate": 1.6908482142857145e-05,
"loss": 0.6784,
"step": 1386
},
{
"epoch": 2.088855421686747,
"grad_norm": 0.3335858581718125,
"learning_rate": 1.6880580357142858e-05,
"loss": 0.7826,
"step": 1387
},
{
"epoch": 2.0903614457831323,
"grad_norm": 0.3595279698443346,
"learning_rate": 1.6852678571428574e-05,
"loss": 0.6945,
"step": 1388
},
{
"epoch": 2.091867469879518,
"grad_norm": 0.3093524461602205,
"learning_rate": 1.6824776785714287e-05,
"loss": 0.7806,
"step": 1389
},
{
"epoch": 2.0933734939759034,
"grad_norm": 0.44207557611221115,
"learning_rate": 1.6796875e-05,
"loss": 0.6671,
"step": 1390
},
{
"epoch": 2.0948795180722892,
"grad_norm": 0.3345810949981571,
"learning_rate": 1.6768973214285716e-05,
"loss": 0.753,
"step": 1391
},
{
"epoch": 2.0963855421686746,
"grad_norm": 0.34931514402833846,
"learning_rate": 1.674107142857143e-05,
"loss": 0.6678,
"step": 1392
},
{
"epoch": 2.0978915662650603,
"grad_norm": 0.3574941441120308,
"learning_rate": 1.6713169642857142e-05,
"loss": 0.7697,
"step": 1393
},
{
"epoch": 2.0993975903614457,
"grad_norm": 0.3693009973013364,
"learning_rate": 1.668526785714286e-05,
"loss": 0.7327,
"step": 1394
},
{
"epoch": 2.1009036144578315,
"grad_norm": 0.3948135424608381,
"learning_rate": 1.6657366071428572e-05,
"loss": 0.7514,
"step": 1395
},
{
"epoch": 2.102409638554217,
"grad_norm": 0.38265698108087787,
"learning_rate": 1.6629464285714285e-05,
"loss": 0.7406,
"step": 1396
},
{
"epoch": 2.1039156626506026,
"grad_norm": 0.3804612049367573,
"learning_rate": 1.66015625e-05,
"loss": 0.7637,
"step": 1397
},
{
"epoch": 2.105421686746988,
"grad_norm": 0.45971678867779936,
"learning_rate": 1.6573660714285714e-05,
"loss": 0.7348,
"step": 1398
},
{
"epoch": 2.1069277108433733,
"grad_norm": 0.3891415095099081,
"learning_rate": 1.6545758928571427e-05,
"loss": 0.8473,
"step": 1399
},
{
"epoch": 2.108433734939759,
"grad_norm": 0.40613457880910886,
"learning_rate": 1.6517857142857144e-05,
"loss": 0.7956,
"step": 1400
},
{
"epoch": 2.1099397590361444,
"grad_norm": 0.38826514142509766,
"learning_rate": 1.648995535714286e-05,
"loss": 0.7493,
"step": 1401
},
{
"epoch": 2.11144578313253,
"grad_norm": 0.40876079230980183,
"learning_rate": 1.6462053571428573e-05,
"loss": 0.7084,
"step": 1402
},
{
"epoch": 2.1129518072289155,
"grad_norm": 0.33775044409923294,
"learning_rate": 1.6434151785714286e-05,
"loss": 0.8349,
"step": 1403
},
{
"epoch": 2.1144578313253013,
"grad_norm": 0.3625455106378626,
"learning_rate": 1.6406250000000002e-05,
"loss": 0.8021,
"step": 1404
},
{
"epoch": 2.1159638554216866,
"grad_norm": 0.35565670020506474,
"learning_rate": 1.6378348214285715e-05,
"loss": 0.7967,
"step": 1405
},
{
"epoch": 2.1174698795180724,
"grad_norm": 0.3344657596508849,
"learning_rate": 1.635044642857143e-05,
"loss": 0.6726,
"step": 1406
},
{
"epoch": 2.1189759036144578,
"grad_norm": 0.3577356458506431,
"learning_rate": 1.6322544642857145e-05,
"loss": 0.7324,
"step": 1407
},
{
"epoch": 2.1204819277108435,
"grad_norm": 0.4621713522030514,
"learning_rate": 1.6294642857142858e-05,
"loss": 0.7955,
"step": 1408
},
{
"epoch": 2.121987951807229,
"grad_norm": 0.38173804451631965,
"learning_rate": 1.6266741071428574e-05,
"loss": 0.8104,
"step": 1409
},
{
"epoch": 2.1234939759036147,
"grad_norm": 0.3240358047388974,
"learning_rate": 1.6238839285714287e-05,
"loss": 0.7611,
"step": 1410
},
{
"epoch": 2.125,
"grad_norm": 0.4094057184613191,
"learning_rate": 1.62109375e-05,
"loss": 0.8551,
"step": 1411
},
{
"epoch": 2.1265060240963853,
"grad_norm": 0.4331967583145405,
"learning_rate": 1.6183035714285717e-05,
"loss": 0.7724,
"step": 1412
},
{
"epoch": 2.128012048192771,
"grad_norm": 0.4349497533625537,
"learning_rate": 1.615513392857143e-05,
"loss": 0.7672,
"step": 1413
},
{
"epoch": 2.1295180722891565,
"grad_norm": 0.36647812022046955,
"learning_rate": 1.6127232142857143e-05,
"loss": 0.8496,
"step": 1414
},
{
"epoch": 2.1310240963855422,
"grad_norm": 0.45869182040822,
"learning_rate": 1.609933035714286e-05,
"loss": 0.8237,
"step": 1415
},
{
"epoch": 2.1325301204819276,
"grad_norm": 0.4031142026932879,
"learning_rate": 1.6071428571428572e-05,
"loss": 0.7475,
"step": 1416
},
{
"epoch": 2.1340361445783134,
"grad_norm": 0.3521215581010213,
"learning_rate": 1.6043526785714285e-05,
"loss": 0.7108,
"step": 1417
},
{
"epoch": 2.1355421686746987,
"grad_norm": 0.4484599295501349,
"learning_rate": 1.6015625e-05,
"loss": 0.7478,
"step": 1418
},
{
"epoch": 2.1370481927710845,
"grad_norm": 0.32907755017475787,
"learning_rate": 1.5987723214285714e-05,
"loss": 0.7376,
"step": 1419
},
{
"epoch": 2.13855421686747,
"grad_norm": 0.5947662686588567,
"learning_rate": 1.5959821428571427e-05,
"loss": 0.7178,
"step": 1420
},
{
"epoch": 2.1400602409638556,
"grad_norm": 0.345645876211955,
"learning_rate": 1.5931919642857144e-05,
"loss": 0.6904,
"step": 1421
},
{
"epoch": 2.141566265060241,
"grad_norm": 0.34696081709722676,
"learning_rate": 1.5904017857142857e-05,
"loss": 0.7697,
"step": 1422
},
{
"epoch": 2.1430722891566267,
"grad_norm": 0.36557333518396634,
"learning_rate": 1.5876116071428573e-05,
"loss": 0.7379,
"step": 1423
},
{
"epoch": 2.144578313253012,
"grad_norm": 0.3303570392411989,
"learning_rate": 1.5848214285714286e-05,
"loss": 0.6624,
"step": 1424
},
{
"epoch": 2.1460843373493974,
"grad_norm": 0.32650789147224313,
"learning_rate": 1.58203125e-05,
"loss": 0.8062,
"step": 1425
},
{
"epoch": 2.147590361445783,
"grad_norm": 0.31323031203465923,
"learning_rate": 1.5792410714285716e-05,
"loss": 0.7013,
"step": 1426
},
{
"epoch": 2.1490963855421685,
"grad_norm": 0.3102361773365546,
"learning_rate": 1.576450892857143e-05,
"loss": 0.8351,
"step": 1427
},
{
"epoch": 2.1506024096385543,
"grad_norm": 0.3284729582294777,
"learning_rate": 1.573660714285714e-05,
"loss": 0.7476,
"step": 1428
},
{
"epoch": 2.1521084337349397,
"grad_norm": 0.3245267868353776,
"learning_rate": 1.5708705357142858e-05,
"loss": 0.7416,
"step": 1429
},
{
"epoch": 2.1536144578313254,
"grad_norm": 0.3621532354825919,
"learning_rate": 1.5680803571428574e-05,
"loss": 0.8161,
"step": 1430
},
{
"epoch": 2.1551204819277108,
"grad_norm": 0.31798550935686776,
"learning_rate": 1.5652901785714287e-05,
"loss": 0.6925,
"step": 1431
},
{
"epoch": 2.1566265060240966,
"grad_norm": 0.30365964969105036,
"learning_rate": 1.5625e-05,
"loss": 0.7943,
"step": 1432
},
{
"epoch": 2.158132530120482,
"grad_norm": 0.33661767287720845,
"learning_rate": 1.5597098214285717e-05,
"loss": 0.722,
"step": 1433
},
{
"epoch": 2.1596385542168672,
"grad_norm": 0.3024257708102083,
"learning_rate": 1.556919642857143e-05,
"loss": 0.8094,
"step": 1434
},
{
"epoch": 2.161144578313253,
"grad_norm": 0.3886734909263864,
"learning_rate": 1.5541294642857143e-05,
"loss": 0.7583,
"step": 1435
},
{
"epoch": 2.1626506024096384,
"grad_norm": 0.3316832869220419,
"learning_rate": 1.551339285714286e-05,
"loss": 0.8357,
"step": 1436
},
{
"epoch": 2.164156626506024,
"grad_norm": 0.3214310144558677,
"learning_rate": 1.5485491071428572e-05,
"loss": 0.7312,
"step": 1437
},
{
"epoch": 2.1656626506024095,
"grad_norm": 0.31458944600983796,
"learning_rate": 1.5457589285714285e-05,
"loss": 0.6734,
"step": 1438
},
{
"epoch": 2.1671686746987953,
"grad_norm": 0.3691240001857765,
"learning_rate": 1.54296875e-05,
"loss": 0.7418,
"step": 1439
},
{
"epoch": 2.1686746987951806,
"grad_norm": 0.37900091234913547,
"learning_rate": 1.5401785714285715e-05,
"loss": 0.7634,
"step": 1440
},
{
"epoch": 2.1701807228915664,
"grad_norm": 0.3892665931923318,
"learning_rate": 1.5373883928571428e-05,
"loss": 0.6819,
"step": 1441
},
{
"epoch": 2.1716867469879517,
"grad_norm": 0.5170319100346973,
"learning_rate": 1.5345982142857144e-05,
"loss": 0.7684,
"step": 1442
},
{
"epoch": 2.1731927710843375,
"grad_norm": 0.37576405701592275,
"learning_rate": 1.5318080357142857e-05,
"loss": 0.73,
"step": 1443
},
{
"epoch": 2.174698795180723,
"grad_norm": 0.4906713795886261,
"learning_rate": 1.5290178571428573e-05,
"loss": 0.7135,
"step": 1444
},
{
"epoch": 2.1762048192771086,
"grad_norm": 0.3384509794384884,
"learning_rate": 1.5262276785714286e-05,
"loss": 0.7305,
"step": 1445
},
{
"epoch": 2.177710843373494,
"grad_norm": 0.38880149604824804,
"learning_rate": 1.5234375000000001e-05,
"loss": 0.7131,
"step": 1446
},
{
"epoch": 2.1792168674698793,
"grad_norm": 0.46609194524790026,
"learning_rate": 1.5206473214285716e-05,
"loss": 0.7895,
"step": 1447
},
{
"epoch": 2.180722891566265,
"grad_norm": 0.42144982010934895,
"learning_rate": 1.5178571428571429e-05,
"loss": 0.8333,
"step": 1448
},
{
"epoch": 2.1822289156626504,
"grad_norm": 0.43233800009331064,
"learning_rate": 1.5150669642857143e-05,
"loss": 0.7834,
"step": 1449
},
{
"epoch": 2.183734939759036,
"grad_norm": 0.39046796371611303,
"learning_rate": 1.5122767857142858e-05,
"loss": 0.7225,
"step": 1450
},
{
"epoch": 2.1852409638554215,
"grad_norm": 0.35599971361170696,
"learning_rate": 1.5094866071428573e-05,
"loss": 0.8478,
"step": 1451
},
{
"epoch": 2.1867469879518073,
"grad_norm": 0.376894104515715,
"learning_rate": 1.5066964285714286e-05,
"loss": 0.7243,
"step": 1452
},
{
"epoch": 2.1882530120481927,
"grad_norm": 0.33451908144810605,
"learning_rate": 1.50390625e-05,
"loss": 0.7442,
"step": 1453
},
{
"epoch": 2.1897590361445785,
"grad_norm": 0.4517295720656928,
"learning_rate": 1.5011160714285715e-05,
"loss": 0.7234,
"step": 1454
},
{
"epoch": 2.191265060240964,
"grad_norm": 0.3999828153267003,
"learning_rate": 1.4983258928571428e-05,
"loss": 0.7536,
"step": 1455
},
{
"epoch": 2.1927710843373496,
"grad_norm": 0.39686587388349964,
"learning_rate": 1.4955357142857143e-05,
"loss": 0.7198,
"step": 1456
},
{
"epoch": 2.194277108433735,
"grad_norm": 0.4773228010706459,
"learning_rate": 1.4927455357142858e-05,
"loss": 0.721,
"step": 1457
},
{
"epoch": 2.1957831325301207,
"grad_norm": 0.311235739186363,
"learning_rate": 1.4899553571428574e-05,
"loss": 0.7655,
"step": 1458
},
{
"epoch": 2.197289156626506,
"grad_norm": 0.39917764017614926,
"learning_rate": 1.4871651785714285e-05,
"loss": 0.6966,
"step": 1459
},
{
"epoch": 2.1987951807228914,
"grad_norm": 0.38865401937076227,
"learning_rate": 1.484375e-05,
"loss": 0.8043,
"step": 1460
},
{
"epoch": 2.200301204819277,
"grad_norm": 0.4135066027653089,
"learning_rate": 1.4815848214285716e-05,
"loss": 0.751,
"step": 1461
},
{
"epoch": 2.2018072289156625,
"grad_norm": 0.4280038092482014,
"learning_rate": 1.4787946428571428e-05,
"loss": 0.8172,
"step": 1462
},
{
"epoch": 2.2033132530120483,
"grad_norm": 0.31875548970442735,
"learning_rate": 1.4760044642857142e-05,
"loss": 0.7899,
"step": 1463
},
{
"epoch": 2.2048192771084336,
"grad_norm": 0.36114725627517974,
"learning_rate": 1.4732142857142859e-05,
"loss": 0.7191,
"step": 1464
},
{
"epoch": 2.2063253012048194,
"grad_norm": 0.44930215922386957,
"learning_rate": 1.4704241071428573e-05,
"loss": 0.6902,
"step": 1465
},
{
"epoch": 2.2078313253012047,
"grad_norm": 5.629659541218995,
"learning_rate": 1.4676339285714286e-05,
"loss": 0.9258,
"step": 1466
},
{
"epoch": 2.2093373493975905,
"grad_norm": 0.6065400351049115,
"learning_rate": 1.4648437500000001e-05,
"loss": 0.7278,
"step": 1467
},
{
"epoch": 2.210843373493976,
"grad_norm": 0.5315992751585906,
"learning_rate": 1.4620535714285716e-05,
"loss": 0.759,
"step": 1468
},
{
"epoch": 2.2123493975903616,
"grad_norm": 0.4095278741080125,
"learning_rate": 1.4592633928571429e-05,
"loss": 0.7673,
"step": 1469
},
{
"epoch": 2.213855421686747,
"grad_norm": 4.426838580875431,
"learning_rate": 1.4564732142857144e-05,
"loss": 0.9324,
"step": 1470
},
{
"epoch": 2.2153614457831328,
"grad_norm": 0.7868496470106048,
"learning_rate": 1.4536830357142858e-05,
"loss": 0.711,
"step": 1471
},
{
"epoch": 2.216867469879518,
"grad_norm": 0.38722904076610354,
"learning_rate": 1.4508928571428573e-05,
"loss": 0.7714,
"step": 1472
},
{
"epoch": 2.2183734939759034,
"grad_norm": 0.5901853931313702,
"learning_rate": 1.4481026785714286e-05,
"loss": 0.7157,
"step": 1473
},
{
"epoch": 2.2198795180722892,
"grad_norm": 0.5014487266760705,
"learning_rate": 1.4453125e-05,
"loss": 0.7762,
"step": 1474
},
{
"epoch": 2.2213855421686746,
"grad_norm": 0.4466679619214497,
"learning_rate": 1.4425223214285715e-05,
"loss": 0.7586,
"step": 1475
},
{
"epoch": 2.2228915662650603,
"grad_norm": 0.49424688849988746,
"learning_rate": 1.4397321428571428e-05,
"loss": 0.7683,
"step": 1476
},
{
"epoch": 2.2243975903614457,
"grad_norm": 0.3997938450461635,
"learning_rate": 1.4369419642857143e-05,
"loss": 0.7261,
"step": 1477
},
{
"epoch": 2.2259036144578315,
"grad_norm": 0.48977189481603206,
"learning_rate": 1.4341517857142858e-05,
"loss": 0.7346,
"step": 1478
},
{
"epoch": 2.227409638554217,
"grad_norm": 4.009904485792324,
"learning_rate": 1.4313616071428574e-05,
"loss": 0.8032,
"step": 1479
},
{
"epoch": 2.2289156626506026,
"grad_norm": 0.394660280245848,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.718,
"step": 1480
},
{
"epoch": 2.230421686746988,
"grad_norm": 0.44155577113233757,
"learning_rate": 1.42578125e-05,
"loss": 0.7882,
"step": 1481
},
{
"epoch": 2.2319277108433733,
"grad_norm": 0.4700845307258182,
"learning_rate": 1.4229910714285717e-05,
"loss": 0.8706,
"step": 1482
},
{
"epoch": 2.233433734939759,
"grad_norm": 0.36500043709587043,
"learning_rate": 1.4202008928571428e-05,
"loss": 0.7573,
"step": 1483
},
{
"epoch": 2.2349397590361444,
"grad_norm": 0.35204631333776126,
"learning_rate": 1.4174107142857143e-05,
"loss": 0.7185,
"step": 1484
},
{
"epoch": 2.23644578313253,
"grad_norm": 0.3894148206826439,
"learning_rate": 1.4146205357142859e-05,
"loss": 0.7481,
"step": 1485
},
{
"epoch": 2.2379518072289155,
"grad_norm": 0.4905843605260157,
"learning_rate": 1.4118303571428574e-05,
"loss": 0.7253,
"step": 1486
},
{
"epoch": 2.2394578313253013,
"grad_norm": 0.40699885906197847,
"learning_rate": 1.4090401785714285e-05,
"loss": 0.7234,
"step": 1487
},
{
"epoch": 2.2409638554216866,
"grad_norm": 0.34639617536160733,
"learning_rate": 1.4062500000000001e-05,
"loss": 0.7285,
"step": 1488
},
{
"epoch": 2.2424698795180724,
"grad_norm": 0.38831073635710883,
"learning_rate": 1.4034598214285716e-05,
"loss": 0.7088,
"step": 1489
},
{
"epoch": 2.2439759036144578,
"grad_norm": 0.34833864297874506,
"learning_rate": 1.4006696428571427e-05,
"loss": 0.6875,
"step": 1490
},
{
"epoch": 2.2454819277108435,
"grad_norm": 0.45125460276534163,
"learning_rate": 1.3978794642857144e-05,
"loss": 0.7775,
"step": 1491
},
{
"epoch": 2.246987951807229,
"grad_norm": 0.3412153391063464,
"learning_rate": 1.3950892857142858e-05,
"loss": 0.7194,
"step": 1492
},
{
"epoch": 2.2484939759036147,
"grad_norm": 0.3434777799997792,
"learning_rate": 1.3922991071428573e-05,
"loss": 0.7948,
"step": 1493
},
{
"epoch": 2.25,
"grad_norm": 0.3315516658118742,
"learning_rate": 1.3895089285714286e-05,
"loss": 0.7815,
"step": 1494
},
{
"epoch": 2.2515060240963853,
"grad_norm": 0.3778204995685785,
"learning_rate": 1.38671875e-05,
"loss": 0.8173,
"step": 1495
},
{
"epoch": 2.253012048192771,
"grad_norm": 0.31282356182336674,
"learning_rate": 1.3839285714285715e-05,
"loss": 0.7457,
"step": 1496
},
{
"epoch": 2.2545180722891565,
"grad_norm": 0.3201952407776396,
"learning_rate": 1.3811383928571428e-05,
"loss": 0.7644,
"step": 1497
},
{
"epoch": 2.2560240963855422,
"grad_norm": 0.34649909979641264,
"learning_rate": 1.3783482142857143e-05,
"loss": 0.7604,
"step": 1498
},
{
"epoch": 2.2575301204819276,
"grad_norm": 0.31791074081886034,
"learning_rate": 1.3755580357142858e-05,
"loss": 0.7239,
"step": 1499
},
{
"epoch": 2.2590361445783134,
"grad_norm": 0.3520427559817463,
"learning_rate": 1.3727678571428573e-05,
"loss": 0.7628,
"step": 1500
},
{
"epoch": 2.2605421686746987,
"grad_norm": 0.37048507674464376,
"learning_rate": 1.3699776785714286e-05,
"loss": 0.7818,
"step": 1501
},
{
"epoch": 2.2620481927710845,
"grad_norm": 0.33395191795406953,
"learning_rate": 1.3671875e-05,
"loss": 0.8464,
"step": 1502
},
{
"epoch": 2.26355421686747,
"grad_norm": 0.33671498400440064,
"learning_rate": 1.3643973214285715e-05,
"loss": 0.8762,
"step": 1503
},
{
"epoch": 2.2650602409638556,
"grad_norm": 0.3916141098381724,
"learning_rate": 1.3616071428571428e-05,
"loss": 0.7531,
"step": 1504
},
{
"epoch": 2.266566265060241,
"grad_norm": 0.3304283919090669,
"learning_rate": 1.3588169642857143e-05,
"loss": 0.7849,
"step": 1505
},
{
"epoch": 2.2680722891566267,
"grad_norm": 0.32638210603988604,
"learning_rate": 1.3560267857142857e-05,
"loss": 0.7481,
"step": 1506
},
{
"epoch": 2.269578313253012,
"grad_norm": 0.32657651141560134,
"learning_rate": 1.3532366071428574e-05,
"loss": 0.7003,
"step": 1507
},
{
"epoch": 2.2710843373493974,
"grad_norm": 0.3287406545492075,
"learning_rate": 1.3504464285714285e-05,
"loss": 0.7745,
"step": 1508
},
{
"epoch": 2.272590361445783,
"grad_norm": 0.3040287332094913,
"learning_rate": 1.3476562500000001e-05,
"loss": 0.7823,
"step": 1509
},
{
"epoch": 2.2740963855421685,
"grad_norm": 0.3204669639788259,
"learning_rate": 1.3448660714285716e-05,
"loss": 0.7355,
"step": 1510
},
{
"epoch": 2.2756024096385543,
"grad_norm": 0.44714640355767765,
"learning_rate": 1.3420758928571427e-05,
"loss": 0.7645,
"step": 1511
},
{
"epoch": 2.2771084337349397,
"grad_norm": 0.29290394290760374,
"learning_rate": 1.3392857142857144e-05,
"loss": 0.7299,
"step": 1512
},
{
"epoch": 2.2786144578313254,
"grad_norm": 0.35137233976556037,
"learning_rate": 1.3364955357142859e-05,
"loss": 0.7447,
"step": 1513
},
{
"epoch": 2.2801204819277108,
"grad_norm": 0.3494034458248596,
"learning_rate": 1.3337053571428573e-05,
"loss": 0.7942,
"step": 1514
},
{
"epoch": 2.2816265060240966,
"grad_norm": 0.3019653242019043,
"learning_rate": 1.3309151785714286e-05,
"loss": 0.7091,
"step": 1515
},
{
"epoch": 2.283132530120482,
"grad_norm": 0.34801799749492796,
"learning_rate": 1.3281250000000001e-05,
"loss": 0.7618,
"step": 1516
},
{
"epoch": 2.2846385542168672,
"grad_norm": 0.34559036231199464,
"learning_rate": 1.3253348214285716e-05,
"loss": 0.7138,
"step": 1517
},
{
"epoch": 2.286144578313253,
"grad_norm": 0.6098079262189459,
"learning_rate": 1.3225446428571429e-05,
"loss": 0.7797,
"step": 1518
},
{
"epoch": 2.287650602409639,
"grad_norm": 0.34363701224742343,
"learning_rate": 1.3197544642857143e-05,
"loss": 0.7293,
"step": 1519
},
{
"epoch": 2.289156626506024,
"grad_norm": 0.3353648488153242,
"learning_rate": 1.3169642857142858e-05,
"loss": 0.807,
"step": 1520
},
{
"epoch": 2.2906626506024095,
"grad_norm": 0.33628133497739626,
"learning_rate": 1.3141741071428573e-05,
"loss": 0.8546,
"step": 1521
},
{
"epoch": 2.2921686746987953,
"grad_norm": 0.3806016308955366,
"learning_rate": 1.3113839285714286e-05,
"loss": 0.8023,
"step": 1522
},
{
"epoch": 2.2936746987951806,
"grad_norm": 0.3705354512353976,
"learning_rate": 1.30859375e-05,
"loss": 0.7666,
"step": 1523
},
{
"epoch": 2.2951807228915664,
"grad_norm": 0.32779701425936303,
"learning_rate": 1.3058035714285715e-05,
"loss": 0.8191,
"step": 1524
},
{
"epoch": 2.2966867469879517,
"grad_norm": 0.3687256323588038,
"learning_rate": 1.3030133928571428e-05,
"loss": 0.7877,
"step": 1525
},
{
"epoch": 2.2981927710843375,
"grad_norm": 0.40558806435165456,
"learning_rate": 1.3002232142857143e-05,
"loss": 0.7996,
"step": 1526
},
{
"epoch": 2.299698795180723,
"grad_norm": 0.37421589263740757,
"learning_rate": 1.2974330357142858e-05,
"loss": 0.7893,
"step": 1527
},
{
"epoch": 2.3012048192771086,
"grad_norm": 0.3378217568813387,
"learning_rate": 1.2946428571428574e-05,
"loss": 0.7833,
"step": 1528
},
{
"epoch": 2.302710843373494,
"grad_norm": 0.3820747457855776,
"learning_rate": 1.2918526785714285e-05,
"loss": 0.7101,
"step": 1529
},
{
"epoch": 2.3042168674698793,
"grad_norm": 0.3550534218167412,
"learning_rate": 1.2890625e-05,
"loss": 0.7825,
"step": 1530
},
{
"epoch": 2.305722891566265,
"grad_norm": 0.33153684215253115,
"learning_rate": 1.2862723214285716e-05,
"loss": 0.8278,
"step": 1531
},
{
"epoch": 2.3072289156626504,
"grad_norm": 0.5235387685373346,
"learning_rate": 1.2834821428571428e-05,
"loss": 0.7424,
"step": 1532
},
{
"epoch": 2.308734939759036,
"grad_norm": 0.30023751601381604,
"learning_rate": 1.2806919642857142e-05,
"loss": 0.7967,
"step": 1533
},
{
"epoch": 2.3102409638554215,
"grad_norm": 0.33791465392398207,
"learning_rate": 1.2779017857142859e-05,
"loss": 0.7612,
"step": 1534
},
{
"epoch": 2.3117469879518073,
"grad_norm": 0.5877012442882222,
"learning_rate": 1.2751116071428573e-05,
"loss": 0.7334,
"step": 1535
},
{
"epoch": 2.3132530120481927,
"grad_norm": 0.32441322865683986,
"learning_rate": 1.2723214285714285e-05,
"loss": 0.7203,
"step": 1536
},
{
"epoch": 2.3147590361445785,
"grad_norm": 0.38708922657560385,
"learning_rate": 1.2695312500000001e-05,
"loss": 0.7145,
"step": 1537
},
{
"epoch": 2.316265060240964,
"grad_norm": 0.3670600283794749,
"learning_rate": 1.2667410714285716e-05,
"loss": 0.7315,
"step": 1538
},
{
"epoch": 2.3177710843373496,
"grad_norm": 0.30999772452723007,
"learning_rate": 1.2639508928571429e-05,
"loss": 0.7198,
"step": 1539
},
{
"epoch": 2.319277108433735,
"grad_norm": 0.4114193275077201,
"learning_rate": 1.2611607142857144e-05,
"loss": 0.6656,
"step": 1540
},
{
"epoch": 2.3207831325301207,
"grad_norm": 0.36950164328726537,
"learning_rate": 1.2583705357142858e-05,
"loss": 0.7513,
"step": 1541
},
{
"epoch": 2.322289156626506,
"grad_norm": 0.32797870683087715,
"learning_rate": 1.2555803571428573e-05,
"loss": 0.7354,
"step": 1542
},
{
"epoch": 2.3237951807228914,
"grad_norm": 0.3381588459848259,
"learning_rate": 1.2527901785714286e-05,
"loss": 0.7692,
"step": 1543
},
{
"epoch": 2.325301204819277,
"grad_norm": 0.3488294126059973,
"learning_rate": 1.25e-05,
"loss": 0.6647,
"step": 1544
},
{
"epoch": 2.3268072289156625,
"grad_norm": 0.33652825348547083,
"learning_rate": 1.2472098214285714e-05,
"loss": 0.7183,
"step": 1545
},
{
"epoch": 2.3283132530120483,
"grad_norm": 0.35366727400277786,
"learning_rate": 1.244419642857143e-05,
"loss": 0.6547,
"step": 1546
},
{
"epoch": 2.3298192771084336,
"grad_norm": 0.3116422606174144,
"learning_rate": 1.2416294642857143e-05,
"loss": 0.7862,
"step": 1547
},
{
"epoch": 2.3313253012048194,
"grad_norm": 0.31662707539133983,
"learning_rate": 1.2388392857142858e-05,
"loss": 0.7954,
"step": 1548
},
{
"epoch": 2.3328313253012047,
"grad_norm": 0.3520072226113262,
"learning_rate": 1.2360491071428572e-05,
"loss": 0.7636,
"step": 1549
},
{
"epoch": 2.3343373493975905,
"grad_norm": 0.37957257710829045,
"learning_rate": 1.2332589285714287e-05,
"loss": 0.8053,
"step": 1550
},
{
"epoch": 2.335843373493976,
"grad_norm": 0.39294241122126156,
"learning_rate": 1.23046875e-05,
"loss": 0.8411,
"step": 1551
},
{
"epoch": 2.337349397590361,
"grad_norm": 0.3817941578460477,
"learning_rate": 1.2276785714285715e-05,
"loss": 0.7594,
"step": 1552
},
{
"epoch": 2.338855421686747,
"grad_norm": 0.3266523024320165,
"learning_rate": 1.224888392857143e-05,
"loss": 0.7624,
"step": 1553
},
{
"epoch": 2.3403614457831328,
"grad_norm": 0.3187437277319167,
"learning_rate": 1.2220982142857142e-05,
"loss": 0.7042,
"step": 1554
},
{
"epoch": 2.341867469879518,
"grad_norm": 0.3388167308705055,
"learning_rate": 1.2193080357142859e-05,
"loss": 0.7086,
"step": 1555
},
{
"epoch": 2.3433734939759034,
"grad_norm": 0.3268282422744848,
"learning_rate": 1.2165178571428572e-05,
"loss": 0.6961,
"step": 1556
},
{
"epoch": 2.3448795180722892,
"grad_norm": 0.31557933473966115,
"learning_rate": 1.2137276785714287e-05,
"loss": 0.7257,
"step": 1557
},
{
"epoch": 2.3463855421686746,
"grad_norm": 0.37828668508509394,
"learning_rate": 1.2109375000000001e-05,
"loss": 0.687,
"step": 1558
},
{
"epoch": 2.3478915662650603,
"grad_norm": 0.32628719383748184,
"learning_rate": 1.2081473214285714e-05,
"loss": 0.729,
"step": 1559
},
{
"epoch": 2.3493975903614457,
"grad_norm": 0.35601393534592257,
"learning_rate": 1.2053571428571429e-05,
"loss": 0.7199,
"step": 1560
},
{
"epoch": 2.3509036144578315,
"grad_norm": 0.4147488869105199,
"learning_rate": 1.2025669642857144e-05,
"loss": 0.7867,
"step": 1561
},
{
"epoch": 2.352409638554217,
"grad_norm": 0.3228752972973316,
"learning_rate": 1.1997767857142858e-05,
"loss": 0.7421,
"step": 1562
},
{
"epoch": 2.3539156626506026,
"grad_norm": 0.3816049275147419,
"learning_rate": 1.1969866071428571e-05,
"loss": 0.8204,
"step": 1563
},
{
"epoch": 2.355421686746988,
"grad_norm": 0.38238943332817554,
"learning_rate": 1.1941964285714286e-05,
"loss": 0.7683,
"step": 1564
},
{
"epoch": 2.3569277108433733,
"grad_norm": 0.3883181445207651,
"learning_rate": 1.19140625e-05,
"loss": 0.6902,
"step": 1565
},
{
"epoch": 2.358433734939759,
"grad_norm": 0.31459464899698264,
"learning_rate": 1.1886160714285714e-05,
"loss": 0.689,
"step": 1566
},
{
"epoch": 2.3599397590361444,
"grad_norm": 0.45655037390674574,
"learning_rate": 1.185825892857143e-05,
"loss": 0.7705,
"step": 1567
},
{
"epoch": 2.36144578313253,
"grad_norm": 0.3348764051495036,
"learning_rate": 1.1830357142857143e-05,
"loss": 0.79,
"step": 1568
},
{
"epoch": 2.3629518072289155,
"grad_norm": 0.47390562940406716,
"learning_rate": 1.1802455357142858e-05,
"loss": 0.8226,
"step": 1569
},
{
"epoch": 2.3644578313253013,
"grad_norm": 0.334088553713862,
"learning_rate": 1.1774553571428573e-05,
"loss": 0.7485,
"step": 1570
},
{
"epoch": 2.3659638554216866,
"grad_norm": 0.3119151636701632,
"learning_rate": 1.1746651785714287e-05,
"loss": 0.7696,
"step": 1571
},
{
"epoch": 2.3674698795180724,
"grad_norm": 0.45614326541727057,
"learning_rate": 1.171875e-05,
"loss": 0.805,
"step": 1572
},
{
"epoch": 2.3689759036144578,
"grad_norm": 0.4141650115812532,
"learning_rate": 1.1690848214285715e-05,
"loss": 0.7471,
"step": 1573
},
{
"epoch": 2.3704819277108435,
"grad_norm": 0.3406866374371924,
"learning_rate": 1.166294642857143e-05,
"loss": 0.7218,
"step": 1574
},
{
"epoch": 2.371987951807229,
"grad_norm": 0.3472080815866802,
"learning_rate": 1.1635044642857143e-05,
"loss": 0.7655,
"step": 1575
},
{
"epoch": 2.3734939759036147,
"grad_norm": 0.4181126564299595,
"learning_rate": 1.1607142857142857e-05,
"loss": 0.72,
"step": 1576
},
{
"epoch": 2.375,
"grad_norm": 0.3985159369714137,
"learning_rate": 1.1579241071428572e-05,
"loss": 0.7865,
"step": 1577
},
{
"epoch": 2.3765060240963853,
"grad_norm": 0.3326108070777843,
"learning_rate": 1.1551339285714287e-05,
"loss": 0.721,
"step": 1578
},
{
"epoch": 2.378012048192771,
"grad_norm": 0.30327944101361815,
"learning_rate": 1.15234375e-05,
"loss": 0.651,
"step": 1579
},
{
"epoch": 2.3795180722891565,
"grad_norm": 0.29869785963887485,
"learning_rate": 1.1495535714285714e-05,
"loss": 0.7196,
"step": 1580
},
{
"epoch": 2.3810240963855422,
"grad_norm": 2.8070582817998306,
"learning_rate": 1.1467633928571429e-05,
"loss": 0.7505,
"step": 1581
},
{
"epoch": 2.3825301204819276,
"grad_norm": 0.3002452373708965,
"learning_rate": 1.1439732142857144e-05,
"loss": 0.715,
"step": 1582
},
{
"epoch": 2.3840361445783134,
"grad_norm": 0.34518288105059947,
"learning_rate": 1.1411830357142859e-05,
"loss": 0.7036,
"step": 1583
},
{
"epoch": 2.3855421686746987,
"grad_norm": 0.3212164749795799,
"learning_rate": 1.1383928571428572e-05,
"loss": 0.8252,
"step": 1584
},
{
"epoch": 2.3870481927710845,
"grad_norm": 0.356356053947901,
"learning_rate": 1.1356026785714286e-05,
"loss": 0.8325,
"step": 1585
},
{
"epoch": 2.38855421686747,
"grad_norm": 0.302258849662692,
"learning_rate": 1.1328125000000001e-05,
"loss": 0.6998,
"step": 1586
},
{
"epoch": 2.3900602409638556,
"grad_norm": 0.3084371779989265,
"learning_rate": 1.1300223214285714e-05,
"loss": 0.6782,
"step": 1587
},
{
"epoch": 2.391566265060241,
"grad_norm": 0.42507923753805554,
"learning_rate": 1.1272321428571429e-05,
"loss": 0.748,
"step": 1588
},
{
"epoch": 2.3930722891566267,
"grad_norm": 0.3593574839433531,
"learning_rate": 1.1244419642857143e-05,
"loss": 0.7505,
"step": 1589
},
{
"epoch": 2.394578313253012,
"grad_norm": 0.32150968977368144,
"learning_rate": 1.1216517857142858e-05,
"loss": 0.6844,
"step": 1590
},
{
"epoch": 2.3960843373493974,
"grad_norm": 0.3830557852788486,
"learning_rate": 1.1188616071428571e-05,
"loss": 0.7307,
"step": 1591
},
{
"epoch": 2.397590361445783,
"grad_norm": 0.32883487560918984,
"learning_rate": 1.1160714285714287e-05,
"loss": 0.8465,
"step": 1592
},
{
"epoch": 2.3990963855421685,
"grad_norm": 0.3176558879818026,
"learning_rate": 1.11328125e-05,
"loss": 0.7233,
"step": 1593
},
{
"epoch": 2.4006024096385543,
"grad_norm": 0.35517175293561243,
"learning_rate": 1.1104910714285713e-05,
"loss": 0.7711,
"step": 1594
},
{
"epoch": 2.4021084337349397,
"grad_norm": 0.4161855943582698,
"learning_rate": 1.107700892857143e-05,
"loss": 0.7482,
"step": 1595
},
{
"epoch": 2.4036144578313254,
"grad_norm": 0.317577741346565,
"learning_rate": 1.1049107142857143e-05,
"loss": 0.8201,
"step": 1596
},
{
"epoch": 2.4051204819277108,
"grad_norm": 0.5283223690946734,
"learning_rate": 1.1021205357142857e-05,
"loss": 0.8123,
"step": 1597
},
{
"epoch": 2.4066265060240966,
"grad_norm": 0.32526314756031255,
"learning_rate": 1.0993303571428572e-05,
"loss": 0.7833,
"step": 1598
},
{
"epoch": 2.408132530120482,
"grad_norm": 0.278710304248553,
"learning_rate": 1.0965401785714287e-05,
"loss": 0.6619,
"step": 1599
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.3663596585520872,
"learning_rate": 1.09375e-05,
"loss": 0.7412,
"step": 1600
},
{
"epoch": 2.411144578313253,
"grad_norm": 9.099862641821197,
"learning_rate": 1.0909598214285715e-05,
"loss": 0.8334,
"step": 1601
},
{
"epoch": 2.412650602409639,
"grad_norm": 0.3754383671622475,
"learning_rate": 1.088169642857143e-05,
"loss": 0.7517,
"step": 1602
},
{
"epoch": 2.414156626506024,
"grad_norm": 0.3308813829704266,
"learning_rate": 1.0853794642857142e-05,
"loss": 0.767,
"step": 1603
},
{
"epoch": 2.4156626506024095,
"grad_norm": 0.4908829443861295,
"learning_rate": 1.0825892857142859e-05,
"loss": 0.7642,
"step": 1604
},
{
"epoch": 2.4171686746987953,
"grad_norm": 0.47736764239809126,
"learning_rate": 1.0797991071428572e-05,
"loss": 0.8113,
"step": 1605
},
{
"epoch": 2.4186746987951806,
"grad_norm": 0.35380880278609955,
"learning_rate": 1.0770089285714286e-05,
"loss": 0.7407,
"step": 1606
},
{
"epoch": 2.4201807228915664,
"grad_norm": 0.40944850266191524,
"learning_rate": 1.0742187500000001e-05,
"loss": 0.8732,
"step": 1607
},
{
"epoch": 2.4216867469879517,
"grad_norm": 0.4523343590986343,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.786,
"step": 1608
},
{
"epoch": 2.4231927710843375,
"grad_norm": 0.33711424675896884,
"learning_rate": 1.0686383928571429e-05,
"loss": 0.7704,
"step": 1609
},
{
"epoch": 2.424698795180723,
"grad_norm": 1.2527101170341979,
"learning_rate": 1.0658482142857143e-05,
"loss": 0.8738,
"step": 1610
},
{
"epoch": 2.4262048192771086,
"grad_norm": 0.352217672675291,
"learning_rate": 1.0630580357142858e-05,
"loss": 0.823,
"step": 1611
},
{
"epoch": 2.427710843373494,
"grad_norm": 0.33148829607882463,
"learning_rate": 1.0602678571428571e-05,
"loss": 0.6878,
"step": 1612
},
{
"epoch": 2.4292168674698793,
"grad_norm": 0.7705081488609036,
"learning_rate": 1.0574776785714288e-05,
"loss": 0.7366,
"step": 1613
},
{
"epoch": 2.430722891566265,
"grad_norm": 0.3096865611107542,
"learning_rate": 1.0546875e-05,
"loss": 0.7126,
"step": 1614
},
{
"epoch": 2.4322289156626504,
"grad_norm": 0.3646637609001635,
"learning_rate": 1.0518973214285714e-05,
"loss": 0.7024,
"step": 1615
},
{
"epoch": 2.433734939759036,
"grad_norm": 0.3212814165072044,
"learning_rate": 1.049107142857143e-05,
"loss": 0.6839,
"step": 1616
},
{
"epoch": 2.4352409638554215,
"grad_norm": 0.36745537360626596,
"learning_rate": 1.0463169642857143e-05,
"loss": 0.7812,
"step": 1617
},
{
"epoch": 2.4367469879518073,
"grad_norm": 0.326006879973065,
"learning_rate": 1.0435267857142858e-05,
"loss": 0.7005,
"step": 1618
},
{
"epoch": 2.4382530120481927,
"grad_norm": 0.32195385100982427,
"learning_rate": 1.0407366071428572e-05,
"loss": 0.7676,
"step": 1619
},
{
"epoch": 2.4397590361445785,
"grad_norm": 0.3057148822743455,
"learning_rate": 1.0379464285714287e-05,
"loss": 0.7359,
"step": 1620
},
{
"epoch": 2.441265060240964,
"grad_norm": 0.35698563805633854,
"learning_rate": 1.03515625e-05,
"loss": 0.8196,
"step": 1621
},
{
"epoch": 2.4427710843373496,
"grad_norm": 0.35576744946089167,
"learning_rate": 1.0323660714285715e-05,
"loss": 0.6722,
"step": 1622
},
{
"epoch": 2.444277108433735,
"grad_norm": 0.32006447986520775,
"learning_rate": 1.029575892857143e-05,
"loss": 0.7664,
"step": 1623
},
{
"epoch": 2.4457831325301207,
"grad_norm": 0.39629472779127706,
"learning_rate": 1.0267857142857142e-05,
"loss": 0.8362,
"step": 1624
},
{
"epoch": 2.447289156626506,
"grad_norm": 0.3431479732778465,
"learning_rate": 1.0239955357142859e-05,
"loss": 0.6791,
"step": 1625
},
{
"epoch": 2.4487951807228914,
"grad_norm": 0.36829125465274487,
"learning_rate": 1.0212053571428572e-05,
"loss": 0.8223,
"step": 1626
},
{
"epoch": 2.450301204819277,
"grad_norm": 0.30497956492071304,
"learning_rate": 1.0184151785714287e-05,
"loss": 0.7243,
"step": 1627
},
{
"epoch": 2.4518072289156625,
"grad_norm": 0.37729459411051475,
"learning_rate": 1.0156250000000001e-05,
"loss": 0.6752,
"step": 1628
},
{
"epoch": 2.4533132530120483,
"grad_norm": 0.3836694007357503,
"learning_rate": 1.0128348214285714e-05,
"loss": 0.6971,
"step": 1629
},
{
"epoch": 2.4548192771084336,
"grad_norm": 0.31722623796920624,
"learning_rate": 1.0100446428571429e-05,
"loss": 0.7387,
"step": 1630
},
{
"epoch": 2.4563253012048194,
"grad_norm": 0.3121767082504369,
"learning_rate": 1.0072544642857144e-05,
"loss": 0.7319,
"step": 1631
},
{
"epoch": 2.4578313253012047,
"grad_norm": 0.3780022777526052,
"learning_rate": 1.0044642857142858e-05,
"loss": 0.7069,
"step": 1632
},
{
"epoch": 2.4593373493975905,
"grad_norm": 0.34563955691316695,
"learning_rate": 1.0016741071428571e-05,
"loss": 0.7257,
"step": 1633
},
{
"epoch": 2.460843373493976,
"grad_norm": 0.4170413829593214,
"learning_rate": 9.988839285714286e-06,
"loss": 0.7894,
"step": 1634
},
{
"epoch": 2.462349397590361,
"grad_norm": 0.3608347388605557,
"learning_rate": 9.9609375e-06,
"loss": 0.8056,
"step": 1635
},
{
"epoch": 2.463855421686747,
"grad_norm": 0.3658191559519562,
"learning_rate": 9.933035714285714e-06,
"loss": 0.7564,
"step": 1636
},
{
"epoch": 2.4653614457831328,
"grad_norm": 0.3496037556674547,
"learning_rate": 9.905133928571428e-06,
"loss": 0.7171,
"step": 1637
},
{
"epoch": 2.466867469879518,
"grad_norm": 0.34926407735281834,
"learning_rate": 9.877232142857143e-06,
"loss": 0.7095,
"step": 1638
},
{
"epoch": 2.4683734939759034,
"grad_norm": 0.33297389299600105,
"learning_rate": 9.849330357142858e-06,
"loss": 0.7933,
"step": 1639
},
{
"epoch": 2.4698795180722892,
"grad_norm": 0.4008270518972094,
"learning_rate": 9.821428571428573e-06,
"loss": 0.7837,
"step": 1640
},
{
"epoch": 2.4713855421686746,
"grad_norm": 0.32786172209321857,
"learning_rate": 9.793526785714287e-06,
"loss": 0.801,
"step": 1641
},
{
"epoch": 2.4728915662650603,
"grad_norm": 0.33162283318794417,
"learning_rate": 9.765625e-06,
"loss": 0.7046,
"step": 1642
},
{
"epoch": 2.4743975903614457,
"grad_norm": 0.32794247295603285,
"learning_rate": 9.737723214285715e-06,
"loss": 0.732,
"step": 1643
},
{
"epoch": 2.4759036144578315,
"grad_norm": 0.30922403203379856,
"learning_rate": 9.70982142857143e-06,
"loss": 0.6706,
"step": 1644
},
{
"epoch": 2.477409638554217,
"grad_norm": 0.3275014819455298,
"learning_rate": 9.681919642857143e-06,
"loss": 0.743,
"step": 1645
},
{
"epoch": 2.4789156626506026,
"grad_norm": 0.3293056468465578,
"learning_rate": 9.654017857142857e-06,
"loss": 0.812,
"step": 1646
},
{
"epoch": 2.480421686746988,
"grad_norm": 0.4518667482647438,
"learning_rate": 9.626116071428572e-06,
"loss": 0.7656,
"step": 1647
},
{
"epoch": 2.4819277108433733,
"grad_norm": 0.2750064612288585,
"learning_rate": 9.598214285714287e-06,
"loss": 0.6894,
"step": 1648
},
{
"epoch": 2.483433734939759,
"grad_norm": 0.3220729496862736,
"learning_rate": 9.5703125e-06,
"loss": 0.7604,
"step": 1649
},
{
"epoch": 2.4849397590361444,
"grad_norm": 0.9653347432397725,
"learning_rate": 9.542410714285714e-06,
"loss": 0.7702,
"step": 1650
},
{
"epoch": 2.48644578313253,
"grad_norm": 0.3200061045834666,
"learning_rate": 9.514508928571429e-06,
"loss": 0.7086,
"step": 1651
},
{
"epoch": 2.4879518072289155,
"grad_norm": 0.3384418591621976,
"learning_rate": 9.486607142857142e-06,
"loss": 0.7965,
"step": 1652
},
{
"epoch": 2.4894578313253013,
"grad_norm": 0.30560538155243794,
"learning_rate": 9.458705357142858e-06,
"loss": 0.7264,
"step": 1653
},
{
"epoch": 2.4909638554216866,
"grad_norm": 0.28316436547607876,
"learning_rate": 9.430803571428571e-06,
"loss": 0.7678,
"step": 1654
},
{
"epoch": 2.4924698795180724,
"grad_norm": 0.3431959840639018,
"learning_rate": 9.402901785714286e-06,
"loss": 0.7688,
"step": 1655
},
{
"epoch": 2.4939759036144578,
"grad_norm": 0.3634651952724355,
"learning_rate": 9.375000000000001e-06,
"loss": 0.7449,
"step": 1656
},
{
"epoch": 2.4954819277108435,
"grad_norm": 0.3512078563200498,
"learning_rate": 9.347098214285714e-06,
"loss": 0.8026,
"step": 1657
},
{
"epoch": 2.496987951807229,
"grad_norm": 0.33971691197904924,
"learning_rate": 9.319196428571429e-06,
"loss": 0.8172,
"step": 1658
},
{
"epoch": 2.4984939759036147,
"grad_norm": 0.3214519519095423,
"learning_rate": 9.291294642857143e-06,
"loss": 0.7766,
"step": 1659
},
{
"epoch": 2.5,
"grad_norm": 0.3300704483930006,
"learning_rate": 9.263392857142858e-06,
"loss": 0.7675,
"step": 1660
},
{
"epoch": 2.5015060240963853,
"grad_norm": 0.3194051415655414,
"learning_rate": 9.235491071428571e-06,
"loss": 0.7773,
"step": 1661
},
{
"epoch": 2.503012048192771,
"grad_norm": 0.3275955587302349,
"learning_rate": 9.207589285714287e-06,
"loss": 0.7106,
"step": 1662
},
{
"epoch": 2.5045180722891565,
"grad_norm": 0.4028080584103782,
"learning_rate": 9.1796875e-06,
"loss": 0.7358,
"step": 1663
},
{
"epoch": 2.5060240963855422,
"grad_norm": 0.312909517020488,
"learning_rate": 9.151785714285713e-06,
"loss": 0.7022,
"step": 1664
},
{
"epoch": 2.5075301204819276,
"grad_norm": 0.295421568719493,
"learning_rate": 9.12388392857143e-06,
"loss": 0.7185,
"step": 1665
},
{
"epoch": 2.5090361445783134,
"grad_norm": 0.29750441602981054,
"learning_rate": 9.095982142857143e-06,
"loss": 0.7018,
"step": 1666
},
{
"epoch": 2.5105421686746987,
"grad_norm": 0.3632913023086794,
"learning_rate": 9.068080357142857e-06,
"loss": 0.7654,
"step": 1667
},
{
"epoch": 2.5120481927710845,
"grad_norm": 0.3213212393458334,
"learning_rate": 9.040178571428572e-06,
"loss": 0.8426,
"step": 1668
},
{
"epoch": 2.51355421686747,
"grad_norm": 0.43756566715206985,
"learning_rate": 9.012276785714287e-06,
"loss": 0.7618,
"step": 1669
},
{
"epoch": 2.515060240963855,
"grad_norm": 0.38614648579149197,
"learning_rate": 8.984375e-06,
"loss": 0.6978,
"step": 1670
},
{
"epoch": 2.516566265060241,
"grad_norm": 0.31857899188793437,
"learning_rate": 8.956473214285715e-06,
"loss": 0.7781,
"step": 1671
},
{
"epoch": 2.5180722891566267,
"grad_norm": 0.3492619415176146,
"learning_rate": 8.92857142857143e-06,
"loss": 0.7265,
"step": 1672
},
{
"epoch": 2.519578313253012,
"grad_norm": 0.40997150176397573,
"learning_rate": 8.900669642857142e-06,
"loss": 0.8332,
"step": 1673
},
{
"epoch": 2.5210843373493974,
"grad_norm": 0.30423155481950254,
"learning_rate": 8.872767857142859e-06,
"loss": 0.7473,
"step": 1674
},
{
"epoch": 2.522590361445783,
"grad_norm": 0.3150534403421142,
"learning_rate": 8.844866071428572e-06,
"loss": 0.7063,
"step": 1675
},
{
"epoch": 2.5240963855421685,
"grad_norm": 0.2827213507142706,
"learning_rate": 8.816964285714286e-06,
"loss": 0.7127,
"step": 1676
},
{
"epoch": 2.5256024096385543,
"grad_norm": 0.44323766157120825,
"learning_rate": 8.789062500000001e-06,
"loss": 0.8124,
"step": 1677
},
{
"epoch": 2.5271084337349397,
"grad_norm": 0.31143912418563424,
"learning_rate": 8.761160714285714e-06,
"loss": 0.7436,
"step": 1678
},
{
"epoch": 2.5286144578313254,
"grad_norm": 0.2852586752507426,
"learning_rate": 8.733258928571429e-06,
"loss": 0.7121,
"step": 1679
},
{
"epoch": 2.5301204819277108,
"grad_norm": 0.290852277121653,
"learning_rate": 8.705357142857143e-06,
"loss": 0.7092,
"step": 1680
},
{
"epoch": 2.5316265060240966,
"grad_norm": 0.34747534919721623,
"learning_rate": 8.677455357142858e-06,
"loss": 0.725,
"step": 1681
},
{
"epoch": 2.533132530120482,
"grad_norm": 0.2885485377803353,
"learning_rate": 8.649553571428571e-06,
"loss": 0.7582,
"step": 1682
},
{
"epoch": 2.5346385542168672,
"grad_norm": 0.2944768735900468,
"learning_rate": 8.621651785714288e-06,
"loss": 0.7344,
"step": 1683
},
{
"epoch": 2.536144578313253,
"grad_norm": 0.33640698865737795,
"learning_rate": 8.59375e-06,
"loss": 0.7754,
"step": 1684
},
{
"epoch": 2.537650602409639,
"grad_norm": 0.29309966282394656,
"learning_rate": 8.565848214285714e-06,
"loss": 0.7488,
"step": 1685
},
{
"epoch": 2.539156626506024,
"grad_norm": 0.3112044530213108,
"learning_rate": 8.53794642857143e-06,
"loss": 0.7877,
"step": 1686
},
{
"epoch": 2.5406626506024095,
"grad_norm": 0.31462286822432445,
"learning_rate": 8.510044642857143e-06,
"loss": 0.7369,
"step": 1687
},
{
"epoch": 2.5421686746987953,
"grad_norm": 0.3018236341853575,
"learning_rate": 8.482142857142858e-06,
"loss": 0.7689,
"step": 1688
},
{
"epoch": 2.5436746987951806,
"grad_norm": 0.3290912878735477,
"learning_rate": 8.454241071428572e-06,
"loss": 0.8191,
"step": 1689
},
{
"epoch": 2.5451807228915664,
"grad_norm": 0.30159760746277026,
"learning_rate": 8.426339285714287e-06,
"loss": 0.6907,
"step": 1690
},
{
"epoch": 2.5466867469879517,
"grad_norm": 0.27511792008401753,
"learning_rate": 8.3984375e-06,
"loss": 0.6875,
"step": 1691
},
{
"epoch": 2.5481927710843375,
"grad_norm": 0.3193212987781086,
"learning_rate": 8.370535714285715e-06,
"loss": 0.7151,
"step": 1692
},
{
"epoch": 2.549698795180723,
"grad_norm": 0.30484346892105973,
"learning_rate": 8.34263392857143e-06,
"loss": 0.787,
"step": 1693
},
{
"epoch": 2.5512048192771086,
"grad_norm": 0.35002067683836413,
"learning_rate": 8.314732142857142e-06,
"loss": 0.7047,
"step": 1694
},
{
"epoch": 2.552710843373494,
"grad_norm": 0.30055272545090506,
"learning_rate": 8.286830357142857e-06,
"loss": 0.7525,
"step": 1695
},
{
"epoch": 2.5542168674698793,
"grad_norm": 0.3097319886280592,
"learning_rate": 8.258928571428572e-06,
"loss": 0.7091,
"step": 1696
},
{
"epoch": 2.555722891566265,
"grad_norm": 0.3113709834816135,
"learning_rate": 8.231026785714286e-06,
"loss": 0.7108,
"step": 1697
},
{
"epoch": 2.557228915662651,
"grad_norm": 0.3404911531317668,
"learning_rate": 8.203125000000001e-06,
"loss": 0.6913,
"step": 1698
},
{
"epoch": 2.558734939759036,
"grad_norm": 0.29629324537287216,
"learning_rate": 8.175223214285714e-06,
"loss": 0.7747,
"step": 1699
},
{
"epoch": 2.5602409638554215,
"grad_norm": 0.36363402506892634,
"learning_rate": 8.147321428571429e-06,
"loss": 0.789,
"step": 1700
},
{
"epoch": 2.5617469879518073,
"grad_norm": 0.3351018299822603,
"learning_rate": 8.119419642857144e-06,
"loss": 0.7463,
"step": 1701
},
{
"epoch": 2.5632530120481927,
"grad_norm": 0.34151895194478593,
"learning_rate": 8.091517857142858e-06,
"loss": 0.7768,
"step": 1702
},
{
"epoch": 2.5647590361445785,
"grad_norm": 0.45820963661807423,
"learning_rate": 8.063616071428571e-06,
"loss": 0.7407,
"step": 1703
},
{
"epoch": 2.566265060240964,
"grad_norm": 0.3352161151097203,
"learning_rate": 8.035714285714286e-06,
"loss": 0.6755,
"step": 1704
},
{
"epoch": 2.567771084337349,
"grad_norm": 0.3169084299120574,
"learning_rate": 8.0078125e-06,
"loss": 0.7471,
"step": 1705
},
{
"epoch": 2.569277108433735,
"grad_norm": 0.3355797013952085,
"learning_rate": 7.979910714285714e-06,
"loss": 0.7526,
"step": 1706
},
{
"epoch": 2.5707831325301207,
"grad_norm": 0.3508763844828514,
"learning_rate": 7.952008928571428e-06,
"loss": 0.7552,
"step": 1707
},
{
"epoch": 2.572289156626506,
"grad_norm": 0.30185750269095435,
"learning_rate": 7.924107142857143e-06,
"loss": 0.7402,
"step": 1708
},
{
"epoch": 2.5737951807228914,
"grad_norm": 0.3088190422650214,
"learning_rate": 7.896205357142858e-06,
"loss": 0.6934,
"step": 1709
},
{
"epoch": 2.575301204819277,
"grad_norm": 0.2979001535960527,
"learning_rate": 7.86830357142857e-06,
"loss": 0.7637,
"step": 1710
},
{
"epoch": 2.5768072289156625,
"grad_norm": 0.33104826957943423,
"learning_rate": 7.840401785714287e-06,
"loss": 0.7175,
"step": 1711
},
{
"epoch": 2.5783132530120483,
"grad_norm": 0.2827750019155564,
"learning_rate": 7.8125e-06,
"loss": 0.7752,
"step": 1712
},
{
"epoch": 2.5798192771084336,
"grad_norm": 0.34451558868137294,
"learning_rate": 7.784598214285715e-06,
"loss": 0.8112,
"step": 1713
},
{
"epoch": 2.5813253012048194,
"grad_norm": 0.32888771692530944,
"learning_rate": 7.75669642857143e-06,
"loss": 0.8763,
"step": 1714
},
{
"epoch": 2.5828313253012047,
"grad_norm": 0.3358824838925111,
"learning_rate": 7.728794642857143e-06,
"loss": 0.7125,
"step": 1715
},
{
"epoch": 2.5843373493975905,
"grad_norm": 0.30772555796997797,
"learning_rate": 7.700892857142857e-06,
"loss": 0.82,
"step": 1716
},
{
"epoch": 2.585843373493976,
"grad_norm": 0.30537826650342315,
"learning_rate": 7.672991071428572e-06,
"loss": 0.7098,
"step": 1717
},
{
"epoch": 2.587349397590361,
"grad_norm": 0.3364116449898356,
"learning_rate": 7.645089285714287e-06,
"loss": 0.7483,
"step": 1718
},
{
"epoch": 2.588855421686747,
"grad_norm": 0.2999840346490302,
"learning_rate": 7.6171875000000005e-06,
"loss": 0.7386,
"step": 1719
},
{
"epoch": 2.5903614457831328,
"grad_norm": 0.2921003345567464,
"learning_rate": 7.589285714285714e-06,
"loss": 0.7663,
"step": 1720
},
{
"epoch": 2.591867469879518,
"grad_norm": 0.37700574559408767,
"learning_rate": 7.561383928571429e-06,
"loss": 0.7188,
"step": 1721
},
{
"epoch": 2.5933734939759034,
"grad_norm": 0.2891593821679184,
"learning_rate": 7.533482142857143e-06,
"loss": 0.7339,
"step": 1722
},
{
"epoch": 2.5948795180722892,
"grad_norm": 0.32695522353971695,
"learning_rate": 7.505580357142858e-06,
"loss": 0.7667,
"step": 1723
},
{
"epoch": 2.5963855421686746,
"grad_norm": 4.615977334434396,
"learning_rate": 7.4776785714285714e-06,
"loss": 0.7141,
"step": 1724
},
{
"epoch": 2.5978915662650603,
"grad_norm": 0.3576400319102837,
"learning_rate": 7.449776785714287e-06,
"loss": 0.7666,
"step": 1725
},
{
"epoch": 2.5993975903614457,
"grad_norm": 0.3277305801199659,
"learning_rate": 7.421875e-06,
"loss": 0.7593,
"step": 1726
},
{
"epoch": 2.6009036144578315,
"grad_norm": 0.3152495627726194,
"learning_rate": 7.393973214285714e-06,
"loss": 0.7403,
"step": 1727
},
{
"epoch": 2.602409638554217,
"grad_norm": 0.4057139805318214,
"learning_rate": 7.366071428571429e-06,
"loss": 0.7845,
"step": 1728
},
{
"epoch": 2.6039156626506026,
"grad_norm": 0.38133151780374375,
"learning_rate": 7.338169642857143e-06,
"loss": 0.7778,
"step": 1729
},
{
"epoch": 2.605421686746988,
"grad_norm": 0.32065648128633994,
"learning_rate": 7.310267857142858e-06,
"loss": 0.8046,
"step": 1730
},
{
"epoch": 2.6069277108433733,
"grad_norm": 0.34158436417603294,
"learning_rate": 7.282366071428572e-06,
"loss": 0.806,
"step": 1731
},
{
"epoch": 2.608433734939759,
"grad_norm": 0.320836935814278,
"learning_rate": 7.2544642857142865e-06,
"loss": 0.751,
"step": 1732
},
{
"epoch": 2.609939759036145,
"grad_norm": 0.2645577351183864,
"learning_rate": 7.2265625e-06,
"loss": 0.7146,
"step": 1733
},
{
"epoch": 2.61144578313253,
"grad_norm": 0.2939622691935842,
"learning_rate": 7.198660714285714e-06,
"loss": 0.6934,
"step": 1734
},
{
"epoch": 2.6129518072289155,
"grad_norm": 0.3085976645029874,
"learning_rate": 7.170758928571429e-06,
"loss": 0.7562,
"step": 1735
},
{
"epoch": 2.6144578313253013,
"grad_norm": 0.3133660819464677,
"learning_rate": 7.142857142857143e-06,
"loss": 0.8088,
"step": 1736
},
{
"epoch": 2.6159638554216866,
"grad_norm": 0.3120826312173088,
"learning_rate": 7.114955357142858e-06,
"loss": 0.749,
"step": 1737
},
{
"epoch": 2.6174698795180724,
"grad_norm": 0.290831760335641,
"learning_rate": 7.087053571428571e-06,
"loss": 0.7393,
"step": 1738
},
{
"epoch": 2.6189759036144578,
"grad_norm": 0.28095556888751433,
"learning_rate": 7.059151785714287e-06,
"loss": 0.7492,
"step": 1739
},
{
"epoch": 2.6204819277108435,
"grad_norm": 0.2910932600971462,
"learning_rate": 7.031250000000001e-06,
"loss": 0.7129,
"step": 1740
},
{
"epoch": 2.621987951807229,
"grad_norm": 0.3165228918670804,
"learning_rate": 7.003348214285714e-06,
"loss": 0.7368,
"step": 1741
},
{
"epoch": 2.6234939759036147,
"grad_norm": 0.33275859079245845,
"learning_rate": 6.975446428571429e-06,
"loss": 0.7515,
"step": 1742
},
{
"epoch": 2.625,
"grad_norm": 0.30619504473855724,
"learning_rate": 6.947544642857143e-06,
"loss": 0.7651,
"step": 1743
},
{
"epoch": 2.6265060240963853,
"grad_norm": 0.3333497956490508,
"learning_rate": 6.919642857142858e-06,
"loss": 0.7306,
"step": 1744
},
{
"epoch": 2.628012048192771,
"grad_norm": 0.8631720657888876,
"learning_rate": 6.891741071428572e-06,
"loss": 0.7551,
"step": 1745
},
{
"epoch": 2.6295180722891565,
"grad_norm": 0.2732178228066102,
"learning_rate": 6.863839285714286e-06,
"loss": 0.6302,
"step": 1746
},
{
"epoch": 2.6310240963855422,
"grad_norm": 0.2979283591631788,
"learning_rate": 6.8359375e-06,
"loss": 0.7421,
"step": 1747
},
{
"epoch": 2.6325301204819276,
"grad_norm": 0.30638502864668016,
"learning_rate": 6.808035714285714e-06,
"loss": 0.7219,
"step": 1748
},
{
"epoch": 2.6340361445783134,
"grad_norm": 0.40995615030901184,
"learning_rate": 6.780133928571429e-06,
"loss": 0.7497,
"step": 1749
},
{
"epoch": 2.6355421686746987,
"grad_norm": 0.3201702809504483,
"learning_rate": 6.7522321428571425e-06,
"loss": 0.8091,
"step": 1750
},
{
"epoch": 2.6370481927710845,
"grad_norm": 0.2901248588152903,
"learning_rate": 6.724330357142858e-06,
"loss": 0.7105,
"step": 1751
},
{
"epoch": 2.63855421686747,
"grad_norm": 0.28558652457934336,
"learning_rate": 6.696428571428572e-06,
"loss": 0.7081,
"step": 1752
},
{
"epoch": 2.640060240963855,
"grad_norm": 0.2880499393809584,
"learning_rate": 6.668526785714287e-06,
"loss": 0.7669,
"step": 1753
},
{
"epoch": 2.641566265060241,
"grad_norm": 0.3246271815796135,
"learning_rate": 6.6406250000000005e-06,
"loss": 0.773,
"step": 1754
},
{
"epoch": 2.6430722891566267,
"grad_norm": 0.3787360307050243,
"learning_rate": 6.612723214285714e-06,
"loss": 0.7447,
"step": 1755
},
{
"epoch": 2.644578313253012,
"grad_norm": 0.34004245406795175,
"learning_rate": 6.584821428571429e-06,
"loss": 0.8278,
"step": 1756
},
{
"epoch": 2.6460843373493974,
"grad_norm": 0.3284632860091845,
"learning_rate": 6.556919642857143e-06,
"loss": 0.7699,
"step": 1757
},
{
"epoch": 2.647590361445783,
"grad_norm": 0.3289158419870179,
"learning_rate": 6.5290178571428576e-06,
"loss": 0.7581,
"step": 1758
},
{
"epoch": 2.6490963855421685,
"grad_norm": 0.4542885676143434,
"learning_rate": 6.501116071428571e-06,
"loss": 0.748,
"step": 1759
},
{
"epoch": 2.6506024096385543,
"grad_norm": 0.3309405821918433,
"learning_rate": 6.473214285714287e-06,
"loss": 0.7074,
"step": 1760
},
{
"epoch": 2.6521084337349397,
"grad_norm": 0.3356756904002276,
"learning_rate": 6.4453125e-06,
"loss": 0.7391,
"step": 1761
},
{
"epoch": 2.6536144578313254,
"grad_norm": 1.3821862415310182,
"learning_rate": 6.417410714285714e-06,
"loss": 0.6628,
"step": 1762
},
{
"epoch": 2.6551204819277108,
"grad_norm": 0.2836718119591368,
"learning_rate": 6.389508928571429e-06,
"loss": 0.7403,
"step": 1763
},
{
"epoch": 2.6566265060240966,
"grad_norm": 0.2868740652496764,
"learning_rate": 6.361607142857142e-06,
"loss": 0.7205,
"step": 1764
},
{
"epoch": 2.658132530120482,
"grad_norm": 0.2840147028136461,
"learning_rate": 6.333705357142858e-06,
"loss": 0.7454,
"step": 1765
},
{
"epoch": 2.6596385542168672,
"grad_norm": 0.3079527368987645,
"learning_rate": 6.305803571428572e-06,
"loss": 0.7861,
"step": 1766
},
{
"epoch": 2.661144578313253,
"grad_norm": 0.2790278662949671,
"learning_rate": 6.2779017857142864e-06,
"loss": 0.6726,
"step": 1767
},
{
"epoch": 2.662650602409639,
"grad_norm": 0.27569819576844196,
"learning_rate": 6.25e-06,
"loss": 0.7081,
"step": 1768
},
{
"epoch": 2.664156626506024,
"grad_norm": 0.27956412459666335,
"learning_rate": 6.222098214285715e-06,
"loss": 0.7551,
"step": 1769
},
{
"epoch": 2.6656626506024095,
"grad_norm": 0.3162529723805227,
"learning_rate": 6.194196428571429e-06,
"loss": 0.7643,
"step": 1770
},
{
"epoch": 2.6671686746987953,
"grad_norm": 0.31282453907161467,
"learning_rate": 6.1662946428571435e-06,
"loss": 0.7968,
"step": 1771
},
{
"epoch": 2.6686746987951806,
"grad_norm": 0.31646843921869117,
"learning_rate": 6.138392857142857e-06,
"loss": 0.7425,
"step": 1772
},
{
"epoch": 2.6701807228915664,
"grad_norm": 0.3278769674964476,
"learning_rate": 6.110491071428571e-06,
"loss": 0.71,
"step": 1773
},
{
"epoch": 2.6716867469879517,
"grad_norm": 0.2766595955868983,
"learning_rate": 6.082589285714286e-06,
"loss": 0.7164,
"step": 1774
},
{
"epoch": 2.6731927710843375,
"grad_norm": 0.28936544890761035,
"learning_rate": 6.054687500000001e-06,
"loss": 0.731,
"step": 1775
},
{
"epoch": 2.674698795180723,
"grad_norm": 0.2892549009960483,
"learning_rate": 6.0267857142857145e-06,
"loss": 0.7213,
"step": 1776
},
{
"epoch": 2.6762048192771086,
"grad_norm": 0.7303475774736463,
"learning_rate": 5.998883928571429e-06,
"loss": 0.7696,
"step": 1777
},
{
"epoch": 2.677710843373494,
"grad_norm": 0.35090700181292234,
"learning_rate": 5.970982142857143e-06,
"loss": 0.804,
"step": 1778
},
{
"epoch": 2.6792168674698793,
"grad_norm": 0.33520248357214366,
"learning_rate": 5.943080357142857e-06,
"loss": 0.7619,
"step": 1779
},
{
"epoch": 2.680722891566265,
"grad_norm": 0.324198342217403,
"learning_rate": 5.9151785714285716e-06,
"loss": 0.7704,
"step": 1780
},
{
"epoch": 2.682228915662651,
"grad_norm": 0.35488474729216524,
"learning_rate": 5.887276785714286e-06,
"loss": 0.7829,
"step": 1781
},
{
"epoch": 2.683734939759036,
"grad_norm": 0.3344748119857392,
"learning_rate": 5.859375e-06,
"loss": 0.7479,
"step": 1782
},
{
"epoch": 2.6852409638554215,
"grad_norm": 0.28681873592590684,
"learning_rate": 5.831473214285715e-06,
"loss": 0.7406,
"step": 1783
},
{
"epoch": 2.6867469879518073,
"grad_norm": 0.2887266170114813,
"learning_rate": 5.803571428571429e-06,
"loss": 0.7467,
"step": 1784
},
{
"epoch": 2.6882530120481927,
"grad_norm": 0.8614487109247648,
"learning_rate": 5.775669642857143e-06,
"loss": 0.7208,
"step": 1785
},
{
"epoch": 2.6897590361445785,
"grad_norm": 0.34175724858171014,
"learning_rate": 5.747767857142857e-06,
"loss": 0.6993,
"step": 1786
},
{
"epoch": 2.691265060240964,
"grad_norm": 0.30958641538805864,
"learning_rate": 5.719866071428572e-06,
"loss": 0.6918,
"step": 1787
},
{
"epoch": 2.692771084337349,
"grad_norm": 0.32163703545485206,
"learning_rate": 5.691964285714286e-06,
"loss": 0.752,
"step": 1788
},
{
"epoch": 2.694277108433735,
"grad_norm": 0.3116012210013593,
"learning_rate": 5.6640625000000005e-06,
"loss": 0.7635,
"step": 1789
},
{
"epoch": 2.6957831325301207,
"grad_norm": 0.3359119225928835,
"learning_rate": 5.636160714285714e-06,
"loss": 0.7017,
"step": 1790
},
{
"epoch": 2.697289156626506,
"grad_norm": 0.29462321692762805,
"learning_rate": 5.608258928571429e-06,
"loss": 0.6825,
"step": 1791
},
{
"epoch": 2.6987951807228914,
"grad_norm": 0.8425904584454552,
"learning_rate": 5.580357142857144e-06,
"loss": 0.7454,
"step": 1792
},
{
"epoch": 2.700301204819277,
"grad_norm": 0.3679685879887224,
"learning_rate": 5.552455357142857e-06,
"loss": 0.7755,
"step": 1793
},
{
"epoch": 2.7018072289156625,
"grad_norm": 0.30176317658315494,
"learning_rate": 5.524553571428571e-06,
"loss": 0.772,
"step": 1794
},
{
"epoch": 2.7033132530120483,
"grad_norm": 0.342918240740829,
"learning_rate": 5.496651785714286e-06,
"loss": 0.7331,
"step": 1795
},
{
"epoch": 2.7048192771084336,
"grad_norm": 0.32795774755255214,
"learning_rate": 5.46875e-06,
"loss": 0.8177,
"step": 1796
},
{
"epoch": 2.7063253012048194,
"grad_norm": 0.293863359761839,
"learning_rate": 5.440848214285715e-06,
"loss": 0.6931,
"step": 1797
},
{
"epoch": 2.7078313253012047,
"grad_norm": 0.3277936763288901,
"learning_rate": 5.412946428571429e-06,
"loss": 0.7723,
"step": 1798
},
{
"epoch": 2.7093373493975905,
"grad_norm": 0.34816737440405526,
"learning_rate": 5.385044642857143e-06,
"loss": 0.7113,
"step": 1799
},
{
"epoch": 2.710843373493976,
"grad_norm": 0.30627797121168604,
"learning_rate": 5.357142857142857e-06,
"loss": 0.7738,
"step": 1800
},
{
"epoch": 2.712349397590361,
"grad_norm": 0.3002590808272766,
"learning_rate": 5.329241071428572e-06,
"loss": 0.7773,
"step": 1801
},
{
"epoch": 2.713855421686747,
"grad_norm": 0.2980043893823824,
"learning_rate": 5.301339285714286e-06,
"loss": 0.7365,
"step": 1802
},
{
"epoch": 2.7153614457831328,
"grad_norm": 0.3022858303556268,
"learning_rate": 5.2734375e-06,
"loss": 0.7455,
"step": 1803
},
{
"epoch": 2.716867469879518,
"grad_norm": 0.27048619159829523,
"learning_rate": 5.245535714285715e-06,
"loss": 0.654,
"step": 1804
},
{
"epoch": 2.7183734939759034,
"grad_norm": 0.3198220531778734,
"learning_rate": 5.217633928571429e-06,
"loss": 0.7499,
"step": 1805
},
{
"epoch": 2.7198795180722892,
"grad_norm": 1.3714789798650127,
"learning_rate": 5.1897321428571435e-06,
"loss": 0.7805,
"step": 1806
},
{
"epoch": 2.7213855421686746,
"grad_norm": 0.36148685284609405,
"learning_rate": 5.161830357142857e-06,
"loss": 0.8323,
"step": 1807
},
{
"epoch": 2.7228915662650603,
"grad_norm": 0.2925238727326717,
"learning_rate": 5.133928571428571e-06,
"loss": 0.756,
"step": 1808
},
{
"epoch": 2.7243975903614457,
"grad_norm": 0.3590779084763921,
"learning_rate": 5.106026785714286e-06,
"loss": 0.7196,
"step": 1809
},
{
"epoch": 2.7259036144578315,
"grad_norm": 0.3002194432429927,
"learning_rate": 5.078125000000001e-06,
"loss": 0.7618,
"step": 1810
},
{
"epoch": 2.727409638554217,
"grad_norm": 0.4393829774226643,
"learning_rate": 5.0502232142857145e-06,
"loss": 0.7096,
"step": 1811
},
{
"epoch": 2.7289156626506026,
"grad_norm": 0.3017728591582575,
"learning_rate": 5.022321428571429e-06,
"loss": 0.717,
"step": 1812
},
{
"epoch": 2.730421686746988,
"grad_norm": 0.32193074597724014,
"learning_rate": 4.994419642857143e-06,
"loss": 0.7289,
"step": 1813
},
{
"epoch": 2.7319277108433733,
"grad_norm": 0.28872388326630216,
"learning_rate": 4.966517857142857e-06,
"loss": 0.7229,
"step": 1814
},
{
"epoch": 2.733433734939759,
"grad_norm": 0.3989555049359659,
"learning_rate": 4.9386160714285716e-06,
"loss": 0.7467,
"step": 1815
},
{
"epoch": 2.734939759036145,
"grad_norm": 0.2837428392594192,
"learning_rate": 4.910714285714286e-06,
"loss": 0.697,
"step": 1816
},
{
"epoch": 2.73644578313253,
"grad_norm": 0.2886560428527395,
"learning_rate": 4.8828125e-06,
"loss": 0.7762,
"step": 1817
},
{
"epoch": 2.7379518072289155,
"grad_norm": 0.31400513767956945,
"learning_rate": 4.854910714285715e-06,
"loss": 0.7722,
"step": 1818
},
{
"epoch": 2.7394578313253013,
"grad_norm": 0.3117213132631473,
"learning_rate": 4.827008928571429e-06,
"loss": 0.6635,
"step": 1819
},
{
"epoch": 2.7409638554216866,
"grad_norm": 0.2890138534448533,
"learning_rate": 4.799107142857143e-06,
"loss": 0.6778,
"step": 1820
},
{
"epoch": 2.7424698795180724,
"grad_norm": 0.2838821479509516,
"learning_rate": 4.771205357142857e-06,
"loss": 0.7101,
"step": 1821
},
{
"epoch": 2.7439759036144578,
"grad_norm": 0.28251707803539966,
"learning_rate": 4.743303571428571e-06,
"loss": 0.6826,
"step": 1822
},
{
"epoch": 2.7454819277108435,
"grad_norm": 0.2818004032242758,
"learning_rate": 4.715401785714286e-06,
"loss": 0.7251,
"step": 1823
},
{
"epoch": 2.746987951807229,
"grad_norm": 0.3109687875915881,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.7665,
"step": 1824
},
{
"epoch": 2.7484939759036147,
"grad_norm": 0.29591002251996784,
"learning_rate": 4.659598214285714e-06,
"loss": 0.7636,
"step": 1825
},
{
"epoch": 2.75,
"grad_norm": 0.2856289072717422,
"learning_rate": 4.631696428571429e-06,
"loss": 0.7786,
"step": 1826
},
{
"epoch": 2.7515060240963853,
"grad_norm": 0.3141918846889473,
"learning_rate": 4.603794642857144e-06,
"loss": 0.7682,
"step": 1827
},
{
"epoch": 2.753012048192771,
"grad_norm": 0.3463559100992555,
"learning_rate": 4.575892857142857e-06,
"loss": 0.8205,
"step": 1828
},
{
"epoch": 2.7545180722891565,
"grad_norm": 0.3001393370777785,
"learning_rate": 4.547991071428571e-06,
"loss": 0.8053,
"step": 1829
},
{
"epoch": 2.7560240963855422,
"grad_norm": 0.29718636191980297,
"learning_rate": 4.520089285714286e-06,
"loss": 0.7273,
"step": 1830
},
{
"epoch": 2.7575301204819276,
"grad_norm": 0.2918620456876534,
"learning_rate": 4.4921875e-06,
"loss": 0.7757,
"step": 1831
},
{
"epoch": 2.7590361445783134,
"grad_norm": 0.29972840789057525,
"learning_rate": 4.464285714285715e-06,
"loss": 0.7284,
"step": 1832
},
{
"epoch": 2.7605421686746987,
"grad_norm": 0.2847024079832293,
"learning_rate": 4.436383928571429e-06,
"loss": 0.7394,
"step": 1833
},
{
"epoch": 2.7620481927710845,
"grad_norm": 0.30072487312796725,
"learning_rate": 4.408482142857143e-06,
"loss": 0.8278,
"step": 1834
},
{
"epoch": 2.76355421686747,
"grad_norm": 0.2790568445128912,
"learning_rate": 4.380580357142857e-06,
"loss": 0.7259,
"step": 1835
},
{
"epoch": 2.765060240963855,
"grad_norm": 0.2846409815384662,
"learning_rate": 4.352678571428572e-06,
"loss": 0.7831,
"step": 1836
},
{
"epoch": 2.766566265060241,
"grad_norm": 0.2918921690138049,
"learning_rate": 4.3247767857142856e-06,
"loss": 0.7669,
"step": 1837
},
{
"epoch": 2.7680722891566267,
"grad_norm": 0.31859135295259267,
"learning_rate": 4.296875e-06,
"loss": 0.7654,
"step": 1838
},
{
"epoch": 2.769578313253012,
"grad_norm": 0.28812343764677306,
"learning_rate": 4.268973214285715e-06,
"loss": 0.72,
"step": 1839
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.2897701597670063,
"learning_rate": 4.241071428571429e-06,
"loss": 0.6553,
"step": 1840
},
{
"epoch": 2.772590361445783,
"grad_norm": 0.3182428437315208,
"learning_rate": 4.2131696428571435e-06,
"loss": 0.7753,
"step": 1841
},
{
"epoch": 2.7740963855421685,
"grad_norm": 0.29225269878494925,
"learning_rate": 4.185267857142857e-06,
"loss": 0.79,
"step": 1842
},
{
"epoch": 2.7756024096385543,
"grad_norm": 0.32180829859504595,
"learning_rate": 4.157366071428571e-06,
"loss": 0.7169,
"step": 1843
},
{
"epoch": 2.7771084337349397,
"grad_norm": 0.26290022410963065,
"learning_rate": 4.129464285714286e-06,
"loss": 0.7274,
"step": 1844
},
{
"epoch": 2.7786144578313254,
"grad_norm": 0.29892657079479723,
"learning_rate": 4.101562500000001e-06,
"loss": 0.7556,
"step": 1845
},
{
"epoch": 2.7801204819277108,
"grad_norm": 0.29600236822343073,
"learning_rate": 4.0736607142857144e-06,
"loss": 0.7593,
"step": 1846
},
{
"epoch": 2.7816265060240966,
"grad_norm": 0.2785999824878323,
"learning_rate": 4.045758928571429e-06,
"loss": 0.7595,
"step": 1847
},
{
"epoch": 2.783132530120482,
"grad_norm": 0.3027459070981101,
"learning_rate": 4.017857142857143e-06,
"loss": 0.7178,
"step": 1848
},
{
"epoch": 2.7846385542168672,
"grad_norm": 0.30603121805610306,
"learning_rate": 3.989955357142857e-06,
"loss": 0.7205,
"step": 1849
},
{
"epoch": 2.786144578313253,
"grad_norm": 0.29468597803045465,
"learning_rate": 3.9620535714285715e-06,
"loss": 0.7774,
"step": 1850
},
{
"epoch": 2.787650602409639,
"grad_norm": 0.720466719902226,
"learning_rate": 3.934151785714285e-06,
"loss": 0.8633,
"step": 1851
},
{
"epoch": 2.789156626506024,
"grad_norm": 0.3579853298835368,
"learning_rate": 3.90625e-06,
"loss": 0.77,
"step": 1852
},
{
"epoch": 2.7906626506024095,
"grad_norm": 0.2887832305698688,
"learning_rate": 3.878348214285715e-06,
"loss": 0.6968,
"step": 1853
},
{
"epoch": 2.7921686746987953,
"grad_norm": 0.28061994255931466,
"learning_rate": 3.850446428571429e-06,
"loss": 0.6743,
"step": 1854
},
{
"epoch": 2.7936746987951806,
"grad_norm": 0.27489293538232473,
"learning_rate": 3.822544642857143e-06,
"loss": 0.7139,
"step": 1855
},
{
"epoch": 2.7951807228915664,
"grad_norm": 0.39398310546797993,
"learning_rate": 3.794642857142857e-06,
"loss": 0.6827,
"step": 1856
},
{
"epoch": 2.7966867469879517,
"grad_norm": 0.2811543383525772,
"learning_rate": 3.7667410714285714e-06,
"loss": 0.6448,
"step": 1857
},
{
"epoch": 2.7981927710843375,
"grad_norm": 0.29335743981006307,
"learning_rate": 3.7388392857142857e-06,
"loss": 0.7945,
"step": 1858
},
{
"epoch": 2.799698795180723,
"grad_norm": 0.28589285906809897,
"learning_rate": 3.7109375e-06,
"loss": 0.7289,
"step": 1859
},
{
"epoch": 2.8012048192771086,
"grad_norm": 0.277648959894417,
"learning_rate": 3.6830357142857147e-06,
"loss": 0.7412,
"step": 1860
},
{
"epoch": 2.802710843373494,
"grad_norm": 0.3201561620524433,
"learning_rate": 3.655133928571429e-06,
"loss": 0.7009,
"step": 1861
},
{
"epoch": 2.8042168674698793,
"grad_norm": 0.33360988034040734,
"learning_rate": 3.6272321428571432e-06,
"loss": 0.7709,
"step": 1862
},
{
"epoch": 2.805722891566265,
"grad_norm": 0.30520858038407805,
"learning_rate": 3.599330357142857e-06,
"loss": 0.7165,
"step": 1863
},
{
"epoch": 2.807228915662651,
"grad_norm": 0.2911169684542639,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.7598,
"step": 1864
},
{
"epoch": 2.808734939759036,
"grad_norm": 0.9411944444151761,
"learning_rate": 3.5435267857142856e-06,
"loss": 0.7633,
"step": 1865
},
{
"epoch": 2.8102409638554215,
"grad_norm": 0.3772441731605911,
"learning_rate": 3.5156250000000003e-06,
"loss": 0.7564,
"step": 1866
},
{
"epoch": 2.8117469879518073,
"grad_norm": 0.29726261895641076,
"learning_rate": 3.4877232142857146e-06,
"loss": 0.7702,
"step": 1867
},
{
"epoch": 2.8132530120481927,
"grad_norm": 0.30853582388798567,
"learning_rate": 3.459821428571429e-06,
"loss": 0.806,
"step": 1868
},
{
"epoch": 2.8147590361445785,
"grad_norm": 0.29016333273204,
"learning_rate": 3.431919642857143e-06,
"loss": 0.7482,
"step": 1869
},
{
"epoch": 2.816265060240964,
"grad_norm": 0.29103802697699344,
"learning_rate": 3.404017857142857e-06,
"loss": 0.7584,
"step": 1870
},
{
"epoch": 2.817771084337349,
"grad_norm": 0.32989495872725566,
"learning_rate": 3.3761160714285713e-06,
"loss": 0.7245,
"step": 1871
},
{
"epoch": 2.819277108433735,
"grad_norm": 0.29428189409450783,
"learning_rate": 3.348214285714286e-06,
"loss": 0.7402,
"step": 1872
},
{
"epoch": 2.8207831325301207,
"grad_norm": 0.2923119051513408,
"learning_rate": 3.3203125000000002e-06,
"loss": 0.7359,
"step": 1873
},
{
"epoch": 2.822289156626506,
"grad_norm": 0.31331439206390427,
"learning_rate": 3.2924107142857145e-06,
"loss": 0.7621,
"step": 1874
},
{
"epoch": 2.8237951807228914,
"grad_norm": 0.300143125183693,
"learning_rate": 3.2645089285714288e-06,
"loss": 0.8172,
"step": 1875
},
{
"epoch": 2.825301204819277,
"grad_norm": 0.3676603314896176,
"learning_rate": 3.2366071428571435e-06,
"loss": 0.7648,
"step": 1876
},
{
"epoch": 2.8268072289156625,
"grad_norm": 0.2724689316576843,
"learning_rate": 3.208705357142857e-06,
"loss": 0.7012,
"step": 1877
},
{
"epoch": 2.8283132530120483,
"grad_norm": 0.310509776398765,
"learning_rate": 3.180803571428571e-06,
"loss": 0.8045,
"step": 1878
},
{
"epoch": 2.8298192771084336,
"grad_norm": 0.2858320851928093,
"learning_rate": 3.152901785714286e-06,
"loss": 0.7463,
"step": 1879
},
{
"epoch": 2.8313253012048194,
"grad_norm": 0.2812001856918453,
"learning_rate": 3.125e-06,
"loss": 0.7572,
"step": 1880
},
{
"epoch": 2.8328313253012047,
"grad_norm": 0.2753278308275173,
"learning_rate": 3.0970982142857144e-06,
"loss": 0.7919,
"step": 1881
},
{
"epoch": 2.8343373493975905,
"grad_norm": 0.2920512474700927,
"learning_rate": 3.0691964285714287e-06,
"loss": 0.7859,
"step": 1882
},
{
"epoch": 2.835843373493976,
"grad_norm": 3.659231766168921,
"learning_rate": 3.041294642857143e-06,
"loss": 0.8445,
"step": 1883
},
{
"epoch": 2.837349397590361,
"grad_norm": 0.2712121435641824,
"learning_rate": 3.0133928571428572e-06,
"loss": 0.751,
"step": 1884
},
{
"epoch": 2.838855421686747,
"grad_norm": 0.31294800874927753,
"learning_rate": 2.9854910714285715e-06,
"loss": 0.7136,
"step": 1885
},
{
"epoch": 2.8403614457831328,
"grad_norm": 0.3276982648277128,
"learning_rate": 2.9575892857142858e-06,
"loss": 0.7662,
"step": 1886
},
{
"epoch": 2.841867469879518,
"grad_norm": 0.2997042134972521,
"learning_rate": 2.9296875e-06,
"loss": 0.746,
"step": 1887
},
{
"epoch": 2.8433734939759034,
"grad_norm": 0.2653916799759019,
"learning_rate": 2.9017857142857143e-06,
"loss": 0.6621,
"step": 1888
},
{
"epoch": 2.8448795180722892,
"grad_norm": 0.29798621810138914,
"learning_rate": 2.8738839285714286e-06,
"loss": 0.7444,
"step": 1889
},
{
"epoch": 2.8463855421686746,
"grad_norm": 0.30934582746592326,
"learning_rate": 2.845982142857143e-06,
"loss": 0.7616,
"step": 1890
},
{
"epoch": 2.8478915662650603,
"grad_norm": 0.28853264933063194,
"learning_rate": 2.818080357142857e-06,
"loss": 0.7552,
"step": 1891
},
{
"epoch": 2.8493975903614457,
"grad_norm": 0.2925190688211709,
"learning_rate": 2.790178571428572e-06,
"loss": 0.7576,
"step": 1892
},
{
"epoch": 2.8509036144578315,
"grad_norm": 0.28958377921866213,
"learning_rate": 2.7622767857142857e-06,
"loss": 0.7214,
"step": 1893
},
{
"epoch": 2.852409638554217,
"grad_norm": 0.30791392475195667,
"learning_rate": 2.734375e-06,
"loss": 0.7899,
"step": 1894
},
{
"epoch": 2.8539156626506026,
"grad_norm": 0.28363265721815545,
"learning_rate": 2.7064732142857147e-06,
"loss": 0.743,
"step": 1895
},
{
"epoch": 2.855421686746988,
"grad_norm": 0.2746011031902342,
"learning_rate": 2.6785714285714285e-06,
"loss": 0.7582,
"step": 1896
},
{
"epoch": 2.8569277108433733,
"grad_norm": 0.4354239478315636,
"learning_rate": 2.650669642857143e-06,
"loss": 0.8054,
"step": 1897
},
{
"epoch": 2.858433734939759,
"grad_norm": 0.27755531390514193,
"learning_rate": 2.6227678571428575e-06,
"loss": 0.7162,
"step": 1898
},
{
"epoch": 2.859939759036145,
"grad_norm": 0.31739389137642543,
"learning_rate": 2.5948660714285718e-06,
"loss": 0.7831,
"step": 1899
},
{
"epoch": 2.86144578313253,
"grad_norm": 0.2877187908458202,
"learning_rate": 2.5669642857142856e-06,
"loss": 0.7222,
"step": 1900
},
{
"epoch": 2.8629518072289155,
"grad_norm": 0.27811069674017586,
"learning_rate": 2.5390625000000003e-06,
"loss": 0.7769,
"step": 1901
},
{
"epoch": 2.8644578313253013,
"grad_norm": 0.29517335209299167,
"learning_rate": 2.5111607142857146e-06,
"loss": 0.72,
"step": 1902
},
{
"epoch": 2.8659638554216866,
"grad_norm": 0.5895082211728702,
"learning_rate": 2.4832589285714284e-06,
"loss": 0.7877,
"step": 1903
},
{
"epoch": 2.8674698795180724,
"grad_norm": 0.3017527360373152,
"learning_rate": 2.455357142857143e-06,
"loss": 0.754,
"step": 1904
},
{
"epoch": 2.8689759036144578,
"grad_norm": 0.3020871890290279,
"learning_rate": 2.4274553571428574e-06,
"loss": 0.768,
"step": 1905
},
{
"epoch": 2.8704819277108435,
"grad_norm": 0.27145009897189104,
"learning_rate": 2.3995535714285717e-06,
"loss": 0.7196,
"step": 1906
},
{
"epoch": 2.871987951807229,
"grad_norm": 0.2938966933645326,
"learning_rate": 2.3716517857142855e-06,
"loss": 0.7465,
"step": 1907
},
{
"epoch": 2.8734939759036147,
"grad_norm": 0.3291871922581937,
"learning_rate": 2.3437500000000002e-06,
"loss": 0.695,
"step": 1908
},
{
"epoch": 2.875,
"grad_norm": 0.29359359603993,
"learning_rate": 2.3158482142857145e-06,
"loss": 0.7131,
"step": 1909
},
{
"epoch": 2.8765060240963853,
"grad_norm": 0.31072192255011544,
"learning_rate": 2.2879464285714283e-06,
"loss": 0.756,
"step": 1910
},
{
"epoch": 2.878012048192771,
"grad_norm": 0.2967924432080697,
"learning_rate": 2.260044642857143e-06,
"loss": 0.7741,
"step": 1911
},
{
"epoch": 2.8795180722891565,
"grad_norm": 0.3557670496190678,
"learning_rate": 2.2321428571428573e-06,
"loss": 0.7631,
"step": 1912
},
{
"epoch": 2.8810240963855422,
"grad_norm": 0.31254839796755324,
"learning_rate": 2.2042410714285716e-06,
"loss": 0.7576,
"step": 1913
},
{
"epoch": 2.8825301204819276,
"grad_norm": 0.3077806685698394,
"learning_rate": 2.176339285714286e-06,
"loss": 0.8085,
"step": 1914
},
{
"epoch": 2.8840361445783134,
"grad_norm": 0.2866823006051055,
"learning_rate": 2.1484375e-06,
"loss": 0.7868,
"step": 1915
},
{
"epoch": 2.8855421686746987,
"grad_norm": 0.3155772094140096,
"learning_rate": 2.1205357142857144e-06,
"loss": 0.7165,
"step": 1916
},
{
"epoch": 2.8870481927710845,
"grad_norm": 0.3010287908071899,
"learning_rate": 2.0926339285714287e-06,
"loss": 0.7067,
"step": 1917
},
{
"epoch": 2.88855421686747,
"grad_norm": 0.282822990439801,
"learning_rate": 2.064732142857143e-06,
"loss": 0.6914,
"step": 1918
},
{
"epoch": 2.890060240963855,
"grad_norm": 0.4224286171276179,
"learning_rate": 2.0368303571428572e-06,
"loss": 0.8104,
"step": 1919
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.28319248388182466,
"learning_rate": 2.0089285714285715e-06,
"loss": 0.7673,
"step": 1920
},
{
"epoch": 2.8930722891566267,
"grad_norm": 0.31646975205676803,
"learning_rate": 1.9810267857142858e-06,
"loss": 0.7211,
"step": 1921
},
{
"epoch": 2.894578313253012,
"grad_norm": 0.2835675299229189,
"learning_rate": 1.953125e-06,
"loss": 0.7181,
"step": 1922
},
{
"epoch": 2.8960843373493974,
"grad_norm": 0.2882055390592135,
"learning_rate": 1.9252232142857143e-06,
"loss": 0.7141,
"step": 1923
},
{
"epoch": 2.897590361445783,
"grad_norm": 0.3346438957348171,
"learning_rate": 1.8973214285714286e-06,
"loss": 0.7383,
"step": 1924
},
{
"epoch": 2.8990963855421685,
"grad_norm": 0.29787971155623943,
"learning_rate": 1.8694196428571429e-06,
"loss": 0.7121,
"step": 1925
},
{
"epoch": 2.9006024096385543,
"grad_norm": 0.28321608791481456,
"learning_rate": 1.8415178571428573e-06,
"loss": 0.7681,
"step": 1926
},
{
"epoch": 2.9021084337349397,
"grad_norm": 0.281455085466644,
"learning_rate": 1.8136160714285716e-06,
"loss": 0.8073,
"step": 1927
},
{
"epoch": 2.9036144578313254,
"grad_norm": 0.2891014705244002,
"learning_rate": 1.7857142857142857e-06,
"loss": 0.7696,
"step": 1928
},
{
"epoch": 2.9051204819277108,
"grad_norm": 0.286423507464086,
"learning_rate": 1.7578125000000002e-06,
"loss": 0.7354,
"step": 1929
},
{
"epoch": 2.9066265060240966,
"grad_norm": 0.27264166420951463,
"learning_rate": 1.7299107142857144e-06,
"loss": 0.6998,
"step": 1930
},
{
"epoch": 2.908132530120482,
"grad_norm": 0.3257255066423078,
"learning_rate": 1.7020089285714285e-06,
"loss": 0.8231,
"step": 1931
},
{
"epoch": 2.9096385542168672,
"grad_norm": 0.3467231042637017,
"learning_rate": 1.674107142857143e-06,
"loss": 0.7553,
"step": 1932
},
{
"epoch": 2.911144578313253,
"grad_norm": 0.27173782716896655,
"learning_rate": 1.6462053571428573e-06,
"loss": 0.7032,
"step": 1933
},
{
"epoch": 2.912650602409639,
"grad_norm": 0.3490545650945632,
"learning_rate": 1.6183035714285717e-06,
"loss": 0.7347,
"step": 1934
},
{
"epoch": 2.914156626506024,
"grad_norm": 0.25752020615528276,
"learning_rate": 1.5904017857142856e-06,
"loss": 0.6845,
"step": 1935
},
{
"epoch": 2.9156626506024095,
"grad_norm": 0.2836075770435027,
"learning_rate": 1.5625e-06,
"loss": 0.7646,
"step": 1936
},
{
"epoch": 2.9171686746987953,
"grad_norm": 0.27537557617799796,
"learning_rate": 1.5345982142857143e-06,
"loss": 0.6949,
"step": 1937
},
{
"epoch": 2.9186746987951806,
"grad_norm": 0.2841046340229543,
"learning_rate": 1.5066964285714286e-06,
"loss": 0.8221,
"step": 1938
},
{
"epoch": 2.9201807228915664,
"grad_norm": 0.29679182605450916,
"learning_rate": 1.4787946428571429e-06,
"loss": 0.7675,
"step": 1939
},
{
"epoch": 2.9216867469879517,
"grad_norm": 0.2686251234725454,
"learning_rate": 1.4508928571428572e-06,
"loss": 0.765,
"step": 1940
},
{
"epoch": 2.9231927710843375,
"grad_norm": 0.2726348854457834,
"learning_rate": 1.4229910714285714e-06,
"loss": 0.7187,
"step": 1941
},
{
"epoch": 2.924698795180723,
"grad_norm": 0.2658162415389458,
"learning_rate": 1.395089285714286e-06,
"loss": 0.6842,
"step": 1942
},
{
"epoch": 2.9262048192771086,
"grad_norm": 0.25894486210748174,
"learning_rate": 1.3671875e-06,
"loss": 0.7254,
"step": 1943
},
{
"epoch": 2.927710843373494,
"grad_norm": 0.30389598460624445,
"learning_rate": 1.3392857142857143e-06,
"loss": 0.7894,
"step": 1944
},
{
"epoch": 2.9292168674698793,
"grad_norm": 0.3093627640244388,
"learning_rate": 1.3113839285714287e-06,
"loss": 0.6892,
"step": 1945
},
{
"epoch": 2.930722891566265,
"grad_norm": 0.3003090390154035,
"learning_rate": 1.2834821428571428e-06,
"loss": 0.807,
"step": 1946
},
{
"epoch": 2.932228915662651,
"grad_norm": 0.2947620182108699,
"learning_rate": 1.2555803571428573e-06,
"loss": 0.6971,
"step": 1947
},
{
"epoch": 2.933734939759036,
"grad_norm": 0.30018352491939837,
"learning_rate": 1.2276785714285716e-06,
"loss": 0.826,
"step": 1948
},
{
"epoch": 2.9352409638554215,
"grad_norm": 0.29760905773166585,
"learning_rate": 1.1997767857142858e-06,
"loss": 0.7329,
"step": 1949
},
{
"epoch": 2.9367469879518073,
"grad_norm": 0.2768555144730212,
"learning_rate": 1.1718750000000001e-06,
"loss": 0.7563,
"step": 1950
},
{
"epoch": 2.9382530120481927,
"grad_norm": 0.27849852133162567,
"learning_rate": 1.1439732142857142e-06,
"loss": 0.7177,
"step": 1951
},
{
"epoch": 2.9397590361445785,
"grad_norm": 0.8135785010770303,
"learning_rate": 1.1160714285714287e-06,
"loss": 0.8123,
"step": 1952
},
{
"epoch": 2.941265060240964,
"grad_norm": 0.3831821163339208,
"learning_rate": 1.088169642857143e-06,
"loss": 0.7684,
"step": 1953
},
{
"epoch": 2.942771084337349,
"grad_norm": 0.27702905205061307,
"learning_rate": 1.0602678571428572e-06,
"loss": 0.7138,
"step": 1954
},
{
"epoch": 2.944277108433735,
"grad_norm": 0.25535735912526225,
"learning_rate": 1.0323660714285715e-06,
"loss": 0.6811,
"step": 1955
},
{
"epoch": 2.9457831325301207,
"grad_norm": 0.2856123673357833,
"learning_rate": 1.0044642857142857e-06,
"loss": 0.7771,
"step": 1956
},
{
"epoch": 2.947289156626506,
"grad_norm": 0.30038409519307724,
"learning_rate": 9.765625e-07,
"loss": 0.7175,
"step": 1957
},
{
"epoch": 2.9487951807228914,
"grad_norm": 0.4141262510310498,
"learning_rate": 9.486607142857143e-07,
"loss": 0.8161,
"step": 1958
},
{
"epoch": 2.950301204819277,
"grad_norm": 0.280304507565235,
"learning_rate": 9.207589285714287e-07,
"loss": 0.7696,
"step": 1959
},
{
"epoch": 2.9518072289156625,
"grad_norm": 0.2861561614487602,
"learning_rate": 8.928571428571428e-07,
"loss": 0.7923,
"step": 1960
},
{
"epoch": 2.9533132530120483,
"grad_norm": 0.2835788478787062,
"learning_rate": 8.649553571428572e-07,
"loss": 0.8431,
"step": 1961
},
{
"epoch": 2.9548192771084336,
"grad_norm": 0.3154664609367005,
"learning_rate": 8.370535714285715e-07,
"loss": 0.772,
"step": 1962
},
{
"epoch": 2.9563253012048194,
"grad_norm": 0.29306616569175803,
"learning_rate": 8.091517857142859e-07,
"loss": 0.7964,
"step": 1963
},
{
"epoch": 2.9578313253012047,
"grad_norm": 1.819652743361417,
"learning_rate": 7.8125e-07,
"loss": 0.6886,
"step": 1964
},
{
"epoch": 2.9593373493975905,
"grad_norm": 0.2875085122515832,
"learning_rate": 7.533482142857143e-07,
"loss": 0.778,
"step": 1965
},
{
"epoch": 2.960843373493976,
"grad_norm": 0.2751727687994609,
"learning_rate": 7.254464285714286e-07,
"loss": 0.7512,
"step": 1966
},
{
"epoch": 2.962349397590361,
"grad_norm": 0.2800672547752561,
"learning_rate": 6.97544642857143e-07,
"loss": 0.7691,
"step": 1967
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.26420185686650816,
"learning_rate": 6.696428571428571e-07,
"loss": 0.7012,
"step": 1968
},
{
"epoch": 2.9653614457831328,
"grad_norm": 0.27483679447506965,
"learning_rate": 6.417410714285714e-07,
"loss": 0.7983,
"step": 1969
},
{
"epoch": 2.966867469879518,
"grad_norm": 0.32034396483164435,
"learning_rate": 6.138392857142858e-07,
"loss": 0.7663,
"step": 1970
},
{
"epoch": 2.9683734939759034,
"grad_norm": 0.29226222021438575,
"learning_rate": 5.859375000000001e-07,
"loss": 0.7024,
"step": 1971
},
{
"epoch": 2.9698795180722892,
"grad_norm": 0.2829008645709252,
"learning_rate": 5.580357142857143e-07,
"loss": 0.7304,
"step": 1972
},
{
"epoch": 2.9713855421686746,
"grad_norm": 0.29884308825110284,
"learning_rate": 5.301339285714286e-07,
"loss": 0.8206,
"step": 1973
},
{
"epoch": 2.9728915662650603,
"grad_norm": 0.31738723625885645,
"learning_rate": 5.022321428571429e-07,
"loss": 0.8205,
"step": 1974
},
{
"epoch": 2.9743975903614457,
"grad_norm": 0.28238477012873703,
"learning_rate": 4.7433035714285715e-07,
"loss": 0.7227,
"step": 1975
},
{
"epoch": 2.9759036144578315,
"grad_norm": 0.28560853787026175,
"learning_rate": 4.464285714285714e-07,
"loss": 0.7709,
"step": 1976
},
{
"epoch": 2.977409638554217,
"grad_norm": 0.2848947727442523,
"learning_rate": 4.1852678571428575e-07,
"loss": 0.7393,
"step": 1977
},
{
"epoch": 2.9789156626506026,
"grad_norm": 0.26955375500337003,
"learning_rate": 3.90625e-07,
"loss": 0.7544,
"step": 1978
},
{
"epoch": 2.980421686746988,
"grad_norm": 0.3045732926762347,
"learning_rate": 3.627232142857143e-07,
"loss": 0.7344,
"step": 1979
},
{
"epoch": 2.9819277108433733,
"grad_norm": 0.25837533799337203,
"learning_rate": 3.3482142857142856e-07,
"loss": 0.7037,
"step": 1980
},
{
"epoch": 2.983433734939759,
"grad_norm": 0.38317576424191563,
"learning_rate": 3.069196428571429e-07,
"loss": 0.689,
"step": 1981
},
{
"epoch": 2.984939759036145,
"grad_norm": 0.30484577173749194,
"learning_rate": 2.7901785714285716e-07,
"loss": 0.7688,
"step": 1982
},
{
"epoch": 2.98644578313253,
"grad_norm": 0.2931332783055906,
"learning_rate": 2.5111607142857144e-07,
"loss": 0.7264,
"step": 1983
},
{
"epoch": 2.9879518072289155,
"grad_norm": 0.28390313193734124,
"learning_rate": 2.232142857142857e-07,
"loss": 0.7865,
"step": 1984
},
{
"epoch": 2.9894578313253013,
"grad_norm": 0.3076581154256268,
"learning_rate": 1.953125e-07,
"loss": 0.7508,
"step": 1985
},
{
"epoch": 2.9909638554216866,
"grad_norm": 0.27449497039862664,
"learning_rate": 1.6741071428571428e-07,
"loss": 0.7499,
"step": 1986
},
{
"epoch": 2.9924698795180724,
"grad_norm": 0.2697959851824814,
"learning_rate": 1.3950892857142858e-07,
"loss": 0.7841,
"step": 1987
},
{
"epoch": 2.9939759036144578,
"grad_norm": 0.2804462283355088,
"learning_rate": 1.1160714285714285e-07,
"loss": 0.7486,
"step": 1988
},
{
"epoch": 2.9954819277108435,
"grad_norm": 0.32675633721133424,
"learning_rate": 8.370535714285714e-08,
"loss": 0.8623,
"step": 1989
},
{
"epoch": 2.996987951807229,
"grad_norm": 0.30464261688709765,
"learning_rate": 5.580357142857143e-08,
"loss": 0.8101,
"step": 1990
},
{
"epoch": 2.9984939759036147,
"grad_norm": 0.262168865720023,
"learning_rate": 2.7901785714285714e-08,
"loss": 0.6706,
"step": 1991
},
{
"epoch": 3.0,
"grad_norm": 0.2592255486898374,
"learning_rate": 0.0,
"loss": 0.6769,
"step": 1992
},
{
"epoch": 3.0,
"step": 1992,
"total_flos": 1.6885256323774546e+18,
"train_loss": 1.0046712136172866,
"train_runtime": 115762.6071,
"train_samples_per_second": 0.275,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1992,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6885256323774546e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}