tinystories_baseline_full / trainer_state.json
ptsv's picture
ptsv/gpt2-m_tinystories_baseline_full
19cf416 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9970958373668926,
"eval_steps": 400,
"global_step": 1935,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003097773475314618,
"grad_norm": 25.331384658813477,
"learning_rate": 4.99741468459152e-05,
"loss": 7.6182,
"step": 2
},
{
"epoch": 0.006195546950629236,
"grad_norm": 23.0921688079834,
"learning_rate": 4.994829369183041e-05,
"loss": 6.4939,
"step": 4
},
{
"epoch": 0.009293320425943852,
"grad_norm": 15.174758911132812,
"learning_rate": 4.9896587383660806e-05,
"loss": 5.8833,
"step": 6
},
{
"epoch": 0.012391093901258471,
"grad_norm": 13.869624137878418,
"learning_rate": 4.984488107549121e-05,
"loss": 5.1932,
"step": 8
},
{
"epoch": 0.015488867376573089,
"grad_norm": 8.06086254119873,
"learning_rate": 4.9793174767321616e-05,
"loss": 4.6732,
"step": 10
},
{
"epoch": 0.018586640851887704,
"grad_norm": 7.566930294036865,
"learning_rate": 4.974146845915202e-05,
"loss": 4.2104,
"step": 12
},
{
"epoch": 0.021684414327202323,
"grad_norm": 7.272712230682373,
"learning_rate": 4.968976215098242e-05,
"loss": 3.9414,
"step": 14
},
{
"epoch": 0.024782187802516942,
"grad_norm": 5.09606409072876,
"learning_rate": 4.9638055842812824e-05,
"loss": 3.7442,
"step": 16
},
{
"epoch": 0.027879961277831558,
"grad_norm": 4.3098273277282715,
"learning_rate": 4.958634953464323e-05,
"loss": 3.4832,
"step": 18
},
{
"epoch": 0.030977734753146177,
"grad_norm": 4.051149368286133,
"learning_rate": 4.9534643226473634e-05,
"loss": 3.2805,
"step": 20
},
{
"epoch": 0.034075508228460796,
"grad_norm": 3.865760326385498,
"learning_rate": 4.948293691830403e-05,
"loss": 3.0996,
"step": 22
},
{
"epoch": 0.03717328170377541,
"grad_norm": 3.1818125247955322,
"learning_rate": 4.943123061013444e-05,
"loss": 2.9279,
"step": 24
},
{
"epoch": 0.04027105517909003,
"grad_norm": 2.931936502456665,
"learning_rate": 4.937952430196484e-05,
"loss": 2.748,
"step": 26
},
{
"epoch": 0.04336882865440465,
"grad_norm": 3.031834125518799,
"learning_rate": 4.932781799379524e-05,
"loss": 2.6392,
"step": 28
},
{
"epoch": 0.046466602129719266,
"grad_norm": 2.5047967433929443,
"learning_rate": 4.9276111685625646e-05,
"loss": 2.4719,
"step": 30
},
{
"epoch": 0.049564375605033885,
"grad_norm": 2.0622289180755615,
"learning_rate": 4.922440537745605e-05,
"loss": 2.33,
"step": 32
},
{
"epoch": 0.0526621490803485,
"grad_norm": 1.961929202079773,
"learning_rate": 4.9172699069286456e-05,
"loss": 2.2305,
"step": 34
},
{
"epoch": 0.055759922555663116,
"grad_norm": 2.2184677124023438,
"learning_rate": 4.9120992761116855e-05,
"loss": 2.1613,
"step": 36
},
{
"epoch": 0.058857696030977735,
"grad_norm": 1.6713696718215942,
"learning_rate": 4.906928645294726e-05,
"loss": 2.1162,
"step": 38
},
{
"epoch": 0.061955469506292354,
"grad_norm": 1.4867547750473022,
"learning_rate": 4.9017580144777665e-05,
"loss": 2.0524,
"step": 40
},
{
"epoch": 0.06505324298160697,
"grad_norm": 1.3332496881484985,
"learning_rate": 4.896587383660807e-05,
"loss": 2.0135,
"step": 42
},
{
"epoch": 0.06815101645692159,
"grad_norm": 1.1923631429672241,
"learning_rate": 4.891416752843847e-05,
"loss": 1.947,
"step": 44
},
{
"epoch": 0.07124878993223621,
"grad_norm": 1.1316183805465698,
"learning_rate": 4.886246122026887e-05,
"loss": 1.9088,
"step": 46
},
{
"epoch": 0.07434656340755082,
"grad_norm": 1.0954861640930176,
"learning_rate": 4.881075491209928e-05,
"loss": 1.8897,
"step": 48
},
{
"epoch": 0.07744433688286544,
"grad_norm": 1.076762318611145,
"learning_rate": 4.8759048603929683e-05,
"loss": 1.8766,
"step": 50
},
{
"epoch": 0.08054211035818006,
"grad_norm": 0.9524347186088562,
"learning_rate": 4.870734229576008e-05,
"loss": 1.8171,
"step": 52
},
{
"epoch": 0.08363988383349467,
"grad_norm": 1.0217920541763306,
"learning_rate": 4.865563598759049e-05,
"loss": 1.8053,
"step": 54
},
{
"epoch": 0.0867376573088093,
"grad_norm": 0.9322136044502258,
"learning_rate": 4.860392967942089e-05,
"loss": 1.7908,
"step": 56
},
{
"epoch": 0.08983543078412391,
"grad_norm": 1.0043178796768188,
"learning_rate": 4.855222337125129e-05,
"loss": 1.7848,
"step": 58
},
{
"epoch": 0.09293320425943853,
"grad_norm": 0.8848307132720947,
"learning_rate": 4.8500517063081695e-05,
"loss": 1.753,
"step": 60
},
{
"epoch": 0.09603097773475315,
"grad_norm": 0.8497102856636047,
"learning_rate": 4.84488107549121e-05,
"loss": 1.7496,
"step": 62
},
{
"epoch": 0.09912875121006777,
"grad_norm": 0.8451825380325317,
"learning_rate": 4.8397104446742505e-05,
"loss": 1.7324,
"step": 64
},
{
"epoch": 0.10222652468538238,
"grad_norm": 0.8965175747871399,
"learning_rate": 4.8345398138572904e-05,
"loss": 1.7452,
"step": 66
},
{
"epoch": 0.105324298160697,
"grad_norm": 0.8017619252204895,
"learning_rate": 4.829369183040331e-05,
"loss": 1.7167,
"step": 68
},
{
"epoch": 0.10842207163601161,
"grad_norm": 0.8092736601829529,
"learning_rate": 4.8241985522233714e-05,
"loss": 1.7169,
"step": 70
},
{
"epoch": 0.11151984511132623,
"grad_norm": 0.7606578469276428,
"learning_rate": 4.819027921406412e-05,
"loss": 1.7131,
"step": 72
},
{
"epoch": 0.11461761858664085,
"grad_norm": 0.8365290760993958,
"learning_rate": 4.813857290589452e-05,
"loss": 1.707,
"step": 74
},
{
"epoch": 0.11771539206195547,
"grad_norm": 0.7831740975379944,
"learning_rate": 4.808686659772492e-05,
"loss": 1.7044,
"step": 76
},
{
"epoch": 0.12081316553727009,
"grad_norm": 0.7931554913520813,
"learning_rate": 4.803516028955533e-05,
"loss": 1.6685,
"step": 78
},
{
"epoch": 0.12391093901258471,
"grad_norm": 0.7500888109207153,
"learning_rate": 4.7983453981385726e-05,
"loss": 1.6628,
"step": 80
},
{
"epoch": 0.12700871248789933,
"grad_norm": 0.6964195966720581,
"learning_rate": 4.793174767321613e-05,
"loss": 1.6568,
"step": 82
},
{
"epoch": 0.13010648596321395,
"grad_norm": 0.8971940875053406,
"learning_rate": 4.788004136504654e-05,
"loss": 1.6734,
"step": 84
},
{
"epoch": 0.13320425943852857,
"grad_norm": 0.7129445672035217,
"learning_rate": 4.782833505687694e-05,
"loss": 1.628,
"step": 86
},
{
"epoch": 0.13630203291384319,
"grad_norm": 0.8455966114997864,
"learning_rate": 4.7776628748707346e-05,
"loss": 1.6518,
"step": 88
},
{
"epoch": 0.1393998063891578,
"grad_norm": 0.7180489301681519,
"learning_rate": 4.772492244053775e-05,
"loss": 1.6516,
"step": 90
},
{
"epoch": 0.14249757986447242,
"grad_norm": 0.7145109176635742,
"learning_rate": 4.7673216132368156e-05,
"loss": 1.6319,
"step": 92
},
{
"epoch": 0.14559535333978701,
"grad_norm": 0.8121469020843506,
"learning_rate": 4.7621509824198554e-05,
"loss": 1.6627,
"step": 94
},
{
"epoch": 0.14869312681510163,
"grad_norm": 0.7496806383132935,
"learning_rate": 4.756980351602896e-05,
"loss": 1.6319,
"step": 96
},
{
"epoch": 0.15179090029041625,
"grad_norm": 0.7465183734893799,
"learning_rate": 4.7518097207859365e-05,
"loss": 1.6222,
"step": 98
},
{
"epoch": 0.15488867376573087,
"grad_norm": 0.6814384460449219,
"learning_rate": 4.746639089968977e-05,
"loss": 1.6235,
"step": 100
},
{
"epoch": 0.1579864472410455,
"grad_norm": 0.7421935796737671,
"learning_rate": 4.741468459152017e-05,
"loss": 1.6075,
"step": 102
},
{
"epoch": 0.1610842207163601,
"grad_norm": 0.7680038809776306,
"learning_rate": 4.736297828335057e-05,
"loss": 1.6302,
"step": 104
},
{
"epoch": 0.16418199419167473,
"grad_norm": 0.6994872093200684,
"learning_rate": 4.731127197518098e-05,
"loss": 1.6262,
"step": 106
},
{
"epoch": 0.16727976766698935,
"grad_norm": 0.7842795848846436,
"learning_rate": 4.7259565667011376e-05,
"loss": 1.6095,
"step": 108
},
{
"epoch": 0.17037754114230397,
"grad_norm": 0.9475075006484985,
"learning_rate": 4.720785935884178e-05,
"loss": 1.6044,
"step": 110
},
{
"epoch": 0.1734753146176186,
"grad_norm": 0.7090861201286316,
"learning_rate": 4.7156153050672187e-05,
"loss": 1.6177,
"step": 112
},
{
"epoch": 0.1765730880929332,
"grad_norm": 0.70943683385849,
"learning_rate": 4.710444674250259e-05,
"loss": 1.6166,
"step": 114
},
{
"epoch": 0.17967086156824782,
"grad_norm": 0.7159855365753174,
"learning_rate": 4.705274043433299e-05,
"loss": 1.6084,
"step": 116
},
{
"epoch": 0.18276863504356244,
"grad_norm": 0.7541035413742065,
"learning_rate": 4.7001034126163395e-05,
"loss": 1.5959,
"step": 118
},
{
"epoch": 0.18586640851887706,
"grad_norm": 0.6943245530128479,
"learning_rate": 4.69493278179938e-05,
"loss": 1.5732,
"step": 120
},
{
"epoch": 0.18896418199419168,
"grad_norm": 0.6735599637031555,
"learning_rate": 4.6897621509824205e-05,
"loss": 1.582,
"step": 122
},
{
"epoch": 0.1920619554695063,
"grad_norm": 0.6189006567001343,
"learning_rate": 4.6845915201654603e-05,
"loss": 1.5789,
"step": 124
},
{
"epoch": 0.19515972894482092,
"grad_norm": 0.6355108022689819,
"learning_rate": 4.679420889348501e-05,
"loss": 1.5852,
"step": 126
},
{
"epoch": 0.19825750242013554,
"grad_norm": 0.7827373743057251,
"learning_rate": 4.6742502585315414e-05,
"loss": 1.5868,
"step": 128
},
{
"epoch": 0.20135527589545016,
"grad_norm": 0.7391951680183411,
"learning_rate": 4.669079627714581e-05,
"loss": 1.5543,
"step": 130
},
{
"epoch": 0.20445304937076475,
"grad_norm": 0.659102201461792,
"learning_rate": 4.663908996897622e-05,
"loss": 1.5933,
"step": 132
},
{
"epoch": 0.20755082284607937,
"grad_norm": 0.7462721467018127,
"learning_rate": 4.658738366080662e-05,
"loss": 1.5757,
"step": 134
},
{
"epoch": 0.210648596321394,
"grad_norm": 0.6847939491271973,
"learning_rate": 4.653567735263703e-05,
"loss": 1.5737,
"step": 136
},
{
"epoch": 0.2137463697967086,
"grad_norm": 0.7970624566078186,
"learning_rate": 4.6483971044467425e-05,
"loss": 1.5801,
"step": 138
},
{
"epoch": 0.21684414327202323,
"grad_norm": 0.6875075101852417,
"learning_rate": 4.643226473629783e-05,
"loss": 1.5971,
"step": 140
},
{
"epoch": 0.21994191674733785,
"grad_norm": 0.708543062210083,
"learning_rate": 4.6380558428128236e-05,
"loss": 1.5693,
"step": 142
},
{
"epoch": 0.22303969022265246,
"grad_norm": 0.6866245865821838,
"learning_rate": 4.632885211995864e-05,
"loss": 1.5673,
"step": 144
},
{
"epoch": 0.22613746369796708,
"grad_norm": 0.7913303375244141,
"learning_rate": 4.627714581178904e-05,
"loss": 1.5746,
"step": 146
},
{
"epoch": 0.2292352371732817,
"grad_norm": 0.7373950481414795,
"learning_rate": 4.6225439503619444e-05,
"loss": 1.5535,
"step": 148
},
{
"epoch": 0.23233301064859632,
"grad_norm": 0.6925334334373474,
"learning_rate": 4.617373319544985e-05,
"loss": 1.5516,
"step": 150
},
{
"epoch": 0.23543078412391094,
"grad_norm": 1.0897713899612427,
"learning_rate": 4.6122026887280254e-05,
"loss": 1.5388,
"step": 152
},
{
"epoch": 0.23852855759922556,
"grad_norm": 0.7056815028190613,
"learning_rate": 4.607032057911065e-05,
"loss": 1.5398,
"step": 154
},
{
"epoch": 0.24162633107454018,
"grad_norm": 0.7982754111289978,
"learning_rate": 4.601861427094106e-05,
"loss": 1.5655,
"step": 156
},
{
"epoch": 0.2447241045498548,
"grad_norm": 0.6022890210151672,
"learning_rate": 4.596690796277146e-05,
"loss": 1.5439,
"step": 158
},
{
"epoch": 0.24782187802516942,
"grad_norm": 0.7785722017288208,
"learning_rate": 4.591520165460186e-05,
"loss": 1.5396,
"step": 160
},
{
"epoch": 0.250919651500484,
"grad_norm": 0.8552553653717041,
"learning_rate": 4.5863495346432266e-05,
"loss": 1.5387,
"step": 162
},
{
"epoch": 0.25401742497579866,
"grad_norm": 0.7920766472816467,
"learning_rate": 4.581178903826267e-05,
"loss": 1.5309,
"step": 164
},
{
"epoch": 0.25711519845111325,
"grad_norm": 0.6684227585792542,
"learning_rate": 4.5760082730093076e-05,
"loss": 1.5686,
"step": 166
},
{
"epoch": 0.2602129719264279,
"grad_norm": 0.8071036338806152,
"learning_rate": 4.5708376421923474e-05,
"loss": 1.546,
"step": 168
},
{
"epoch": 0.2633107454017425,
"grad_norm": 0.7032163739204407,
"learning_rate": 4.565667011375388e-05,
"loss": 1.5496,
"step": 170
},
{
"epoch": 0.26640851887705713,
"grad_norm": 0.7134162783622742,
"learning_rate": 4.5604963805584284e-05,
"loss": 1.5376,
"step": 172
},
{
"epoch": 0.2695062923523717,
"grad_norm": 0.6842672228813171,
"learning_rate": 4.555325749741469e-05,
"loss": 1.5381,
"step": 174
},
{
"epoch": 0.27260406582768637,
"grad_norm": 0.8091727495193481,
"learning_rate": 4.550155118924509e-05,
"loss": 1.5487,
"step": 176
},
{
"epoch": 0.27570183930300096,
"grad_norm": 0.8914667963981628,
"learning_rate": 4.544984488107549e-05,
"loss": 1.5411,
"step": 178
},
{
"epoch": 0.2787996127783156,
"grad_norm": 0.676179826259613,
"learning_rate": 4.53981385729059e-05,
"loss": 1.5694,
"step": 180
},
{
"epoch": 0.2818973862536302,
"grad_norm": 0.9339081645011902,
"learning_rate": 4.5346432264736296e-05,
"loss": 1.5377,
"step": 182
},
{
"epoch": 0.28499515972894485,
"grad_norm": 0.7921097874641418,
"learning_rate": 4.52947259565667e-05,
"loss": 1.5275,
"step": 184
},
{
"epoch": 0.28809293320425944,
"grad_norm": 0.7746542692184448,
"learning_rate": 4.5243019648397106e-05,
"loss": 1.5319,
"step": 186
},
{
"epoch": 0.29119070667957403,
"grad_norm": 0.9051031470298767,
"learning_rate": 4.519131334022751e-05,
"loss": 1.546,
"step": 188
},
{
"epoch": 0.2942884801548887,
"grad_norm": 0.9229975342750549,
"learning_rate": 4.513960703205791e-05,
"loss": 1.5389,
"step": 190
},
{
"epoch": 0.29738625363020327,
"grad_norm": 0.7976638674736023,
"learning_rate": 4.5087900723888315e-05,
"loss": 1.5345,
"step": 192
},
{
"epoch": 0.3004840271055179,
"grad_norm": 0.6807886362075806,
"learning_rate": 4.503619441571872e-05,
"loss": 1.5139,
"step": 194
},
{
"epoch": 0.3035818005808325,
"grad_norm": 0.7215619683265686,
"learning_rate": 4.4984488107549125e-05,
"loss": 1.5431,
"step": 196
},
{
"epoch": 0.30667957405614715,
"grad_norm": 0.6181749105453491,
"learning_rate": 4.493278179937952e-05,
"loss": 1.5301,
"step": 198
},
{
"epoch": 0.30977734753146174,
"grad_norm": 0.7625611424446106,
"learning_rate": 4.488107549120993e-05,
"loss": 1.5185,
"step": 200
},
{
"epoch": 0.3128751210067764,
"grad_norm": 0.8464534878730774,
"learning_rate": 4.4829369183040333e-05,
"loss": 1.5293,
"step": 202
},
{
"epoch": 0.315972894482091,
"grad_norm": 1.0860360860824585,
"learning_rate": 4.477766287487073e-05,
"loss": 1.5289,
"step": 204
},
{
"epoch": 0.31907066795740563,
"grad_norm": 0.6855882406234741,
"learning_rate": 4.472595656670114e-05,
"loss": 1.513,
"step": 206
},
{
"epoch": 0.3221684414327202,
"grad_norm": 0.6964966654777527,
"learning_rate": 4.467425025853154e-05,
"loss": 1.5256,
"step": 208
},
{
"epoch": 0.32526621490803487,
"grad_norm": 0.9584571719169617,
"learning_rate": 4.462254395036195e-05,
"loss": 1.5307,
"step": 210
},
{
"epoch": 0.32836398838334946,
"grad_norm": 0.8495706915855408,
"learning_rate": 4.4570837642192345e-05,
"loss": 1.517,
"step": 212
},
{
"epoch": 0.3314617618586641,
"grad_norm": 0.7504775524139404,
"learning_rate": 4.451913133402275e-05,
"loss": 1.5006,
"step": 214
},
{
"epoch": 0.3345595353339787,
"grad_norm": 0.8532968759536743,
"learning_rate": 4.4467425025853155e-05,
"loss": 1.5357,
"step": 216
},
{
"epoch": 0.33765730880929334,
"grad_norm": 1.1161147356033325,
"learning_rate": 4.441571871768356e-05,
"loss": 1.5194,
"step": 218
},
{
"epoch": 0.34075508228460794,
"grad_norm": 0.9745946526527405,
"learning_rate": 4.436401240951396e-05,
"loss": 1.5254,
"step": 220
},
{
"epoch": 0.3438528557599226,
"grad_norm": 1.0027096271514893,
"learning_rate": 4.4312306101344364e-05,
"loss": 1.5295,
"step": 222
},
{
"epoch": 0.3469506292352372,
"grad_norm": 0.9923873543739319,
"learning_rate": 4.426059979317477e-05,
"loss": 1.517,
"step": 224
},
{
"epoch": 0.35004840271055176,
"grad_norm": 1.348768711090088,
"learning_rate": 4.420889348500517e-05,
"loss": 1.535,
"step": 226
},
{
"epoch": 0.3531461761858664,
"grad_norm": 0.808506190776825,
"learning_rate": 4.415718717683557e-05,
"loss": 1.5003,
"step": 228
},
{
"epoch": 0.356243949661181,
"grad_norm": 0.9981195330619812,
"learning_rate": 4.410548086866598e-05,
"loss": 1.491,
"step": 230
},
{
"epoch": 0.35934172313649565,
"grad_norm": 0.75667804479599,
"learning_rate": 4.405377456049638e-05,
"loss": 1.5006,
"step": 232
},
{
"epoch": 0.36243949661181024,
"grad_norm": 0.6836598515510559,
"learning_rate": 4.400206825232678e-05,
"loss": 1.5165,
"step": 234
},
{
"epoch": 0.3655372700871249,
"grad_norm": 0.817143976688385,
"learning_rate": 4.3950361944157186e-05,
"loss": 1.4953,
"step": 236
},
{
"epoch": 0.3686350435624395,
"grad_norm": 0.9403025507926941,
"learning_rate": 4.389865563598759e-05,
"loss": 1.5093,
"step": 238
},
{
"epoch": 0.3717328170377541,
"grad_norm": 0.8944310545921326,
"learning_rate": 4.3846949327817996e-05,
"loss": 1.4989,
"step": 240
},
{
"epoch": 0.3748305905130687,
"grad_norm": 0.8741413950920105,
"learning_rate": 4.3795243019648394e-05,
"loss": 1.5084,
"step": 242
},
{
"epoch": 0.37792836398838336,
"grad_norm": 0.7388432621955872,
"learning_rate": 4.3743536711478806e-05,
"loss": 1.4904,
"step": 244
},
{
"epoch": 0.38102613746369796,
"grad_norm": 1.061517596244812,
"learning_rate": 4.369183040330921e-05,
"loss": 1.4917,
"step": 246
},
{
"epoch": 0.3841239109390126,
"grad_norm": 0.9416221380233765,
"learning_rate": 4.364012409513961e-05,
"loss": 1.4798,
"step": 248
},
{
"epoch": 0.3872216844143272,
"grad_norm": 0.9135381579399109,
"learning_rate": 4.3588417786970015e-05,
"loss": 1.4923,
"step": 250
},
{
"epoch": 0.39031945788964184,
"grad_norm": 0.7566073536872864,
"learning_rate": 4.353671147880042e-05,
"loss": 1.5062,
"step": 252
},
{
"epoch": 0.39341723136495643,
"grad_norm": 0.8875766396522522,
"learning_rate": 4.3485005170630825e-05,
"loss": 1.5138,
"step": 254
},
{
"epoch": 0.3965150048402711,
"grad_norm": 0.9574587345123291,
"learning_rate": 4.343329886246122e-05,
"loss": 1.5071,
"step": 256
},
{
"epoch": 0.39961277831558567,
"grad_norm": 0.8392723202705383,
"learning_rate": 4.338159255429163e-05,
"loss": 1.4951,
"step": 258
},
{
"epoch": 0.4027105517909003,
"grad_norm": 1.0290050506591797,
"learning_rate": 4.332988624612203e-05,
"loss": 1.5016,
"step": 260
},
{
"epoch": 0.4058083252662149,
"grad_norm": 0.8251259922981262,
"learning_rate": 4.327817993795243e-05,
"loss": 1.5002,
"step": 262
},
{
"epoch": 0.4089060987415295,
"grad_norm": 0.645811140537262,
"learning_rate": 4.3226473629782837e-05,
"loss": 1.4892,
"step": 264
},
{
"epoch": 0.41200387221684415,
"grad_norm": 0.8079040050506592,
"learning_rate": 4.317476732161324e-05,
"loss": 1.4952,
"step": 266
},
{
"epoch": 0.41510164569215874,
"grad_norm": 0.72142094373703,
"learning_rate": 4.312306101344365e-05,
"loss": 1.4852,
"step": 268
},
{
"epoch": 0.4181994191674734,
"grad_norm": 0.6570938229560852,
"learning_rate": 4.3071354705274045e-05,
"loss": 1.4905,
"step": 270
},
{
"epoch": 0.421297192642788,
"grad_norm": 0.8945090770721436,
"learning_rate": 4.301964839710445e-05,
"loss": 1.4761,
"step": 272
},
{
"epoch": 0.4243949661181026,
"grad_norm": 0.6998433470726013,
"learning_rate": 4.2967942088934855e-05,
"loss": 1.4766,
"step": 274
},
{
"epoch": 0.4274927395934172,
"grad_norm": 0.8252823948860168,
"learning_rate": 4.291623578076526e-05,
"loss": 1.4991,
"step": 276
},
{
"epoch": 0.43059051306873186,
"grad_norm": 0.7982930541038513,
"learning_rate": 4.286452947259566e-05,
"loss": 1.491,
"step": 278
},
{
"epoch": 0.43368828654404645,
"grad_norm": 0.7150436043739319,
"learning_rate": 4.2812823164426064e-05,
"loss": 1.4785,
"step": 280
},
{
"epoch": 0.4367860600193611,
"grad_norm": 0.7387247681617737,
"learning_rate": 4.276111685625647e-05,
"loss": 1.4922,
"step": 282
},
{
"epoch": 0.4398838334946757,
"grad_norm": 0.6871611475944519,
"learning_rate": 4.270941054808687e-05,
"loss": 1.4837,
"step": 284
},
{
"epoch": 0.44298160696999034,
"grad_norm": 0.8222848176956177,
"learning_rate": 4.265770423991727e-05,
"loss": 1.4894,
"step": 286
},
{
"epoch": 0.44607938044530493,
"grad_norm": 0.7156632542610168,
"learning_rate": 4.260599793174768e-05,
"loss": 1.4967,
"step": 288
},
{
"epoch": 0.4491771539206196,
"grad_norm": 0.8731895685195923,
"learning_rate": 4.255429162357808e-05,
"loss": 1.4721,
"step": 290
},
{
"epoch": 0.45227492739593417,
"grad_norm": 0.7288525104522705,
"learning_rate": 4.250258531540848e-05,
"loss": 1.4777,
"step": 292
},
{
"epoch": 0.4553727008712488,
"grad_norm": 0.582266092300415,
"learning_rate": 4.2450879007238886e-05,
"loss": 1.4536,
"step": 294
},
{
"epoch": 0.4584704743465634,
"grad_norm": 0.8917784094810486,
"learning_rate": 4.239917269906929e-05,
"loss": 1.4913,
"step": 296
},
{
"epoch": 0.46156824782187805,
"grad_norm": 0.7268736958503723,
"learning_rate": 4.2347466390899696e-05,
"loss": 1.5163,
"step": 298
},
{
"epoch": 0.46466602129719264,
"grad_norm": 0.7215520143508911,
"learning_rate": 4.2295760082730094e-05,
"loss": 1.5,
"step": 300
},
{
"epoch": 0.46776379477250724,
"grad_norm": 0.7149571180343628,
"learning_rate": 4.22440537745605e-05,
"loss": 1.5089,
"step": 302
},
{
"epoch": 0.4708615682478219,
"grad_norm": 0.8469173312187195,
"learning_rate": 4.2192347466390904e-05,
"loss": 1.4913,
"step": 304
},
{
"epoch": 0.4739593417231365,
"grad_norm": 0.7890795469284058,
"learning_rate": 4.21406411582213e-05,
"loss": 1.4625,
"step": 306
},
{
"epoch": 0.4770571151984511,
"grad_norm": 0.6617977023124695,
"learning_rate": 4.208893485005171e-05,
"loss": 1.468,
"step": 308
},
{
"epoch": 0.4801548886737657,
"grad_norm": 0.8981150388717651,
"learning_rate": 4.203722854188211e-05,
"loss": 1.4746,
"step": 310
},
{
"epoch": 0.48325266214908036,
"grad_norm": 0.733159601688385,
"learning_rate": 4.198552223371252e-05,
"loss": 1.4685,
"step": 312
},
{
"epoch": 0.48635043562439495,
"grad_norm": 0.8201707601547241,
"learning_rate": 4.1933815925542916e-05,
"loss": 1.4735,
"step": 314
},
{
"epoch": 0.4894482090997096,
"grad_norm": 0.7135993242263794,
"learning_rate": 4.188210961737332e-05,
"loss": 1.4879,
"step": 316
},
{
"epoch": 0.4925459825750242,
"grad_norm": 0.9840111136436462,
"learning_rate": 4.1830403309203726e-05,
"loss": 1.5049,
"step": 318
},
{
"epoch": 0.49564375605033884,
"grad_norm": 0.8042699098587036,
"learning_rate": 4.177869700103413e-05,
"loss": 1.4553,
"step": 320
},
{
"epoch": 0.4987415295256534,
"grad_norm": 0.8507205843925476,
"learning_rate": 4.172699069286453e-05,
"loss": 1.4756,
"step": 322
},
{
"epoch": 0.501839303000968,
"grad_norm": 0.9723739624023438,
"learning_rate": 4.1675284384694934e-05,
"loss": 1.4621,
"step": 324
},
{
"epoch": 0.5049370764762827,
"grad_norm": 0.9678776264190674,
"learning_rate": 4.162357807652534e-05,
"loss": 1.4715,
"step": 326
},
{
"epoch": 0.5080348499515973,
"grad_norm": 1.031358242034912,
"learning_rate": 4.1571871768355745e-05,
"loss": 1.4919,
"step": 328
},
{
"epoch": 0.5111326234269119,
"grad_norm": 0.7192648649215698,
"learning_rate": 4.152016546018614e-05,
"loss": 1.4659,
"step": 330
},
{
"epoch": 0.5142303969022265,
"grad_norm": 0.9962320923805237,
"learning_rate": 4.146845915201655e-05,
"loss": 1.4544,
"step": 332
},
{
"epoch": 0.5173281703775412,
"grad_norm": 0.7830464243888855,
"learning_rate": 4.141675284384695e-05,
"loss": 1.457,
"step": 334
},
{
"epoch": 0.5204259438528558,
"grad_norm": 0.8971250653266907,
"learning_rate": 4.136504653567735e-05,
"loss": 1.4669,
"step": 336
},
{
"epoch": 0.5235237173281704,
"grad_norm": 0.7550873160362244,
"learning_rate": 4.1313340227507756e-05,
"loss": 1.464,
"step": 338
},
{
"epoch": 0.526621490803485,
"grad_norm": 0.9138239622116089,
"learning_rate": 4.126163391933816e-05,
"loss": 1.4861,
"step": 340
},
{
"epoch": 0.5297192642787996,
"grad_norm": 0.8233465552330017,
"learning_rate": 4.1209927611168567e-05,
"loss": 1.4731,
"step": 342
},
{
"epoch": 0.5328170377541143,
"grad_norm": 0.8664296865463257,
"learning_rate": 4.1158221302998965e-05,
"loss": 1.4576,
"step": 344
},
{
"epoch": 0.5359148112294289,
"grad_norm": 0.8798326849937439,
"learning_rate": 4.110651499482937e-05,
"loss": 1.4711,
"step": 346
},
{
"epoch": 0.5390125847047434,
"grad_norm": 0.9018617272377014,
"learning_rate": 4.1054808686659775e-05,
"loss": 1.4753,
"step": 348
},
{
"epoch": 0.542110358180058,
"grad_norm": 1.0225512981414795,
"learning_rate": 4.100310237849018e-05,
"loss": 1.4546,
"step": 350
},
{
"epoch": 0.5452081316553727,
"grad_norm": 0.8715205192565918,
"learning_rate": 4.095139607032058e-05,
"loss": 1.4685,
"step": 352
},
{
"epoch": 0.5483059051306873,
"grad_norm": 0.912350058555603,
"learning_rate": 4.0899689762150983e-05,
"loss": 1.4591,
"step": 354
},
{
"epoch": 0.5514036786060019,
"grad_norm": 0.8565901517868042,
"learning_rate": 4.084798345398139e-05,
"loss": 1.45,
"step": 356
},
{
"epoch": 0.5545014520813165,
"grad_norm": 1.0796242952346802,
"learning_rate": 4.079627714581179e-05,
"loss": 1.4703,
"step": 358
},
{
"epoch": 0.5575992255566312,
"grad_norm": 0.8250919580459595,
"learning_rate": 4.074457083764219e-05,
"loss": 1.4465,
"step": 360
},
{
"epoch": 0.5606969990319458,
"grad_norm": 1.1239674091339111,
"learning_rate": 4.06928645294726e-05,
"loss": 1.4664,
"step": 362
},
{
"epoch": 0.5637947725072604,
"grad_norm": 0.8356937170028687,
"learning_rate": 4.0641158221303e-05,
"loss": 1.4736,
"step": 364
},
{
"epoch": 0.566892545982575,
"grad_norm": 0.9538320302963257,
"learning_rate": 4.05894519131334e-05,
"loss": 1.4685,
"step": 366
},
{
"epoch": 0.5699903194578897,
"grad_norm": 0.9279565215110779,
"learning_rate": 4.0537745604963805e-05,
"loss": 1.4837,
"step": 368
},
{
"epoch": 0.5730880929332043,
"grad_norm": 0.7745909690856934,
"learning_rate": 4.048603929679421e-05,
"loss": 1.4496,
"step": 370
},
{
"epoch": 0.5761858664085189,
"grad_norm": 1.0521866083145142,
"learning_rate": 4.0434332988624616e-05,
"loss": 1.4368,
"step": 372
},
{
"epoch": 0.5792836398838335,
"grad_norm": 1.0289032459259033,
"learning_rate": 4.0382626680455014e-05,
"loss": 1.4522,
"step": 374
},
{
"epoch": 0.5823814133591481,
"grad_norm": 1.0961400270462036,
"learning_rate": 4.033092037228542e-05,
"loss": 1.4528,
"step": 376
},
{
"epoch": 0.5854791868344628,
"grad_norm": 0.8788878321647644,
"learning_rate": 4.0279214064115824e-05,
"loss": 1.4644,
"step": 378
},
{
"epoch": 0.5885769603097774,
"grad_norm": 0.8058956861495972,
"learning_rate": 4.022750775594622e-05,
"loss": 1.4437,
"step": 380
},
{
"epoch": 0.5916747337850919,
"grad_norm": 0.7191058993339539,
"learning_rate": 4.017580144777663e-05,
"loss": 1.4553,
"step": 382
},
{
"epoch": 0.5947725072604065,
"grad_norm": 0.9950371980667114,
"learning_rate": 4.012409513960703e-05,
"loss": 1.4601,
"step": 384
},
{
"epoch": 0.5978702807357212,
"grad_norm": 1.0689505338668823,
"learning_rate": 4.007238883143744e-05,
"loss": 1.4298,
"step": 386
},
{
"epoch": 0.6009680542110358,
"grad_norm": 1.0722795724868774,
"learning_rate": 4.0020682523267836e-05,
"loss": 1.4477,
"step": 388
},
{
"epoch": 0.6040658276863504,
"grad_norm": 1.1227514743804932,
"learning_rate": 3.996897621509824e-05,
"loss": 1.4691,
"step": 390
},
{
"epoch": 0.607163601161665,
"grad_norm": 0.9659287333488464,
"learning_rate": 3.9917269906928646e-05,
"loss": 1.4575,
"step": 392
},
{
"epoch": 0.6102613746369797,
"grad_norm": 1.0061907768249512,
"learning_rate": 3.986556359875905e-05,
"loss": 1.4678,
"step": 394
},
{
"epoch": 0.6133591481122943,
"grad_norm": 1.0684504508972168,
"learning_rate": 3.981385729058945e-05,
"loss": 1.451,
"step": 396
},
{
"epoch": 0.6164569215876089,
"grad_norm": 0.7698823809623718,
"learning_rate": 3.9762150982419854e-05,
"loss": 1.468,
"step": 398
},
{
"epoch": 0.6195546950629235,
"grad_norm": 1.1838396787643433,
"learning_rate": 3.971044467425026e-05,
"loss": 1.4412,
"step": 400
},
{
"epoch": 0.6195546950629235,
"eval_loss": 1.4199515581130981,
"eval_runtime": 500.1361,
"eval_samples_per_second": 41.309,
"eval_steps_per_second": 5.165,
"step": 400
},
{
"epoch": 0.6226524685382382,
"grad_norm": 0.9022509455680847,
"learning_rate": 3.965873836608066e-05,
"loss": 1.4453,
"step": 402
},
{
"epoch": 0.6257502420135528,
"grad_norm": 0.9168387055397034,
"learning_rate": 3.960703205791106e-05,
"loss": 1.4888,
"step": 404
},
{
"epoch": 0.6288480154888674,
"grad_norm": 1.0163190364837646,
"learning_rate": 3.9555325749741475e-05,
"loss": 1.449,
"step": 406
},
{
"epoch": 0.631945788964182,
"grad_norm": 1.0365885496139526,
"learning_rate": 3.950361944157187e-05,
"loss": 1.4691,
"step": 408
},
{
"epoch": 0.6350435624394967,
"grad_norm": 0.9853517413139343,
"learning_rate": 3.945191313340228e-05,
"loss": 1.4544,
"step": 410
},
{
"epoch": 0.6381413359148113,
"grad_norm": 1.1572599411010742,
"learning_rate": 3.940020682523268e-05,
"loss": 1.4577,
"step": 412
},
{
"epoch": 0.6412391093901258,
"grad_norm": 0.9835549592971802,
"learning_rate": 3.934850051706309e-05,
"loss": 1.4408,
"step": 414
},
{
"epoch": 0.6443368828654404,
"grad_norm": 0.792939305305481,
"learning_rate": 3.9296794208893487e-05,
"loss": 1.4579,
"step": 416
},
{
"epoch": 0.647434656340755,
"grad_norm": 0.9052080512046814,
"learning_rate": 3.924508790072389e-05,
"loss": 1.452,
"step": 418
},
{
"epoch": 0.6505324298160697,
"grad_norm": 0.8612220287322998,
"learning_rate": 3.91933815925543e-05,
"loss": 1.4652,
"step": 420
},
{
"epoch": 0.6536302032913843,
"grad_norm": 0.9994277358055115,
"learning_rate": 3.91416752843847e-05,
"loss": 1.4543,
"step": 422
},
{
"epoch": 0.6567279767666989,
"grad_norm": 0.9547328948974609,
"learning_rate": 3.90899689762151e-05,
"loss": 1.4588,
"step": 424
},
{
"epoch": 0.6598257502420135,
"grad_norm": 0.9314976930618286,
"learning_rate": 3.9038262668045505e-05,
"loss": 1.4889,
"step": 426
},
{
"epoch": 0.6629235237173282,
"grad_norm": 0.891118586063385,
"learning_rate": 3.898655635987591e-05,
"loss": 1.4451,
"step": 428
},
{
"epoch": 0.6660212971926428,
"grad_norm": 0.8572834730148315,
"learning_rate": 3.8934850051706315e-05,
"loss": 1.4312,
"step": 430
},
{
"epoch": 0.6691190706679574,
"grad_norm": 0.994143545627594,
"learning_rate": 3.8883143743536714e-05,
"loss": 1.4357,
"step": 432
},
{
"epoch": 0.672216844143272,
"grad_norm": 1.099258542060852,
"learning_rate": 3.883143743536712e-05,
"loss": 1.4601,
"step": 434
},
{
"epoch": 0.6753146176185867,
"grad_norm": 0.7827516794204712,
"learning_rate": 3.8779731127197524e-05,
"loss": 1.4351,
"step": 436
},
{
"epoch": 0.6784123910939013,
"grad_norm": 0.952328085899353,
"learning_rate": 3.872802481902792e-05,
"loss": 1.4374,
"step": 438
},
{
"epoch": 0.6815101645692159,
"grad_norm": 0.8575209379196167,
"learning_rate": 3.867631851085833e-05,
"loss": 1.4427,
"step": 440
},
{
"epoch": 0.6846079380445305,
"grad_norm": 0.848095178604126,
"learning_rate": 3.862461220268873e-05,
"loss": 1.4337,
"step": 442
},
{
"epoch": 0.6877057115198452,
"grad_norm": 0.8709319233894348,
"learning_rate": 3.857290589451914e-05,
"loss": 1.4647,
"step": 444
},
{
"epoch": 0.6908034849951598,
"grad_norm": 0.9184627532958984,
"learning_rate": 3.8521199586349535e-05,
"loss": 1.4248,
"step": 446
},
{
"epoch": 0.6939012584704743,
"grad_norm": 0.7510619759559631,
"learning_rate": 3.846949327817994e-05,
"loss": 1.4524,
"step": 448
},
{
"epoch": 0.6969990319457889,
"grad_norm": 0.9397213459014893,
"learning_rate": 3.8417786970010346e-05,
"loss": 1.4324,
"step": 450
},
{
"epoch": 0.7000968054211035,
"grad_norm": 0.9663003087043762,
"learning_rate": 3.836608066184075e-05,
"loss": 1.4538,
"step": 452
},
{
"epoch": 0.7031945788964182,
"grad_norm": 1.021768569946289,
"learning_rate": 3.831437435367115e-05,
"loss": 1.4208,
"step": 454
},
{
"epoch": 0.7062923523717328,
"grad_norm": 0.7242577075958252,
"learning_rate": 3.8262668045501554e-05,
"loss": 1.4426,
"step": 456
},
{
"epoch": 0.7093901258470474,
"grad_norm": 0.8870192766189575,
"learning_rate": 3.821096173733196e-05,
"loss": 1.4317,
"step": 458
},
{
"epoch": 0.712487899322362,
"grad_norm": 0.7298364043235779,
"learning_rate": 3.815925542916236e-05,
"loss": 1.453,
"step": 460
},
{
"epoch": 0.7155856727976767,
"grad_norm": 1.1145060062408447,
"learning_rate": 3.810754912099276e-05,
"loss": 1.422,
"step": 462
},
{
"epoch": 0.7186834462729913,
"grad_norm": 0.7591832876205444,
"learning_rate": 3.805584281282317e-05,
"loss": 1.4558,
"step": 464
},
{
"epoch": 0.7217812197483059,
"grad_norm": 0.9220917224884033,
"learning_rate": 3.800413650465357e-05,
"loss": 1.4594,
"step": 466
},
{
"epoch": 0.7248789932236205,
"grad_norm": 0.768500030040741,
"learning_rate": 3.795243019648397e-05,
"loss": 1.4314,
"step": 468
},
{
"epoch": 0.7279767666989352,
"grad_norm": 1.0767769813537598,
"learning_rate": 3.7900723888314376e-05,
"loss": 1.4502,
"step": 470
},
{
"epoch": 0.7310745401742498,
"grad_norm": 1.253908634185791,
"learning_rate": 3.784901758014478e-05,
"loss": 1.4252,
"step": 472
},
{
"epoch": 0.7341723136495644,
"grad_norm": 0.943250298500061,
"learning_rate": 3.7797311271975186e-05,
"loss": 1.4248,
"step": 474
},
{
"epoch": 0.737270087124879,
"grad_norm": 0.8801707625389099,
"learning_rate": 3.7745604963805584e-05,
"loss": 1.4419,
"step": 476
},
{
"epoch": 0.7403678606001937,
"grad_norm": 0.6957564949989319,
"learning_rate": 3.769389865563599e-05,
"loss": 1.4259,
"step": 478
},
{
"epoch": 0.7434656340755083,
"grad_norm": 0.8454000353813171,
"learning_rate": 3.7642192347466395e-05,
"loss": 1.4236,
"step": 480
},
{
"epoch": 0.7465634075508228,
"grad_norm": 0.8354184031486511,
"learning_rate": 3.759048603929679e-05,
"loss": 1.4391,
"step": 482
},
{
"epoch": 0.7496611810261374,
"grad_norm": 0.8452537059783936,
"learning_rate": 3.75387797311272e-05,
"loss": 1.4217,
"step": 484
},
{
"epoch": 0.752758954501452,
"grad_norm": 0.7954672574996948,
"learning_rate": 3.74870734229576e-05,
"loss": 1.4427,
"step": 486
},
{
"epoch": 0.7558567279767667,
"grad_norm": 0.8007389307022095,
"learning_rate": 3.743536711478801e-05,
"loss": 1.4352,
"step": 488
},
{
"epoch": 0.7589545014520813,
"grad_norm": 0.9226048588752747,
"learning_rate": 3.7383660806618406e-05,
"loss": 1.4431,
"step": 490
},
{
"epoch": 0.7620522749273959,
"grad_norm": 0.9642147421836853,
"learning_rate": 3.733195449844881e-05,
"loss": 1.4365,
"step": 492
},
{
"epoch": 0.7651500484027105,
"grad_norm": 0.8833573460578918,
"learning_rate": 3.7280248190279217e-05,
"loss": 1.4198,
"step": 494
},
{
"epoch": 0.7682478218780252,
"grad_norm": 0.7712429165840149,
"learning_rate": 3.722854188210962e-05,
"loss": 1.4407,
"step": 496
},
{
"epoch": 0.7713455953533398,
"grad_norm": 0.8324020504951477,
"learning_rate": 3.717683557394002e-05,
"loss": 1.4472,
"step": 498
},
{
"epoch": 0.7744433688286544,
"grad_norm": 0.855499267578125,
"learning_rate": 3.7125129265770425e-05,
"loss": 1.4107,
"step": 500
},
{
"epoch": 0.777541142303969,
"grad_norm": 0.9357978105545044,
"learning_rate": 3.707342295760083e-05,
"loss": 1.4538,
"step": 502
},
{
"epoch": 0.7806389157792837,
"grad_norm": 0.8887574076652527,
"learning_rate": 3.702171664943123e-05,
"loss": 1.4268,
"step": 504
},
{
"epoch": 0.7837366892545983,
"grad_norm": 0.7522751688957214,
"learning_rate": 3.6970010341261633e-05,
"loss": 1.4156,
"step": 506
},
{
"epoch": 0.7868344627299129,
"grad_norm": 0.8015060424804688,
"learning_rate": 3.691830403309204e-05,
"loss": 1.4157,
"step": 508
},
{
"epoch": 0.7899322362052275,
"grad_norm": 0.8720948100090027,
"learning_rate": 3.6866597724922444e-05,
"loss": 1.4153,
"step": 510
},
{
"epoch": 0.7930300096805422,
"grad_norm": 0.9627205729484558,
"learning_rate": 3.681489141675284e-05,
"loss": 1.4361,
"step": 512
},
{
"epoch": 0.7961277831558567,
"grad_norm": 0.8693482279777527,
"learning_rate": 3.676318510858325e-05,
"loss": 1.4587,
"step": 514
},
{
"epoch": 0.7992255566311713,
"grad_norm": 0.9331101775169373,
"learning_rate": 3.671147880041365e-05,
"loss": 1.4454,
"step": 516
},
{
"epoch": 0.8023233301064859,
"grad_norm": 0.9783412218093872,
"learning_rate": 3.665977249224406e-05,
"loss": 1.4434,
"step": 518
},
{
"epoch": 0.8054211035818006,
"grad_norm": 0.828762412071228,
"learning_rate": 3.6608066184074455e-05,
"loss": 1.4054,
"step": 520
},
{
"epoch": 0.8085188770571152,
"grad_norm": 1.0303949117660522,
"learning_rate": 3.655635987590486e-05,
"loss": 1.4385,
"step": 522
},
{
"epoch": 0.8116166505324298,
"grad_norm": 0.7832565903663635,
"learning_rate": 3.6504653567735266e-05,
"loss": 1.4367,
"step": 524
},
{
"epoch": 0.8147144240077444,
"grad_norm": 0.8763161301612854,
"learning_rate": 3.645294725956567e-05,
"loss": 1.4005,
"step": 526
},
{
"epoch": 0.817812197483059,
"grad_norm": 0.8118229508399963,
"learning_rate": 3.640124095139607e-05,
"loss": 1.4349,
"step": 528
},
{
"epoch": 0.8209099709583737,
"grad_norm": 0.9044933319091797,
"learning_rate": 3.6349534643226474e-05,
"loss": 1.4393,
"step": 530
},
{
"epoch": 0.8240077444336883,
"grad_norm": 1.0195043087005615,
"learning_rate": 3.629782833505688e-05,
"loss": 1.4283,
"step": 532
},
{
"epoch": 0.8271055179090029,
"grad_norm": 0.8480585813522339,
"learning_rate": 3.624612202688728e-05,
"loss": 1.4317,
"step": 534
},
{
"epoch": 0.8302032913843175,
"grad_norm": 0.9876317381858826,
"learning_rate": 3.619441571871768e-05,
"loss": 1.4356,
"step": 536
},
{
"epoch": 0.8333010648596322,
"grad_norm": 0.880251944065094,
"learning_rate": 3.614270941054809e-05,
"loss": 1.4226,
"step": 538
},
{
"epoch": 0.8363988383349468,
"grad_norm": 0.9709184169769287,
"learning_rate": 3.609100310237849e-05,
"loss": 1.4318,
"step": 540
},
{
"epoch": 0.8394966118102614,
"grad_norm": 1.1328480243682861,
"learning_rate": 3.603929679420889e-05,
"loss": 1.4243,
"step": 542
},
{
"epoch": 0.842594385285576,
"grad_norm": 1.0186697244644165,
"learning_rate": 3.5987590486039296e-05,
"loss": 1.4318,
"step": 544
},
{
"epoch": 0.8456921587608907,
"grad_norm": 1.1424143314361572,
"learning_rate": 3.59358841778697e-05,
"loss": 1.434,
"step": 546
},
{
"epoch": 0.8487899322362052,
"grad_norm": 0.8991900086402893,
"learning_rate": 3.5884177869700106e-05,
"loss": 1.425,
"step": 548
},
{
"epoch": 0.8518877057115198,
"grad_norm": 1.0953640937805176,
"learning_rate": 3.5832471561530504e-05,
"loss": 1.4208,
"step": 550
},
{
"epoch": 0.8549854791868344,
"grad_norm": 0.8714589476585388,
"learning_rate": 3.578076525336091e-05,
"loss": 1.4133,
"step": 552
},
{
"epoch": 0.8580832526621491,
"grad_norm": 1.155774474143982,
"learning_rate": 3.5729058945191315e-05,
"loss": 1.4217,
"step": 554
},
{
"epoch": 0.8611810261374637,
"grad_norm": 0.7970629334449768,
"learning_rate": 3.567735263702171e-05,
"loss": 1.4225,
"step": 556
},
{
"epoch": 0.8642787996127783,
"grad_norm": 0.8982592821121216,
"learning_rate": 3.562564632885212e-05,
"loss": 1.4445,
"step": 558
},
{
"epoch": 0.8673765730880929,
"grad_norm": 0.8706935048103333,
"learning_rate": 3.557394002068252e-05,
"loss": 1.4471,
"step": 560
},
{
"epoch": 0.8704743465634075,
"grad_norm": 0.8587532043457031,
"learning_rate": 3.552223371251293e-05,
"loss": 1.4198,
"step": 562
},
{
"epoch": 0.8735721200387222,
"grad_norm": 1.0104795694351196,
"learning_rate": 3.5470527404343326e-05,
"loss": 1.4024,
"step": 564
},
{
"epoch": 0.8766698935140368,
"grad_norm": 0.9810327887535095,
"learning_rate": 3.541882109617373e-05,
"loss": 1.4077,
"step": 566
},
{
"epoch": 0.8797676669893514,
"grad_norm": 0.8219797611236572,
"learning_rate": 3.536711478800414e-05,
"loss": 1.4101,
"step": 568
},
{
"epoch": 0.882865440464666,
"grad_norm": 0.9589121341705322,
"learning_rate": 3.531540847983454e-05,
"loss": 1.4186,
"step": 570
},
{
"epoch": 0.8859632139399807,
"grad_norm": 0.7841024398803711,
"learning_rate": 3.526370217166495e-05,
"loss": 1.4176,
"step": 572
},
{
"epoch": 0.8890609874152953,
"grad_norm": 0.8052421808242798,
"learning_rate": 3.521199586349535e-05,
"loss": 1.423,
"step": 574
},
{
"epoch": 0.8921587608906099,
"grad_norm": 1.0032799243927002,
"learning_rate": 3.516028955532576e-05,
"loss": 1.3992,
"step": 576
},
{
"epoch": 0.8952565343659245,
"grad_norm": 0.934697151184082,
"learning_rate": 3.5108583247156155e-05,
"loss": 1.4145,
"step": 578
},
{
"epoch": 0.8983543078412392,
"grad_norm": 0.8936486840248108,
"learning_rate": 3.505687693898656e-05,
"loss": 1.4508,
"step": 580
},
{
"epoch": 0.9014520813165537,
"grad_norm": 1.3050090074539185,
"learning_rate": 3.5005170630816965e-05,
"loss": 1.4105,
"step": 582
},
{
"epoch": 0.9045498547918683,
"grad_norm": 1.2944504022598267,
"learning_rate": 3.4953464322647364e-05,
"loss": 1.4274,
"step": 584
},
{
"epoch": 0.9076476282671829,
"grad_norm": 0.7632983922958374,
"learning_rate": 3.490175801447777e-05,
"loss": 1.43,
"step": 586
},
{
"epoch": 0.9107454017424976,
"grad_norm": 1.2242131233215332,
"learning_rate": 3.4850051706308174e-05,
"loss": 1.4083,
"step": 588
},
{
"epoch": 0.9138431752178122,
"grad_norm": 0.9511488676071167,
"learning_rate": 3.479834539813858e-05,
"loss": 1.411,
"step": 590
},
{
"epoch": 0.9169409486931268,
"grad_norm": 0.9538015723228455,
"learning_rate": 3.474663908996898e-05,
"loss": 1.4136,
"step": 592
},
{
"epoch": 0.9200387221684414,
"grad_norm": 0.9940462708473206,
"learning_rate": 3.469493278179938e-05,
"loss": 1.4199,
"step": 594
},
{
"epoch": 0.9231364956437561,
"grad_norm": 1.045178771018982,
"learning_rate": 3.464322647362979e-05,
"loss": 1.4096,
"step": 596
},
{
"epoch": 0.9262342691190707,
"grad_norm": 0.9768006205558777,
"learning_rate": 3.459152016546019e-05,
"loss": 1.4235,
"step": 598
},
{
"epoch": 0.9293320425943853,
"grad_norm": 1.0503100156784058,
"learning_rate": 3.453981385729059e-05,
"loss": 1.4259,
"step": 600
},
{
"epoch": 0.9324298160696999,
"grad_norm": 1.1488350629806519,
"learning_rate": 3.4488107549120996e-05,
"loss": 1.4031,
"step": 602
},
{
"epoch": 0.9355275895450145,
"grad_norm": 1.0303666591644287,
"learning_rate": 3.44364012409514e-05,
"loss": 1.4023,
"step": 604
},
{
"epoch": 0.9386253630203292,
"grad_norm": 0.9419746398925781,
"learning_rate": 3.4384694932781806e-05,
"loss": 1.4151,
"step": 606
},
{
"epoch": 0.9417231364956438,
"grad_norm": 0.9052528738975525,
"learning_rate": 3.4332988624612204e-05,
"loss": 1.4177,
"step": 608
},
{
"epoch": 0.9448209099709584,
"grad_norm": 1.0753135681152344,
"learning_rate": 3.428128231644261e-05,
"loss": 1.4063,
"step": 610
},
{
"epoch": 0.947918683446273,
"grad_norm": 0.7896863222122192,
"learning_rate": 3.4229576008273014e-05,
"loss": 1.4309,
"step": 612
},
{
"epoch": 0.9510164569215876,
"grad_norm": 1.0495641231536865,
"learning_rate": 3.417786970010341e-05,
"loss": 1.4284,
"step": 614
},
{
"epoch": 0.9541142303969022,
"grad_norm": 1.0048576593399048,
"learning_rate": 3.412616339193382e-05,
"loss": 1.4346,
"step": 616
},
{
"epoch": 0.9572120038722168,
"grad_norm": 1.0318708419799805,
"learning_rate": 3.407445708376422e-05,
"loss": 1.4106,
"step": 618
},
{
"epoch": 0.9603097773475314,
"grad_norm": 0.9739704132080078,
"learning_rate": 3.402275077559463e-05,
"loss": 1.4137,
"step": 620
},
{
"epoch": 0.9634075508228461,
"grad_norm": 1.0641124248504639,
"learning_rate": 3.3971044467425026e-05,
"loss": 1.4154,
"step": 622
},
{
"epoch": 0.9665053242981607,
"grad_norm": 1.143355369567871,
"learning_rate": 3.391933815925543e-05,
"loss": 1.4049,
"step": 624
},
{
"epoch": 0.9696030977734753,
"grad_norm": 0.7641253471374512,
"learning_rate": 3.3867631851085836e-05,
"loss": 1.4153,
"step": 626
},
{
"epoch": 0.9727008712487899,
"grad_norm": 0.9126153588294983,
"learning_rate": 3.381592554291624e-05,
"loss": 1.4219,
"step": 628
},
{
"epoch": 0.9757986447241046,
"grad_norm": 0.8339759111404419,
"learning_rate": 3.376421923474664e-05,
"loss": 1.4234,
"step": 630
},
{
"epoch": 0.9788964181994192,
"grad_norm": 1.062849760055542,
"learning_rate": 3.3712512926577045e-05,
"loss": 1.4298,
"step": 632
},
{
"epoch": 0.9819941916747338,
"grad_norm": 0.880806565284729,
"learning_rate": 3.366080661840745e-05,
"loss": 1.4041,
"step": 634
},
{
"epoch": 0.9850919651500484,
"grad_norm": 0.9244954586029053,
"learning_rate": 3.360910031023785e-05,
"loss": 1.4208,
"step": 636
},
{
"epoch": 0.988189738625363,
"grad_norm": 0.9386717677116394,
"learning_rate": 3.355739400206825e-05,
"loss": 1.4153,
"step": 638
},
{
"epoch": 0.9912875121006777,
"grad_norm": 0.7148683667182922,
"learning_rate": 3.350568769389866e-05,
"loss": 1.4211,
"step": 640
},
{
"epoch": 0.9943852855759923,
"grad_norm": 1.0988705158233643,
"learning_rate": 3.345398138572906e-05,
"loss": 1.4169,
"step": 642
},
{
"epoch": 0.9974830590513069,
"grad_norm": 0.9161446690559387,
"learning_rate": 3.340227507755946e-05,
"loss": 1.4126,
"step": 644
},
{
"epoch": 1.0005808325266214,
"grad_norm": 0.9653096199035645,
"learning_rate": 3.3350568769389867e-05,
"loss": 1.4004,
"step": 646
},
{
"epoch": 1.003678606001936,
"grad_norm": 1.2281991243362427,
"learning_rate": 3.329886246122027e-05,
"loss": 1.3948,
"step": 648
},
{
"epoch": 1.0067763794772506,
"grad_norm": 0.8875632882118225,
"learning_rate": 3.324715615305068e-05,
"loss": 1.3869,
"step": 650
},
{
"epoch": 1.0098741529525654,
"grad_norm": 1.2403393983840942,
"learning_rate": 3.3195449844881075e-05,
"loss": 1.3794,
"step": 652
},
{
"epoch": 1.01297192642788,
"grad_norm": 0.9899982810020447,
"learning_rate": 3.314374353671148e-05,
"loss": 1.3781,
"step": 654
},
{
"epoch": 1.0160696999031946,
"grad_norm": 1.2559030055999756,
"learning_rate": 3.3092037228541885e-05,
"loss": 1.3978,
"step": 656
},
{
"epoch": 1.0191674733785092,
"grad_norm": 0.9205394387245178,
"learning_rate": 3.3040330920372283e-05,
"loss": 1.384,
"step": 658
},
{
"epoch": 1.0222652468538238,
"grad_norm": 1.1866810321807861,
"learning_rate": 3.298862461220269e-05,
"loss": 1.3989,
"step": 660
},
{
"epoch": 1.0253630203291384,
"grad_norm": 0.8332041501998901,
"learning_rate": 3.2936918304033094e-05,
"loss": 1.373,
"step": 662
},
{
"epoch": 1.028460793804453,
"grad_norm": 0.9644818902015686,
"learning_rate": 3.28852119958635e-05,
"loss": 1.3982,
"step": 664
},
{
"epoch": 1.0315585672797676,
"grad_norm": 0.9065265655517578,
"learning_rate": 3.28335056876939e-05,
"loss": 1.388,
"step": 666
},
{
"epoch": 1.0346563407550824,
"grad_norm": 0.8498512506484985,
"learning_rate": 3.27817993795243e-05,
"loss": 1.3833,
"step": 668
},
{
"epoch": 1.037754114230397,
"grad_norm": 0.7631977796554565,
"learning_rate": 3.273009307135471e-05,
"loss": 1.381,
"step": 670
},
{
"epoch": 1.0408518877057116,
"grad_norm": 0.9017680883407593,
"learning_rate": 3.267838676318511e-05,
"loss": 1.4068,
"step": 672
},
{
"epoch": 1.0439496611810262,
"grad_norm": 1.02823007106781,
"learning_rate": 3.262668045501551e-05,
"loss": 1.4036,
"step": 674
},
{
"epoch": 1.0470474346563408,
"grad_norm": 1.0055862665176392,
"learning_rate": 3.2574974146845916e-05,
"loss": 1.3691,
"step": 676
},
{
"epoch": 1.0501452081316553,
"grad_norm": 0.9213855862617493,
"learning_rate": 3.252326783867632e-05,
"loss": 1.4011,
"step": 678
},
{
"epoch": 1.05324298160697,
"grad_norm": 0.9935958385467529,
"learning_rate": 3.247156153050672e-05,
"loss": 1.3893,
"step": 680
},
{
"epoch": 1.0563407550822845,
"grad_norm": 0.8763697743415833,
"learning_rate": 3.2419855222337124e-05,
"loss": 1.3952,
"step": 682
},
{
"epoch": 1.0594385285575991,
"grad_norm": 1.15850830078125,
"learning_rate": 3.236814891416753e-05,
"loss": 1.3923,
"step": 684
},
{
"epoch": 1.062536302032914,
"grad_norm": 0.9471246600151062,
"learning_rate": 3.2316442605997934e-05,
"loss": 1.3884,
"step": 686
},
{
"epoch": 1.0656340755082285,
"grad_norm": 0.7925785779953003,
"learning_rate": 3.226473629782833e-05,
"loss": 1.3752,
"step": 688
},
{
"epoch": 1.0687318489835431,
"grad_norm": 0.9303650856018066,
"learning_rate": 3.221302998965874e-05,
"loss": 1.3719,
"step": 690
},
{
"epoch": 1.0718296224588577,
"grad_norm": 0.9009895324707031,
"learning_rate": 3.216132368148914e-05,
"loss": 1.376,
"step": 692
},
{
"epoch": 1.0749273959341723,
"grad_norm": 0.922558605670929,
"learning_rate": 3.210961737331955e-05,
"loss": 1.3873,
"step": 694
},
{
"epoch": 1.078025169409487,
"grad_norm": 0.9685287475585938,
"learning_rate": 3.2057911065149946e-05,
"loss": 1.3911,
"step": 696
},
{
"epoch": 1.0811229428848015,
"grad_norm": 1.0427310466766357,
"learning_rate": 3.200620475698035e-05,
"loss": 1.3862,
"step": 698
},
{
"epoch": 1.084220716360116,
"grad_norm": 0.8039479851722717,
"learning_rate": 3.1954498448810756e-05,
"loss": 1.3968,
"step": 700
},
{
"epoch": 1.0873184898354307,
"grad_norm": 0.7638404965400696,
"learning_rate": 3.190279214064116e-05,
"loss": 1.3917,
"step": 702
},
{
"epoch": 1.0904162633107455,
"grad_norm": 0.8520601391792297,
"learning_rate": 3.185108583247156e-05,
"loss": 1.3864,
"step": 704
},
{
"epoch": 1.09351403678606,
"grad_norm": 0.7571600079536438,
"learning_rate": 3.1799379524301965e-05,
"loss": 1.3905,
"step": 706
},
{
"epoch": 1.0966118102613747,
"grad_norm": 0.8143354654312134,
"learning_rate": 3.174767321613237e-05,
"loss": 1.3478,
"step": 708
},
{
"epoch": 1.0997095837366893,
"grad_norm": 0.9007526636123657,
"learning_rate": 3.169596690796277e-05,
"loss": 1.3845,
"step": 710
},
{
"epoch": 1.1028073572120038,
"grad_norm": 0.7659597396850586,
"learning_rate": 3.164426059979317e-05,
"loss": 1.4011,
"step": 712
},
{
"epoch": 1.1059051306873184,
"grad_norm": 0.9849894642829895,
"learning_rate": 3.159255429162358e-05,
"loss": 1.4066,
"step": 714
},
{
"epoch": 1.109002904162633,
"grad_norm": 0.7712810635566711,
"learning_rate": 3.154084798345398e-05,
"loss": 1.3568,
"step": 716
},
{
"epoch": 1.1121006776379476,
"grad_norm": 0.9364888668060303,
"learning_rate": 3.148914167528438e-05,
"loss": 1.3842,
"step": 718
},
{
"epoch": 1.1151984511132624,
"grad_norm": 0.8143067359924316,
"learning_rate": 3.1437435367114786e-05,
"loss": 1.3957,
"step": 720
},
{
"epoch": 1.118296224588577,
"grad_norm": 1.0575618743896484,
"learning_rate": 3.138572905894519e-05,
"loss": 1.3808,
"step": 722
},
{
"epoch": 1.1213939980638916,
"grad_norm": 0.9788165092468262,
"learning_rate": 3.13340227507756e-05,
"loss": 1.4097,
"step": 724
},
{
"epoch": 1.1244917715392062,
"grad_norm": 0.8391342163085938,
"learning_rate": 3.1282316442605995e-05,
"loss": 1.3683,
"step": 726
},
{
"epoch": 1.1275895450145208,
"grad_norm": 1.1145310401916504,
"learning_rate": 3.123061013443641e-05,
"loss": 1.3898,
"step": 728
},
{
"epoch": 1.1306873184898354,
"grad_norm": 1.1125495433807373,
"learning_rate": 3.117890382626681e-05,
"loss": 1.383,
"step": 730
},
{
"epoch": 1.13378509196515,
"grad_norm": 0.8851980566978455,
"learning_rate": 3.112719751809721e-05,
"loss": 1.3912,
"step": 732
},
{
"epoch": 1.1368828654404646,
"grad_norm": 0.869816243648529,
"learning_rate": 3.1075491209927615e-05,
"loss": 1.3855,
"step": 734
},
{
"epoch": 1.1399806389157794,
"grad_norm": 0.9421548247337341,
"learning_rate": 3.102378490175802e-05,
"loss": 1.3921,
"step": 736
},
{
"epoch": 1.143078412391094,
"grad_norm": 0.9962127208709717,
"learning_rate": 3.097207859358842e-05,
"loss": 1.3872,
"step": 738
},
{
"epoch": 1.1461761858664086,
"grad_norm": 0.8962863087654114,
"learning_rate": 3.0920372285418824e-05,
"loss": 1.3813,
"step": 740
},
{
"epoch": 1.1492739593417232,
"grad_norm": 1.142207384109497,
"learning_rate": 3.086866597724923e-05,
"loss": 1.379,
"step": 742
},
{
"epoch": 1.1523717328170378,
"grad_norm": 0.839261531829834,
"learning_rate": 3.0816959669079634e-05,
"loss": 1.3581,
"step": 744
},
{
"epoch": 1.1554695062923523,
"grad_norm": 1.087727665901184,
"learning_rate": 3.076525336091003e-05,
"loss": 1.3847,
"step": 746
},
{
"epoch": 1.158567279767667,
"grad_norm": 1.204419732093811,
"learning_rate": 3.071354705274044e-05,
"loss": 1.3883,
"step": 748
},
{
"epoch": 1.1616650532429815,
"grad_norm": 0.9747138023376465,
"learning_rate": 3.066184074457084e-05,
"loss": 1.3939,
"step": 750
},
{
"epoch": 1.1647628267182961,
"grad_norm": 1.068014144897461,
"learning_rate": 3.061013443640125e-05,
"loss": 1.3776,
"step": 752
},
{
"epoch": 1.167860600193611,
"grad_norm": 0.9767001271247864,
"learning_rate": 3.0558428128231646e-05,
"loss": 1.4095,
"step": 754
},
{
"epoch": 1.1709583736689255,
"grad_norm": 0.8887537717819214,
"learning_rate": 3.050672182006205e-05,
"loss": 1.3689,
"step": 756
},
{
"epoch": 1.1740561471442401,
"grad_norm": 1.0799994468688965,
"learning_rate": 3.0455015511892452e-05,
"loss": 1.3742,
"step": 758
},
{
"epoch": 1.1771539206195547,
"grad_norm": 0.8181743025779724,
"learning_rate": 3.0403309203722857e-05,
"loss": 1.377,
"step": 760
},
{
"epoch": 1.1802516940948693,
"grad_norm": 0.8285690546035767,
"learning_rate": 3.035160289555326e-05,
"loss": 1.3915,
"step": 762
},
{
"epoch": 1.1833494675701839,
"grad_norm": 0.8738031387329102,
"learning_rate": 3.0299896587383664e-05,
"loss": 1.4063,
"step": 764
},
{
"epoch": 1.1864472410454985,
"grad_norm": 0.8122093677520752,
"learning_rate": 3.0248190279214066e-05,
"loss": 1.3806,
"step": 766
},
{
"epoch": 1.189545014520813,
"grad_norm": 0.9666309952735901,
"learning_rate": 3.019648397104447e-05,
"loss": 1.3841,
"step": 768
},
{
"epoch": 1.1926427879961277,
"grad_norm": 0.7673875689506531,
"learning_rate": 3.0144777662874873e-05,
"loss": 1.3989,
"step": 770
},
{
"epoch": 1.1957405614714425,
"grad_norm": 1.207763910293579,
"learning_rate": 3.0093071354705278e-05,
"loss": 1.398,
"step": 772
},
{
"epoch": 1.198838334946757,
"grad_norm": 1.1100952625274658,
"learning_rate": 3.004136504653568e-05,
"loss": 1.3796,
"step": 774
},
{
"epoch": 1.2019361084220717,
"grad_norm": 0.8612858653068542,
"learning_rate": 2.9989658738366084e-05,
"loss": 1.3632,
"step": 776
},
{
"epoch": 1.2050338818973862,
"grad_norm": 0.9066482782363892,
"learning_rate": 2.9937952430196486e-05,
"loss": 1.3791,
"step": 778
},
{
"epoch": 1.2081316553727008,
"grad_norm": 0.8077186346054077,
"learning_rate": 2.988624612202689e-05,
"loss": 1.3735,
"step": 780
},
{
"epoch": 1.2112294288480154,
"grad_norm": 0.7607460618019104,
"learning_rate": 2.9834539813857293e-05,
"loss": 1.376,
"step": 782
},
{
"epoch": 1.21432720232333,
"grad_norm": 0.8543524146080017,
"learning_rate": 2.9782833505687695e-05,
"loss": 1.3708,
"step": 784
},
{
"epoch": 1.2174249757986448,
"grad_norm": 0.8722901344299316,
"learning_rate": 2.97311271975181e-05,
"loss": 1.3886,
"step": 786
},
{
"epoch": 1.2205227492739594,
"grad_norm": 0.8278937935829163,
"learning_rate": 2.96794208893485e-05,
"loss": 1.3654,
"step": 788
},
{
"epoch": 1.223620522749274,
"grad_norm": 0.8393619656562805,
"learning_rate": 2.9627714581178906e-05,
"loss": 1.3842,
"step": 790
},
{
"epoch": 1.2267182962245886,
"grad_norm": 0.8890239596366882,
"learning_rate": 2.9576008273009308e-05,
"loss": 1.3767,
"step": 792
},
{
"epoch": 1.2298160696999032,
"grad_norm": 0.9097030162811279,
"learning_rate": 2.9524301964839713e-05,
"loss": 1.3511,
"step": 794
},
{
"epoch": 1.2329138431752178,
"grad_norm": 1.070699691772461,
"learning_rate": 2.9472595656670115e-05,
"loss": 1.372,
"step": 796
},
{
"epoch": 1.2360116166505324,
"grad_norm": 0.994193971157074,
"learning_rate": 2.942088934850052e-05,
"loss": 1.3826,
"step": 798
},
{
"epoch": 1.239109390125847,
"grad_norm": 0.8802192807197571,
"learning_rate": 2.936918304033092e-05,
"loss": 1.3608,
"step": 800
},
{
"epoch": 1.239109390125847,
"eval_loss": 1.3684968948364258,
"eval_runtime": 500.0031,
"eval_samples_per_second": 41.32,
"eval_steps_per_second": 5.166,
"step": 800
},
{
"epoch": 1.2422071636011616,
"grad_norm": 0.9239991903305054,
"learning_rate": 2.9317476732161327e-05,
"loss": 1.4047,
"step": 802
},
{
"epoch": 1.2453049370764764,
"grad_norm": 0.7434487342834473,
"learning_rate": 2.926577042399173e-05,
"loss": 1.3958,
"step": 804
},
{
"epoch": 1.248402710551791,
"grad_norm": 0.7530505061149597,
"learning_rate": 2.921406411582213e-05,
"loss": 1.3642,
"step": 806
},
{
"epoch": 1.2515004840271056,
"grad_norm": 0.79911208152771,
"learning_rate": 2.9162357807652535e-05,
"loss": 1.3506,
"step": 808
},
{
"epoch": 1.2545982575024202,
"grad_norm": 0.6747287511825562,
"learning_rate": 2.9110651499482937e-05,
"loss": 1.3716,
"step": 810
},
{
"epoch": 1.2576960309777347,
"grad_norm": 0.8054267764091492,
"learning_rate": 2.9058945191313342e-05,
"loss": 1.3587,
"step": 812
},
{
"epoch": 1.2607938044530493,
"grad_norm": 0.8911522030830383,
"learning_rate": 2.9007238883143744e-05,
"loss": 1.3895,
"step": 814
},
{
"epoch": 1.263891577928364,
"grad_norm": 0.9144203662872314,
"learning_rate": 2.895553257497415e-05,
"loss": 1.3768,
"step": 816
},
{
"epoch": 1.2669893514036787,
"grad_norm": 0.7684288024902344,
"learning_rate": 2.890382626680455e-05,
"loss": 1.3694,
"step": 818
},
{
"epoch": 1.2700871248789931,
"grad_norm": 0.868026077747345,
"learning_rate": 2.8852119958634955e-05,
"loss": 1.3739,
"step": 820
},
{
"epoch": 1.273184898354308,
"grad_norm": 0.9168595671653748,
"learning_rate": 2.8800413650465357e-05,
"loss": 1.3813,
"step": 822
},
{
"epoch": 1.2762826718296225,
"grad_norm": 0.9085325002670288,
"learning_rate": 2.8748707342295762e-05,
"loss": 1.3727,
"step": 824
},
{
"epoch": 1.279380445304937,
"grad_norm": 0.8276653289794922,
"learning_rate": 2.8697001034126164e-05,
"loss": 1.3726,
"step": 826
},
{
"epoch": 1.2824782187802517,
"grad_norm": 0.9563087821006775,
"learning_rate": 2.864529472595657e-05,
"loss": 1.3881,
"step": 828
},
{
"epoch": 1.2855759922555663,
"grad_norm": 0.8035735487937927,
"learning_rate": 2.859358841778697e-05,
"loss": 1.3732,
"step": 830
},
{
"epoch": 1.2886737657308809,
"grad_norm": 0.9055673480033875,
"learning_rate": 2.8541882109617372e-05,
"loss": 1.3691,
"step": 832
},
{
"epoch": 1.2917715392061955,
"grad_norm": 0.8407905697822571,
"learning_rate": 2.8490175801447777e-05,
"loss": 1.368,
"step": 834
},
{
"epoch": 1.2948693126815103,
"grad_norm": 0.8415255546569824,
"learning_rate": 2.843846949327818e-05,
"loss": 1.396,
"step": 836
},
{
"epoch": 1.2979670861568247,
"grad_norm": 0.8884280920028687,
"learning_rate": 2.8386763185108584e-05,
"loss": 1.3676,
"step": 838
},
{
"epoch": 1.3010648596321395,
"grad_norm": 0.7399088740348816,
"learning_rate": 2.8335056876938986e-05,
"loss": 1.3803,
"step": 840
},
{
"epoch": 1.304162633107454,
"grad_norm": 0.9572098851203918,
"learning_rate": 2.828335056876939e-05,
"loss": 1.365,
"step": 842
},
{
"epoch": 1.3072604065827687,
"grad_norm": 1.2893517017364502,
"learning_rate": 2.8231644260599793e-05,
"loss": 1.3971,
"step": 844
},
{
"epoch": 1.3103581800580832,
"grad_norm": 0.9179888963699341,
"learning_rate": 2.8179937952430198e-05,
"loss": 1.3602,
"step": 846
},
{
"epoch": 1.3134559535333978,
"grad_norm": 0.867713987827301,
"learning_rate": 2.81282316442606e-05,
"loss": 1.3971,
"step": 848
},
{
"epoch": 1.3165537270087124,
"grad_norm": 0.9152940511703491,
"learning_rate": 2.8076525336091004e-05,
"loss": 1.3714,
"step": 850
},
{
"epoch": 1.319651500484027,
"grad_norm": 1.19929039478302,
"learning_rate": 2.8024819027921406e-05,
"loss": 1.3897,
"step": 852
},
{
"epoch": 1.3227492739593418,
"grad_norm": 1.3702645301818848,
"learning_rate": 2.7973112719751808e-05,
"loss": 1.3727,
"step": 854
},
{
"epoch": 1.3258470474346564,
"grad_norm": 0.8655095100402832,
"learning_rate": 2.7921406411582213e-05,
"loss": 1.377,
"step": 856
},
{
"epoch": 1.328944820909971,
"grad_norm": 0.9268757104873657,
"learning_rate": 2.7869700103412615e-05,
"loss": 1.3847,
"step": 858
},
{
"epoch": 1.3320425943852856,
"grad_norm": 0.9466863870620728,
"learning_rate": 2.781799379524302e-05,
"loss": 1.3784,
"step": 860
},
{
"epoch": 1.3351403678606002,
"grad_norm": 0.8696274757385254,
"learning_rate": 2.776628748707342e-05,
"loss": 1.3832,
"step": 862
},
{
"epoch": 1.3382381413359148,
"grad_norm": 0.8872708678245544,
"learning_rate": 2.7714581178903826e-05,
"loss": 1.3613,
"step": 864
},
{
"epoch": 1.3413359148112294,
"grad_norm": 0.9495521187782288,
"learning_rate": 2.7662874870734228e-05,
"loss": 1.3744,
"step": 866
},
{
"epoch": 1.344433688286544,
"grad_norm": 0.9160442352294922,
"learning_rate": 2.7611168562564633e-05,
"loss": 1.3635,
"step": 868
},
{
"epoch": 1.3475314617618586,
"grad_norm": 1.015899896621704,
"learning_rate": 2.7559462254395035e-05,
"loss": 1.3958,
"step": 870
},
{
"epoch": 1.3506292352371734,
"grad_norm": 1.0616685152053833,
"learning_rate": 2.750775594622544e-05,
"loss": 1.3594,
"step": 872
},
{
"epoch": 1.353727008712488,
"grad_norm": 0.9547629952430725,
"learning_rate": 2.745604963805584e-05,
"loss": 1.3835,
"step": 874
},
{
"epoch": 1.3568247821878026,
"grad_norm": 0.8538408279418945,
"learning_rate": 2.7404343329886247e-05,
"loss": 1.3441,
"step": 876
},
{
"epoch": 1.3599225556631171,
"grad_norm": 1.0383230447769165,
"learning_rate": 2.735263702171665e-05,
"loss": 1.3528,
"step": 878
},
{
"epoch": 1.3630203291384317,
"grad_norm": 1.053682804107666,
"learning_rate": 2.730093071354705e-05,
"loss": 1.3871,
"step": 880
},
{
"epoch": 1.3661181026137463,
"grad_norm": 0.8979085683822632,
"learning_rate": 2.7249224405377455e-05,
"loss": 1.3763,
"step": 882
},
{
"epoch": 1.369215876089061,
"grad_norm": 0.8620943427085876,
"learning_rate": 2.7197518097207857e-05,
"loss": 1.3604,
"step": 884
},
{
"epoch": 1.3723136495643757,
"grad_norm": 0.8336718082427979,
"learning_rate": 2.7145811789038262e-05,
"loss": 1.3479,
"step": 886
},
{
"epoch": 1.37541142303969,
"grad_norm": 0.9134451150894165,
"learning_rate": 2.7094105480868664e-05,
"loss": 1.3716,
"step": 888
},
{
"epoch": 1.378509196515005,
"grad_norm": 0.8077151775360107,
"learning_rate": 2.7042399172699072e-05,
"loss": 1.3766,
"step": 890
},
{
"epoch": 1.3816069699903195,
"grad_norm": 1.0257856845855713,
"learning_rate": 2.6990692864529477e-05,
"loss": 1.4035,
"step": 892
},
{
"epoch": 1.384704743465634,
"grad_norm": 0.9979709386825562,
"learning_rate": 2.693898655635988e-05,
"loss": 1.3616,
"step": 894
},
{
"epoch": 1.3878025169409487,
"grad_norm": 0.9246943593025208,
"learning_rate": 2.6887280248190284e-05,
"loss": 1.3599,
"step": 896
},
{
"epoch": 1.3909002904162633,
"grad_norm": 0.9028282165527344,
"learning_rate": 2.6835573940020685e-05,
"loss": 1.3517,
"step": 898
},
{
"epoch": 1.3939980638915779,
"grad_norm": 1.0988807678222656,
"learning_rate": 2.678386763185109e-05,
"loss": 1.3631,
"step": 900
},
{
"epoch": 1.3970958373668925,
"grad_norm": 0.928338885307312,
"learning_rate": 2.6732161323681492e-05,
"loss": 1.3746,
"step": 902
},
{
"epoch": 1.4001936108422073,
"grad_norm": 0.8771430253982544,
"learning_rate": 2.6680455015511897e-05,
"loss": 1.3812,
"step": 904
},
{
"epoch": 1.4032913843175217,
"grad_norm": 0.8561460971832275,
"learning_rate": 2.66287487073423e-05,
"loss": 1.3785,
"step": 906
},
{
"epoch": 1.4063891577928365,
"grad_norm": 0.808969259262085,
"learning_rate": 2.6577042399172704e-05,
"loss": 1.3753,
"step": 908
},
{
"epoch": 1.409486931268151,
"grad_norm": 0.935157299041748,
"learning_rate": 2.6525336091003106e-05,
"loss": 1.3408,
"step": 910
},
{
"epoch": 1.4125847047434656,
"grad_norm": 0.884665310382843,
"learning_rate": 2.6473629782833507e-05,
"loss": 1.3733,
"step": 912
},
{
"epoch": 1.4156824782187802,
"grad_norm": 0.8393154740333557,
"learning_rate": 2.6421923474663913e-05,
"loss": 1.3707,
"step": 914
},
{
"epoch": 1.4187802516940948,
"grad_norm": 0.9536909461021423,
"learning_rate": 2.6370217166494314e-05,
"loss": 1.3876,
"step": 916
},
{
"epoch": 1.4218780251694094,
"grad_norm": 1.0733585357666016,
"learning_rate": 2.631851085832472e-05,
"loss": 1.3852,
"step": 918
},
{
"epoch": 1.424975798644724,
"grad_norm": 0.9174052476882935,
"learning_rate": 2.626680455015512e-05,
"loss": 1.3534,
"step": 920
},
{
"epoch": 1.4280735721200388,
"grad_norm": 0.9805439114570618,
"learning_rate": 2.6215098241985526e-05,
"loss": 1.3651,
"step": 922
},
{
"epoch": 1.4311713455953534,
"grad_norm": 1.0409832000732422,
"learning_rate": 2.6163391933815928e-05,
"loss": 1.3703,
"step": 924
},
{
"epoch": 1.434269119070668,
"grad_norm": 0.8675093650817871,
"learning_rate": 2.6111685625646333e-05,
"loss": 1.3563,
"step": 926
},
{
"epoch": 1.4373668925459826,
"grad_norm": 0.9059470295906067,
"learning_rate": 2.6059979317476734e-05,
"loss": 1.3721,
"step": 928
},
{
"epoch": 1.4404646660212972,
"grad_norm": 1.078581690788269,
"learning_rate": 2.600827300930714e-05,
"loss": 1.3809,
"step": 930
},
{
"epoch": 1.4435624394966118,
"grad_norm": 0.7785590291023254,
"learning_rate": 2.595656670113754e-05,
"loss": 1.365,
"step": 932
},
{
"epoch": 1.4466602129719264,
"grad_norm": 0.7777726650238037,
"learning_rate": 2.5904860392967943e-05,
"loss": 1.3651,
"step": 934
},
{
"epoch": 1.449757986447241,
"grad_norm": 1.0098230838775635,
"learning_rate": 2.5853154084798348e-05,
"loss": 1.3776,
"step": 936
},
{
"epoch": 1.4528557599225556,
"grad_norm": 0.8591383695602417,
"learning_rate": 2.580144777662875e-05,
"loss": 1.3874,
"step": 938
},
{
"epoch": 1.4559535333978704,
"grad_norm": 0.8739891052246094,
"learning_rate": 2.5749741468459155e-05,
"loss": 1.3532,
"step": 940
},
{
"epoch": 1.459051306873185,
"grad_norm": 1.0277025699615479,
"learning_rate": 2.5698035160289556e-05,
"loss": 1.3717,
"step": 942
},
{
"epoch": 1.4621490803484996,
"grad_norm": 0.8807665705680847,
"learning_rate": 2.564632885211996e-05,
"loss": 1.3819,
"step": 944
},
{
"epoch": 1.4652468538238141,
"grad_norm": 0.8883543610572815,
"learning_rate": 2.5594622543950363e-05,
"loss": 1.3844,
"step": 946
},
{
"epoch": 1.4683446272991287,
"grad_norm": 0.9567596316337585,
"learning_rate": 2.5542916235780768e-05,
"loss": 1.361,
"step": 948
},
{
"epoch": 1.4714424007744433,
"grad_norm": 0.8780364990234375,
"learning_rate": 2.549120992761117e-05,
"loss": 1.3509,
"step": 950
},
{
"epoch": 1.474540174249758,
"grad_norm": 0.9909296631813049,
"learning_rate": 2.5439503619441575e-05,
"loss": 1.3497,
"step": 952
},
{
"epoch": 1.4776379477250727,
"grad_norm": 0.802102267742157,
"learning_rate": 2.5387797311271977e-05,
"loss": 1.3924,
"step": 954
},
{
"epoch": 1.480735721200387,
"grad_norm": 0.9961832165718079,
"learning_rate": 2.5336091003102382e-05,
"loss": 1.3684,
"step": 956
},
{
"epoch": 1.483833494675702,
"grad_norm": 0.8613944053649902,
"learning_rate": 2.5284384694932783e-05,
"loss": 1.4083,
"step": 958
},
{
"epoch": 1.4869312681510165,
"grad_norm": 0.8471227884292603,
"learning_rate": 2.5232678386763185e-05,
"loss": 1.3734,
"step": 960
},
{
"epoch": 1.490029041626331,
"grad_norm": 1.068398118019104,
"learning_rate": 2.518097207859359e-05,
"loss": 1.358,
"step": 962
},
{
"epoch": 1.4931268151016457,
"grad_norm": 0.8443578481674194,
"learning_rate": 2.5129265770423992e-05,
"loss": 1.3777,
"step": 964
},
{
"epoch": 1.4962245885769603,
"grad_norm": 0.8146198391914368,
"learning_rate": 2.5077559462254397e-05,
"loss": 1.3617,
"step": 966
},
{
"epoch": 1.4993223620522749,
"grad_norm": 0.8507145643234253,
"learning_rate": 2.50258531540848e-05,
"loss": 1.3742,
"step": 968
},
{
"epoch": 1.5024201355275895,
"grad_norm": 1.1908308267593384,
"learning_rate": 2.4974146845915204e-05,
"loss": 1.3716,
"step": 970
},
{
"epoch": 1.5055179090029043,
"grad_norm": 0.9929447770118713,
"learning_rate": 2.4922440537745605e-05,
"loss": 1.3774,
"step": 972
},
{
"epoch": 1.5086156824782186,
"grad_norm": 0.8368676900863647,
"learning_rate": 2.487073422957601e-05,
"loss": 1.3765,
"step": 974
},
{
"epoch": 1.5117134559535335,
"grad_norm": 1.0263066291809082,
"learning_rate": 2.4819027921406412e-05,
"loss": 1.3573,
"step": 976
},
{
"epoch": 1.514811229428848,
"grad_norm": 0.8279297947883606,
"learning_rate": 2.4767321613236817e-05,
"loss": 1.3713,
"step": 978
},
{
"epoch": 1.5179090029041626,
"grad_norm": 1.0560111999511719,
"learning_rate": 2.471561530506722e-05,
"loss": 1.3709,
"step": 980
},
{
"epoch": 1.5210067763794772,
"grad_norm": 0.9566187262535095,
"learning_rate": 2.466390899689762e-05,
"loss": 1.3839,
"step": 982
},
{
"epoch": 1.5241045498547918,
"grad_norm": 0.9892044067382812,
"learning_rate": 2.4612202688728026e-05,
"loss": 1.3733,
"step": 984
},
{
"epoch": 1.5272023233301066,
"grad_norm": 0.8076044321060181,
"learning_rate": 2.4560496380558427e-05,
"loss": 1.3751,
"step": 986
},
{
"epoch": 1.530300096805421,
"grad_norm": 0.7843705415725708,
"learning_rate": 2.4508790072388832e-05,
"loss": 1.3499,
"step": 988
},
{
"epoch": 1.5333978702807358,
"grad_norm": 0.8170126676559448,
"learning_rate": 2.4457083764219234e-05,
"loss": 1.3634,
"step": 990
},
{
"epoch": 1.5364956437560502,
"grad_norm": 0.9551861882209778,
"learning_rate": 2.440537745604964e-05,
"loss": 1.3563,
"step": 992
},
{
"epoch": 1.539593417231365,
"grad_norm": 0.8271490931510925,
"learning_rate": 2.435367114788004e-05,
"loss": 1.3608,
"step": 994
},
{
"epoch": 1.5426911907066796,
"grad_norm": 0.9556779861450195,
"learning_rate": 2.4301964839710446e-05,
"loss": 1.3652,
"step": 996
},
{
"epoch": 1.5457889641819942,
"grad_norm": 0.9999971985816956,
"learning_rate": 2.4250258531540848e-05,
"loss": 1.3429,
"step": 998
},
{
"epoch": 1.5488867376573088,
"grad_norm": 0.8976193070411682,
"learning_rate": 2.4198552223371253e-05,
"loss": 1.3591,
"step": 1000
},
{
"epoch": 1.5519845111326234,
"grad_norm": 0.9682601690292358,
"learning_rate": 2.4146845915201654e-05,
"loss": 1.3683,
"step": 1002
},
{
"epoch": 1.5550822846079382,
"grad_norm": 1.0349007844924927,
"learning_rate": 2.409513960703206e-05,
"loss": 1.3673,
"step": 1004
},
{
"epoch": 1.5581800580832526,
"grad_norm": 0.9338064193725586,
"learning_rate": 2.404343329886246e-05,
"loss": 1.391,
"step": 1006
},
{
"epoch": 1.5612778315585674,
"grad_norm": 0.8901142477989197,
"learning_rate": 2.3991726990692863e-05,
"loss": 1.3488,
"step": 1008
},
{
"epoch": 1.5643756050338817,
"grad_norm": 0.9426191449165344,
"learning_rate": 2.394002068252327e-05,
"loss": 1.3566,
"step": 1010
},
{
"epoch": 1.5674733785091965,
"grad_norm": 0.9889611601829529,
"learning_rate": 2.3888314374353673e-05,
"loss": 1.3834,
"step": 1012
},
{
"epoch": 1.5705711519845111,
"grad_norm": 0.8977054357528687,
"learning_rate": 2.3836608066184078e-05,
"loss": 1.3682,
"step": 1014
},
{
"epoch": 1.5736689254598257,
"grad_norm": 1.0564823150634766,
"learning_rate": 2.378490175801448e-05,
"loss": 1.3758,
"step": 1016
},
{
"epoch": 1.5767666989351403,
"grad_norm": 1.1357567310333252,
"learning_rate": 2.3733195449844885e-05,
"loss": 1.3696,
"step": 1018
},
{
"epoch": 1.579864472410455,
"grad_norm": 0.933795690536499,
"learning_rate": 2.3681489141675287e-05,
"loss": 1.3749,
"step": 1020
},
{
"epoch": 1.5829622458857697,
"grad_norm": 0.8098678588867188,
"learning_rate": 2.3629782833505688e-05,
"loss": 1.3497,
"step": 1022
},
{
"epoch": 1.586060019361084,
"grad_norm": 0.9030234813690186,
"learning_rate": 2.3578076525336093e-05,
"loss": 1.3555,
"step": 1024
},
{
"epoch": 1.589157792836399,
"grad_norm": 0.9926664233207703,
"learning_rate": 2.3526370217166495e-05,
"loss": 1.3732,
"step": 1026
},
{
"epoch": 1.5922555663117135,
"grad_norm": 1.2584630250930786,
"learning_rate": 2.34746639089969e-05,
"loss": 1.3608,
"step": 1028
},
{
"epoch": 1.595353339787028,
"grad_norm": 1.0363199710845947,
"learning_rate": 2.3422957600827302e-05,
"loss": 1.3647,
"step": 1030
},
{
"epoch": 1.5984511132623427,
"grad_norm": 1.238027572631836,
"learning_rate": 2.3371251292657707e-05,
"loss": 1.3522,
"step": 1032
},
{
"epoch": 1.6015488867376573,
"grad_norm": 0.6948149800300598,
"learning_rate": 2.331954498448811e-05,
"loss": 1.3697,
"step": 1034
},
{
"epoch": 1.604646660212972,
"grad_norm": 0.8757950663566589,
"learning_rate": 2.3267838676318514e-05,
"loss": 1.3593,
"step": 1036
},
{
"epoch": 1.6077444336882865,
"grad_norm": 1.110155701637268,
"learning_rate": 2.3216132368148915e-05,
"loss": 1.3813,
"step": 1038
},
{
"epoch": 1.6108422071636013,
"grad_norm": 0.8213835954666138,
"learning_rate": 2.316442605997932e-05,
"loss": 1.3739,
"step": 1040
},
{
"epoch": 1.6139399806389156,
"grad_norm": 0.8836016654968262,
"learning_rate": 2.3112719751809722e-05,
"loss": 1.3706,
"step": 1042
},
{
"epoch": 1.6170377541142305,
"grad_norm": 1.0370168685913086,
"learning_rate": 2.3061013443640127e-05,
"loss": 1.3669,
"step": 1044
},
{
"epoch": 1.620135527589545,
"grad_norm": 0.8061625957489014,
"learning_rate": 2.300930713547053e-05,
"loss": 1.3712,
"step": 1046
},
{
"epoch": 1.6232333010648596,
"grad_norm": 0.8144744038581848,
"learning_rate": 2.295760082730093e-05,
"loss": 1.3686,
"step": 1048
},
{
"epoch": 1.6263310745401742,
"grad_norm": 1.1386702060699463,
"learning_rate": 2.2905894519131335e-05,
"loss": 1.3453,
"step": 1050
},
{
"epoch": 1.6294288480154888,
"grad_norm": 0.9613929986953735,
"learning_rate": 2.2854188210961737e-05,
"loss": 1.3518,
"step": 1052
},
{
"epoch": 1.6325266214908036,
"grad_norm": 0.7813166975975037,
"learning_rate": 2.2802481902792142e-05,
"loss": 1.3571,
"step": 1054
},
{
"epoch": 1.635624394966118,
"grad_norm": 0.8500548601150513,
"learning_rate": 2.2750775594622544e-05,
"loss": 1.3408,
"step": 1056
},
{
"epoch": 1.6387221684414328,
"grad_norm": 0.8827762603759766,
"learning_rate": 2.269906928645295e-05,
"loss": 1.3491,
"step": 1058
},
{
"epoch": 1.6418199419167472,
"grad_norm": 0.8917422890663147,
"learning_rate": 2.264736297828335e-05,
"loss": 1.3676,
"step": 1060
},
{
"epoch": 1.644917715392062,
"grad_norm": 0.9541721940040588,
"learning_rate": 2.2595656670113756e-05,
"loss": 1.3504,
"step": 1062
},
{
"epoch": 1.6480154888673766,
"grad_norm": 1.1979867219924927,
"learning_rate": 2.2543950361944157e-05,
"loss": 1.3873,
"step": 1064
},
{
"epoch": 1.6511132623426912,
"grad_norm": 0.9107701182365417,
"learning_rate": 2.2492244053774563e-05,
"loss": 1.3822,
"step": 1066
},
{
"epoch": 1.6542110358180058,
"grad_norm": 1.0378977060317993,
"learning_rate": 2.2440537745604964e-05,
"loss": 1.3589,
"step": 1068
},
{
"epoch": 1.6573088092933204,
"grad_norm": 0.9246495962142944,
"learning_rate": 2.2388831437435366e-05,
"loss": 1.338,
"step": 1070
},
{
"epoch": 1.6604065827686352,
"grad_norm": 1.034191370010376,
"learning_rate": 2.233712512926577e-05,
"loss": 1.3443,
"step": 1072
},
{
"epoch": 1.6635043562439495,
"grad_norm": 0.812461256980896,
"learning_rate": 2.2285418821096173e-05,
"loss": 1.3717,
"step": 1074
},
{
"epoch": 1.6666021297192644,
"grad_norm": 1.0656987428665161,
"learning_rate": 2.2233712512926578e-05,
"loss": 1.3782,
"step": 1076
},
{
"epoch": 1.669699903194579,
"grad_norm": 0.9221978783607483,
"learning_rate": 2.218200620475698e-05,
"loss": 1.3511,
"step": 1078
},
{
"epoch": 1.6727976766698935,
"grad_norm": 1.0364100933074951,
"learning_rate": 2.2130299896587384e-05,
"loss": 1.3518,
"step": 1080
},
{
"epoch": 1.6758954501452081,
"grad_norm": 1.0864959955215454,
"learning_rate": 2.2078593588417786e-05,
"loss": 1.3379,
"step": 1082
},
{
"epoch": 1.6789932236205227,
"grad_norm": 0.9392344951629639,
"learning_rate": 2.202688728024819e-05,
"loss": 1.3683,
"step": 1084
},
{
"epoch": 1.6820909970958373,
"grad_norm": 0.8275219798088074,
"learning_rate": 2.1975180972078593e-05,
"loss": 1.3516,
"step": 1086
},
{
"epoch": 1.685188770571152,
"grad_norm": 0.9747416377067566,
"learning_rate": 2.1923474663908998e-05,
"loss": 1.3571,
"step": 1088
},
{
"epoch": 1.6882865440464667,
"grad_norm": 0.8325587511062622,
"learning_rate": 2.1871768355739403e-05,
"loss": 1.3732,
"step": 1090
},
{
"epoch": 1.691384317521781,
"grad_norm": 1.1191556453704834,
"learning_rate": 2.1820062047569805e-05,
"loss": 1.3539,
"step": 1092
},
{
"epoch": 1.694482090997096,
"grad_norm": 0.9786492586135864,
"learning_rate": 2.176835573940021e-05,
"loss": 1.3506,
"step": 1094
},
{
"epoch": 1.6975798644724105,
"grad_norm": 0.7830746173858643,
"learning_rate": 2.171664943123061e-05,
"loss": 1.3699,
"step": 1096
},
{
"epoch": 1.700677637947725,
"grad_norm": 0.8177460432052612,
"learning_rate": 2.1664943123061017e-05,
"loss": 1.35,
"step": 1098
},
{
"epoch": 1.7037754114230397,
"grad_norm": 0.7574586868286133,
"learning_rate": 2.1613236814891418e-05,
"loss": 1.3609,
"step": 1100
},
{
"epoch": 1.7068731848983543,
"grad_norm": 0.909091055393219,
"learning_rate": 2.1561530506721823e-05,
"loss": 1.3593,
"step": 1102
},
{
"epoch": 1.709970958373669,
"grad_norm": 0.8122137188911438,
"learning_rate": 2.1509824198552225e-05,
"loss": 1.3665,
"step": 1104
},
{
"epoch": 1.7130687318489835,
"grad_norm": 0.7794236540794373,
"learning_rate": 2.145811789038263e-05,
"loss": 1.3435,
"step": 1106
},
{
"epoch": 1.7161665053242983,
"grad_norm": 0.7815309166908264,
"learning_rate": 2.1406411582213032e-05,
"loss": 1.3652,
"step": 1108
},
{
"epoch": 1.7192642787996126,
"grad_norm": 0.791810154914856,
"learning_rate": 2.1354705274043433e-05,
"loss": 1.3294,
"step": 1110
},
{
"epoch": 1.7223620522749274,
"grad_norm": 1.0140234231948853,
"learning_rate": 2.130299896587384e-05,
"loss": 1.3682,
"step": 1112
},
{
"epoch": 1.725459825750242,
"grad_norm": 0.9673962593078613,
"learning_rate": 2.125129265770424e-05,
"loss": 1.3639,
"step": 1114
},
{
"epoch": 1.7285575992255566,
"grad_norm": 0.8091711401939392,
"learning_rate": 2.1199586349534645e-05,
"loss": 1.3487,
"step": 1116
},
{
"epoch": 1.7316553727008712,
"grad_norm": 0.8248768448829651,
"learning_rate": 2.1147880041365047e-05,
"loss": 1.3643,
"step": 1118
},
{
"epoch": 1.7347531461761858,
"grad_norm": 0.9795010089874268,
"learning_rate": 2.1096173733195452e-05,
"loss": 1.3715,
"step": 1120
},
{
"epoch": 1.7378509196515006,
"grad_norm": 0.7902389764785767,
"learning_rate": 2.1044467425025854e-05,
"loss": 1.3501,
"step": 1122
},
{
"epoch": 1.740948693126815,
"grad_norm": 1.280175805091858,
"learning_rate": 2.099276111685626e-05,
"loss": 1.3497,
"step": 1124
},
{
"epoch": 1.7440464666021298,
"grad_norm": 0.9128603339195251,
"learning_rate": 2.094105480868666e-05,
"loss": 1.3568,
"step": 1126
},
{
"epoch": 1.7471442400774442,
"grad_norm": 0.7820084095001221,
"learning_rate": 2.0889348500517066e-05,
"loss": 1.3853,
"step": 1128
},
{
"epoch": 1.750242013552759,
"grad_norm": 0.9994757771492004,
"learning_rate": 2.0837642192347467e-05,
"loss": 1.3501,
"step": 1130
},
{
"epoch": 1.7533397870280736,
"grad_norm": 0.9045569896697998,
"learning_rate": 2.0785935884177872e-05,
"loss": 1.3594,
"step": 1132
},
{
"epoch": 1.7564375605033882,
"grad_norm": 0.8555303812026978,
"learning_rate": 2.0734229576008274e-05,
"loss": 1.3539,
"step": 1134
},
{
"epoch": 1.7595353339787028,
"grad_norm": 1.0530476570129395,
"learning_rate": 2.0682523267838676e-05,
"loss": 1.3504,
"step": 1136
},
{
"epoch": 1.7626331074540174,
"grad_norm": 0.904148519039154,
"learning_rate": 2.063081695966908e-05,
"loss": 1.3544,
"step": 1138
},
{
"epoch": 1.7657308809293322,
"grad_norm": 0.8729182481765747,
"learning_rate": 2.0579110651499482e-05,
"loss": 1.3694,
"step": 1140
},
{
"epoch": 1.7688286544046465,
"grad_norm": 0.8215417265892029,
"learning_rate": 2.0527404343329888e-05,
"loss": 1.3497,
"step": 1142
},
{
"epoch": 1.7719264278799614,
"grad_norm": 0.8960113525390625,
"learning_rate": 2.047569803516029e-05,
"loss": 1.3416,
"step": 1144
},
{
"epoch": 1.775024201355276,
"grad_norm": 0.8761835098266602,
"learning_rate": 2.0423991726990694e-05,
"loss": 1.3485,
"step": 1146
},
{
"epoch": 1.7781219748305905,
"grad_norm": 0.9275888204574585,
"learning_rate": 2.0372285418821096e-05,
"loss": 1.3792,
"step": 1148
},
{
"epoch": 1.7812197483059051,
"grad_norm": 1.0560438632965088,
"learning_rate": 2.03205791106515e-05,
"loss": 1.3543,
"step": 1150
},
{
"epoch": 1.7843175217812197,
"grad_norm": 0.8371681571006775,
"learning_rate": 2.0268872802481903e-05,
"loss": 1.3322,
"step": 1152
},
{
"epoch": 1.7874152952565345,
"grad_norm": 1.2260630130767822,
"learning_rate": 2.0217166494312308e-05,
"loss": 1.365,
"step": 1154
},
{
"epoch": 1.790513068731849,
"grad_norm": 0.9227527976036072,
"learning_rate": 2.016546018614271e-05,
"loss": 1.3334,
"step": 1156
},
{
"epoch": 1.7936108422071637,
"grad_norm": 0.8147873878479004,
"learning_rate": 2.011375387797311e-05,
"loss": 1.3524,
"step": 1158
},
{
"epoch": 1.796708615682478,
"grad_norm": 1.107546091079712,
"learning_rate": 2.0062047569803516e-05,
"loss": 1.3383,
"step": 1160
},
{
"epoch": 1.799806389157793,
"grad_norm": 0.9934420585632324,
"learning_rate": 2.0010341261633918e-05,
"loss": 1.3381,
"step": 1162
},
{
"epoch": 1.8029041626331075,
"grad_norm": 0.9304853677749634,
"learning_rate": 1.9958634953464323e-05,
"loss": 1.3344,
"step": 1164
},
{
"epoch": 1.806001936108422,
"grad_norm": 0.9126875996589661,
"learning_rate": 1.9906928645294725e-05,
"loss": 1.3671,
"step": 1166
},
{
"epoch": 1.8090997095837367,
"grad_norm": 1.0258123874664307,
"learning_rate": 1.985522233712513e-05,
"loss": 1.3598,
"step": 1168
},
{
"epoch": 1.8121974830590513,
"grad_norm": 0.8520185947418213,
"learning_rate": 1.980351602895553e-05,
"loss": 1.3646,
"step": 1170
},
{
"epoch": 1.815295256534366,
"grad_norm": 1.0395876169204712,
"learning_rate": 1.9751809720785936e-05,
"loss": 1.3408,
"step": 1172
},
{
"epoch": 1.8183930300096804,
"grad_norm": 0.8687078952789307,
"learning_rate": 1.970010341261634e-05,
"loss": 1.3568,
"step": 1174
},
{
"epoch": 1.8214908034849953,
"grad_norm": 1.1169476509094238,
"learning_rate": 1.9648397104446743e-05,
"loss": 1.3519,
"step": 1176
},
{
"epoch": 1.8245885769603096,
"grad_norm": 0.9429073929786682,
"learning_rate": 1.959669079627715e-05,
"loss": 1.3437,
"step": 1178
},
{
"epoch": 1.8276863504356244,
"grad_norm": 0.9102051258087158,
"learning_rate": 1.954498448810755e-05,
"loss": 1.3629,
"step": 1180
},
{
"epoch": 1.830784123910939,
"grad_norm": 0.8507852554321289,
"learning_rate": 1.9493278179937955e-05,
"loss": 1.3536,
"step": 1182
},
{
"epoch": 1.8338818973862536,
"grad_norm": 0.8034945130348206,
"learning_rate": 1.9441571871768357e-05,
"loss": 1.3421,
"step": 1184
},
{
"epoch": 1.8369796708615682,
"grad_norm": 0.9575487971305847,
"learning_rate": 1.9389865563598762e-05,
"loss": 1.3565,
"step": 1186
},
{
"epoch": 1.8400774443368828,
"grad_norm": 0.8735955953598022,
"learning_rate": 1.9338159255429164e-05,
"loss": 1.3537,
"step": 1188
},
{
"epoch": 1.8431752178121976,
"grad_norm": 0.834586501121521,
"learning_rate": 1.928645294725957e-05,
"loss": 1.3935,
"step": 1190
},
{
"epoch": 1.846272991287512,
"grad_norm": 0.8274103999137878,
"learning_rate": 1.923474663908997e-05,
"loss": 1.347,
"step": 1192
},
{
"epoch": 1.8493707647628268,
"grad_norm": 0.9628223180770874,
"learning_rate": 1.9183040330920375e-05,
"loss": 1.3623,
"step": 1194
},
{
"epoch": 1.8524685382381412,
"grad_norm": 0.8151761293411255,
"learning_rate": 1.9131334022750777e-05,
"loss": 1.3565,
"step": 1196
},
{
"epoch": 1.855566311713456,
"grad_norm": 0.8839893341064453,
"learning_rate": 1.907962771458118e-05,
"loss": 1.3636,
"step": 1198
},
{
"epoch": 1.8586640851887706,
"grad_norm": 0.8234869837760925,
"learning_rate": 1.9027921406411584e-05,
"loss": 1.3188,
"step": 1200
},
{
"epoch": 1.8586640851887706,
"eval_loss": 1.341654896736145,
"eval_runtime": 499.7205,
"eval_samples_per_second": 41.343,
"eval_steps_per_second": 5.169,
"step": 1200
},
{
"epoch": 1.8617618586640852,
"grad_norm": 0.7479894757270813,
"learning_rate": 1.8976215098241985e-05,
"loss": 1.3527,
"step": 1202
},
{
"epoch": 1.8648596321393998,
"grad_norm": 0.8051818609237671,
"learning_rate": 1.892450879007239e-05,
"loss": 1.339,
"step": 1204
},
{
"epoch": 1.8679574056147144,
"grad_norm": 0.9621079564094543,
"learning_rate": 1.8872802481902792e-05,
"loss": 1.3735,
"step": 1206
},
{
"epoch": 1.8710551790900292,
"grad_norm": 0.9237180352210999,
"learning_rate": 1.8821096173733197e-05,
"loss": 1.3566,
"step": 1208
},
{
"epoch": 1.8741529525653435,
"grad_norm": 1.00318443775177,
"learning_rate": 1.87693898655636e-05,
"loss": 1.3396,
"step": 1210
},
{
"epoch": 1.8772507260406583,
"grad_norm": 0.8000593185424805,
"learning_rate": 1.8717683557394004e-05,
"loss": 1.3669,
"step": 1212
},
{
"epoch": 1.880348499515973,
"grad_norm": 0.827609121799469,
"learning_rate": 1.8665977249224406e-05,
"loss": 1.3355,
"step": 1214
},
{
"epoch": 1.8834462729912875,
"grad_norm": 1.0329563617706299,
"learning_rate": 1.861427094105481e-05,
"loss": 1.3708,
"step": 1216
},
{
"epoch": 1.8865440464666021,
"grad_norm": 1.0466892719268799,
"learning_rate": 1.8562564632885213e-05,
"loss": 1.3493,
"step": 1218
},
{
"epoch": 1.8896418199419167,
"grad_norm": 0.9400922060012817,
"learning_rate": 1.8510858324715614e-05,
"loss": 1.3441,
"step": 1220
},
{
"epoch": 1.8927395934172315,
"grad_norm": 0.9035273194313049,
"learning_rate": 1.845915201654602e-05,
"loss": 1.3534,
"step": 1222
},
{
"epoch": 1.895837366892546,
"grad_norm": 0.8702762126922607,
"learning_rate": 1.840744570837642e-05,
"loss": 1.3616,
"step": 1224
},
{
"epoch": 1.8989351403678607,
"grad_norm": 0.9379782676696777,
"learning_rate": 1.8355739400206826e-05,
"loss": 1.3557,
"step": 1226
},
{
"epoch": 1.902032913843175,
"grad_norm": 1.03324294090271,
"learning_rate": 1.8304033092037228e-05,
"loss": 1.3486,
"step": 1228
},
{
"epoch": 1.90513068731849,
"grad_norm": 0.7788193821907043,
"learning_rate": 1.8252326783867633e-05,
"loss": 1.3658,
"step": 1230
},
{
"epoch": 1.9082284607938045,
"grad_norm": 0.879900336265564,
"learning_rate": 1.8200620475698034e-05,
"loss": 1.3363,
"step": 1232
},
{
"epoch": 1.911326234269119,
"grad_norm": 0.9988526105880737,
"learning_rate": 1.814891416752844e-05,
"loss": 1.3396,
"step": 1234
},
{
"epoch": 1.9144240077444337,
"grad_norm": 1.0158812999725342,
"learning_rate": 1.809720785935884e-05,
"loss": 1.3513,
"step": 1236
},
{
"epoch": 1.9175217812197483,
"grad_norm": 0.8834120035171509,
"learning_rate": 1.8045501551189246e-05,
"loss": 1.3536,
"step": 1238
},
{
"epoch": 1.920619554695063,
"grad_norm": 1.0367848873138428,
"learning_rate": 1.7993795243019648e-05,
"loss": 1.3462,
"step": 1240
},
{
"epoch": 1.9237173281703774,
"grad_norm": 0.9409236311912537,
"learning_rate": 1.7942088934850053e-05,
"loss": 1.3435,
"step": 1242
},
{
"epoch": 1.9268151016456923,
"grad_norm": 0.7982214093208313,
"learning_rate": 1.7890382626680455e-05,
"loss": 1.3319,
"step": 1244
},
{
"epoch": 1.9299128751210066,
"grad_norm": 1.1070462465286255,
"learning_rate": 1.7838676318510856e-05,
"loss": 1.3373,
"step": 1246
},
{
"epoch": 1.9330106485963214,
"grad_norm": 1.0409610271453857,
"learning_rate": 1.778697001034126e-05,
"loss": 1.3551,
"step": 1248
},
{
"epoch": 1.936108422071636,
"grad_norm": 0.9913906455039978,
"learning_rate": 1.7735263702171663e-05,
"loss": 1.3651,
"step": 1250
},
{
"epoch": 1.9392061955469506,
"grad_norm": 0.9144983887672424,
"learning_rate": 1.768355739400207e-05,
"loss": 1.3688,
"step": 1252
},
{
"epoch": 1.9423039690222652,
"grad_norm": 1.01792311668396,
"learning_rate": 1.7631851085832473e-05,
"loss": 1.3368,
"step": 1254
},
{
"epoch": 1.9454017424975798,
"grad_norm": 0.8271951675415039,
"learning_rate": 1.758014477766288e-05,
"loss": 1.3565,
"step": 1256
},
{
"epoch": 1.9484995159728946,
"grad_norm": 1.008579134941101,
"learning_rate": 1.752843846949328e-05,
"loss": 1.3498,
"step": 1258
},
{
"epoch": 1.951597289448209,
"grad_norm": 1.0562330484390259,
"learning_rate": 1.7476732161323682e-05,
"loss": 1.3745,
"step": 1260
},
{
"epoch": 1.9546950629235238,
"grad_norm": 0.9627982974052429,
"learning_rate": 1.7425025853154087e-05,
"loss": 1.3569,
"step": 1262
},
{
"epoch": 1.9577928363988384,
"grad_norm": 1.1486949920654297,
"learning_rate": 1.737331954498449e-05,
"loss": 1.3576,
"step": 1264
},
{
"epoch": 1.960890609874153,
"grad_norm": 0.8623875379562378,
"learning_rate": 1.7321613236814894e-05,
"loss": 1.3331,
"step": 1266
},
{
"epoch": 1.9639883833494676,
"grad_norm": 0.8003185987472534,
"learning_rate": 1.7269906928645295e-05,
"loss": 1.3126,
"step": 1268
},
{
"epoch": 1.9670861568247822,
"grad_norm": 0.8993198275566101,
"learning_rate": 1.72182006204757e-05,
"loss": 1.3348,
"step": 1270
},
{
"epoch": 1.9701839303000968,
"grad_norm": 0.7497487664222717,
"learning_rate": 1.7166494312306102e-05,
"loss": 1.3587,
"step": 1272
},
{
"epoch": 1.9732817037754113,
"grad_norm": 0.9242996573448181,
"learning_rate": 1.7114788004136507e-05,
"loss": 1.3526,
"step": 1274
},
{
"epoch": 1.9763794772507262,
"grad_norm": 0.9362899661064148,
"learning_rate": 1.706308169596691e-05,
"loss": 1.3403,
"step": 1276
},
{
"epoch": 1.9794772507260405,
"grad_norm": 0.9214730262756348,
"learning_rate": 1.7011375387797314e-05,
"loss": 1.3561,
"step": 1278
},
{
"epoch": 1.9825750242013553,
"grad_norm": 1.1729867458343506,
"learning_rate": 1.6959669079627716e-05,
"loss": 1.3712,
"step": 1280
},
{
"epoch": 1.98567279767667,
"grad_norm": 0.8579219579696655,
"learning_rate": 1.690796277145812e-05,
"loss": 1.3562,
"step": 1282
},
{
"epoch": 1.9887705711519845,
"grad_norm": 0.819837212562561,
"learning_rate": 1.6856256463288522e-05,
"loss": 1.3196,
"step": 1284
},
{
"epoch": 1.9918683446272991,
"grad_norm": 1.0892577171325684,
"learning_rate": 1.6804550155118924e-05,
"loss": 1.3139,
"step": 1286
},
{
"epoch": 1.9949661181026137,
"grad_norm": 0.9745960831642151,
"learning_rate": 1.675284384694933e-05,
"loss": 1.3474,
"step": 1288
},
{
"epoch": 1.9980638915779285,
"grad_norm": 0.9176591038703918,
"learning_rate": 1.670113753877973e-05,
"loss": 1.3377,
"step": 1290
},
{
"epoch": 2.001161665053243,
"grad_norm": 0.8982537388801575,
"learning_rate": 1.6649431230610136e-05,
"loss": 1.3667,
"step": 1292
},
{
"epoch": 2.0042594385285577,
"grad_norm": 0.8645797371864319,
"learning_rate": 1.6597724922440538e-05,
"loss": 1.3238,
"step": 1294
},
{
"epoch": 2.007357212003872,
"grad_norm": 1.0574814081192017,
"learning_rate": 1.6546018614270943e-05,
"loss": 1.3436,
"step": 1296
},
{
"epoch": 2.010454985479187,
"grad_norm": 0.8636010885238647,
"learning_rate": 1.6494312306101344e-05,
"loss": 1.3321,
"step": 1298
},
{
"epoch": 2.0135527589545013,
"grad_norm": 1.040042519569397,
"learning_rate": 1.644260599793175e-05,
"loss": 1.3475,
"step": 1300
},
{
"epoch": 2.016650532429816,
"grad_norm": 0.7811307907104492,
"learning_rate": 1.639089968976215e-05,
"loss": 1.3386,
"step": 1302
},
{
"epoch": 2.019748305905131,
"grad_norm": 0.9275119304656982,
"learning_rate": 1.6339193381592556e-05,
"loss": 1.338,
"step": 1304
},
{
"epoch": 2.0228460793804452,
"grad_norm": 0.8792182803153992,
"learning_rate": 1.6287487073422958e-05,
"loss": 1.341,
"step": 1306
},
{
"epoch": 2.02594385285576,
"grad_norm": 0.9327546954154968,
"learning_rate": 1.623578076525336e-05,
"loss": 1.325,
"step": 1308
},
{
"epoch": 2.0290416263310744,
"grad_norm": 0.9593343734741211,
"learning_rate": 1.6184074457083765e-05,
"loss": 1.3111,
"step": 1310
},
{
"epoch": 2.0321393998063892,
"grad_norm": 0.8487372994422913,
"learning_rate": 1.6132368148914166e-05,
"loss": 1.3308,
"step": 1312
},
{
"epoch": 2.0352371732817036,
"grad_norm": 1.0663917064666748,
"learning_rate": 1.608066184074457e-05,
"loss": 1.3126,
"step": 1314
},
{
"epoch": 2.0383349467570184,
"grad_norm": 0.9352003931999207,
"learning_rate": 1.6028955532574973e-05,
"loss": 1.3374,
"step": 1316
},
{
"epoch": 2.041432720232333,
"grad_norm": 0.8087659478187561,
"learning_rate": 1.5977249224405378e-05,
"loss": 1.3381,
"step": 1318
},
{
"epoch": 2.0445304937076476,
"grad_norm": 0.8721085786819458,
"learning_rate": 1.592554291623578e-05,
"loss": 1.3492,
"step": 1320
},
{
"epoch": 2.0476282671829624,
"grad_norm": 1.012121319770813,
"learning_rate": 1.5873836608066185e-05,
"loss": 1.3422,
"step": 1322
},
{
"epoch": 2.050726040658277,
"grad_norm": 0.8746726512908936,
"learning_rate": 1.5822130299896586e-05,
"loss": 1.3518,
"step": 1324
},
{
"epoch": 2.0538238141335916,
"grad_norm": 0.9453880786895752,
"learning_rate": 1.577042399172699e-05,
"loss": 1.3224,
"step": 1326
},
{
"epoch": 2.056921587608906,
"grad_norm": 1.383927583694458,
"learning_rate": 1.5718717683557393e-05,
"loss": 1.3334,
"step": 1328
},
{
"epoch": 2.060019361084221,
"grad_norm": 0.8216990232467651,
"learning_rate": 1.56670113753878e-05,
"loss": 1.3386,
"step": 1330
},
{
"epoch": 2.063117134559535,
"grad_norm": 0.8967849612236023,
"learning_rate": 1.5615305067218203e-05,
"loss": 1.3212,
"step": 1332
},
{
"epoch": 2.06621490803485,
"grad_norm": 0.960881233215332,
"learning_rate": 1.5563598759048605e-05,
"loss": 1.3269,
"step": 1334
},
{
"epoch": 2.069312681510165,
"grad_norm": 0.8327577114105225,
"learning_rate": 1.551189245087901e-05,
"loss": 1.3226,
"step": 1336
},
{
"epoch": 2.072410454985479,
"grad_norm": 0.9150763154029846,
"learning_rate": 1.5460186142709412e-05,
"loss": 1.3438,
"step": 1338
},
{
"epoch": 2.075508228460794,
"grad_norm": 0.7916013598442078,
"learning_rate": 1.5408479834539817e-05,
"loss": 1.3305,
"step": 1340
},
{
"epoch": 2.0786060019361083,
"grad_norm": 0.9902190566062927,
"learning_rate": 1.535677352637022e-05,
"loss": 1.3598,
"step": 1342
},
{
"epoch": 2.081703775411423,
"grad_norm": 0.9081457853317261,
"learning_rate": 1.5305067218200624e-05,
"loss": 1.3469,
"step": 1344
},
{
"epoch": 2.0848015488867375,
"grad_norm": 0.9101652503013611,
"learning_rate": 1.5253360910031025e-05,
"loss": 1.3147,
"step": 1346
},
{
"epoch": 2.0878993223620523,
"grad_norm": 0.9023634791374207,
"learning_rate": 1.5201654601861429e-05,
"loss": 1.3674,
"step": 1348
},
{
"epoch": 2.0909970958373667,
"grad_norm": 0.9344819188117981,
"learning_rate": 1.5149948293691832e-05,
"loss": 1.3074,
"step": 1350
},
{
"epoch": 2.0940948693126815,
"grad_norm": 0.8912569880485535,
"learning_rate": 1.5098241985522235e-05,
"loss": 1.3398,
"step": 1352
},
{
"epoch": 2.0971926427879963,
"grad_norm": 0.9664559960365295,
"learning_rate": 1.5046535677352639e-05,
"loss": 1.3286,
"step": 1354
},
{
"epoch": 2.1002904162633107,
"grad_norm": 0.8646228909492493,
"learning_rate": 1.4994829369183042e-05,
"loss": 1.346,
"step": 1356
},
{
"epoch": 2.1033881897386255,
"grad_norm": 0.939831554889679,
"learning_rate": 1.4943123061013446e-05,
"loss": 1.3479,
"step": 1358
},
{
"epoch": 2.10648596321394,
"grad_norm": 1.0427614450454712,
"learning_rate": 1.4891416752843847e-05,
"loss": 1.3293,
"step": 1360
},
{
"epoch": 2.1095837366892547,
"grad_norm": 0.8650059700012207,
"learning_rate": 1.483971044467425e-05,
"loss": 1.3368,
"step": 1362
},
{
"epoch": 2.112681510164569,
"grad_norm": 0.8453037142753601,
"learning_rate": 1.4788004136504654e-05,
"loss": 1.3463,
"step": 1364
},
{
"epoch": 2.115779283639884,
"grad_norm": 1.010087490081787,
"learning_rate": 1.4736297828335057e-05,
"loss": 1.3337,
"step": 1366
},
{
"epoch": 2.1188770571151982,
"grad_norm": 0.9835523962974548,
"learning_rate": 1.468459152016546e-05,
"loss": 1.3423,
"step": 1368
},
{
"epoch": 2.121974830590513,
"grad_norm": 0.9542858600616455,
"learning_rate": 1.4632885211995864e-05,
"loss": 1.3606,
"step": 1370
},
{
"epoch": 2.125072604065828,
"grad_norm": 0.9768648147583008,
"learning_rate": 1.4581178903826268e-05,
"loss": 1.3388,
"step": 1372
},
{
"epoch": 2.1281703775411422,
"grad_norm": 0.87848961353302,
"learning_rate": 1.4529472595656671e-05,
"loss": 1.3323,
"step": 1374
},
{
"epoch": 2.131268151016457,
"grad_norm": 0.9899694919586182,
"learning_rate": 1.4477766287487074e-05,
"loss": 1.3188,
"step": 1376
},
{
"epoch": 2.1343659244917714,
"grad_norm": 0.9858341813087463,
"learning_rate": 1.4426059979317478e-05,
"loss": 1.3228,
"step": 1378
},
{
"epoch": 2.1374636979670862,
"grad_norm": 0.9731205105781555,
"learning_rate": 1.4374353671147881e-05,
"loss": 1.3143,
"step": 1380
},
{
"epoch": 2.1405614714424006,
"grad_norm": 0.9824615716934204,
"learning_rate": 1.4322647362978284e-05,
"loss": 1.3515,
"step": 1382
},
{
"epoch": 2.1436592449177154,
"grad_norm": 0.90585857629776,
"learning_rate": 1.4270941054808686e-05,
"loss": 1.3191,
"step": 1384
},
{
"epoch": 2.1467570183930302,
"grad_norm": 1.0936884880065918,
"learning_rate": 1.421923474663909e-05,
"loss": 1.3297,
"step": 1386
},
{
"epoch": 2.1498547918683446,
"grad_norm": 0.9065744280815125,
"learning_rate": 1.4167528438469493e-05,
"loss": 1.3311,
"step": 1388
},
{
"epoch": 2.1529525653436594,
"grad_norm": 0.8460130095481873,
"learning_rate": 1.4115822130299896e-05,
"loss": 1.3343,
"step": 1390
},
{
"epoch": 2.156050338818974,
"grad_norm": 0.7978271842002869,
"learning_rate": 1.40641158221303e-05,
"loss": 1.3213,
"step": 1392
},
{
"epoch": 2.1591481122942886,
"grad_norm": 0.9271676540374756,
"learning_rate": 1.4012409513960703e-05,
"loss": 1.3292,
"step": 1394
},
{
"epoch": 2.162245885769603,
"grad_norm": 0.963083028793335,
"learning_rate": 1.3960703205791106e-05,
"loss": 1.3412,
"step": 1396
},
{
"epoch": 2.165343659244918,
"grad_norm": 0.8550283908843994,
"learning_rate": 1.390899689762151e-05,
"loss": 1.3149,
"step": 1398
},
{
"epoch": 2.168441432720232,
"grad_norm": 1.3056062459945679,
"learning_rate": 1.3857290589451913e-05,
"loss": 1.3165,
"step": 1400
},
{
"epoch": 2.171539206195547,
"grad_norm": 0.825268566608429,
"learning_rate": 1.3805584281282317e-05,
"loss": 1.3398,
"step": 1402
},
{
"epoch": 2.1746369796708613,
"grad_norm": 0.8259047269821167,
"learning_rate": 1.375387797311272e-05,
"loss": 1.3278,
"step": 1404
},
{
"epoch": 2.177734753146176,
"grad_norm": 0.7692115306854248,
"learning_rate": 1.3702171664943123e-05,
"loss": 1.3364,
"step": 1406
},
{
"epoch": 2.180832526621491,
"grad_norm": 0.9740896224975586,
"learning_rate": 1.3650465356773525e-05,
"loss": 1.3327,
"step": 1408
},
{
"epoch": 2.1839303000968053,
"grad_norm": 0.7695585489273071,
"learning_rate": 1.3598759048603928e-05,
"loss": 1.3565,
"step": 1410
},
{
"epoch": 2.18702807357212,
"grad_norm": 0.8049722909927368,
"learning_rate": 1.3547052740434332e-05,
"loss": 1.3434,
"step": 1412
},
{
"epoch": 2.1901258470474345,
"grad_norm": 0.8844389915466309,
"learning_rate": 1.3495346432264739e-05,
"loss": 1.3263,
"step": 1414
},
{
"epoch": 2.1932236205227493,
"grad_norm": 0.7504433393478394,
"learning_rate": 1.3443640124095142e-05,
"loss": 1.3431,
"step": 1416
},
{
"epoch": 2.1963213939980637,
"grad_norm": 0.9303148984909058,
"learning_rate": 1.3391933815925545e-05,
"loss": 1.3294,
"step": 1418
},
{
"epoch": 2.1994191674733785,
"grad_norm": 0.9886261224746704,
"learning_rate": 1.3340227507755949e-05,
"loss": 1.3321,
"step": 1420
},
{
"epoch": 2.2025169409486933,
"grad_norm": 0.8835451006889343,
"learning_rate": 1.3288521199586352e-05,
"loss": 1.321,
"step": 1422
},
{
"epoch": 2.2056147144240077,
"grad_norm": 1.0087958574295044,
"learning_rate": 1.3236814891416754e-05,
"loss": 1.3516,
"step": 1424
},
{
"epoch": 2.2087124878993225,
"grad_norm": 0.8284295201301575,
"learning_rate": 1.3185108583247157e-05,
"loss": 1.321,
"step": 1426
},
{
"epoch": 2.211810261374637,
"grad_norm": 0.8967974781990051,
"learning_rate": 1.313340227507756e-05,
"loss": 1.3416,
"step": 1428
},
{
"epoch": 2.2149080348499517,
"grad_norm": 1.1133179664611816,
"learning_rate": 1.3081695966907964e-05,
"loss": 1.3238,
"step": 1430
},
{
"epoch": 2.218005808325266,
"grad_norm": 0.9074902534484863,
"learning_rate": 1.3029989658738367e-05,
"loss": 1.3198,
"step": 1432
},
{
"epoch": 2.221103581800581,
"grad_norm": 0.8816152215003967,
"learning_rate": 1.297828335056877e-05,
"loss": 1.3507,
"step": 1434
},
{
"epoch": 2.2242013552758952,
"grad_norm": 1.0369545221328735,
"learning_rate": 1.2926577042399174e-05,
"loss": 1.297,
"step": 1436
},
{
"epoch": 2.22729912875121,
"grad_norm": 0.8075978755950928,
"learning_rate": 1.2874870734229577e-05,
"loss": 1.3501,
"step": 1438
},
{
"epoch": 2.230396902226525,
"grad_norm": 1.2508447170257568,
"learning_rate": 1.282316442605998e-05,
"loss": 1.3539,
"step": 1440
},
{
"epoch": 2.2334946757018392,
"grad_norm": 0.9969581365585327,
"learning_rate": 1.2771458117890384e-05,
"loss": 1.3049,
"step": 1442
},
{
"epoch": 2.236592449177154,
"grad_norm": 0.921631932258606,
"learning_rate": 1.2719751809720788e-05,
"loss": 1.3437,
"step": 1444
},
{
"epoch": 2.2396902226524684,
"grad_norm": 1.0279971361160278,
"learning_rate": 1.2668045501551191e-05,
"loss": 1.3413,
"step": 1446
},
{
"epoch": 2.2427879961277832,
"grad_norm": 1.0447874069213867,
"learning_rate": 1.2616339193381593e-05,
"loss": 1.3308,
"step": 1448
},
{
"epoch": 2.2458857696030976,
"grad_norm": 0.843579113483429,
"learning_rate": 1.2564632885211996e-05,
"loss": 1.3133,
"step": 1450
},
{
"epoch": 2.2489835430784124,
"grad_norm": 0.8838000297546387,
"learning_rate": 1.25129265770424e-05,
"loss": 1.3288,
"step": 1452
},
{
"epoch": 2.252081316553727,
"grad_norm": 0.9393033385276794,
"learning_rate": 1.2461220268872803e-05,
"loss": 1.3425,
"step": 1454
},
{
"epoch": 2.2551790900290416,
"grad_norm": 0.8526115417480469,
"learning_rate": 1.2409513960703206e-05,
"loss": 1.3372,
"step": 1456
},
{
"epoch": 2.2582768635043564,
"grad_norm": 0.9398928880691528,
"learning_rate": 1.235780765253361e-05,
"loss": 1.3173,
"step": 1458
},
{
"epoch": 2.261374636979671,
"grad_norm": 1.0209931135177612,
"learning_rate": 1.2306101344364013e-05,
"loss": 1.3368,
"step": 1460
},
{
"epoch": 2.2644724104549856,
"grad_norm": 0.9040766954421997,
"learning_rate": 1.2254395036194416e-05,
"loss": 1.357,
"step": 1462
},
{
"epoch": 2.2675701839303,
"grad_norm": 1.052363395690918,
"learning_rate": 1.220268872802482e-05,
"loss": 1.3252,
"step": 1464
},
{
"epoch": 2.270667957405615,
"grad_norm": 1.0517691373825073,
"learning_rate": 1.2150982419855223e-05,
"loss": 1.333,
"step": 1466
},
{
"epoch": 2.273765730880929,
"grad_norm": 1.0435551404953003,
"learning_rate": 1.2099276111685626e-05,
"loss": 1.3338,
"step": 1468
},
{
"epoch": 2.276863504356244,
"grad_norm": 0.9312208890914917,
"learning_rate": 1.204756980351603e-05,
"loss": 1.318,
"step": 1470
},
{
"epoch": 2.2799612778315588,
"grad_norm": 1.1775990724563599,
"learning_rate": 1.1995863495346431e-05,
"loss": 1.3202,
"step": 1472
},
{
"epoch": 2.283059051306873,
"grad_norm": 1.0058298110961914,
"learning_rate": 1.1944157187176836e-05,
"loss": 1.3165,
"step": 1474
},
{
"epoch": 2.286156824782188,
"grad_norm": 1.1250808238983154,
"learning_rate": 1.189245087900724e-05,
"loss": 1.3298,
"step": 1476
},
{
"epoch": 2.2892545982575023,
"grad_norm": 0.8414492607116699,
"learning_rate": 1.1840744570837643e-05,
"loss": 1.3391,
"step": 1478
},
{
"epoch": 2.292352371732817,
"grad_norm": 0.8035596609115601,
"learning_rate": 1.1789038262668047e-05,
"loss": 1.3327,
"step": 1480
},
{
"epoch": 2.2954501452081315,
"grad_norm": 0.8101987242698669,
"learning_rate": 1.173733195449845e-05,
"loss": 1.3102,
"step": 1482
},
{
"epoch": 2.2985479186834463,
"grad_norm": 0.9012944102287292,
"learning_rate": 1.1685625646328853e-05,
"loss": 1.3534,
"step": 1484
},
{
"epoch": 2.301645692158761,
"grad_norm": 0.7570741176605225,
"learning_rate": 1.1633919338159257e-05,
"loss": 1.317,
"step": 1486
},
{
"epoch": 2.3047434656340755,
"grad_norm": 0.7619568109512329,
"learning_rate": 1.158221302998966e-05,
"loss": 1.3601,
"step": 1488
},
{
"epoch": 2.3078412391093903,
"grad_norm": 0.9099006056785583,
"learning_rate": 1.1530506721820064e-05,
"loss": 1.3239,
"step": 1490
},
{
"epoch": 2.3109390125847047,
"grad_norm": 0.7822088599205017,
"learning_rate": 1.1478800413650465e-05,
"loss": 1.3185,
"step": 1492
},
{
"epoch": 2.3140367860600195,
"grad_norm": 0.862535834312439,
"learning_rate": 1.1427094105480869e-05,
"loss": 1.3329,
"step": 1494
},
{
"epoch": 2.317134559535334,
"grad_norm": 1.3833560943603516,
"learning_rate": 1.1375387797311272e-05,
"loss": 1.3377,
"step": 1496
},
{
"epoch": 2.3202323330106487,
"grad_norm": 0.8927620053291321,
"learning_rate": 1.1323681489141675e-05,
"loss": 1.3084,
"step": 1498
},
{
"epoch": 2.323330106485963,
"grad_norm": 0.8435688018798828,
"learning_rate": 1.1271975180972079e-05,
"loss": 1.3322,
"step": 1500
},
{
"epoch": 2.326427879961278,
"grad_norm": 0.9227290153503418,
"learning_rate": 1.1220268872802482e-05,
"loss": 1.3394,
"step": 1502
},
{
"epoch": 2.3295256534365922,
"grad_norm": 0.8425549268722534,
"learning_rate": 1.1168562564632885e-05,
"loss": 1.3486,
"step": 1504
},
{
"epoch": 2.332623426911907,
"grad_norm": 0.8057267069816589,
"learning_rate": 1.1116856256463289e-05,
"loss": 1.3151,
"step": 1506
},
{
"epoch": 2.335721200387222,
"grad_norm": 0.9685359597206116,
"learning_rate": 1.1065149948293692e-05,
"loss": 1.339,
"step": 1508
},
{
"epoch": 2.3388189738625362,
"grad_norm": 0.9330448508262634,
"learning_rate": 1.1013443640124096e-05,
"loss": 1.3112,
"step": 1510
},
{
"epoch": 2.341916747337851,
"grad_norm": 0.9905188083648682,
"learning_rate": 1.0961737331954499e-05,
"loss": 1.3447,
"step": 1512
},
{
"epoch": 2.3450145208131654,
"grad_norm": 0.9230495691299438,
"learning_rate": 1.0910031023784902e-05,
"loss": 1.3457,
"step": 1514
},
{
"epoch": 2.3481122942884802,
"grad_norm": 0.7517797350883484,
"learning_rate": 1.0858324715615306e-05,
"loss": 1.3053,
"step": 1516
},
{
"epoch": 2.3512100677637946,
"grad_norm": 1.13046395778656,
"learning_rate": 1.0806618407445709e-05,
"loss": 1.3442,
"step": 1518
},
{
"epoch": 2.3543078412391094,
"grad_norm": 0.9535288214683533,
"learning_rate": 1.0754912099276113e-05,
"loss": 1.3437,
"step": 1520
},
{
"epoch": 2.3574056147144242,
"grad_norm": 0.9758418798446655,
"learning_rate": 1.0703205791106516e-05,
"loss": 1.3327,
"step": 1522
},
{
"epoch": 2.3605033881897386,
"grad_norm": 0.8258436322212219,
"learning_rate": 1.065149948293692e-05,
"loss": 1.3508,
"step": 1524
},
{
"epoch": 2.3636011616650534,
"grad_norm": 0.9934467077255249,
"learning_rate": 1.0599793174767323e-05,
"loss": 1.3058,
"step": 1526
},
{
"epoch": 2.3666989351403678,
"grad_norm": 0.8944813013076782,
"learning_rate": 1.0548086866597726e-05,
"loss": 1.3083,
"step": 1528
},
{
"epoch": 2.3697967086156826,
"grad_norm": 0.8731038570404053,
"learning_rate": 1.049638055842813e-05,
"loss": 1.3299,
"step": 1530
},
{
"epoch": 2.372894482090997,
"grad_norm": 0.9087830781936646,
"learning_rate": 1.0444674250258533e-05,
"loss": 1.3284,
"step": 1532
},
{
"epoch": 2.3759922555663118,
"grad_norm": 0.8245522975921631,
"learning_rate": 1.0392967942088936e-05,
"loss": 1.3308,
"step": 1534
},
{
"epoch": 2.379090029041626,
"grad_norm": 0.9423663020133972,
"learning_rate": 1.0341261633919338e-05,
"loss": 1.3403,
"step": 1536
},
{
"epoch": 2.382187802516941,
"grad_norm": 0.9050272107124329,
"learning_rate": 1.0289555325749741e-05,
"loss": 1.3267,
"step": 1538
},
{
"epoch": 2.3852855759922553,
"grad_norm": 0.7859249711036682,
"learning_rate": 1.0237849017580145e-05,
"loss": 1.3241,
"step": 1540
},
{
"epoch": 2.38838334946757,
"grad_norm": 0.8981680274009705,
"learning_rate": 1.0186142709410548e-05,
"loss": 1.3367,
"step": 1542
},
{
"epoch": 2.391481122942885,
"grad_norm": 0.9353106021881104,
"learning_rate": 1.0134436401240951e-05,
"loss": 1.3391,
"step": 1544
},
{
"epoch": 2.3945788964181993,
"grad_norm": 0.9247782826423645,
"learning_rate": 1.0082730093071355e-05,
"loss": 1.3047,
"step": 1546
},
{
"epoch": 2.397676669893514,
"grad_norm": 1.141741156578064,
"learning_rate": 1.0031023784901758e-05,
"loss": 1.3307,
"step": 1548
},
{
"epoch": 2.4007744433688285,
"grad_norm": 0.8088661432266235,
"learning_rate": 9.979317476732161e-06,
"loss": 1.3363,
"step": 1550
},
{
"epoch": 2.4038722168441433,
"grad_norm": 0.8670098185539246,
"learning_rate": 9.927611168562565e-06,
"loss": 1.3425,
"step": 1552
},
{
"epoch": 2.4069699903194577,
"grad_norm": 0.8612157702445984,
"learning_rate": 9.875904860392968e-06,
"loss": 1.3253,
"step": 1554
},
{
"epoch": 2.4100677637947725,
"grad_norm": 0.9053961634635925,
"learning_rate": 9.824198552223372e-06,
"loss": 1.3154,
"step": 1556
},
{
"epoch": 2.4131655372700873,
"grad_norm": 0.84452223777771,
"learning_rate": 9.772492244053775e-06,
"loss": 1.3404,
"step": 1558
},
{
"epoch": 2.4162633107454017,
"grad_norm": 0.839474618434906,
"learning_rate": 9.720785935884178e-06,
"loss": 1.3392,
"step": 1560
},
{
"epoch": 2.4193610842207165,
"grad_norm": 0.8587937355041504,
"learning_rate": 9.669079627714582e-06,
"loss": 1.3269,
"step": 1562
},
{
"epoch": 2.422458857696031,
"grad_norm": 0.781345009803772,
"learning_rate": 9.617373319544985e-06,
"loss": 1.3248,
"step": 1564
},
{
"epoch": 2.4255566311713457,
"grad_norm": 0.775817334651947,
"learning_rate": 9.565667011375389e-06,
"loss": 1.3338,
"step": 1566
},
{
"epoch": 2.42865440464666,
"grad_norm": 0.7844461798667908,
"learning_rate": 9.513960703205792e-06,
"loss": 1.3266,
"step": 1568
},
{
"epoch": 2.431752178121975,
"grad_norm": 0.7972658276557922,
"learning_rate": 9.462254395036195e-06,
"loss": 1.3392,
"step": 1570
},
{
"epoch": 2.4348499515972897,
"grad_norm": 0.850536048412323,
"learning_rate": 9.410548086866599e-06,
"loss": 1.3425,
"step": 1572
},
{
"epoch": 2.437947725072604,
"grad_norm": 1.1697067022323608,
"learning_rate": 9.358841778697002e-06,
"loss": 1.3243,
"step": 1574
},
{
"epoch": 2.441045498547919,
"grad_norm": 0.8385635614395142,
"learning_rate": 9.307135470527405e-06,
"loss": 1.3275,
"step": 1576
},
{
"epoch": 2.4441432720232332,
"grad_norm": 0.8202130794525146,
"learning_rate": 9.255429162357807e-06,
"loss": 1.3032,
"step": 1578
},
{
"epoch": 2.447241045498548,
"grad_norm": 0.8149744272232056,
"learning_rate": 9.20372285418821e-06,
"loss": 1.3303,
"step": 1580
},
{
"epoch": 2.4503388189738624,
"grad_norm": 0.9332587122917175,
"learning_rate": 9.152016546018614e-06,
"loss": 1.3391,
"step": 1582
},
{
"epoch": 2.4534365924491772,
"grad_norm": 1.014574646949768,
"learning_rate": 9.100310237849017e-06,
"loss": 1.3455,
"step": 1584
},
{
"epoch": 2.4565343659244916,
"grad_norm": 0.7690302133560181,
"learning_rate": 9.04860392967942e-06,
"loss": 1.3276,
"step": 1586
},
{
"epoch": 2.4596321393998064,
"grad_norm": 1.0496488809585571,
"learning_rate": 8.996897621509824e-06,
"loss": 1.3287,
"step": 1588
},
{
"epoch": 2.4627299128751208,
"grad_norm": 1.0259231328964233,
"learning_rate": 8.945191313340227e-06,
"loss": 1.3298,
"step": 1590
},
{
"epoch": 2.4658276863504356,
"grad_norm": 0.8106045126914978,
"learning_rate": 8.89348500517063e-06,
"loss": 1.311,
"step": 1592
},
{
"epoch": 2.4689254598257504,
"grad_norm": 0.9428908824920654,
"learning_rate": 8.841778697001036e-06,
"loss": 1.336,
"step": 1594
},
{
"epoch": 2.4720232333010648,
"grad_norm": 0.9283081293106079,
"learning_rate": 8.79007238883144e-06,
"loss": 1.3321,
"step": 1596
},
{
"epoch": 2.4751210067763796,
"grad_norm": 0.97194504737854,
"learning_rate": 8.738366080661841e-06,
"loss": 1.3358,
"step": 1598
},
{
"epoch": 2.478218780251694,
"grad_norm": 0.8776614665985107,
"learning_rate": 8.686659772492244e-06,
"loss": 1.3296,
"step": 1600
},
{
"epoch": 2.478218780251694,
"eval_loss": 1.326022982597351,
"eval_runtime": 499.9577,
"eval_samples_per_second": 41.323,
"eval_steps_per_second": 5.166,
"step": 1600
},
{
"epoch": 2.4813165537270088,
"grad_norm": 0.8625020384788513,
"learning_rate": 8.634953464322648e-06,
"loss": 1.3126,
"step": 1602
},
{
"epoch": 2.484414327202323,
"grad_norm": 0.7639974355697632,
"learning_rate": 8.583247156153051e-06,
"loss": 1.3452,
"step": 1604
},
{
"epoch": 2.487512100677638,
"grad_norm": 0.7709546089172363,
"learning_rate": 8.531540847983454e-06,
"loss": 1.3427,
"step": 1606
},
{
"epoch": 2.4906098741529528,
"grad_norm": 0.7982486486434937,
"learning_rate": 8.479834539813858e-06,
"loss": 1.3243,
"step": 1608
},
{
"epoch": 2.493707647628267,
"grad_norm": 0.9817091226577759,
"learning_rate": 8.428128231644261e-06,
"loss": 1.3376,
"step": 1610
},
{
"epoch": 2.496805421103582,
"grad_norm": 0.7842255234718323,
"learning_rate": 8.376421923474665e-06,
"loss": 1.3327,
"step": 1612
},
{
"epoch": 2.4999031945788963,
"grad_norm": 0.8229419589042664,
"learning_rate": 8.324715615305068e-06,
"loss": 1.3284,
"step": 1614
},
{
"epoch": 2.503000968054211,
"grad_norm": 1.0628559589385986,
"learning_rate": 8.273009307135471e-06,
"loss": 1.3413,
"step": 1616
},
{
"epoch": 2.5060987415295255,
"grad_norm": 0.8059125542640686,
"learning_rate": 8.221302998965875e-06,
"loss": 1.3313,
"step": 1618
},
{
"epoch": 2.5091965150048403,
"grad_norm": 0.8325386047363281,
"learning_rate": 8.169596690796278e-06,
"loss": 1.3221,
"step": 1620
},
{
"epoch": 2.512294288480155,
"grad_norm": 0.9245994091033936,
"learning_rate": 8.11789038262668e-06,
"loss": 1.3133,
"step": 1622
},
{
"epoch": 2.5153920619554695,
"grad_norm": 0.9119100570678711,
"learning_rate": 8.066184074457083e-06,
"loss": 1.3211,
"step": 1624
},
{
"epoch": 2.518489835430784,
"grad_norm": 0.9153457283973694,
"learning_rate": 8.014477766287486e-06,
"loss": 1.3226,
"step": 1626
},
{
"epoch": 2.5215876089060987,
"grad_norm": 0.8128604292869568,
"learning_rate": 7.96277145811789e-06,
"loss": 1.3218,
"step": 1628
},
{
"epoch": 2.5246853823814135,
"grad_norm": 0.8200322985649109,
"learning_rate": 7.911065149948293e-06,
"loss": 1.3358,
"step": 1630
},
{
"epoch": 2.527783155856728,
"grad_norm": 0.9329957365989685,
"learning_rate": 7.859358841778697e-06,
"loss": 1.3175,
"step": 1632
},
{
"epoch": 2.5308809293320427,
"grad_norm": 0.9390591979026794,
"learning_rate": 7.807652533609102e-06,
"loss": 1.3442,
"step": 1634
},
{
"epoch": 2.5339787028073575,
"grad_norm": 0.8232764601707458,
"learning_rate": 7.755946225439505e-06,
"loss": 1.3388,
"step": 1636
},
{
"epoch": 2.537076476282672,
"grad_norm": 0.810404360294342,
"learning_rate": 7.704239917269908e-06,
"loss": 1.3216,
"step": 1638
},
{
"epoch": 2.5401742497579862,
"grad_norm": 0.7799009084701538,
"learning_rate": 7.652533609100312e-06,
"loss": 1.3386,
"step": 1640
},
{
"epoch": 2.543272023233301,
"grad_norm": 0.8875685334205627,
"learning_rate": 7.600827300930714e-06,
"loss": 1.3168,
"step": 1642
},
{
"epoch": 2.546369796708616,
"grad_norm": 0.9037766456604004,
"learning_rate": 7.549120992761118e-06,
"loss": 1.3129,
"step": 1644
},
{
"epoch": 2.54946757018393,
"grad_norm": 0.825951099395752,
"learning_rate": 7.497414684591521e-06,
"loss": 1.306,
"step": 1646
},
{
"epoch": 2.552565343659245,
"grad_norm": 0.9290631413459778,
"learning_rate": 7.445708376421924e-06,
"loss": 1.3159,
"step": 1648
},
{
"epoch": 2.5556631171345594,
"grad_norm": 0.9565717577934265,
"learning_rate": 7.394002068252327e-06,
"loss": 1.3072,
"step": 1650
},
{
"epoch": 2.558760890609874,
"grad_norm": 0.7212813496589661,
"learning_rate": 7.34229576008273e-06,
"loss": 1.3388,
"step": 1652
},
{
"epoch": 2.5618586640851886,
"grad_norm": 0.950728178024292,
"learning_rate": 7.290589451913134e-06,
"loss": 1.3248,
"step": 1654
},
{
"epoch": 2.5649564375605034,
"grad_norm": 0.850387454032898,
"learning_rate": 7.238883143743537e-06,
"loss": 1.3425,
"step": 1656
},
{
"epoch": 2.568054211035818,
"grad_norm": 0.9093496203422546,
"learning_rate": 7.1871768355739405e-06,
"loss": 1.3259,
"step": 1658
},
{
"epoch": 2.5711519845111326,
"grad_norm": 0.8944652676582336,
"learning_rate": 7.135470527404343e-06,
"loss": 1.333,
"step": 1660
},
{
"epoch": 2.5742497579864474,
"grad_norm": 0.7491154670715332,
"learning_rate": 7.0837642192347465e-06,
"loss": 1.3226,
"step": 1662
},
{
"epoch": 2.5773475314617618,
"grad_norm": 0.8667898178100586,
"learning_rate": 7.03205791106515e-06,
"loss": 1.3275,
"step": 1664
},
{
"epoch": 2.5804453049370766,
"grad_norm": 1.0023432970046997,
"learning_rate": 6.980351602895553e-06,
"loss": 1.3132,
"step": 1666
},
{
"epoch": 2.583543078412391,
"grad_norm": 0.8249279856681824,
"learning_rate": 6.928645294725957e-06,
"loss": 1.3388,
"step": 1668
},
{
"epoch": 2.5866408518877058,
"grad_norm": 0.9107469320297241,
"learning_rate": 6.87693898655636e-06,
"loss": 1.3089,
"step": 1670
},
{
"epoch": 2.5897386253630206,
"grad_norm": 0.8998382687568665,
"learning_rate": 6.8252326783867625e-06,
"loss": 1.3584,
"step": 1672
},
{
"epoch": 2.592836398838335,
"grad_norm": 0.9280401468276978,
"learning_rate": 6.773526370217166e-06,
"loss": 1.3507,
"step": 1674
},
{
"epoch": 2.5959341723136493,
"grad_norm": 0.7948800921440125,
"learning_rate": 6.721820062047571e-06,
"loss": 1.3209,
"step": 1676
},
{
"epoch": 2.599031945788964,
"grad_norm": 0.7695499062538147,
"learning_rate": 6.670113753877974e-06,
"loss": 1.3084,
"step": 1678
},
{
"epoch": 2.602129719264279,
"grad_norm": 0.8108364939689636,
"learning_rate": 6.618407445708377e-06,
"loss": 1.3213,
"step": 1680
},
{
"epoch": 2.6052274927395933,
"grad_norm": 0.9111447930335999,
"learning_rate": 6.56670113753878e-06,
"loss": 1.2948,
"step": 1682
},
{
"epoch": 2.608325266214908,
"grad_norm": 1.0574729442596436,
"learning_rate": 6.514994829369184e-06,
"loss": 1.3369,
"step": 1684
},
{
"epoch": 2.6114230396902225,
"grad_norm": 0.8202560544013977,
"learning_rate": 6.463288521199587e-06,
"loss": 1.3296,
"step": 1686
},
{
"epoch": 2.6145208131655373,
"grad_norm": 0.754135251045227,
"learning_rate": 6.41158221302999e-06,
"loss": 1.3332,
"step": 1688
},
{
"epoch": 2.6176185866408517,
"grad_norm": 1.0181078910827637,
"learning_rate": 6.359875904860394e-06,
"loss": 1.3354,
"step": 1690
},
{
"epoch": 2.6207163601161665,
"grad_norm": 0.8614677786827087,
"learning_rate": 6.308169596690796e-06,
"loss": 1.3197,
"step": 1692
},
{
"epoch": 2.6238141335914813,
"grad_norm": 0.8117063641548157,
"learning_rate": 6.2564632885212e-06,
"loss": 1.3158,
"step": 1694
},
{
"epoch": 2.6269119070667957,
"grad_norm": 0.9908379912376404,
"learning_rate": 6.204756980351603e-06,
"loss": 1.3188,
"step": 1696
},
{
"epoch": 2.6300096805421105,
"grad_norm": 0.9882792830467224,
"learning_rate": 6.153050672182006e-06,
"loss": 1.3113,
"step": 1698
},
{
"epoch": 2.633107454017425,
"grad_norm": 0.7588277459144592,
"learning_rate": 6.10134436401241e-06,
"loss": 1.316,
"step": 1700
},
{
"epoch": 2.6362052274927397,
"grad_norm": 0.8949116468429565,
"learning_rate": 6.049638055842813e-06,
"loss": 1.3356,
"step": 1702
},
{
"epoch": 2.639303000968054,
"grad_norm": 0.8811196088790894,
"learning_rate": 5.997931747673216e-06,
"loss": 1.3548,
"step": 1704
},
{
"epoch": 2.642400774443369,
"grad_norm": 0.8543995022773743,
"learning_rate": 5.94622543950362e-06,
"loss": 1.3364,
"step": 1706
},
{
"epoch": 2.6454985479186837,
"grad_norm": 0.7959784865379333,
"learning_rate": 5.894519131334023e-06,
"loss": 1.3041,
"step": 1708
},
{
"epoch": 2.648596321393998,
"grad_norm": 0.7849721908569336,
"learning_rate": 5.842812823164427e-06,
"loss": 1.3543,
"step": 1710
},
{
"epoch": 2.651694094869313,
"grad_norm": 0.7859067916870117,
"learning_rate": 5.79110651499483e-06,
"loss": 1.311,
"step": 1712
},
{
"epoch": 2.654791868344627,
"grad_norm": 0.9818124771118164,
"learning_rate": 5.739400206825233e-06,
"loss": 1.3261,
"step": 1714
},
{
"epoch": 2.657889641819942,
"grad_norm": 0.8855445981025696,
"learning_rate": 5.687693898655636e-06,
"loss": 1.3115,
"step": 1716
},
{
"epoch": 2.6609874152952564,
"grad_norm": 0.8744826316833496,
"learning_rate": 5.635987590486039e-06,
"loss": 1.3077,
"step": 1718
},
{
"epoch": 2.664085188770571,
"grad_norm": 0.8999655246734619,
"learning_rate": 5.584281282316443e-06,
"loss": 1.3253,
"step": 1720
},
{
"epoch": 2.667182962245886,
"grad_norm": 0.8045452833175659,
"learning_rate": 5.532574974146846e-06,
"loss": 1.3129,
"step": 1722
},
{
"epoch": 2.6702807357212004,
"grad_norm": 0.8236184120178223,
"learning_rate": 5.4808686659772495e-06,
"loss": 1.3116,
"step": 1724
},
{
"epoch": 2.6733785091965148,
"grad_norm": 0.8479505777359009,
"learning_rate": 5.429162357807653e-06,
"loss": 1.3516,
"step": 1726
},
{
"epoch": 2.6764762826718296,
"grad_norm": 0.9431778192520142,
"learning_rate": 5.377456049638056e-06,
"loss": 1.3424,
"step": 1728
},
{
"epoch": 2.6795740561471444,
"grad_norm": 0.9968345165252686,
"learning_rate": 5.32574974146846e-06,
"loss": 1.3224,
"step": 1730
},
{
"epoch": 2.6826718296224588,
"grad_norm": 0.9029082655906677,
"learning_rate": 5.274043433298863e-06,
"loss": 1.3157,
"step": 1732
},
{
"epoch": 2.6857696030977736,
"grad_norm": 1.0283854007720947,
"learning_rate": 5.222337125129266e-06,
"loss": 1.3191,
"step": 1734
},
{
"epoch": 2.688867376573088,
"grad_norm": 0.7591708898544312,
"learning_rate": 5.170630816959669e-06,
"loss": 1.3148,
"step": 1736
},
{
"epoch": 2.6919651500484028,
"grad_norm": 0.8233999609947205,
"learning_rate": 5.118924508790072e-06,
"loss": 1.323,
"step": 1738
},
{
"epoch": 2.695062923523717,
"grad_norm": 0.82486891746521,
"learning_rate": 5.067218200620476e-06,
"loss": 1.3123,
"step": 1740
},
{
"epoch": 2.698160696999032,
"grad_norm": 0.9283408522605896,
"learning_rate": 5.015511892450879e-06,
"loss": 1.3116,
"step": 1742
},
{
"epoch": 2.7012584704743468,
"grad_norm": 0.8742640614509583,
"learning_rate": 4.9638055842812824e-06,
"loss": 1.3175,
"step": 1744
},
{
"epoch": 2.704356243949661,
"grad_norm": 0.8373504877090454,
"learning_rate": 4.912099276111686e-06,
"loss": 1.3284,
"step": 1746
},
{
"epoch": 2.707454017424976,
"grad_norm": 0.9012168645858765,
"learning_rate": 4.860392967942089e-06,
"loss": 1.3171,
"step": 1748
},
{
"epoch": 2.7105517909002903,
"grad_norm": 0.9048004746437073,
"learning_rate": 4.8086866597724926e-06,
"loss": 1.3148,
"step": 1750
},
{
"epoch": 2.713649564375605,
"grad_norm": 0.8509101271629333,
"learning_rate": 4.756980351602896e-06,
"loss": 1.3238,
"step": 1752
},
{
"epoch": 2.7167473378509195,
"grad_norm": 0.8519226908683777,
"learning_rate": 4.705274043433299e-06,
"loss": 1.3065,
"step": 1754
},
{
"epoch": 2.7198451113262343,
"grad_norm": 0.869109034538269,
"learning_rate": 4.653567735263703e-06,
"loss": 1.3211,
"step": 1756
},
{
"epoch": 2.722942884801549,
"grad_norm": 0.9159611463546753,
"learning_rate": 4.601861427094105e-06,
"loss": 1.334,
"step": 1758
},
{
"epoch": 2.7260406582768635,
"grad_norm": 1.0725860595703125,
"learning_rate": 4.550155118924509e-06,
"loss": 1.33,
"step": 1760
},
{
"epoch": 2.729138431752178,
"grad_norm": 1.0192230939865112,
"learning_rate": 4.498448810754912e-06,
"loss": 1.3405,
"step": 1762
},
{
"epoch": 2.7322362052274927,
"grad_norm": 0.7872644066810608,
"learning_rate": 4.446742502585315e-06,
"loss": 1.3092,
"step": 1764
},
{
"epoch": 2.7353339787028075,
"grad_norm": 0.8949226140975952,
"learning_rate": 4.39503619441572e-06,
"loss": 1.339,
"step": 1766
},
{
"epoch": 2.738431752178122,
"grad_norm": 0.9832562208175659,
"learning_rate": 4.343329886246122e-06,
"loss": 1.2972,
"step": 1768
},
{
"epoch": 2.7415295256534367,
"grad_norm": 1.0453011989593506,
"learning_rate": 4.2916235780765255e-06,
"loss": 1.3298,
"step": 1770
},
{
"epoch": 2.7446272991287515,
"grad_norm": 0.9331648945808411,
"learning_rate": 4.239917269906929e-06,
"loss": 1.3314,
"step": 1772
},
{
"epoch": 2.747725072604066,
"grad_norm": 0.7941224575042725,
"learning_rate": 4.188210961737332e-06,
"loss": 1.3243,
"step": 1774
},
{
"epoch": 2.75082284607938,
"grad_norm": 0.7843746542930603,
"learning_rate": 4.136504653567736e-06,
"loss": 1.3183,
"step": 1776
},
{
"epoch": 2.753920619554695,
"grad_norm": 0.8948341608047485,
"learning_rate": 4.084798345398139e-06,
"loss": 1.3386,
"step": 1778
},
{
"epoch": 2.75701839303001,
"grad_norm": 0.8391401171684265,
"learning_rate": 4.0330920372285416e-06,
"loss": 1.3312,
"step": 1780
},
{
"epoch": 2.760116166505324,
"grad_norm": 0.7821982502937317,
"learning_rate": 3.981385729058945e-06,
"loss": 1.3155,
"step": 1782
},
{
"epoch": 2.763213939980639,
"grad_norm": 0.8525044322013855,
"learning_rate": 3.929679420889348e-06,
"loss": 1.3249,
"step": 1784
},
{
"epoch": 2.7663117134559534,
"grad_norm": 0.8398758172988892,
"learning_rate": 3.8779731127197525e-06,
"loss": 1.3264,
"step": 1786
},
{
"epoch": 2.769409486931268,
"grad_norm": 0.9518385529518127,
"learning_rate": 3.826266804550156e-06,
"loss": 1.3416,
"step": 1788
},
{
"epoch": 2.7725072604065826,
"grad_norm": 0.7718288898468018,
"learning_rate": 3.774560496380559e-06,
"loss": 1.3353,
"step": 1790
},
{
"epoch": 2.7756050338818974,
"grad_norm": 0.8064902424812317,
"learning_rate": 3.722854188210962e-06,
"loss": 1.3179,
"step": 1792
},
{
"epoch": 2.778702807357212,
"grad_norm": 0.9301968216896057,
"learning_rate": 3.671147880041365e-06,
"loss": 1.3467,
"step": 1794
},
{
"epoch": 2.7818005808325266,
"grad_norm": 0.8251471519470215,
"learning_rate": 3.6194415718717686e-06,
"loss": 1.3189,
"step": 1796
},
{
"epoch": 2.7848983543078414,
"grad_norm": 0.9618167877197266,
"learning_rate": 3.5677352637021715e-06,
"loss": 1.3134,
"step": 1798
},
{
"epoch": 2.7879961277831558,
"grad_norm": 0.8314012885093689,
"learning_rate": 3.516028955532575e-06,
"loss": 1.3318,
"step": 1800
},
{
"epoch": 2.7910939012584706,
"grad_norm": 0.7383018136024475,
"learning_rate": 3.4643226473629783e-06,
"loss": 1.3161,
"step": 1802
},
{
"epoch": 2.794191674733785,
"grad_norm": 0.7468191385269165,
"learning_rate": 3.4126163391933813e-06,
"loss": 1.3165,
"step": 1804
},
{
"epoch": 2.7972894482090997,
"grad_norm": 0.9003493785858154,
"learning_rate": 3.3609100310237855e-06,
"loss": 1.3431,
"step": 1806
},
{
"epoch": 2.8003872216844146,
"grad_norm": 0.9882494807243347,
"learning_rate": 3.3092037228541884e-06,
"loss": 1.347,
"step": 1808
},
{
"epoch": 2.803484995159729,
"grad_norm": 0.8110159039497375,
"learning_rate": 3.257497414684592e-06,
"loss": 1.3262,
"step": 1810
},
{
"epoch": 2.8065827686350433,
"grad_norm": 1.0101655721664429,
"learning_rate": 3.205791106514995e-06,
"loss": 1.3372,
"step": 1812
},
{
"epoch": 2.809680542110358,
"grad_norm": 0.8719637393951416,
"learning_rate": 3.154084798345398e-06,
"loss": 1.3051,
"step": 1814
},
{
"epoch": 2.812778315585673,
"grad_norm": 0.868802011013031,
"learning_rate": 3.1023784901758015e-06,
"loss": 1.3299,
"step": 1816
},
{
"epoch": 2.8158760890609873,
"grad_norm": 0.832973301410675,
"learning_rate": 3.050672182006205e-06,
"loss": 1.3182,
"step": 1818
},
{
"epoch": 2.818973862536302,
"grad_norm": 0.8184823393821716,
"learning_rate": 2.998965873836608e-06,
"loss": 1.3083,
"step": 1820
},
{
"epoch": 2.822071636011617,
"grad_norm": 0.909972071647644,
"learning_rate": 2.9472595656670117e-06,
"loss": 1.3224,
"step": 1822
},
{
"epoch": 2.8251694094869313,
"grad_norm": 0.7917800545692444,
"learning_rate": 2.895553257497415e-06,
"loss": 1.325,
"step": 1824
},
{
"epoch": 2.8282671829622457,
"grad_norm": 0.847621738910675,
"learning_rate": 2.843846949327818e-06,
"loss": 1.2891,
"step": 1826
},
{
"epoch": 2.8313649564375605,
"grad_norm": 0.8181946873664856,
"learning_rate": 2.7921406411582214e-06,
"loss": 1.3118,
"step": 1828
},
{
"epoch": 2.8344627299128753,
"grad_norm": 0.8904102444648743,
"learning_rate": 2.7404343329886247e-06,
"loss": 1.323,
"step": 1830
},
{
"epoch": 2.8375605033881897,
"grad_norm": 0.9051535129547119,
"learning_rate": 2.688728024819028e-06,
"loss": 1.3313,
"step": 1832
},
{
"epoch": 2.8406582768635045,
"grad_norm": 0.9703993201255798,
"learning_rate": 2.6370217166494315e-06,
"loss": 1.3269,
"step": 1834
},
{
"epoch": 2.843756050338819,
"grad_norm": 0.9004696011543274,
"learning_rate": 2.5853154084798345e-06,
"loss": 1.3368,
"step": 1836
},
{
"epoch": 2.8468538238141337,
"grad_norm": 0.8330061435699463,
"learning_rate": 2.533609100310238e-06,
"loss": 1.3121,
"step": 1838
},
{
"epoch": 2.849951597289448,
"grad_norm": 0.8227541446685791,
"learning_rate": 2.4819027921406412e-06,
"loss": 1.3146,
"step": 1840
},
{
"epoch": 2.853049370764763,
"grad_norm": 0.945093035697937,
"learning_rate": 2.4301964839710446e-06,
"loss": 1.3237,
"step": 1842
},
{
"epoch": 2.8561471442400777,
"grad_norm": 0.7945578694343567,
"learning_rate": 2.378490175801448e-06,
"loss": 1.3121,
"step": 1844
},
{
"epoch": 2.859244917715392,
"grad_norm": 0.7976880669593811,
"learning_rate": 2.3267838676318514e-06,
"loss": 1.3276,
"step": 1846
},
{
"epoch": 2.862342691190707,
"grad_norm": 0.8588410019874573,
"learning_rate": 2.2750775594622543e-06,
"loss": 1.3509,
"step": 1848
},
{
"epoch": 2.865440464666021,
"grad_norm": 0.8236850500106812,
"learning_rate": 2.2233712512926577e-06,
"loss": 1.3435,
"step": 1850
},
{
"epoch": 2.868538238141336,
"grad_norm": 1.0473836660385132,
"learning_rate": 2.171664943123061e-06,
"loss": 1.3452,
"step": 1852
},
{
"epoch": 2.8716360116166504,
"grad_norm": 0.8921092748641968,
"learning_rate": 2.1199586349534644e-06,
"loss": 1.3129,
"step": 1854
},
{
"epoch": 2.874733785091965,
"grad_norm": 0.9309847950935364,
"learning_rate": 2.068252326783868e-06,
"loss": 1.3445,
"step": 1856
},
{
"epoch": 2.87783155856728,
"grad_norm": 0.8526076078414917,
"learning_rate": 2.0165460186142708e-06,
"loss": 1.3123,
"step": 1858
},
{
"epoch": 2.8809293320425944,
"grad_norm": 0.8316435813903809,
"learning_rate": 1.964839710444674e-06,
"loss": 1.3425,
"step": 1860
},
{
"epoch": 2.8840271055179088,
"grad_norm": 0.7766702771186829,
"learning_rate": 1.913133402275078e-06,
"loss": 1.3024,
"step": 1862
},
{
"epoch": 2.8871248789932236,
"grad_norm": 0.7230123281478882,
"learning_rate": 1.861427094105481e-06,
"loss": 1.3261,
"step": 1864
},
{
"epoch": 2.8902226524685384,
"grad_norm": 0.9641100168228149,
"learning_rate": 1.8097207859358843e-06,
"loss": 1.3273,
"step": 1866
},
{
"epoch": 2.8933204259438527,
"grad_norm": 0.9102080464363098,
"learning_rate": 1.7580144777662875e-06,
"loss": 1.3343,
"step": 1868
},
{
"epoch": 2.8964181994191676,
"grad_norm": 0.8651391267776489,
"learning_rate": 1.7063081695966906e-06,
"loss": 1.3075,
"step": 1870
},
{
"epoch": 2.899515972894482,
"grad_norm": 0.7797631025314331,
"learning_rate": 1.6546018614270942e-06,
"loss": 1.3214,
"step": 1872
},
{
"epoch": 2.9026137463697967,
"grad_norm": 0.7178356051445007,
"learning_rate": 1.6028955532574976e-06,
"loss": 1.3413,
"step": 1874
},
{
"epoch": 2.905711519845111,
"grad_norm": 0.7826308608055115,
"learning_rate": 1.5511892450879008e-06,
"loss": 1.3227,
"step": 1876
},
{
"epoch": 2.908809293320426,
"grad_norm": 0.7831746339797974,
"learning_rate": 1.499482936918304e-06,
"loss": 1.3056,
"step": 1878
},
{
"epoch": 2.9119070667957407,
"grad_norm": 0.9170383214950562,
"learning_rate": 1.4477766287487075e-06,
"loss": 1.2875,
"step": 1880
},
{
"epoch": 2.915004840271055,
"grad_norm": 0.8851009607315063,
"learning_rate": 1.3960703205791107e-06,
"loss": 1.3318,
"step": 1882
},
{
"epoch": 2.91810261374637,
"grad_norm": 0.8773500323295593,
"learning_rate": 1.344364012409514e-06,
"loss": 1.3284,
"step": 1884
},
{
"epoch": 2.9212003872216843,
"grad_norm": 0.8349604606628418,
"learning_rate": 1.2926577042399172e-06,
"loss": 1.3202,
"step": 1886
},
{
"epoch": 2.924298160696999,
"grad_norm": 0.7429217100143433,
"learning_rate": 1.2409513960703206e-06,
"loss": 1.2914,
"step": 1888
},
{
"epoch": 2.9273959341723135,
"grad_norm": 0.8590971231460571,
"learning_rate": 1.189245087900724e-06,
"loss": 1.2972,
"step": 1890
},
{
"epoch": 2.9304937076476283,
"grad_norm": 0.7867938280105591,
"learning_rate": 1.1375387797311272e-06,
"loss": 1.3253,
"step": 1892
},
{
"epoch": 2.933591481122943,
"grad_norm": 0.8447002172470093,
"learning_rate": 1.0858324715615305e-06,
"loss": 1.3536,
"step": 1894
},
{
"epoch": 2.9366892545982575,
"grad_norm": 0.778304934501648,
"learning_rate": 1.034126163391934e-06,
"loss": 1.3141,
"step": 1896
},
{
"epoch": 2.9397870280735723,
"grad_norm": 0.9816215634346008,
"learning_rate": 9.82419855222337e-07,
"loss": 1.3013,
"step": 1898
},
{
"epoch": 2.9428848015488867,
"grad_norm": 0.7762349843978882,
"learning_rate": 9.307135470527405e-07,
"loss": 1.319,
"step": 1900
},
{
"epoch": 2.9459825750242015,
"grad_norm": 0.807734489440918,
"learning_rate": 8.790072388831437e-07,
"loss": 1.2891,
"step": 1902
},
{
"epoch": 2.949080348499516,
"grad_norm": 0.9189450144767761,
"learning_rate": 8.273009307135471e-07,
"loss": 1.314,
"step": 1904
},
{
"epoch": 2.9521781219748306,
"grad_norm": 0.8544689416885376,
"learning_rate": 7.755946225439504e-07,
"loss": 1.3229,
"step": 1906
},
{
"epoch": 2.9552758954501455,
"grad_norm": 0.7630512714385986,
"learning_rate": 7.238883143743538e-07,
"loss": 1.305,
"step": 1908
},
{
"epoch": 2.95837366892546,
"grad_norm": 0.770119845867157,
"learning_rate": 6.72182006204757e-07,
"loss": 1.3178,
"step": 1910
},
{
"epoch": 2.961471442400774,
"grad_norm": 0.7441233396530151,
"learning_rate": 6.204756980351603e-07,
"loss": 1.3228,
"step": 1912
},
{
"epoch": 2.964569215876089,
"grad_norm": 0.849139392375946,
"learning_rate": 5.687693898655636e-07,
"loss": 1.322,
"step": 1914
},
{
"epoch": 2.967666989351404,
"grad_norm": 0.976645290851593,
"learning_rate": 5.17063081695967e-07,
"loss": 1.3295,
"step": 1916
},
{
"epoch": 2.970764762826718,
"grad_norm": 0.8053218722343445,
"learning_rate": 4.6535677352637023e-07,
"loss": 1.3309,
"step": 1918
},
{
"epoch": 2.973862536302033,
"grad_norm": 0.824623167514801,
"learning_rate": 4.1365046535677355e-07,
"loss": 1.3316,
"step": 1920
},
{
"epoch": 2.9769603097773474,
"grad_norm": 0.8549762964248657,
"learning_rate": 3.619441571871769e-07,
"loss": 1.3427,
"step": 1922
},
{
"epoch": 2.980058083252662,
"grad_norm": 0.8293663859367371,
"learning_rate": 3.1023784901758015e-07,
"loss": 1.321,
"step": 1924
},
{
"epoch": 2.9831558567279766,
"grad_norm": 0.810080885887146,
"learning_rate": 2.585315408479835e-07,
"loss": 1.3027,
"step": 1926
},
{
"epoch": 2.9862536302032914,
"grad_norm": 0.863522469997406,
"learning_rate": 2.0682523267838678e-07,
"loss": 1.3436,
"step": 1928
},
{
"epoch": 2.989351403678606,
"grad_norm": 0.9304227828979492,
"learning_rate": 1.5511892450879008e-07,
"loss": 1.3263,
"step": 1930
},
{
"epoch": 2.9924491771539206,
"grad_norm": 0.8415968418121338,
"learning_rate": 1.0341261633919339e-07,
"loss": 1.309,
"step": 1932
},
{
"epoch": 2.9955469506292354,
"grad_norm": 0.9068368673324585,
"learning_rate": 5.1706308169596694e-08,
"loss": 1.3331,
"step": 1934
},
{
"epoch": 2.9970958373668926,
"step": 1935,
"total_flos": 9.200823520827802e+17,
"train_loss": 1.443482890055161,
"train_runtime": 51287.5168,
"train_samples_per_second": 9.668,
"train_steps_per_second": 0.038
},
{
"epoch": 2.9970958373668926,
"eval_loss": 1.3220494985580444,
"eval_runtime": 499.8645,
"eval_samples_per_second": 41.331,
"eval_steps_per_second": 5.167,
"step": 1935
},
{
"epoch": 2.9970958373668926,
"eval_loss": 1.3218390941619873,
"eval_runtime": 500.7224,
"eval_samples_per_second": 41.26,
"eval_steps_per_second": 5.159,
"step": 1935
}
],
"logging_steps": 2,
"max_steps": 1935,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.200823520827802e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}