9b-36 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
f5b0d62 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1410,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00425531914893617,
"grad_norm": 0.9765625,
"learning_rate": 1.6901408450704225e-07,
"loss": 1.472063422203064,
"step": 2
},
{
"epoch": 0.00851063829787234,
"grad_norm": 0.94140625,
"learning_rate": 5.070422535211268e-07,
"loss": 1.9255280494689941,
"step": 4
},
{
"epoch": 0.01276595744680851,
"grad_norm": 1.9296875,
"learning_rate": 8.450704225352114e-07,
"loss": 1.9632502794265747,
"step": 6
},
{
"epoch": 0.01702127659574468,
"grad_norm": 1.34375,
"learning_rate": 1.1830985915492958e-06,
"loss": 1.6374425888061523,
"step": 8
},
{
"epoch": 0.02127659574468085,
"grad_norm": 1.1953125,
"learning_rate": 1.5211267605633803e-06,
"loss": 1.959162950515747,
"step": 10
},
{
"epoch": 0.02553191489361702,
"grad_norm": 0.9140625,
"learning_rate": 1.8591549295774647e-06,
"loss": 1.4726247787475586,
"step": 12
},
{
"epoch": 0.029787234042553193,
"grad_norm": 1.765625,
"learning_rate": 2.1971830985915494e-06,
"loss": 2.0769670009613037,
"step": 14
},
{
"epoch": 0.03404255319148936,
"grad_norm": 0.953125,
"learning_rate": 2.535211267605634e-06,
"loss": 1.9118707180023193,
"step": 16
},
{
"epoch": 0.03829787234042553,
"grad_norm": 1.171875,
"learning_rate": 2.8732394366197183e-06,
"loss": 1.7807828187942505,
"step": 18
},
{
"epoch": 0.0425531914893617,
"grad_norm": 1.5390625,
"learning_rate": 3.2112676056338028e-06,
"loss": 1.9391090869903564,
"step": 20
},
{
"epoch": 0.04680851063829787,
"grad_norm": 0.72265625,
"learning_rate": 3.549295774647887e-06,
"loss": 1.6522634029388428,
"step": 22
},
{
"epoch": 0.05106382978723404,
"grad_norm": 1.1328125,
"learning_rate": 3.887323943661972e-06,
"loss": 1.692237138748169,
"step": 24
},
{
"epoch": 0.05531914893617021,
"grad_norm": 1.1015625,
"learning_rate": 4.225352112676056e-06,
"loss": 1.443329930305481,
"step": 26
},
{
"epoch": 0.059574468085106386,
"grad_norm": 1.0625,
"learning_rate": 4.563380281690141e-06,
"loss": 1.758739948272705,
"step": 28
},
{
"epoch": 0.06382978723404255,
"grad_norm": 0.8515625,
"learning_rate": 4.901408450704226e-06,
"loss": 1.6877835988998413,
"step": 30
},
{
"epoch": 0.06808510638297872,
"grad_norm": 1.5078125,
"learning_rate": 5.2394366197183095e-06,
"loss": 1.468690037727356,
"step": 32
},
{
"epoch": 0.07234042553191489,
"grad_norm": 0.64453125,
"learning_rate": 5.577464788732395e-06,
"loss": 1.6828500032424927,
"step": 34
},
{
"epoch": 0.07659574468085106,
"grad_norm": 1.0546875,
"learning_rate": 5.915492957746479e-06,
"loss": 1.6752516031265259,
"step": 36
},
{
"epoch": 0.08085106382978724,
"grad_norm": 2.984375,
"learning_rate": 6.253521126760563e-06,
"loss": 1.9001795053482056,
"step": 38
},
{
"epoch": 0.0851063829787234,
"grad_norm": 0.9296875,
"learning_rate": 6.591549295774649e-06,
"loss": 1.6414787769317627,
"step": 40
},
{
"epoch": 0.08936170212765958,
"grad_norm": 0.8828125,
"learning_rate": 6.929577464788733e-06,
"loss": 1.3303271532058716,
"step": 42
},
{
"epoch": 0.09361702127659574,
"grad_norm": 0.80078125,
"learning_rate": 7.267605633802817e-06,
"loss": 1.5457786321640015,
"step": 44
},
{
"epoch": 0.09787234042553192,
"grad_norm": 0.54296875,
"learning_rate": 7.605633802816902e-06,
"loss": 1.4271644353866577,
"step": 46
},
{
"epoch": 0.10212765957446808,
"grad_norm": 0.6484375,
"learning_rate": 7.943661971830987e-06,
"loss": 1.5979524850845337,
"step": 48
},
{
"epoch": 0.10638297872340426,
"grad_norm": 1.0234375,
"learning_rate": 8.28169014084507e-06,
"loss": 1.6684672832489014,
"step": 50
},
{
"epoch": 0.11063829787234042,
"grad_norm": 0.7109375,
"learning_rate": 8.619718309859156e-06,
"loss": 1.3746291399002075,
"step": 52
},
{
"epoch": 0.1148936170212766,
"grad_norm": 0.5546875,
"learning_rate": 8.95774647887324e-06,
"loss": 1.4159908294677734,
"step": 54
},
{
"epoch": 0.11914893617021277,
"grad_norm": 0.58984375,
"learning_rate": 9.295774647887323e-06,
"loss": 1.2559518814086914,
"step": 56
},
{
"epoch": 0.12340425531914893,
"grad_norm": 0.6484375,
"learning_rate": 9.63380281690141e-06,
"loss": 1.4071341753005981,
"step": 58
},
{
"epoch": 0.1276595744680851,
"grad_norm": 0.74609375,
"learning_rate": 9.971830985915494e-06,
"loss": 1.325224757194519,
"step": 60
},
{
"epoch": 0.13191489361702127,
"grad_norm": 0.94921875,
"learning_rate": 1.0309859154929577e-05,
"loss": 1.2854632139205933,
"step": 62
},
{
"epoch": 0.13617021276595745,
"grad_norm": 0.87890625,
"learning_rate": 1.0647887323943662e-05,
"loss": 1.0856443643569946,
"step": 64
},
{
"epoch": 0.14042553191489363,
"grad_norm": 0.4296875,
"learning_rate": 1.0985915492957746e-05,
"loss": 1.4248228073120117,
"step": 66
},
{
"epoch": 0.14468085106382977,
"grad_norm": 0.50390625,
"learning_rate": 1.1323943661971831e-05,
"loss": 1.3485311269760132,
"step": 68
},
{
"epoch": 0.14893617021276595,
"grad_norm": 0.6328125,
"learning_rate": 1.1661971830985917e-05,
"loss": 1.3656905889511108,
"step": 70
},
{
"epoch": 0.15319148936170213,
"grad_norm": 0.6171875,
"learning_rate": 1.2e-05,
"loss": 1.4325069189071655,
"step": 72
},
{
"epoch": 0.1574468085106383,
"grad_norm": 0.69921875,
"learning_rate": 1.1999947154376356e-05,
"loss": 1.541415810585022,
"step": 74
},
{
"epoch": 0.16170212765957448,
"grad_norm": 0.53515625,
"learning_rate": 1.199978861866902e-05,
"loss": 1.385392665863037,
"step": 76
},
{
"epoch": 0.16595744680851063,
"grad_norm": 0.609375,
"learning_rate": 1.19995243963688e-05,
"loss": 1.2694331407546997,
"step": 78
},
{
"epoch": 0.1702127659574468,
"grad_norm": 1.125,
"learning_rate": 1.1999154493293607e-05,
"loss": 1.5753132104873657,
"step": 80
},
{
"epoch": 0.17446808510638298,
"grad_norm": 0.546875,
"learning_rate": 1.1998678917588341e-05,
"loss": 1.2333686351776123,
"step": 82
},
{
"epoch": 0.17872340425531916,
"grad_norm": 0.63671875,
"learning_rate": 1.1998097679724704e-05,
"loss": 1.2822571992874146,
"step": 84
},
{
"epoch": 0.1829787234042553,
"grad_norm": 0.5546875,
"learning_rate": 1.1997410792500985e-05,
"loss": 1.3188749551773071,
"step": 86
},
{
"epoch": 0.18723404255319148,
"grad_norm": 0.71875,
"learning_rate": 1.1996618271041757e-05,
"loss": 1.3399384021759033,
"step": 88
},
{
"epoch": 0.19148936170212766,
"grad_norm": 0.75390625,
"learning_rate": 1.1995720132797555e-05,
"loss": 1.3193027973175049,
"step": 90
},
{
"epoch": 0.19574468085106383,
"grad_norm": 0.8515625,
"learning_rate": 1.1994716397544498e-05,
"loss": 1.30392324924469,
"step": 92
},
{
"epoch": 0.2,
"grad_norm": 0.45703125,
"learning_rate": 1.1993607087383841e-05,
"loss": 1.1891350746154785,
"step": 94
},
{
"epoch": 0.20425531914893616,
"grad_norm": 0.51171875,
"learning_rate": 1.1992392226741494e-05,
"loss": 1.2335644960403442,
"step": 96
},
{
"epoch": 0.20851063829787234,
"grad_norm": 0.48046875,
"learning_rate": 1.1991071842367492e-05,
"loss": 1.3029327392578125,
"step": 98
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.55859375,
"learning_rate": 1.1989645963335381e-05,
"loss": 1.2645999193191528,
"step": 100
},
{
"epoch": 0.2170212765957447,
"grad_norm": 0.83203125,
"learning_rate": 1.1988114621041614e-05,
"loss": 1.2268767356872559,
"step": 102
},
{
"epoch": 0.22127659574468084,
"grad_norm": 0.90234375,
"learning_rate": 1.1986477849204828e-05,
"loss": 1.1907193660736084,
"step": 104
},
{
"epoch": 0.225531914893617,
"grad_norm": 0.64453125,
"learning_rate": 1.1984735683865123e-05,
"loss": 1.31586754322052,
"step": 106
},
{
"epoch": 0.2297872340425532,
"grad_norm": 1.0703125,
"learning_rate": 1.1982888163383247e-05,
"loss": 1.299729347229004,
"step": 108
},
{
"epoch": 0.23404255319148937,
"grad_norm": 1.84375,
"learning_rate": 1.1980935328439775e-05,
"loss": 1.708440899848938,
"step": 110
},
{
"epoch": 0.23829787234042554,
"grad_norm": 0.73828125,
"learning_rate": 1.1978877222034202e-05,
"loss": 1.2829785346984863,
"step": 112
},
{
"epoch": 0.2425531914893617,
"grad_norm": 0.68359375,
"learning_rate": 1.197671388948399e-05,
"loss": 1.272111415863037,
"step": 114
},
{
"epoch": 0.24680851063829787,
"grad_norm": 0.421875,
"learning_rate": 1.1974445378423578e-05,
"loss": 1.3535809516906738,
"step": 116
},
{
"epoch": 0.251063829787234,
"grad_norm": 0.85546875,
"learning_rate": 1.1972071738803339e-05,
"loss": 1.2550489902496338,
"step": 118
},
{
"epoch": 0.2553191489361702,
"grad_norm": 1.0859375,
"learning_rate": 1.1969593022888462e-05,
"loss": 1.2029892206192017,
"step": 120
},
{
"epoch": 0.25957446808510637,
"grad_norm": 0.80078125,
"learning_rate": 1.1967009285257822e-05,
"loss": 1.0597739219665527,
"step": 122
},
{
"epoch": 0.26382978723404255,
"grad_norm": 0.5546875,
"learning_rate": 1.1964320582802759e-05,
"loss": 1.2965384721755981,
"step": 124
},
{
"epoch": 0.2680851063829787,
"grad_norm": 0.3515625,
"learning_rate": 1.196152697472584e-05,
"loss": 1.3368679285049438,
"step": 126
},
{
"epoch": 0.2723404255319149,
"grad_norm": 0.79296875,
"learning_rate": 1.1958628522539549e-05,
"loss": 1.3335758447647095,
"step": 128
},
{
"epoch": 0.2765957446808511,
"grad_norm": 0.470703125,
"learning_rate": 1.1955625290064935e-05,
"loss": 1.3016529083251953,
"step": 130
},
{
"epoch": 0.28085106382978725,
"grad_norm": 0.8671875,
"learning_rate": 1.1952517343430199e-05,
"loss": 1.253875494003296,
"step": 132
},
{
"epoch": 0.2851063829787234,
"grad_norm": 0.478515625,
"learning_rate": 1.1949304751069256e-05,
"loss": 1.2634450197219849,
"step": 134
},
{
"epoch": 0.28936170212765955,
"grad_norm": 1.2109375,
"learning_rate": 1.1945987583720202e-05,
"loss": 1.294474482536316,
"step": 136
},
{
"epoch": 0.2936170212765957,
"grad_norm": 0.85546875,
"learning_rate": 1.194256591442378e-05,
"loss": 1.2694545984268188,
"step": 138
},
{
"epoch": 0.2978723404255319,
"grad_norm": 0.484375,
"learning_rate": 1.1939039818521758e-05,
"loss": 1.4072679281234741,
"step": 140
},
{
"epoch": 0.3021276595744681,
"grad_norm": 0.58203125,
"learning_rate": 1.1935409373655282e-05,
"loss": 1.3019527196884155,
"step": 142
},
{
"epoch": 0.30638297872340425,
"grad_norm": 1.125,
"learning_rate": 1.1931674659763148e-05,
"loss": 1.4703279733657837,
"step": 144
},
{
"epoch": 0.31063829787234043,
"grad_norm": 1.453125,
"learning_rate": 1.1927835759080058e-05,
"loss": 1.1757651567459106,
"step": 146
},
{
"epoch": 0.3148936170212766,
"grad_norm": 0.58203125,
"learning_rate": 1.1923892756134807e-05,
"loss": 1.2418992519378662,
"step": 148
},
{
"epoch": 0.3191489361702128,
"grad_norm": 0.55078125,
"learning_rate": 1.1919845737748413e-05,
"loss": 1.1974143981933594,
"step": 150
},
{
"epoch": 0.32340425531914896,
"grad_norm": 0.8359375,
"learning_rate": 1.1915694793032215e-05,
"loss": 1.3293455839157104,
"step": 152
},
{
"epoch": 0.3276595744680851,
"grad_norm": 0.66015625,
"learning_rate": 1.1911440013385906e-05,
"loss": 1.1985448598861694,
"step": 154
},
{
"epoch": 0.33191489361702126,
"grad_norm": 0.46875,
"learning_rate": 1.1907081492495521e-05,
"loss": 1.2568351030349731,
"step": 156
},
{
"epoch": 0.33617021276595743,
"grad_norm": 0.453125,
"learning_rate": 1.1902619326331371e-05,
"loss": 1.2663094997406006,
"step": 158
},
{
"epoch": 0.3404255319148936,
"grad_norm": 0.384765625,
"learning_rate": 1.1898053613145944e-05,
"loss": 1.1971551179885864,
"step": 160
},
{
"epoch": 0.3446808510638298,
"grad_norm": 0.75390625,
"learning_rate": 1.1893384453471717e-05,
"loss": 1.2108319997787476,
"step": 162
},
{
"epoch": 0.34893617021276596,
"grad_norm": 0.423828125,
"learning_rate": 1.1888611950118964e-05,
"loss": 1.2176121473312378,
"step": 164
},
{
"epoch": 0.35319148936170214,
"grad_norm": 0.546875,
"learning_rate": 1.188373620817349e-05,
"loss": 1.2852199077606201,
"step": 166
},
{
"epoch": 0.3574468085106383,
"grad_norm": 0.90234375,
"learning_rate": 1.1878757334994293e-05,
"loss": 1.137981653213501,
"step": 168
},
{
"epoch": 0.3617021276595745,
"grad_norm": 0.490234375,
"learning_rate": 1.1873675440211238e-05,
"loss": 1.2986195087432861,
"step": 170
},
{
"epoch": 0.3659574468085106,
"grad_norm": 0.54296875,
"learning_rate": 1.1868490635722617e-05,
"loss": 1.2511855363845825,
"step": 172
},
{
"epoch": 0.3702127659574468,
"grad_norm": 0.365234375,
"learning_rate": 1.186320303569269e-05,
"loss": 1.2008732557296753,
"step": 174
},
{
"epoch": 0.37446808510638296,
"grad_norm": 0.58203125,
"learning_rate": 1.185781275654917e-05,
"loss": 1.3959091901779175,
"step": 176
},
{
"epoch": 0.37872340425531914,
"grad_norm": 0.53125,
"learning_rate": 1.1852319916980676e-05,
"loss": 1.3956475257873535,
"step": 178
},
{
"epoch": 0.3829787234042553,
"grad_norm": 0.63671875,
"learning_rate": 1.1846724637934086e-05,
"loss": 1.1432154178619385,
"step": 180
},
{
"epoch": 0.3872340425531915,
"grad_norm": 0.4140625,
"learning_rate": 1.184102704261191e-05,
"loss": 1.198095679283142,
"step": 182
},
{
"epoch": 0.39148936170212767,
"grad_norm": 0.52734375,
"learning_rate": 1.1835227256469556e-05,
"loss": 1.126910924911499,
"step": 184
},
{
"epoch": 0.39574468085106385,
"grad_norm": 0.546875,
"learning_rate": 1.1829325407212569e-05,
"loss": 1.340002179145813,
"step": 186
},
{
"epoch": 0.4,
"grad_norm": 0.56640625,
"learning_rate": 1.1823321624793831e-05,
"loss": 1.2044755220413208,
"step": 188
},
{
"epoch": 0.40425531914893614,
"grad_norm": 0.42578125,
"learning_rate": 1.1817216041410678e-05,
"loss": 1.1999846696853638,
"step": 190
},
{
"epoch": 0.4085106382978723,
"grad_norm": 0.94140625,
"learning_rate": 1.181100879150202e-05,
"loss": 1.2849934101104736,
"step": 192
},
{
"epoch": 0.4127659574468085,
"grad_norm": 0.3671875,
"learning_rate": 1.180470001174535e-05,
"loss": 1.3638895750045776,
"step": 194
},
{
"epoch": 0.41702127659574467,
"grad_norm": 0.5390625,
"learning_rate": 1.179828984105375e-05,
"loss": 1.2097505331039429,
"step": 196
},
{
"epoch": 0.42127659574468085,
"grad_norm": 1.140625,
"learning_rate": 1.1791778420572834e-05,
"loss": 1.2969235181808472,
"step": 198
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.70703125,
"learning_rate": 1.1785165893677632e-05,
"loss": 1.3074672222137451,
"step": 200
},
{
"epoch": 0.4297872340425532,
"grad_norm": 0.57421875,
"learning_rate": 1.1778452405969437e-05,
"loss": 1.2175475358963013,
"step": 202
},
{
"epoch": 0.4340425531914894,
"grad_norm": 1.0859375,
"learning_rate": 1.1771638105272605e-05,
"loss": 1.1964837312698364,
"step": 204
},
{
"epoch": 0.43829787234042555,
"grad_norm": 0.455078125,
"learning_rate": 1.176472314163129e-05,
"loss": 1.2108904123306274,
"step": 206
},
{
"epoch": 0.4425531914893617,
"grad_norm": 0.4609375,
"learning_rate": 1.1757707667306142e-05,
"loss": 1.2564092874526978,
"step": 208
},
{
"epoch": 0.44680851063829785,
"grad_norm": 0.625,
"learning_rate": 1.1750591836770963e-05,
"loss": 1.2397825717926025,
"step": 210
},
{
"epoch": 0.451063829787234,
"grad_norm": 0.58984375,
"learning_rate": 1.1743375806709292e-05,
"loss": 1.141276478767395,
"step": 212
},
{
"epoch": 0.4553191489361702,
"grad_norm": 0.43359375,
"learning_rate": 1.1736059736010964e-05,
"loss": 1.2472527027130127,
"step": 214
},
{
"epoch": 0.4595744680851064,
"grad_norm": 0.4921875,
"learning_rate": 1.1728643785768619e-05,
"loss": 1.2373621463775635,
"step": 216
},
{
"epoch": 0.46382978723404256,
"grad_norm": 0.578125,
"learning_rate": 1.1721128119274132e-05,
"loss": 1.3174031972885132,
"step": 218
},
{
"epoch": 0.46808510638297873,
"grad_norm": 1.0390625,
"learning_rate": 1.171351290201504e-05,
"loss": 1.4028608798980713,
"step": 220
},
{
"epoch": 0.4723404255319149,
"grad_norm": 0.75390625,
"learning_rate": 1.170579830167089e-05,
"loss": 1.2434858083724976,
"step": 222
},
{
"epoch": 0.4765957446808511,
"grad_norm": 0.60546875,
"learning_rate": 1.1697984488109536e-05,
"loss": 1.2289927005767822,
"step": 224
},
{
"epoch": 0.4808510638297872,
"grad_norm": 0.43359375,
"learning_rate": 1.1690071633383422e-05,
"loss": 1.1950970888137817,
"step": 226
},
{
"epoch": 0.4851063829787234,
"grad_norm": 0.423828125,
"learning_rate": 1.168205991172577e-05,
"loss": 1.398798942565918,
"step": 228
},
{
"epoch": 0.48936170212765956,
"grad_norm": 0.50390625,
"learning_rate": 1.1673949499546763e-05,
"loss": 1.2393437623977661,
"step": 230
},
{
"epoch": 0.49361702127659574,
"grad_norm": 0.75,
"learning_rate": 1.166574057542964e-05,
"loss": 1.2385178804397583,
"step": 232
},
{
"epoch": 0.4978723404255319,
"grad_norm": 0.6875,
"learning_rate": 1.165743332012679e-05,
"loss": 1.4011635780334473,
"step": 234
},
{
"epoch": 0.502127659574468,
"grad_norm": 0.52734375,
"learning_rate": 1.1649027916555742e-05,
"loss": 1.2445231676101685,
"step": 236
},
{
"epoch": 0.5063829787234042,
"grad_norm": 0.53515625,
"learning_rate": 1.1640524549795163e-05,
"loss": 1.2868069410324097,
"step": 238
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.369140625,
"learning_rate": 1.1631923407080772e-05,
"loss": 1.3375487327575684,
"step": 240
},
{
"epoch": 0.5148936170212766,
"grad_norm": 0.62109375,
"learning_rate": 1.1623224677801212e-05,
"loss": 1.109569787979126,
"step": 242
},
{
"epoch": 0.5191489361702127,
"grad_norm": 2.34375,
"learning_rate": 1.1614428553493886e-05,
"loss": 1.1656110286712646,
"step": 244
},
{
"epoch": 0.5234042553191489,
"grad_norm": 0.95703125,
"learning_rate": 1.160553522784075e-05,
"loss": 1.159610629081726,
"step": 246
},
{
"epoch": 0.5276595744680851,
"grad_norm": 0.474609375,
"learning_rate": 1.1596544896664021e-05,
"loss": 1.24387788772583,
"step": 248
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.408203125,
"learning_rate": 1.1587457757921896e-05,
"loss": 1.324474811553955,
"step": 250
},
{
"epoch": 0.5361702127659574,
"grad_norm": 0.4296875,
"learning_rate": 1.1578274011704169e-05,
"loss": 1.4482465982437134,
"step": 252
},
{
"epoch": 0.5404255319148936,
"grad_norm": 0.88671875,
"learning_rate": 1.1568993860227838e-05,
"loss": 1.425924301147461,
"step": 254
},
{
"epoch": 0.5446808510638298,
"grad_norm": 1.6796875,
"learning_rate": 1.155961750783265e-05,
"loss": 1.6282589435577393,
"step": 256
},
{
"epoch": 0.548936170212766,
"grad_norm": 0.68359375,
"learning_rate": 1.1550145160976607e-05,
"loss": 1.294546127319336,
"step": 258
},
{
"epoch": 0.5531914893617021,
"grad_norm": 0.416015625,
"learning_rate": 1.1540577028231398e-05,
"loss": 1.2809118032455444,
"step": 260
},
{
"epoch": 0.5574468085106383,
"grad_norm": 1.0546875,
"learning_rate": 1.1530913320277837e-05,
"loss": 1.2208646535873413,
"step": 262
},
{
"epoch": 0.5617021276595745,
"grad_norm": 0.55859375,
"learning_rate": 1.1521154249901204e-05,
"loss": 1.2243047952651978,
"step": 264
},
{
"epoch": 0.5659574468085107,
"grad_norm": 0.640625,
"learning_rate": 1.1511300031986567e-05,
"loss": 1.325520634651184,
"step": 266
},
{
"epoch": 0.5702127659574469,
"grad_norm": 0.50390625,
"learning_rate": 1.1501350883514048e-05,
"loss": 1.1810495853424072,
"step": 268
},
{
"epoch": 0.574468085106383,
"grad_norm": 0.9140625,
"learning_rate": 1.149130702355404e-05,
"loss": 1.360308289527893,
"step": 270
},
{
"epoch": 0.5787234042553191,
"grad_norm": 0.5859375,
"learning_rate": 1.14811686732624e-05,
"loss": 1.2189104557037354,
"step": 272
},
{
"epoch": 0.5829787234042553,
"grad_norm": 0.66015625,
"learning_rate": 1.1470936055875562e-05,
"loss": 1.3855215311050415,
"step": 274
},
{
"epoch": 0.5872340425531914,
"grad_norm": 0.44140625,
"learning_rate": 1.1460609396705629e-05,
"loss": 1.239030361175537,
"step": 276
},
{
"epoch": 0.5914893617021276,
"grad_norm": 0.51171875,
"learning_rate": 1.1450188923135407e-05,
"loss": 1.2763073444366455,
"step": 278
},
{
"epoch": 0.5957446808510638,
"grad_norm": 1.6796875,
"learning_rate": 1.1439674864613413e-05,
"loss": 1.1475056409835815,
"step": 280
},
{
"epoch": 0.6,
"grad_norm": 0.91015625,
"learning_rate": 1.14290674526488e-05,
"loss": 1.3000105619430542,
"step": 282
},
{
"epoch": 0.6042553191489362,
"grad_norm": 0.59765625,
"learning_rate": 1.1418366920806277e-05,
"loss": 1.2847286462783813,
"step": 284
},
{
"epoch": 0.6085106382978723,
"grad_norm": 0.328125,
"learning_rate": 1.1407573504700965e-05,
"loss": 1.2533907890319824,
"step": 286
},
{
"epoch": 0.6127659574468085,
"grad_norm": 0.447265625,
"learning_rate": 1.1396687441993191e-05,
"loss": 1.092968463897705,
"step": 288
},
{
"epoch": 0.6170212765957447,
"grad_norm": 0.3984375,
"learning_rate": 1.1385708972383283e-05,
"loss": 1.4811941385269165,
"step": 290
},
{
"epoch": 0.6212765957446809,
"grad_norm": 0.5234375,
"learning_rate": 1.1374638337606272e-05,
"loss": 1.2241995334625244,
"step": 292
},
{
"epoch": 0.625531914893617,
"grad_norm": 0.39453125,
"learning_rate": 1.1363475781426572e-05,
"loss": 1.273016095161438,
"step": 294
},
{
"epoch": 0.6297872340425532,
"grad_norm": 0.486328125,
"learning_rate": 1.1352221549632619e-05,
"loss": 1.3111282587051392,
"step": 296
},
{
"epoch": 0.6340425531914894,
"grad_norm": 0.369140625,
"learning_rate": 1.134087589003145e-05,
"loss": 1.2370787858963013,
"step": 298
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.9609375,
"learning_rate": 1.132943905244326e-05,
"loss": 1.2171998023986816,
"step": 300
},
{
"epoch": 0.6425531914893617,
"grad_norm": 0.61328125,
"learning_rate": 1.1317911288695888e-05,
"loss": 1.3028873205184937,
"step": 302
},
{
"epoch": 0.6468085106382979,
"grad_norm": 0.388671875,
"learning_rate": 1.1306292852619274e-05,
"loss": 1.2210191488265991,
"step": 304
},
{
"epoch": 0.6510638297872341,
"grad_norm": 0.51953125,
"learning_rate": 1.129458400003988e-05,
"loss": 1.2221373319625854,
"step": 306
},
{
"epoch": 0.6553191489361702,
"grad_norm": 0.7109375,
"learning_rate": 1.1282784988775045e-05,
"loss": 1.236470341682434,
"step": 308
},
{
"epoch": 0.6595744680851063,
"grad_norm": 0.390625,
"learning_rate": 1.1270896078627315e-05,
"loss": 1.0521761178970337,
"step": 310
},
{
"epoch": 0.6638297872340425,
"grad_norm": 0.859375,
"learning_rate": 1.125891753137872e-05,
"loss": 1.1648889780044556,
"step": 312
},
{
"epoch": 0.6680851063829787,
"grad_norm": 0.484375,
"learning_rate": 1.1246849610785009e-05,
"loss": 1.2399919033050537,
"step": 314
},
{
"epoch": 0.6723404255319149,
"grad_norm": 0.69140625,
"learning_rate": 1.1234692582569843e-05,
"loss": 1.2077488899230957,
"step": 316
},
{
"epoch": 0.676595744680851,
"grad_norm": 1.234375,
"learning_rate": 1.1222446714418947e-05,
"loss": 1.4379267692565918,
"step": 318
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.349609375,
"learning_rate": 1.1210112275974216e-05,
"loss": 1.2180498838424683,
"step": 320
},
{
"epoch": 0.6851063829787234,
"grad_norm": 0.49609375,
"learning_rate": 1.1197689538827766e-05,
"loss": 1.190024971961975,
"step": 322
},
{
"epoch": 0.6893617021276596,
"grad_norm": 0.78515625,
"learning_rate": 1.1185178776515973e-05,
"loss": 1.2704949378967285,
"step": 324
},
{
"epoch": 0.6936170212765957,
"grad_norm": 0.50390625,
"learning_rate": 1.1172580264513435e-05,
"loss": 1.2116349935531616,
"step": 326
},
{
"epoch": 0.6978723404255319,
"grad_norm": 1.015625,
"learning_rate": 1.1159894280226908e-05,
"loss": 1.4247322082519531,
"step": 328
},
{
"epoch": 0.7021276595744681,
"grad_norm": 0.76171875,
"learning_rate": 1.114712110298921e-05,
"loss": 1.222773551940918,
"step": 330
},
{
"epoch": 0.7063829787234043,
"grad_norm": 0.443359375,
"learning_rate": 1.1134261014053054e-05,
"loss": 1.2406312227249146,
"step": 332
},
{
"epoch": 0.7106382978723405,
"grad_norm": 0.41015625,
"learning_rate": 1.1121314296584864e-05,
"loss": 1.1038767099380493,
"step": 334
},
{
"epoch": 0.7148936170212766,
"grad_norm": 0.96875,
"learning_rate": 1.1108281235658543e-05,
"loss": 1.2219905853271484,
"step": 336
},
{
"epoch": 0.7191489361702128,
"grad_norm": 0.71484375,
"learning_rate": 1.1095162118249182e-05,
"loss": 1.2996376752853394,
"step": 338
},
{
"epoch": 0.723404255319149,
"grad_norm": 0.69140625,
"learning_rate": 1.1081957233226762e-05,
"loss": 1.2108495235443115,
"step": 340
},
{
"epoch": 0.7276595744680852,
"grad_norm": 4.40625,
"learning_rate": 1.1068666871349777e-05,
"loss": 1.1036784648895264,
"step": 342
},
{
"epoch": 0.7319148936170212,
"grad_norm": 0.75,
"learning_rate": 1.1055291325258833e-05,
"loss": 1.1888855695724487,
"step": 344
},
{
"epoch": 0.7361702127659574,
"grad_norm": 2.484375,
"learning_rate": 1.1041830889470211e-05,
"loss": 1.2789053916931152,
"step": 346
},
{
"epoch": 0.7404255319148936,
"grad_norm": 0.66015625,
"learning_rate": 1.1028285860369379e-05,
"loss": 1.2360132932662964,
"step": 348
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.6875,
"learning_rate": 1.1014656536204471e-05,
"loss": 1.271801233291626,
"step": 350
},
{
"epoch": 0.7489361702127659,
"grad_norm": 0.333984375,
"learning_rate": 1.1000943217079704e-05,
"loss": 1.177423119544983,
"step": 352
},
{
"epoch": 0.7531914893617021,
"grad_norm": 0.431640625,
"learning_rate": 1.098714620494879e-05,
"loss": 1.1379421949386597,
"step": 354
},
{
"epoch": 0.7574468085106383,
"grad_norm": 0.462890625,
"learning_rate": 1.0973265803608273e-05,
"loss": 1.293025255203247,
"step": 356
},
{
"epoch": 0.7617021276595745,
"grad_norm": 0.34765625,
"learning_rate": 1.0959302318690851e-05,
"loss": 1.1501177549362183,
"step": 358
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.36328125,
"learning_rate": 1.0945256057658632e-05,
"loss": 1.1921217441558838,
"step": 360
},
{
"epoch": 0.7702127659574468,
"grad_norm": 0.6171875,
"learning_rate": 1.0931127329796376e-05,
"loss": 1.219430923461914,
"step": 362
},
{
"epoch": 0.774468085106383,
"grad_norm": 3.84375,
"learning_rate": 1.0916916446204684e-05,
"loss": 1.2632174491882324,
"step": 364
},
{
"epoch": 0.7787234042553192,
"grad_norm": 1.2421875,
"learning_rate": 1.090262371979314e-05,
"loss": 1.1648533344268799,
"step": 366
},
{
"epoch": 0.7829787234042553,
"grad_norm": 0.392578125,
"learning_rate": 1.0888249465273429e-05,
"loss": 1.1504024267196655,
"step": 368
},
{
"epoch": 0.7872340425531915,
"grad_norm": 0.66796875,
"learning_rate": 1.08737939991524e-05,
"loss": 1.2344441413879395,
"step": 370
},
{
"epoch": 0.7914893617021277,
"grad_norm": 1.234375,
"learning_rate": 1.0859257639725105e-05,
"loss": 1.1171855926513672,
"step": 372
},
{
"epoch": 0.7957446808510639,
"grad_norm": 0.42578125,
"learning_rate": 1.0844640707067789e-05,
"loss": 1.0803868770599365,
"step": 374
},
{
"epoch": 0.8,
"grad_norm": 0.7109375,
"learning_rate": 1.0829943523030833e-05,
"loss": 1.1519043445587158,
"step": 376
},
{
"epoch": 0.8042553191489362,
"grad_norm": 0.478515625,
"learning_rate": 1.0815166411231678e-05,
"loss": 1.2066103219985962,
"step": 378
},
{
"epoch": 0.8085106382978723,
"grad_norm": 0.55859375,
"learning_rate": 1.0800309697047694e-05,
"loss": 1.2266093492507935,
"step": 380
},
{
"epoch": 0.8127659574468085,
"grad_norm": 0.5078125,
"learning_rate": 1.0785373707609015e-05,
"loss": 1.1117401123046875,
"step": 382
},
{
"epoch": 0.8170212765957446,
"grad_norm": 1.4609375,
"learning_rate": 1.0770358771791342e-05,
"loss": 1.210506796836853,
"step": 384
},
{
"epoch": 0.8212765957446808,
"grad_norm": 0.5,
"learning_rate": 1.0755265220208694e-05,
"loss": 1.0881282091140747,
"step": 386
},
{
"epoch": 0.825531914893617,
"grad_norm": 0.380859375,
"learning_rate": 1.0740093385206134e-05,
"loss": 1.1627310514450073,
"step": 388
},
{
"epoch": 0.8297872340425532,
"grad_norm": 0.60546875,
"learning_rate": 1.0724843600852442e-05,
"loss": 1.3014237880706787,
"step": 390
},
{
"epoch": 0.8340425531914893,
"grad_norm": 0.388671875,
"learning_rate": 1.0709516202932775e-05,
"loss": 1.1474575996398926,
"step": 392
},
{
"epoch": 0.8382978723404255,
"grad_norm": 0.578125,
"learning_rate": 1.0694111528941255e-05,
"loss": 1.0830378532409668,
"step": 394
},
{
"epoch": 0.8425531914893617,
"grad_norm": 0.44140625,
"learning_rate": 1.0678629918073552e-05,
"loss": 1.3125864267349243,
"step": 396
},
{
"epoch": 0.8468085106382979,
"grad_norm": 0.447265625,
"learning_rate": 1.0663071711219407e-05,
"loss": 1.2408422231674194,
"step": 398
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.6484375,
"learning_rate": 1.0647437250955132e-05,
"loss": 1.164583444595337,
"step": 400
},
{
"epoch": 0.8553191489361702,
"grad_norm": 0.82421875,
"learning_rate": 1.0631726881536062e-05,
"loss": 1.215876579284668,
"step": 402
},
{
"epoch": 0.8595744680851064,
"grad_norm": 0.66015625,
"learning_rate": 1.0615940948888973e-05,
"loss": 1.1813125610351562,
"step": 404
},
{
"epoch": 0.8638297872340426,
"grad_norm": 0.59765625,
"learning_rate": 1.0600079800604474e-05,
"loss": 1.2217594385147095,
"step": 406
},
{
"epoch": 0.8680851063829788,
"grad_norm": 2.921875,
"learning_rate": 1.0584143785929342e-05,
"loss": 1.2609615325927734,
"step": 408
},
{
"epoch": 0.8723404255319149,
"grad_norm": 0.62890625,
"learning_rate": 1.0568133255758849e-05,
"loss": 1.143092393875122,
"step": 410
},
{
"epoch": 0.8765957446808511,
"grad_norm": 0.62109375,
"learning_rate": 1.0552048562629009e-05,
"loss": 1.2375463247299194,
"step": 412
},
{
"epoch": 0.8808510638297873,
"grad_norm": 0.74609375,
"learning_rate": 1.0535890060708838e-05,
"loss": 1.1186902523040771,
"step": 414
},
{
"epoch": 0.8851063829787233,
"grad_norm": 0.439453125,
"learning_rate": 1.0519658105792554e-05,
"loss": 1.1387929916381836,
"step": 416
},
{
"epoch": 0.8893617021276595,
"grad_norm": 1.421875,
"learning_rate": 1.0503353055291729e-05,
"loss": 1.181614875793457,
"step": 418
},
{
"epoch": 0.8936170212765957,
"grad_norm": 1.03125,
"learning_rate": 1.0486975268227431e-05,
"loss": 1.308741807937622,
"step": 420
},
{
"epoch": 0.8978723404255319,
"grad_norm": 1.21875,
"learning_rate": 1.0470525105222318e-05,
"loss": 1.0869234800338745,
"step": 422
},
{
"epoch": 0.902127659574468,
"grad_norm": 0.443359375,
"learning_rate": 1.0454002928492686e-05,
"loss": 1.1498181819915771,
"step": 424
},
{
"epoch": 0.9063829787234042,
"grad_norm": 0.6796875,
"learning_rate": 1.0437409101840513e-05,
"loss": 1.3278398513793945,
"step": 426
},
{
"epoch": 0.9106382978723404,
"grad_norm": 0.44140625,
"learning_rate": 1.0420743990645426e-05,
"loss": 1.2144547700881958,
"step": 428
},
{
"epoch": 0.9148936170212766,
"grad_norm": 0.486328125,
"learning_rate": 1.0404007961856676e-05,
"loss": 1.191633701324463,
"step": 430
},
{
"epoch": 0.9191489361702128,
"grad_norm": 0.5078125,
"learning_rate": 1.0387201383985043e-05,
"loss": 1.2432807683944702,
"step": 432
},
{
"epoch": 0.9234042553191489,
"grad_norm": 0.703125,
"learning_rate": 1.0370324627094734e-05,
"loss": 1.5649425983428955,
"step": 434
},
{
"epoch": 0.9276595744680851,
"grad_norm": 0.55859375,
"learning_rate": 1.0353378062795224e-05,
"loss": 1.2039592266082764,
"step": 436
},
{
"epoch": 0.9319148936170213,
"grad_norm": 0.49609375,
"learning_rate": 1.033636206423308e-05,
"loss": 1.1712656021118164,
"step": 438
},
{
"epoch": 0.9361702127659575,
"grad_norm": 0.75390625,
"learning_rate": 1.0319277006083738e-05,
"loss": 1.030342936515808,
"step": 440
},
{
"epoch": 0.9404255319148936,
"grad_norm": 0.74609375,
"learning_rate": 1.0302123264543267e-05,
"loss": 1.1908173561096191,
"step": 442
},
{
"epoch": 0.9446808510638298,
"grad_norm": 1.5234375,
"learning_rate": 1.028490121732007e-05,
"loss": 1.174695611000061,
"step": 444
},
{
"epoch": 0.948936170212766,
"grad_norm": 1.8203125,
"learning_rate": 1.026761124362657e-05,
"loss": 1.3273422718048096,
"step": 446
},
{
"epoch": 0.9531914893617022,
"grad_norm": 0.3828125,
"learning_rate": 1.0250253724170875e-05,
"loss": 1.162235975265503,
"step": 448
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.53515625,
"learning_rate": 1.0232829041148372e-05,
"loss": 1.1651887893676758,
"step": 450
},
{
"epoch": 0.9617021276595744,
"grad_norm": 0.48828125,
"learning_rate": 1.0215337578233328e-05,
"loss": 1.1634246110916138,
"step": 452
},
{
"epoch": 0.9659574468085106,
"grad_norm": 0.5625,
"learning_rate": 1.019777972057044e-05,
"loss": 1.0295268297195435,
"step": 454
},
{
"epoch": 0.9702127659574468,
"grad_norm": 0.41796875,
"learning_rate": 1.0180155854766348e-05,
"loss": 1.178024411201477,
"step": 456
},
{
"epoch": 0.9744680851063829,
"grad_norm": 0.62109375,
"learning_rate": 1.0162466368881124e-05,
"loss": 1.2120832204818726,
"step": 458
},
{
"epoch": 0.9787234042553191,
"grad_norm": 0.50390625,
"learning_rate": 1.0144711652419738e-05,
"loss": 1.1555849313735962,
"step": 460
},
{
"epoch": 0.9829787234042553,
"grad_norm": 4.0625,
"learning_rate": 1.0126892096323463e-05,
"loss": 1.2941299676895142,
"step": 462
},
{
"epoch": 0.9872340425531915,
"grad_norm": 2.34375,
"learning_rate": 1.0109008092961276e-05,
"loss": 1.0498948097229004,
"step": 464
},
{
"epoch": 0.9914893617021276,
"grad_norm": 0.71875,
"learning_rate": 1.0091060036121233e-05,
"loss": 1.2505208253860474,
"step": 466
},
{
"epoch": 0.9957446808510638,
"grad_norm": 0.51953125,
"learning_rate": 1.0073048321001766e-05,
"loss": 1.1784660816192627,
"step": 468
},
{
"epoch": 1.0,
"grad_norm": 0.6875,
"learning_rate": 1.0054973344203011e-05,
"loss": 1.2162238359451294,
"step": 470
},
{
"epoch": 1.004255319148936,
"grad_norm": 0.33203125,
"learning_rate": 1.003683550371806e-05,
"loss": 0.902032196521759,
"step": 472
},
{
"epoch": 1.0085106382978724,
"grad_norm": 0.84375,
"learning_rate": 1.00186351989242e-05,
"loss": 0.6829485893249512,
"step": 474
},
{
"epoch": 1.0127659574468084,
"grad_norm": 0.380859375,
"learning_rate": 1.0000372830574128e-05,
"loss": 0.9958571195602417,
"step": 476
},
{
"epoch": 1.0170212765957447,
"grad_norm": 0.341796875,
"learning_rate": 9.982048800787103e-06,
"loss": 0.8577584624290466,
"step": 478
},
{
"epoch": 1.0212765957446808,
"grad_norm": 0.443359375,
"learning_rate": 9.96366351304012e-06,
"loss": 0.7623387575149536,
"step": 480
},
{
"epoch": 1.025531914893617,
"grad_norm": 0.443359375,
"learning_rate": 9.945217372159019e-06,
"loss": 0.6408636569976807,
"step": 482
},
{
"epoch": 1.0297872340425531,
"grad_norm": 0.40625,
"learning_rate": 9.926710784309548e-06,
"loss": 0.8527731895446777,
"step": 484
},
{
"epoch": 1.0340425531914894,
"grad_norm": 0.71875,
"learning_rate": 9.908144156988452e-06,
"loss": 1.0902431011199951,
"step": 486
},
{
"epoch": 1.0382978723404255,
"grad_norm": 0.5703125,
"learning_rate": 9.88951789901448e-06,
"loss": 0.9952311515808105,
"step": 488
},
{
"epoch": 1.0425531914893618,
"grad_norm": 0.466796875,
"learning_rate": 9.87083242051939e-06,
"loss": 1.0575801134109497,
"step": 490
},
{
"epoch": 1.0468085106382978,
"grad_norm": 0.8515625,
"learning_rate": 9.852088132938916e-06,
"loss": 0.8896694779396057,
"step": 492
},
{
"epoch": 1.0510638297872341,
"grad_norm": 0.61328125,
"learning_rate": 9.833285449003712e-06,
"loss": 0.8272213935852051,
"step": 494
},
{
"epoch": 1.0553191489361702,
"grad_norm": 0.490234375,
"learning_rate": 9.814424782730261e-06,
"loss": 0.897000789642334,
"step": 496
},
{
"epoch": 1.0595744680851065,
"grad_norm": 0.6015625,
"learning_rate": 9.79550654941176e-06,
"loss": 0.7115342020988464,
"step": 498
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.458984375,
"learning_rate": 9.776531165608975e-06,
"loss": 0.7840989232063293,
"step": 500
},
{
"epoch": 1.0680851063829788,
"grad_norm": 0.400390625,
"learning_rate": 9.757499049141065e-06,
"loss": 0.8686625361442566,
"step": 502
},
{
"epoch": 1.0723404255319149,
"grad_norm": 0.455078125,
"learning_rate": 9.738410619076393e-06,
"loss": 0.5279070138931274,
"step": 504
},
{
"epoch": 1.076595744680851,
"grad_norm": 1.2421875,
"learning_rate": 9.71926629572329e-06,
"loss": 0.7969399094581604,
"step": 506
},
{
"epoch": 1.0808510638297872,
"grad_norm": 1.3828125,
"learning_rate": 9.7000665006208e-06,
"loss": 0.9214133024215698,
"step": 508
},
{
"epoch": 1.0851063829787233,
"grad_norm": 0.88671875,
"learning_rate": 9.680811656529397e-06,
"loss": 0.8827441930770874,
"step": 510
},
{
"epoch": 1.0893617021276596,
"grad_norm": 1.1328125,
"learning_rate": 9.661502187421687e-06,
"loss": 0.7750219702720642,
"step": 512
},
{
"epoch": 1.0936170212765957,
"grad_norm": 0.384765625,
"learning_rate": 9.64213851847306e-06,
"loss": 0.7688886523246765,
"step": 514
},
{
"epoch": 1.097872340425532,
"grad_norm": 0.64453125,
"learning_rate": 9.62272107605233e-06,
"loss": 0.9912289977073669,
"step": 516
},
{
"epoch": 1.102127659574468,
"grad_norm": 0.9765625,
"learning_rate": 9.603250287712357e-06,
"loss": 0.8116132020950317,
"step": 518
},
{
"epoch": 1.1063829787234043,
"grad_norm": 0.6171875,
"learning_rate": 9.583726582180619e-06,
"loss": 0.5431628227233887,
"step": 520
},
{
"epoch": 1.1106382978723404,
"grad_norm": 0.82421875,
"learning_rate": 9.564150389349784e-06,
"loss": 0.7063818573951721,
"step": 522
},
{
"epoch": 1.1148936170212767,
"grad_norm": 0.46484375,
"learning_rate": 9.544522140268226e-06,
"loss": 0.8259474635124207,
"step": 524
},
{
"epoch": 1.1191489361702127,
"grad_norm": 0.65625,
"learning_rate": 9.524842267130567e-06,
"loss": 0.8532420992851257,
"step": 526
},
{
"epoch": 1.123404255319149,
"grad_norm": 0.65625,
"learning_rate": 9.505111203268119e-06,
"loss": 0.7610599398612976,
"step": 528
},
{
"epoch": 1.127659574468085,
"grad_norm": 0.59375,
"learning_rate": 9.48532938313937e-06,
"loss": 0.8436508178710938,
"step": 530
},
{
"epoch": 1.1319148936170214,
"grad_norm": 0.353515625,
"learning_rate": 9.465497242320423e-06,
"loss": 1.2464487552642822,
"step": 532
},
{
"epoch": 1.1361702127659574,
"grad_norm": 0.60546875,
"learning_rate": 9.445615217495373e-06,
"loss": 0.7736493945121765,
"step": 534
},
{
"epoch": 1.1404255319148937,
"grad_norm": 0.5546875,
"learning_rate": 9.42568374644672e-06,
"loss": 0.9255214333534241,
"step": 536
},
{
"epoch": 1.1446808510638298,
"grad_norm": 0.52734375,
"learning_rate": 9.40570326804573e-06,
"loss": 0.7744427919387817,
"step": 538
},
{
"epoch": 1.148936170212766,
"grad_norm": 0.251953125,
"learning_rate": 9.385674222242742e-06,
"loss": 0.6865782737731934,
"step": 540
},
{
"epoch": 1.1531914893617021,
"grad_norm": 6.25,
"learning_rate": 9.365597050057524e-06,
"loss": 0.8758373260498047,
"step": 542
},
{
"epoch": 1.1574468085106382,
"grad_norm": 0.48046875,
"learning_rate": 9.345472193569518e-06,
"loss": 0.8117732405662537,
"step": 544
},
{
"epoch": 1.1617021276595745,
"grad_norm": 0.44140625,
"learning_rate": 9.325300095908145e-06,
"loss": 0.9483519196510315,
"step": 546
},
{
"epoch": 1.1659574468085105,
"grad_norm": 0.4921875,
"learning_rate": 9.305081201243022e-06,
"loss": 0.660556972026825,
"step": 548
},
{
"epoch": 1.1702127659574468,
"grad_norm": 0.828125,
"learning_rate": 9.284815954774185e-06,
"loss": 0.7756091952323914,
"step": 550
},
{
"epoch": 1.174468085106383,
"grad_norm": 0.62109375,
"learning_rate": 9.264504802722297e-06,
"loss": 0.8955855369567871,
"step": 552
},
{
"epoch": 1.1787234042553192,
"grad_norm": 0.58203125,
"learning_rate": 9.244148192318819e-06,
"loss": 0.8398646712303162,
"step": 554
},
{
"epoch": 1.1829787234042553,
"grad_norm": 0.97265625,
"learning_rate": 9.223746571796152e-06,
"loss": 0.8468598127365112,
"step": 556
},
{
"epoch": 1.1872340425531915,
"grad_norm": 0.490234375,
"learning_rate": 9.203300390377784e-06,
"loss": 0.6725097298622131,
"step": 558
},
{
"epoch": 1.1914893617021276,
"grad_norm": 0.59765625,
"learning_rate": 9.182810098268377e-06,
"loss": 0.7907771468162537,
"step": 560
},
{
"epoch": 1.195744680851064,
"grad_norm": 0.453125,
"learning_rate": 9.162276146643881e-06,
"loss": 0.8897430896759033,
"step": 562
},
{
"epoch": 1.2,
"grad_norm": 1.515625,
"learning_rate": 9.141698987641577e-06,
"loss": 0.9244027137756348,
"step": 564
},
{
"epoch": 1.2042553191489362,
"grad_norm": 1.2734375,
"learning_rate": 9.121079074350135e-06,
"loss": 0.8451488614082336,
"step": 566
},
{
"epoch": 1.2085106382978723,
"grad_norm": 1.6015625,
"learning_rate": 9.100416860799625e-06,
"loss": 0.9149748682975769,
"step": 568
},
{
"epoch": 1.2127659574468086,
"grad_norm": 0.443359375,
"learning_rate": 9.079712801951533e-06,
"loss": 0.8140401244163513,
"step": 570
},
{
"epoch": 1.2170212765957447,
"grad_norm": 0.640625,
"learning_rate": 9.058967353688733e-06,
"loss": 0.8866817355155945,
"step": 572
},
{
"epoch": 1.2212765957446807,
"grad_norm": 0.70703125,
"learning_rate": 9.038180972805454e-06,
"loss": 0.8173488974571228,
"step": 574
},
{
"epoch": 1.225531914893617,
"grad_norm": 0.55859375,
"learning_rate": 9.017354116997226e-06,
"loss": 0.7841181755065918,
"step": 576
},
{
"epoch": 1.2297872340425533,
"grad_norm": 0.48046875,
"learning_rate": 8.99648724485079e-06,
"loss": 0.5890490412712097,
"step": 578
},
{
"epoch": 1.2340425531914894,
"grad_norm": 0.578125,
"learning_rate": 8.975580815834008e-06,
"loss": 0.5997076034545898,
"step": 580
},
{
"epoch": 1.2382978723404254,
"grad_norm": 1.1328125,
"learning_rate": 8.954635290285748e-06,
"loss": 0.6937717199325562,
"step": 582
},
{
"epoch": 1.2425531914893617,
"grad_norm": 0.73046875,
"learning_rate": 8.933651129405741e-06,
"loss": 0.7356208562850952,
"step": 584
},
{
"epoch": 1.2468085106382978,
"grad_norm": 0.875,
"learning_rate": 8.912628795244435e-06,
"loss": 0.8549614548683167,
"step": 586
},
{
"epoch": 1.251063829787234,
"grad_norm": 0.40234375,
"learning_rate": 8.891568750692811e-06,
"loss": 0.645767092704773,
"step": 588
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.61328125,
"learning_rate": 8.870471459472202e-06,
"loss": 0.9579916596412659,
"step": 590
},
{
"epoch": 1.2595744680851064,
"grad_norm": 0.498046875,
"learning_rate": 8.849337386124065e-06,
"loss": 0.6670525670051575,
"step": 592
},
{
"epoch": 1.2638297872340425,
"grad_norm": 0.5234375,
"learning_rate": 8.828166995999771e-06,
"loss": 0.9148899912834167,
"step": 594
},
{
"epoch": 1.2680851063829788,
"grad_norm": 0.6015625,
"learning_rate": 8.806960755250352e-06,
"loss": 0.9241386651992798,
"step": 596
},
{
"epoch": 1.2723404255319148,
"grad_norm": 0.8046875,
"learning_rate": 8.785719130816227e-06,
"loss": 0.8401479721069336,
"step": 598
},
{
"epoch": 1.2765957446808511,
"grad_norm": 0.59765625,
"learning_rate": 8.76444259041694e-06,
"loss": 0.9863938689231873,
"step": 600
},
{
"epoch": 1.2808510638297872,
"grad_norm": 0.58984375,
"learning_rate": 8.743131602540837e-06,
"loss": 0.9384634494781494,
"step": 602
},
{
"epoch": 1.2851063829787235,
"grad_norm": 0.6484375,
"learning_rate": 8.721786636434773e-06,
"loss": 0.7852924466133118,
"step": 604
},
{
"epoch": 1.2893617021276595,
"grad_norm": 0.490234375,
"learning_rate": 8.70040816209377e-06,
"loss": 0.9877030849456787,
"step": 606
},
{
"epoch": 1.2936170212765958,
"grad_norm": 0.494140625,
"learning_rate": 8.67899665025066e-06,
"loss": 0.7262607216835022,
"step": 608
},
{
"epoch": 1.297872340425532,
"grad_norm": 1.578125,
"learning_rate": 8.657552572365738e-06,
"loss": 1.0153322219848633,
"step": 610
},
{
"epoch": 1.302127659574468,
"grad_norm": 0.34375,
"learning_rate": 8.636076400616361e-06,
"loss": 0.8889206051826477,
"step": 612
},
{
"epoch": 1.3063829787234043,
"grad_norm": 0.40234375,
"learning_rate": 8.614568607886572e-06,
"loss": 1.0539144277572632,
"step": 614
},
{
"epoch": 1.3106382978723405,
"grad_norm": 0.392578125,
"learning_rate": 8.593029667756665e-06,
"loss": 0.9332261085510254,
"step": 616
},
{
"epoch": 1.3148936170212766,
"grad_norm": 0.4609375,
"learning_rate": 8.57146005449278e-06,
"loss": 0.7537972331047058,
"step": 618
},
{
"epoch": 1.3191489361702127,
"grad_norm": 0.361328125,
"learning_rate": 8.549860243036443e-06,
"loss": 0.8345380425453186,
"step": 620
},
{
"epoch": 1.323404255319149,
"grad_norm": 0.70703125,
"learning_rate": 8.528230708994113e-06,
"loss": 0.8078710436820984,
"step": 622
},
{
"epoch": 1.327659574468085,
"grad_norm": 1.5390625,
"learning_rate": 8.506571928626716e-06,
"loss": 0.6944683790206909,
"step": 624
},
{
"epoch": 1.3319148936170213,
"grad_norm": 0.333984375,
"learning_rate": 8.484884378839148e-06,
"loss": 0.8724764585494995,
"step": 626
},
{
"epoch": 1.3361702127659574,
"grad_norm": 0.7109375,
"learning_rate": 8.463168537169782e-06,
"loss": 0.9229905009269714,
"step": 628
},
{
"epoch": 1.3404255319148937,
"grad_norm": 0.466796875,
"learning_rate": 8.44142488177995e-06,
"loss": 0.8973690271377563,
"step": 630
},
{
"epoch": 1.3446808510638297,
"grad_norm": 0.5390625,
"learning_rate": 8.419653891443415e-06,
"loss": 0.8710704445838928,
"step": 632
},
{
"epoch": 1.348936170212766,
"grad_norm": 1.890625,
"learning_rate": 8.397856045535826e-06,
"loss": 0.9143708348274231,
"step": 634
},
{
"epoch": 1.353191489361702,
"grad_norm": 0.40234375,
"learning_rate": 8.37603182402417e-06,
"loss": 0.7919833660125732,
"step": 636
},
{
"epoch": 1.3574468085106384,
"grad_norm": 0.341796875,
"learning_rate": 8.354181707456192e-06,
"loss": 0.7822130918502808,
"step": 638
},
{
"epoch": 1.3617021276595744,
"grad_norm": 0.40625,
"learning_rate": 8.332306176949824e-06,
"loss": 0.635791003704071,
"step": 640
},
{
"epoch": 1.3659574468085105,
"grad_norm": 0.341796875,
"learning_rate": 8.310405714182593e-06,
"loss": 0.765158474445343,
"step": 642
},
{
"epoch": 1.3702127659574468,
"grad_norm": 0.56640625,
"learning_rate": 8.288480801380998e-06,
"loss": 0.526314914226532,
"step": 644
},
{
"epoch": 1.374468085106383,
"grad_norm": 0.392578125,
"learning_rate": 8.266531921309911e-06,
"loss": 0.8815028071403503,
"step": 646
},
{
"epoch": 1.3787234042553191,
"grad_norm": 0.59765625,
"learning_rate": 8.244559557261944e-06,
"loss": 0.8624444007873535,
"step": 648
},
{
"epoch": 1.3829787234042552,
"grad_norm": 0.69921875,
"learning_rate": 8.22256419304679e-06,
"loss": 1.1067816019058228,
"step": 650
},
{
"epoch": 1.3872340425531915,
"grad_norm": 0.408203125,
"learning_rate": 8.200546312980595e-06,
"loss": 0.8086753487586975,
"step": 652
},
{
"epoch": 1.3914893617021278,
"grad_norm": 0.412109375,
"learning_rate": 8.17850640187528e-06,
"loss": 0.8894110321998596,
"step": 654
},
{
"epoch": 1.3957446808510638,
"grad_norm": 0.38671875,
"learning_rate": 8.156444945027855e-06,
"loss": 0.9589279294013977,
"step": 656
},
{
"epoch": 1.4,
"grad_norm": 0.82421875,
"learning_rate": 8.134362428209765e-06,
"loss": 0.8438636064529419,
"step": 658
},
{
"epoch": 1.4042553191489362,
"grad_norm": 0.412109375,
"learning_rate": 8.11225933765616e-06,
"loss": 0.7788761258125305,
"step": 660
},
{
"epoch": 1.4085106382978723,
"grad_norm": 0.361328125,
"learning_rate": 8.090136160055213e-06,
"loss": 0.8602153658866882,
"step": 662
},
{
"epoch": 1.4127659574468086,
"grad_norm": 0.64453125,
"learning_rate": 8.067993382537386e-06,
"loss": 1.1651355028152466,
"step": 664
},
{
"epoch": 1.4170212765957446,
"grad_norm": 0.376953125,
"learning_rate": 8.045831492664716e-06,
"loss": 0.8709754347801208,
"step": 666
},
{
"epoch": 1.421276595744681,
"grad_norm": 0.36328125,
"learning_rate": 8.023650978420076e-06,
"loss": 0.8617551922798157,
"step": 668
},
{
"epoch": 1.425531914893617,
"grad_norm": 0.322265625,
"learning_rate": 8.001452328196425e-06,
"loss": 0.7164908647537231,
"step": 670
},
{
"epoch": 1.4297872340425533,
"grad_norm": 0.68359375,
"learning_rate": 7.979236030786065e-06,
"loss": 0.874544084072113,
"step": 672
},
{
"epoch": 1.4340425531914893,
"grad_norm": 0.357421875,
"learning_rate": 7.957002575369866e-06,
"loss": 0.8772100806236267,
"step": 674
},
{
"epoch": 1.4382978723404256,
"grad_norm": 0.82421875,
"learning_rate": 7.934752451506499e-06,
"loss": 0.8531442880630493,
"step": 676
},
{
"epoch": 1.4425531914893617,
"grad_norm": 0.703125,
"learning_rate": 7.912486149121662e-06,
"loss": 0.8926745653152466,
"step": 678
},
{
"epoch": 1.4468085106382977,
"grad_norm": 0.302734375,
"learning_rate": 7.89020415849729e-06,
"loss": 0.8355059623718262,
"step": 680
},
{
"epoch": 1.451063829787234,
"grad_norm": 0.5234375,
"learning_rate": 7.867906970260748e-06,
"loss": 0.7553901076316833,
"step": 682
},
{
"epoch": 1.4553191489361703,
"grad_norm": 0.427734375,
"learning_rate": 7.845595075374053e-06,
"loss": 0.7148939967155457,
"step": 684
},
{
"epoch": 1.4595744680851064,
"grad_norm": 0.8203125,
"learning_rate": 7.823268965123027e-06,
"loss": 0.7749176621437073,
"step": 686
},
{
"epoch": 1.4638297872340424,
"grad_norm": 1.6796875,
"learning_rate": 7.800929131106519e-06,
"loss": 1.0506820678710938,
"step": 688
},
{
"epoch": 1.4680851063829787,
"grad_norm": 0.67578125,
"learning_rate": 7.77857606522555e-06,
"loss": 0.5485996603965759,
"step": 690
},
{
"epoch": 1.472340425531915,
"grad_norm": 0.416015625,
"learning_rate": 7.756210259672503e-06,
"loss": 0.8781046271324158,
"step": 692
},
{
"epoch": 1.476595744680851,
"grad_norm": 0.435546875,
"learning_rate": 7.733832206920267e-06,
"loss": 0.8102371692657471,
"step": 694
},
{
"epoch": 1.4808510638297872,
"grad_norm": 0.4296875,
"learning_rate": 7.711442399711406e-06,
"loss": 0.8387575149536133,
"step": 696
},
{
"epoch": 1.4851063829787234,
"grad_norm": 0.357421875,
"learning_rate": 7.689041331047307e-06,
"loss": 0.7191005945205688,
"step": 698
},
{
"epoch": 1.4893617021276595,
"grad_norm": 0.77734375,
"learning_rate": 7.66662949417732e-06,
"loss": 0.560632586479187,
"step": 700
},
{
"epoch": 1.4936170212765958,
"grad_norm": 0.8515625,
"learning_rate": 7.644207382587906e-06,
"loss": 0.8454610705375671,
"step": 702
},
{
"epoch": 1.4978723404255319,
"grad_norm": 0.455078125,
"learning_rate": 7.621775489991757e-06,
"loss": 0.5917819738388062,
"step": 704
},
{
"epoch": 1.5021276595744681,
"grad_norm": 0.69140625,
"learning_rate": 7.599334310316937e-06,
"loss": 0.8950475454330444,
"step": 706
},
{
"epoch": 1.5063829787234042,
"grad_norm": 0.83203125,
"learning_rate": 7.576884337696004e-06,
"loss": 0.9987728595733643,
"step": 708
},
{
"epoch": 1.5106382978723403,
"grad_norm": 0.5,
"learning_rate": 7.554426066455125e-06,
"loss": 0.8234822154045105,
"step": 710
},
{
"epoch": 1.5148936170212766,
"grad_norm": 1.15625,
"learning_rate": 7.5319599911031986e-06,
"loss": 0.948941707611084,
"step": 712
},
{
"epoch": 1.5191489361702128,
"grad_norm": 0.95703125,
"learning_rate": 7.509486606320955e-06,
"loss": 0.8466644883155823,
"step": 714
},
{
"epoch": 1.523404255319149,
"grad_norm": 2.265625,
"learning_rate": 7.487006406950077e-06,
"loss": 0.7706676721572876,
"step": 716
},
{
"epoch": 1.527659574468085,
"grad_norm": 0.42578125,
"learning_rate": 7.464519887982301e-06,
"loss": 0.8639274835586548,
"step": 718
},
{
"epoch": 1.5319148936170213,
"grad_norm": 0.28515625,
"learning_rate": 7.442027544548502e-06,
"loss": 0.8100276589393616,
"step": 720
},
{
"epoch": 1.5361702127659576,
"grad_norm": 0.71484375,
"learning_rate": 7.419529871907815e-06,
"loss": 0.8926405310630798,
"step": 722
},
{
"epoch": 1.5404255319148936,
"grad_norm": 0.341796875,
"learning_rate": 7.397027365436715e-06,
"loss": 0.8414310216903687,
"step": 724
},
{
"epoch": 1.5446808510638297,
"grad_norm": 1.1953125,
"learning_rate": 7.374520520618113e-06,
"loss": 0.8629379868507385,
"step": 726
},
{
"epoch": 1.548936170212766,
"grad_norm": 0.37890625,
"learning_rate": 7.352009833030451e-06,
"loss": 0.8124608397483826,
"step": 728
},
{
"epoch": 1.5531914893617023,
"grad_norm": 0.361328125,
"learning_rate": 7.329495798336777e-06,
"loss": 1.0221534967422485,
"step": 730
},
{
"epoch": 1.5574468085106383,
"grad_norm": 0.369140625,
"learning_rate": 7.306978912273843e-06,
"loss": 0.6406850218772888,
"step": 732
},
{
"epoch": 1.5617021276595744,
"grad_norm": 2.296875,
"learning_rate": 7.284459670641185e-06,
"loss": 0.6190369129180908,
"step": 734
},
{
"epoch": 1.5659574468085107,
"grad_norm": 0.384765625,
"learning_rate": 7.261938569290206e-06,
"loss": 0.8675222396850586,
"step": 736
},
{
"epoch": 1.570212765957447,
"grad_norm": 0.609375,
"learning_rate": 7.239416104113262e-06,
"loss": 0.8379670977592468,
"step": 738
},
{
"epoch": 1.574468085106383,
"grad_norm": 0.376953125,
"learning_rate": 7.216892771032732e-06,
"loss": 0.7264598608016968,
"step": 740
},
{
"epoch": 1.578723404255319,
"grad_norm": 0.54296875,
"learning_rate": 7.1943690659901095e-06,
"loss": 0.8947696685791016,
"step": 742
},
{
"epoch": 1.5829787234042554,
"grad_norm": 0.7109375,
"learning_rate": 7.17184548493508e-06,
"loss": 0.7789361476898193,
"step": 744
},
{
"epoch": 1.5872340425531914,
"grad_norm": 0.400390625,
"learning_rate": 7.149322523814594e-06,
"loss": 0.8117201328277588,
"step": 746
},
{
"epoch": 1.5914893617021275,
"grad_norm": 0.455078125,
"learning_rate": 7.1268006785619575e-06,
"loss": 0.7403523921966553,
"step": 748
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.58984375,
"learning_rate": 7.104280445085897e-06,
"loss": 0.8037891387939453,
"step": 750
},
{
"epoch": 1.6,
"grad_norm": 0.435546875,
"learning_rate": 7.081762319259662e-06,
"loss": 0.8160814642906189,
"step": 752
},
{
"epoch": 1.6042553191489362,
"grad_norm": 0.326171875,
"learning_rate": 7.0592467969100836e-06,
"loss": 0.7555669546127319,
"step": 754
},
{
"epoch": 1.6085106382978722,
"grad_norm": 0.78515625,
"learning_rate": 7.036734373806672e-06,
"loss": 0.8494399785995483,
"step": 756
},
{
"epoch": 1.6127659574468085,
"grad_norm": 0.41015625,
"learning_rate": 7.01422554565069e-06,
"loss": 1.0269806385040283,
"step": 758
},
{
"epoch": 1.6170212765957448,
"grad_norm": 0.466796875,
"learning_rate": 6.991720808064251e-06,
"loss": 0.9812240600585938,
"step": 760
},
{
"epoch": 1.6212765957446809,
"grad_norm": 1.671875,
"learning_rate": 6.969220656579391e-06,
"loss": 0.8393826484680176,
"step": 762
},
{
"epoch": 1.625531914893617,
"grad_norm": 0.486328125,
"learning_rate": 6.946725586627165e-06,
"loss": 0.9660863876342773,
"step": 764
},
{
"epoch": 1.6297872340425532,
"grad_norm": 0.310546875,
"learning_rate": 6.924236093526747e-06,
"loss": 1.0426111221313477,
"step": 766
},
{
"epoch": 1.6340425531914895,
"grad_norm": 0.470703125,
"learning_rate": 6.901752672474499e-06,
"loss": 0.6731575727462769,
"step": 768
},
{
"epoch": 1.6382978723404256,
"grad_norm": 0.458984375,
"learning_rate": 6.879275818533095e-06,
"loss": 0.9503965377807617,
"step": 770
},
{
"epoch": 1.6425531914893616,
"grad_norm": 0.486328125,
"learning_rate": 6.8568060266206056e-06,
"loss": 1.0612298250198364,
"step": 772
},
{
"epoch": 1.646808510638298,
"grad_norm": 0.6875,
"learning_rate": 6.834343791499595e-06,
"loss": 0.7399391531944275,
"step": 774
},
{
"epoch": 1.6510638297872342,
"grad_norm": 0.64453125,
"learning_rate": 6.811889607766242e-06,
"loss": 0.6109141707420349,
"step": 776
},
{
"epoch": 1.65531914893617,
"grad_norm": 0.3515625,
"learning_rate": 6.789443969839441e-06,
"loss": 0.8604304790496826,
"step": 778
},
{
"epoch": 1.6595744680851063,
"grad_norm": 0.59375,
"learning_rate": 6.767007371949911e-06,
"loss": 0.864715576171875,
"step": 780
},
{
"epoch": 1.6638297872340426,
"grad_norm": 0.3984375,
"learning_rate": 6.744580308129327e-06,
"loss": 0.8427615165710449,
"step": 782
},
{
"epoch": 1.6680851063829787,
"grad_norm": 0.53125,
"learning_rate": 6.722163272199424e-06,
"loss": 0.9220309853553772,
"step": 784
},
{
"epoch": 1.6723404255319148,
"grad_norm": 1.046875,
"learning_rate": 6.69975675776114e-06,
"loss": 0.8783171772956848,
"step": 786
},
{
"epoch": 1.676595744680851,
"grad_norm": 0.30078125,
"learning_rate": 6.677361258183735e-06,
"loss": 0.6494432687759399,
"step": 788
},
{
"epoch": 1.6808510638297873,
"grad_norm": 0.462890625,
"learning_rate": 6.6549772665939346e-06,
"loss": 0.8931559920310974,
"step": 790
},
{
"epoch": 1.6851063829787234,
"grad_norm": 0.6796875,
"learning_rate": 6.632605275865074e-06,
"loss": 0.7723158597946167,
"step": 792
},
{
"epoch": 1.6893617021276595,
"grad_norm": 0.474609375,
"learning_rate": 6.610245778606232e-06,
"loss": 0.9853664636611938,
"step": 794
},
{
"epoch": 1.6936170212765957,
"grad_norm": 0.54296875,
"learning_rate": 6.587899267151401e-06,
"loss": 0.7868849635124207,
"step": 796
},
{
"epoch": 1.697872340425532,
"grad_norm": 0.51953125,
"learning_rate": 6.56556623354864e-06,
"loss": 0.852700412273407,
"step": 798
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.443359375,
"learning_rate": 6.543247169549232e-06,
"loss": 0.8994773626327515,
"step": 800
},
{
"epoch": 1.7063829787234042,
"grad_norm": 0.34765625,
"learning_rate": 6.520942566596868e-06,
"loss": 0.8999802470207214,
"step": 802
},
{
"epoch": 1.7106382978723405,
"grad_norm": 0.5,
"learning_rate": 6.4986529158168215e-06,
"loss": 0.7869191765785217,
"step": 804
},
{
"epoch": 1.7148936170212767,
"grad_norm": 0.64453125,
"learning_rate": 6.476378708005135e-06,
"loss": 0.8270288705825806,
"step": 806
},
{
"epoch": 1.7191489361702128,
"grad_norm": 0.75,
"learning_rate": 6.454120433617804e-06,
"loss": 0.9229409694671631,
"step": 808
},
{
"epoch": 1.7234042553191489,
"grad_norm": 0.59765625,
"learning_rate": 6.431878582759994e-06,
"loss": 0.7548995614051819,
"step": 810
},
{
"epoch": 1.7276595744680852,
"grad_norm": 0.357421875,
"learning_rate": 6.409653645175241e-06,
"loss": 0.8321532607078552,
"step": 812
},
{
"epoch": 1.7319148936170212,
"grad_norm": 0.62109375,
"learning_rate": 6.387446110234658e-06,
"loss": 0.6601775288581848,
"step": 814
},
{
"epoch": 1.7361702127659573,
"grad_norm": 0.34765625,
"learning_rate": 6.365256466926183e-06,
"loss": 0.8633728623390198,
"step": 816
},
{
"epoch": 1.7404255319148936,
"grad_norm": 0.447265625,
"learning_rate": 6.343085203843786e-06,
"loss": 0.9041755199432373,
"step": 818
},
{
"epoch": 1.7446808510638299,
"grad_norm": 0.53515625,
"learning_rate": 6.32093280917673e-06,
"loss": 0.8834015727043152,
"step": 820
},
{
"epoch": 1.748936170212766,
"grad_norm": 0.466796875,
"learning_rate": 6.29879977069881e-06,
"loss": 0.7971745133399963,
"step": 822
},
{
"epoch": 1.753191489361702,
"grad_norm": 0.359375,
"learning_rate": 6.2766865757576164e-06,
"loss": 0.8187481164932251,
"step": 824
},
{
"epoch": 1.7574468085106383,
"grad_norm": 0.58203125,
"learning_rate": 6.254593711263813e-06,
"loss": 0.8846163153648376,
"step": 826
},
{
"epoch": 1.7617021276595746,
"grad_norm": 0.875,
"learning_rate": 6.232521663680393e-06,
"loss": 0.9830833077430725,
"step": 828
},
{
"epoch": 1.7659574468085106,
"grad_norm": 0.953125,
"learning_rate": 6.210470919011992e-06,
"loss": 0.7482036352157593,
"step": 830
},
{
"epoch": 1.7702127659574467,
"grad_norm": 0.50390625,
"learning_rate": 6.188441962794179e-06,
"loss": 0.8920266628265381,
"step": 832
},
{
"epoch": 1.774468085106383,
"grad_norm": 0.3359375,
"learning_rate": 6.166435280082749e-06,
"loss": 0.8772265315055847,
"step": 834
},
{
"epoch": 1.7787234042553193,
"grad_norm": 0.435546875,
"learning_rate": 6.1444513554430745e-06,
"loss": 0.8204891681671143,
"step": 836
},
{
"epoch": 1.7829787234042553,
"grad_norm": 0.4765625,
"learning_rate": 6.122490672939405e-06,
"loss": 0.5873453617095947,
"step": 838
},
{
"epoch": 1.7872340425531914,
"grad_norm": 0.44921875,
"learning_rate": 6.100553716124224e-06,
"loss": 0.8039622902870178,
"step": 840
},
{
"epoch": 1.7914893617021277,
"grad_norm": 1.15625,
"learning_rate": 6.078640968027598e-06,
"loss": 0.6872312426567078,
"step": 842
},
{
"epoch": 1.795744680851064,
"grad_norm": 0.78125,
"learning_rate": 6.056752911146548e-06,
"loss": 0.8578442931175232,
"step": 844
},
{
"epoch": 1.8,
"grad_norm": 0.4921875,
"learning_rate": 6.034890027434413e-06,
"loss": 0.7564026117324829,
"step": 846
},
{
"epoch": 1.804255319148936,
"grad_norm": 0.78515625,
"learning_rate": 6.013052798290241e-06,
"loss": 0.8832213878631592,
"step": 848
},
{
"epoch": 1.8085106382978724,
"grad_norm": 0.62109375,
"learning_rate": 5.9912417045482e-06,
"loss": 0.8571723699569702,
"step": 850
},
{
"epoch": 1.8127659574468085,
"grad_norm": 0.60546875,
"learning_rate": 5.969457226466977e-06,
"loss": 0.824770450592041,
"step": 852
},
{
"epoch": 1.8170212765957445,
"grad_norm": 0.38671875,
"learning_rate": 5.9476998437192066e-06,
"loss": 0.8723496794700623,
"step": 854
},
{
"epoch": 1.8212765957446808,
"grad_norm": 0.388671875,
"learning_rate": 5.925970035380918e-06,
"loss": 0.7535234093666077,
"step": 856
},
{
"epoch": 1.825531914893617,
"grad_norm": 0.40625,
"learning_rate": 5.904268279920973e-06,
"loss": 0.9033308625221252,
"step": 858
},
{
"epoch": 1.8297872340425532,
"grad_norm": 0.345703125,
"learning_rate": 5.88259505519054e-06,
"loss": 0.670329749584198,
"step": 860
},
{
"epoch": 1.8340425531914892,
"grad_norm": 0.43359375,
"learning_rate": 5.860950838412565e-06,
"loss": 0.8669137358665466,
"step": 862
},
{
"epoch": 1.8382978723404255,
"grad_norm": 0.357421875,
"learning_rate": 5.839336106171274e-06,
"loss": 0.8537063598632812,
"step": 864
},
{
"epoch": 1.8425531914893618,
"grad_norm": 0.6640625,
"learning_rate": 5.81775133440167e-06,
"loss": 0.8618923425674438,
"step": 866
},
{
"epoch": 1.8468085106382979,
"grad_norm": 0.400390625,
"learning_rate": 5.79619699837905e-06,
"loss": 0.7936420440673828,
"step": 868
},
{
"epoch": 1.851063829787234,
"grad_norm": 0.46484375,
"learning_rate": 5.774673572708554e-06,
"loss": 0.7838106155395508,
"step": 870
},
{
"epoch": 1.8553191489361702,
"grad_norm": 0.455078125,
"learning_rate": 5.753181531314708e-06,
"loss": 0.8583153486251831,
"step": 872
},
{
"epoch": 1.8595744680851065,
"grad_norm": 0.43359375,
"learning_rate": 5.7317213474309764e-06,
"loss": 0.9282540678977966,
"step": 874
},
{
"epoch": 1.8638297872340426,
"grad_norm": 0.75,
"learning_rate": 5.710293493589363e-06,
"loss": 0.6059424877166748,
"step": 876
},
{
"epoch": 1.8680851063829786,
"grad_norm": 3.71875,
"learning_rate": 5.688898441609994e-06,
"loss": 0.9776955842971802,
"step": 878
},
{
"epoch": 1.872340425531915,
"grad_norm": 0.72265625,
"learning_rate": 5.6675366625907264e-06,
"loss": 0.900459885597229,
"step": 880
},
{
"epoch": 1.8765957446808512,
"grad_norm": 0.458984375,
"learning_rate": 5.646208626896784e-06,
"loss": 0.758176326751709,
"step": 882
},
{
"epoch": 1.8808510638297873,
"grad_norm": 0.41796875,
"learning_rate": 5.624914804150397e-06,
"loss": 0.8674149513244629,
"step": 884
},
{
"epoch": 1.8851063829787233,
"grad_norm": 0.6796875,
"learning_rate": 5.6036556632204564e-06,
"loss": 0.778677761554718,
"step": 886
},
{
"epoch": 1.8893617021276596,
"grad_norm": 0.63671875,
"learning_rate": 5.582431672212195e-06,
"loss": 0.8965961933135986,
"step": 888
},
{
"epoch": 1.8936170212765957,
"grad_norm": 0.408203125,
"learning_rate": 5.5612432984568815e-06,
"loss": 0.5581719279289246,
"step": 890
},
{
"epoch": 1.8978723404255318,
"grad_norm": 1.609375,
"learning_rate": 5.5400910085015275e-06,
"loss": 0.8819167017936707,
"step": 892
},
{
"epoch": 1.902127659574468,
"grad_norm": 0.490234375,
"learning_rate": 5.518975268098611e-06,
"loss": 0.9992945194244385,
"step": 894
},
{
"epoch": 1.9063829787234043,
"grad_norm": 0.330078125,
"learning_rate": 5.497896542195829e-06,
"loss": 0.6863605976104736,
"step": 896
},
{
"epoch": 1.9106382978723404,
"grad_norm": 0.447265625,
"learning_rate": 5.476855294925857e-06,
"loss": 0.7966746687889099,
"step": 898
},
{
"epoch": 1.9148936170212765,
"grad_norm": 0.310546875,
"learning_rate": 5.455851989596123e-06,
"loss": 1.0022021532058716,
"step": 900
},
{
"epoch": 1.9191489361702128,
"grad_norm": 0.3359375,
"learning_rate": 5.434887088678614e-06,
"loss": 0.7175713181495667,
"step": 902
},
{
"epoch": 1.923404255319149,
"grad_norm": 0.43359375,
"learning_rate": 5.413961053799693e-06,
"loss": 0.787550687789917,
"step": 904
},
{
"epoch": 1.9276595744680851,
"grad_norm": 0.53515625,
"learning_rate": 5.393074345729926e-06,
"loss": 0.9805369973182678,
"step": 906
},
{
"epoch": 1.9319148936170212,
"grad_norm": 0.71875,
"learning_rate": 5.372227424373942e-06,
"loss": 0.90399169921875,
"step": 908
},
{
"epoch": 1.9361702127659575,
"grad_norm": 0.49609375,
"learning_rate": 5.351420748760311e-06,
"loss": 0.8127355575561523,
"step": 910
},
{
"epoch": 1.9404255319148938,
"grad_norm": 0.34765625,
"learning_rate": 5.330654777031428e-06,
"loss": 0.9437844157218933,
"step": 912
},
{
"epoch": 1.9446808510638298,
"grad_norm": 0.412109375,
"learning_rate": 5.309929966433428e-06,
"loss": 1.0004428625106812,
"step": 914
},
{
"epoch": 1.9489361702127659,
"grad_norm": 1.953125,
"learning_rate": 5.289246773306118e-06,
"loss": 0.8540473580360413,
"step": 916
},
{
"epoch": 1.9531914893617022,
"grad_norm": 0.51171875,
"learning_rate": 5.268605653072935e-06,
"loss": 0.7977997660636902,
"step": 918
},
{
"epoch": 1.9574468085106385,
"grad_norm": 0.494140625,
"learning_rate": 5.248007060230907e-06,
"loss": 0.9748218655586243,
"step": 920
},
{
"epoch": 1.9617021276595743,
"grad_norm": 0.41796875,
"learning_rate": 5.227451448340651e-06,
"loss": 0.86171555519104,
"step": 922
},
{
"epoch": 1.9659574468085106,
"grad_norm": 0.353515625,
"learning_rate": 5.206939270016393e-06,
"loss": 0.8841200470924377,
"step": 924
},
{
"epoch": 1.9702127659574469,
"grad_norm": 0.50390625,
"learning_rate": 5.186470976915983e-06,
"loss": 0.9302433133125305,
"step": 926
},
{
"epoch": 1.974468085106383,
"grad_norm": 0.484375,
"learning_rate": 5.166047019730971e-06,
"loss": 0.6985507011413574,
"step": 928
},
{
"epoch": 1.978723404255319,
"grad_norm": 0.3984375,
"learning_rate": 5.145667848176675e-06,
"loss": 0.9847785830497742,
"step": 930
},
{
"epoch": 1.9829787234042553,
"grad_norm": 0.4140625,
"learning_rate": 5.1253339109822705e-06,
"loss": 0.9930030703544617,
"step": 932
},
{
"epoch": 1.9872340425531916,
"grad_norm": 0.5703125,
"learning_rate": 5.10504565588092e-06,
"loss": 0.7830001711845398,
"step": 934
},
{
"epoch": 1.9914893617021276,
"grad_norm": 0.365234375,
"learning_rate": 5.084803529599915e-06,
"loss": 0.607052743434906,
"step": 936
},
{
"epoch": 1.9957446808510637,
"grad_norm": 1.0078125,
"learning_rate": 5.064607977850834e-06,
"loss": 0.9631056785583496,
"step": 938
},
{
"epoch": 2.0,
"grad_norm": 0.60546875,
"learning_rate": 5.044459445319727e-06,
"loss": 0.6884191036224365,
"step": 940
},
{
"epoch": 2.0042553191489363,
"grad_norm": 0.4296875,
"learning_rate": 5.024358375657334e-06,
"loss": 0.5563607215881348,
"step": 942
},
{
"epoch": 2.008510638297872,
"grad_norm": 0.412109375,
"learning_rate": 5.004305211469303e-06,
"loss": 0.5658197402954102,
"step": 944
},
{
"epoch": 2.0127659574468084,
"grad_norm": 0.4453125,
"learning_rate": 4.984300394306453e-06,
"loss": 0.5938859581947327,
"step": 946
},
{
"epoch": 2.0170212765957447,
"grad_norm": 0.416015625,
"learning_rate": 4.964344364655053e-06,
"loss": 0.5363519191741943,
"step": 948
},
{
"epoch": 2.021276595744681,
"grad_norm": 0.66015625,
"learning_rate": 4.944437561927118e-06,
"loss": 0.6647061109542847,
"step": 950
},
{
"epoch": 2.025531914893617,
"grad_norm": 0.357421875,
"learning_rate": 4.92458042445073e-06,
"loss": 0.557117223739624,
"step": 952
},
{
"epoch": 2.029787234042553,
"grad_norm": 0.427734375,
"learning_rate": 4.9047733894603946e-06,
"loss": 0.3953529894351959,
"step": 954
},
{
"epoch": 2.0340425531914894,
"grad_norm": 0.439453125,
"learning_rate": 4.88501689308741e-06,
"loss": 0.779535710811615,
"step": 956
},
{
"epoch": 2.0382978723404257,
"grad_norm": 0.36328125,
"learning_rate": 4.8653113703502695e-06,
"loss": 0.5275522470474243,
"step": 958
},
{
"epoch": 2.0425531914893615,
"grad_norm": 0.51953125,
"learning_rate": 4.845657255145068e-06,
"loss": 0.5947195291519165,
"step": 960
},
{
"epoch": 2.046808510638298,
"grad_norm": 2.640625,
"learning_rate": 4.8260549802359605e-06,
"loss": 0.6270468235015869,
"step": 962
},
{
"epoch": 2.051063829787234,
"grad_norm": 0.64453125,
"learning_rate": 4.806504977245636e-06,
"loss": 0.6905896067619324,
"step": 964
},
{
"epoch": 2.0553191489361704,
"grad_norm": 0.51953125,
"learning_rate": 4.7870076766457995e-06,
"loss": 0.5533561110496521,
"step": 966
},
{
"epoch": 2.0595744680851062,
"grad_norm": 0.609375,
"learning_rate": 4.767563507747705e-06,
"loss": 0.6810728311538696,
"step": 968
},
{
"epoch": 2.0638297872340425,
"grad_norm": 0.337890625,
"learning_rate": 4.748172898692704e-06,
"loss": 0.3691895306110382,
"step": 970
},
{
"epoch": 2.068085106382979,
"grad_norm": 0.6953125,
"learning_rate": 4.728836276442803e-06,
"loss": 0.5883108377456665,
"step": 972
},
{
"epoch": 2.072340425531915,
"grad_norm": 0.498046875,
"learning_rate": 4.7095540667712775e-06,
"loss": 0.5326440334320068,
"step": 974
},
{
"epoch": 2.076595744680851,
"grad_norm": 1.84375,
"learning_rate": 4.690326694253294e-06,
"loss": 0.41566312313079834,
"step": 976
},
{
"epoch": 2.0808510638297872,
"grad_norm": 0.62109375,
"learning_rate": 4.671154582256559e-06,
"loss": 0.7029457688331604,
"step": 978
},
{
"epoch": 2.0851063829787235,
"grad_norm": 0.2412109375,
"learning_rate": 4.6520381529319954e-06,
"loss": 0.4108755588531494,
"step": 980
},
{
"epoch": 2.0893617021276594,
"grad_norm": 0.4609375,
"learning_rate": 4.632977827204445e-06,
"loss": 0.4902803599834442,
"step": 982
},
{
"epoch": 2.0936170212765957,
"grad_norm": 0.98828125,
"learning_rate": 4.613974024763411e-06,
"loss": 0.5197535753250122,
"step": 984
},
{
"epoch": 2.097872340425532,
"grad_norm": 0.341796875,
"learning_rate": 4.595027164053805e-06,
"loss": 0.4887603521347046,
"step": 986
},
{
"epoch": 2.1021276595744682,
"grad_norm": 0.45703125,
"learning_rate": 4.5761376622667406e-06,
"loss": 0.276875376701355,
"step": 988
},
{
"epoch": 2.106382978723404,
"grad_norm": 0.375,
"learning_rate": 4.557305935330346e-06,
"loss": 0.6325949430465698,
"step": 990
},
{
"epoch": 2.1106382978723404,
"grad_norm": 0.451171875,
"learning_rate": 4.538532397900599e-06,
"loss": 0.6041569709777832,
"step": 992
},
{
"epoch": 2.1148936170212767,
"grad_norm": 0.408203125,
"learning_rate": 4.519817463352204e-06,
"loss": 0.6599090099334717,
"step": 994
},
{
"epoch": 2.119148936170213,
"grad_norm": 0.68359375,
"learning_rate": 4.5011615437694915e-06,
"loss": 0.5671730041503906,
"step": 996
},
{
"epoch": 2.123404255319149,
"grad_norm": 0.51953125,
"learning_rate": 4.48256504993734e-06,
"loss": 0.5928320288658142,
"step": 998
},
{
"epoch": 2.127659574468085,
"grad_norm": 0.396484375,
"learning_rate": 4.464028391332129e-06,
"loss": 0.20121610164642334,
"step": 1000
},
{
"epoch": 2.1319148936170214,
"grad_norm": 0.4140625,
"learning_rate": 4.445551976112725e-06,
"loss": 0.7131472826004028,
"step": 1002
},
{
"epoch": 2.1361702127659576,
"grad_norm": 0.52734375,
"learning_rate": 4.4271362111115006e-06,
"loss": 0.5065695643424988,
"step": 1004
},
{
"epoch": 2.1404255319148935,
"grad_norm": 0.765625,
"learning_rate": 4.408781501825362e-06,
"loss": 0.733562707901001,
"step": 1006
},
{
"epoch": 2.1446808510638298,
"grad_norm": 0.439453125,
"learning_rate": 4.390488252406838e-06,
"loss": 0.5062799453735352,
"step": 1008
},
{
"epoch": 2.148936170212766,
"grad_norm": 0.310546875,
"learning_rate": 4.372256865655169e-06,
"loss": 0.39719632267951965,
"step": 1010
},
{
"epoch": 2.153191489361702,
"grad_norm": 0.384765625,
"learning_rate": 4.354087743007433e-06,
"loss": 0.5480824112892151,
"step": 1012
},
{
"epoch": 2.157446808510638,
"grad_norm": 0.640625,
"learning_rate": 4.335981284529725e-06,
"loss": 0.5634360909461975,
"step": 1014
},
{
"epoch": 2.1617021276595745,
"grad_norm": 0.5390625,
"learning_rate": 4.317937888908333e-06,
"loss": 0.6165044903755188,
"step": 1016
},
{
"epoch": 2.1659574468085108,
"grad_norm": 0.66796875,
"learning_rate": 4.2999579534409626e-06,
"loss": 0.3983045220375061,
"step": 1018
},
{
"epoch": 2.1702127659574466,
"grad_norm": 0.73828125,
"learning_rate": 4.282041874027989e-06,
"loss": 0.41795188188552856,
"step": 1020
},
{
"epoch": 2.174468085106383,
"grad_norm": 0.474609375,
"learning_rate": 4.264190045163742e-06,
"loss": 0.6024309396743774,
"step": 1022
},
{
"epoch": 2.178723404255319,
"grad_norm": 1.015625,
"learning_rate": 4.246402859927817e-06,
"loss": 0.6394532918930054,
"step": 1024
},
{
"epoch": 2.1829787234042555,
"grad_norm": 0.55078125,
"learning_rate": 4.22868070997642e-06,
"loss": 0.4610865116119385,
"step": 1026
},
{
"epoch": 2.1872340425531913,
"grad_norm": 0.40234375,
"learning_rate": 4.211023985533748e-06,
"loss": 0.5758063197135925,
"step": 1028
},
{
"epoch": 2.1914893617021276,
"grad_norm": 0.443359375,
"learning_rate": 4.1934330753833885e-06,
"loss": 0.6651563048362732,
"step": 1030
},
{
"epoch": 2.195744680851064,
"grad_norm": 3.75,
"learning_rate": 4.175908366859766e-06,
"loss": 0.5991113185882568,
"step": 1032
},
{
"epoch": 2.2,
"grad_norm": 0.470703125,
"learning_rate": 4.158450245839608e-06,
"loss": 0.6895382404327393,
"step": 1034
},
{
"epoch": 2.204255319148936,
"grad_norm": 0.76953125,
"learning_rate": 4.141059096733455e-06,
"loss": 0.4550260305404663,
"step": 1036
},
{
"epoch": 2.2085106382978723,
"grad_norm": 0.42578125,
"learning_rate": 4.123735302477193e-06,
"loss": 0.4480676054954529,
"step": 1038
},
{
"epoch": 2.2127659574468086,
"grad_norm": 0.419921875,
"learning_rate": 4.106479244523616e-06,
"loss": 0.5520376563072205,
"step": 1040
},
{
"epoch": 2.217021276595745,
"grad_norm": 0.470703125,
"learning_rate": 4.0892913028340335e-06,
"loss": 0.6519399285316467,
"step": 1042
},
{
"epoch": 2.2212765957446807,
"grad_norm": 0.44921875,
"learning_rate": 4.072171855869905e-06,
"loss": 0.5653026700019836,
"step": 1044
},
{
"epoch": 2.225531914893617,
"grad_norm": 0.44140625,
"learning_rate": 4.055121280584499e-06,
"loss": 0.5862460732460022,
"step": 1046
},
{
"epoch": 2.2297872340425533,
"grad_norm": 1.1484375,
"learning_rate": 4.038139952414603e-06,
"loss": 0.8048577308654785,
"step": 1048
},
{
"epoch": 2.2340425531914896,
"grad_norm": 0.396484375,
"learning_rate": 4.02122824527225e-06,
"loss": 0.530511736869812,
"step": 1050
},
{
"epoch": 2.2382978723404254,
"grad_norm": 0.5234375,
"learning_rate": 4.004386531536482e-06,
"loss": 0.43314328789711,
"step": 1052
},
{
"epoch": 2.2425531914893617,
"grad_norm": 0.57421875,
"learning_rate": 3.987615182045163e-06,
"loss": 0.5556919574737549,
"step": 1054
},
{
"epoch": 2.246808510638298,
"grad_norm": 0.392578125,
"learning_rate": 3.9709145660868015e-06,
"loss": 0.6972762942314148,
"step": 1056
},
{
"epoch": 2.251063829787234,
"grad_norm": 0.55859375,
"learning_rate": 3.9542850513924275e-06,
"loss": 0.31911152601242065,
"step": 1058
},
{
"epoch": 2.25531914893617,
"grad_norm": 0.65234375,
"learning_rate": 3.9377270041274875e-06,
"loss": 0.6526750922203064,
"step": 1060
},
{
"epoch": 2.2595744680851064,
"grad_norm": 0.76171875,
"learning_rate": 3.921240788883785e-06,
"loss": 0.5144931077957153,
"step": 1062
},
{
"epoch": 2.2638297872340427,
"grad_norm": 0.69921875,
"learning_rate": 3.904826768671458e-06,
"loss": 0.7288011312484741,
"step": 1064
},
{
"epoch": 2.2680851063829786,
"grad_norm": 0.578125,
"learning_rate": 3.888485304910978e-06,
"loss": 0.6101799607276917,
"step": 1066
},
{
"epoch": 2.272340425531915,
"grad_norm": 0.6796875,
"learning_rate": 3.8722167574252e-06,
"loss": 0.5383592247962952,
"step": 1068
},
{
"epoch": 2.276595744680851,
"grad_norm": 0.82421875,
"learning_rate": 3.856021484431428e-06,
"loss": 0.6244062185287476,
"step": 1070
},
{
"epoch": 2.2808510638297874,
"grad_norm": 0.41796875,
"learning_rate": 3.839899842533538e-06,
"loss": 0.4686053991317749,
"step": 1072
},
{
"epoch": 2.2851063829787233,
"grad_norm": 0.5078125,
"learning_rate": 3.823852186714121e-06,
"loss": 0.5087999105453491,
"step": 1074
},
{
"epoch": 2.2893617021276595,
"grad_norm": 0.48828125,
"learning_rate": 3.80787887032667e-06,
"loss": 0.4900204837322235,
"step": 1076
},
{
"epoch": 2.293617021276596,
"grad_norm": 0.70703125,
"learning_rate": 3.7919802450877993e-06,
"loss": 0.5593716502189636,
"step": 1078
},
{
"epoch": 2.297872340425532,
"grad_norm": 0.859375,
"learning_rate": 3.7761566610694882e-06,
"loss": 0.3470194339752197,
"step": 1080
},
{
"epoch": 2.302127659574468,
"grad_norm": 0.5546875,
"learning_rate": 3.7604084666913924e-06,
"loss": 0.28270450234413147,
"step": 1082
},
{
"epoch": 2.3063829787234043,
"grad_norm": 0.4609375,
"learning_rate": 3.74473600871316e-06,
"loss": 0.6159269213676453,
"step": 1084
},
{
"epoch": 2.3106382978723405,
"grad_norm": 0.38671875,
"learning_rate": 3.729139632226795e-06,
"loss": 0.46399620175361633,
"step": 1086
},
{
"epoch": 2.3148936170212764,
"grad_norm": 1.3046875,
"learning_rate": 3.713619680649067e-06,
"loss": 0.39948195219039917,
"step": 1088
},
{
"epoch": 2.3191489361702127,
"grad_norm": 1.890625,
"learning_rate": 3.698176495713943e-06,
"loss": 0.4936513602733612,
"step": 1090
},
{
"epoch": 2.323404255319149,
"grad_norm": 0.453125,
"learning_rate": 3.6828104174650614e-06,
"loss": 0.6025733351707458,
"step": 1092
},
{
"epoch": 2.3276595744680852,
"grad_norm": 0.42578125,
"learning_rate": 3.667521784248253e-06,
"loss": 0.5419857501983643,
"step": 1094
},
{
"epoch": 2.331914893617021,
"grad_norm": 0.5625,
"learning_rate": 3.652310932704083e-06,
"loss": 0.5457516312599182,
"step": 1096
},
{
"epoch": 2.3361702127659574,
"grad_norm": 0.6015625,
"learning_rate": 3.637178197760443e-06,
"loss": 0.5860179662704468,
"step": 1098
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.36328125,
"learning_rate": 3.6221239126251687e-06,
"loss": 0.4711592197418213,
"step": 1100
},
{
"epoch": 2.34468085106383,
"grad_norm": 0.64453125,
"learning_rate": 3.6071484087787147e-06,
"loss": 0.6296599507331848,
"step": 1102
},
{
"epoch": 2.348936170212766,
"grad_norm": 1.96875,
"learning_rate": 3.59225201596685e-06,
"loss": 0.7384690046310425,
"step": 1104
},
{
"epoch": 2.353191489361702,
"grad_norm": 0.63671875,
"learning_rate": 3.577435062193391e-06,
"loss": 0.5660156607627869,
"step": 1106
},
{
"epoch": 2.3574468085106384,
"grad_norm": 1.875,
"learning_rate": 3.562697873712993e-06,
"loss": 0.5146188139915466,
"step": 1108
},
{
"epoch": 2.3617021276595747,
"grad_norm": 1.046875,
"learning_rate": 3.548040775023951e-06,
"loss": 0.4210270643234253,
"step": 1110
},
{
"epoch": 2.3659574468085105,
"grad_norm": 0.796875,
"learning_rate": 3.5334640888610656e-06,
"loss": 0.4388498365879059,
"step": 1112
},
{
"epoch": 2.370212765957447,
"grad_norm": 1.90625,
"learning_rate": 3.5189681361885336e-06,
"loss": 0.3667604327201843,
"step": 1114
},
{
"epoch": 2.374468085106383,
"grad_norm": 0.36328125,
"learning_rate": 3.5045532361928817e-06,
"loss": 0.4419676959514618,
"step": 1116
},
{
"epoch": 2.378723404255319,
"grad_norm": 0.66015625,
"learning_rate": 3.490219706275933e-06,
"loss": 0.6218468546867371,
"step": 1118
},
{
"epoch": 2.382978723404255,
"grad_norm": 0.435546875,
"learning_rate": 3.4759678620478234e-06,
"loss": 0.4756940007209778,
"step": 1120
},
{
"epoch": 2.3872340425531915,
"grad_norm": 0.375,
"learning_rate": 3.4617980173200518e-06,
"loss": 0.6557533144950867,
"step": 1122
},
{
"epoch": 2.391489361702128,
"grad_norm": 0.42578125,
"learning_rate": 3.447710484098571e-06,
"loss": 0.3975709080696106,
"step": 1124
},
{
"epoch": 2.395744680851064,
"grad_norm": 0.58203125,
"learning_rate": 3.43370557257691e-06,
"loss": 0.5444962382316589,
"step": 1126
},
{
"epoch": 2.4,
"grad_norm": 0.47265625,
"learning_rate": 3.4197835911293578e-06,
"loss": 0.48340773582458496,
"step": 1128
},
{
"epoch": 2.404255319148936,
"grad_norm": 0.5625,
"learning_rate": 3.4059448463041582e-06,
"loss": 0.8209078311920166,
"step": 1130
},
{
"epoch": 2.4085106382978725,
"grad_norm": 0.462890625,
"learning_rate": 3.3921896428167704e-06,
"loss": 0.6566969156265259,
"step": 1132
},
{
"epoch": 2.4127659574468083,
"grad_norm": 0.439453125,
"learning_rate": 3.378518283543155e-06,
"loss": 0.7115936875343323,
"step": 1134
},
{
"epoch": 2.4170212765957446,
"grad_norm": 0.6875,
"learning_rate": 3.3649310695131094e-06,
"loss": 0.48289287090301514,
"step": 1136
},
{
"epoch": 2.421276595744681,
"grad_norm": 0.462890625,
"learning_rate": 3.3514282999036305e-06,
"loss": 0.3552096486091614,
"step": 1138
},
{
"epoch": 2.425531914893617,
"grad_norm": 0.400390625,
"learning_rate": 3.3380102720323343e-06,
"loss": 0.635092556476593,
"step": 1140
},
{
"epoch": 2.429787234042553,
"grad_norm": 0.66796875,
"learning_rate": 3.324677281350911e-06,
"loss": 0.4491591453552246,
"step": 1142
},
{
"epoch": 2.4340425531914893,
"grad_norm": 0.859375,
"learning_rate": 3.3114296214386135e-06,
"loss": 0.5700670480728149,
"step": 1144
},
{
"epoch": 2.4382978723404256,
"grad_norm": 1.2265625,
"learning_rate": 3.2982675839957957e-06,
"loss": 0.6150033473968506,
"step": 1146
},
{
"epoch": 2.4425531914893615,
"grad_norm": 0.5703125,
"learning_rate": 3.28519145883749e-06,
"loss": 0.2981261909008026,
"step": 1148
},
{
"epoch": 2.4468085106382977,
"grad_norm": 0.4140625,
"learning_rate": 3.2722015338870253e-06,
"loss": 0.43131235241889954,
"step": 1150
},
{
"epoch": 2.451063829787234,
"grad_norm": 1.1640625,
"learning_rate": 3.2592980951696847e-06,
"loss": 0.5070037841796875,
"step": 1152
},
{
"epoch": 2.4553191489361703,
"grad_norm": 1.078125,
"learning_rate": 3.2464814268064147e-06,
"loss": 0.4555862843990326,
"step": 1154
},
{
"epoch": 2.4595744680851066,
"grad_norm": 0.49609375,
"learning_rate": 3.2337518110075632e-06,
"loss": 0.5812932252883911,
"step": 1156
},
{
"epoch": 2.4638297872340424,
"grad_norm": 1.359375,
"learning_rate": 3.221109528066664e-06,
"loss": 0.5926228761672974,
"step": 1158
},
{
"epoch": 2.4680851063829787,
"grad_norm": 0.828125,
"learning_rate": 3.2085548563542688e-06,
"loss": 0.6022335290908813,
"step": 1160
},
{
"epoch": 2.472340425531915,
"grad_norm": 0.7890625,
"learning_rate": 3.19608807231182e-06,
"loss": 0.5389635562896729,
"step": 1162
},
{
"epoch": 2.476595744680851,
"grad_norm": 0.50390625,
"learning_rate": 3.1837094504455587e-06,
"loss": 0.586044192314148,
"step": 1164
},
{
"epoch": 2.480851063829787,
"grad_norm": 0.482421875,
"learning_rate": 3.17141926332048e-06,
"loss": 0.5692299604415894,
"step": 1166
},
{
"epoch": 2.4851063829787234,
"grad_norm": 0.44140625,
"learning_rate": 3.159217781554335e-06,
"loss": 0.658069372177124,
"step": 1168
},
{
"epoch": 2.4893617021276597,
"grad_norm": 0.416015625,
"learning_rate": 3.1471052738116726e-06,
"loss": 0.5921551585197449,
"step": 1170
},
{
"epoch": 2.4936170212765956,
"grad_norm": 0.35546875,
"learning_rate": 3.135082006797918e-06,
"loss": 0.45771515369415283,
"step": 1172
},
{
"epoch": 2.497872340425532,
"grad_norm": 0.490234375,
"learning_rate": 3.123148245253508e-06,
"loss": 0.3539358079433441,
"step": 1174
},
{
"epoch": 2.502127659574468,
"grad_norm": 0.41796875,
"learning_rate": 3.111304251948056e-06,
"loss": 0.6486715078353882,
"step": 1176
},
{
"epoch": 2.506382978723404,
"grad_norm": 0.75,
"learning_rate": 3.0995502876745657e-06,
"loss": 0.3491562008857727,
"step": 1178
},
{
"epoch": 2.5106382978723403,
"grad_norm": 1.046875,
"learning_rate": 3.087886611243692e-06,
"loss": 0.554216742515564,
"step": 1180
},
{
"epoch": 2.5148936170212766,
"grad_norm": 0.447265625,
"learning_rate": 3.076313479478042e-06,
"loss": 0.46358993649482727,
"step": 1182
},
{
"epoch": 2.519148936170213,
"grad_norm": 2.625,
"learning_rate": 3.064831147206519e-06,
"loss": 0.7309602499008179,
"step": 1184
},
{
"epoch": 2.523404255319149,
"grad_norm": 0.380859375,
"learning_rate": 3.05343986725871e-06,
"loss": 0.5900013446807861,
"step": 1186
},
{
"epoch": 2.527659574468085,
"grad_norm": 1.734375,
"learning_rate": 3.0421398904593186e-06,
"loss": 0.8710350394248962,
"step": 1188
},
{
"epoch": 2.5319148936170213,
"grad_norm": 0.34375,
"learning_rate": 3.030931465622647e-06,
"loss": 0.7665842175483704,
"step": 1190
},
{
"epoch": 2.5361702127659576,
"grad_norm": 0.423828125,
"learning_rate": 3.0198148395471105e-06,
"loss": 0.5311375260353088,
"step": 1192
},
{
"epoch": 2.5404255319148934,
"grad_norm": 0.41015625,
"learning_rate": 3.00879025700981e-06,
"loss": 0.2682938873767853,
"step": 1194
},
{
"epoch": 2.5446808510638297,
"grad_norm": 0.9140625,
"learning_rate": 2.997857960761137e-06,
"loss": 0.5427325367927551,
"step": 1196
},
{
"epoch": 2.548936170212766,
"grad_norm": 0.765625,
"learning_rate": 2.98701819151943e-06,
"loss": 0.49154531955718994,
"step": 1198
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.39453125,
"learning_rate": 2.976271187965673e-06,
"loss": 0.5094670057296753,
"step": 1200
},
{
"epoch": 2.5574468085106385,
"grad_norm": 0.71875,
"learning_rate": 2.9656171867382446e-06,
"loss": 0.4511142075061798,
"step": 1202
},
{
"epoch": 2.5617021276595744,
"grad_norm": 0.5859375,
"learning_rate": 2.955056422427704e-06,
"loss": 0.5634865760803223,
"step": 1204
},
{
"epoch": 2.5659574468085107,
"grad_norm": 0.51953125,
"learning_rate": 2.9445891275716233e-06,
"loss": 0.3763676583766937,
"step": 1206
},
{
"epoch": 2.570212765957447,
"grad_norm": 0.703125,
"learning_rate": 2.9342155326494704e-06,
"loss": 0.5212900638580322,
"step": 1208
},
{
"epoch": 2.574468085106383,
"grad_norm": 0.72265625,
"learning_rate": 2.9239358660775357e-06,
"loss": 0.4663785994052887,
"step": 1210
},
{
"epoch": 2.578723404255319,
"grad_norm": 0.50390625,
"learning_rate": 2.9137503542038966e-06,
"loss": 0.5414974093437195,
"step": 1212
},
{
"epoch": 2.5829787234042554,
"grad_norm": 0.470703125,
"learning_rate": 2.903659221303441e-06,
"loss": 0.6152816414833069,
"step": 1214
},
{
"epoch": 2.5872340425531917,
"grad_norm": 0.455078125,
"learning_rate": 2.893662689572925e-06,
"loss": 0.42417243123054504,
"step": 1216
},
{
"epoch": 2.5914893617021275,
"grad_norm": 0.419921875,
"learning_rate": 2.883760979126076e-06,
"loss": 0.6008761525154114,
"step": 1218
},
{
"epoch": 2.595744680851064,
"grad_norm": 0.90234375,
"learning_rate": 2.8739543079887554e-06,
"loss": 0.749297022819519,
"step": 1220
},
{
"epoch": 2.6,
"grad_norm": 0.94140625,
"learning_rate": 2.8642428920941513e-06,
"loss": 0.6406426429748535,
"step": 1222
},
{
"epoch": 2.604255319148936,
"grad_norm": 0.63671875,
"learning_rate": 2.8546269452780275e-06,
"loss": 0.5915369391441345,
"step": 1224
},
{
"epoch": 2.608510638297872,
"grad_norm": 2.078125,
"learning_rate": 2.8451066792740108e-06,
"loss": 0.7708158493041992,
"step": 1226
},
{
"epoch": 2.6127659574468085,
"grad_norm": 0.796875,
"learning_rate": 2.835682303708931e-06,
"loss": 0.2944878339767456,
"step": 1228
},
{
"epoch": 2.617021276595745,
"grad_norm": 0.56640625,
"learning_rate": 2.826354026098208e-06,
"loss": 0.4445026218891144,
"step": 1230
},
{
"epoch": 2.621276595744681,
"grad_norm": 0.546875,
"learning_rate": 2.817122051841277e-06,
"loss": 0.5953022837638855,
"step": 1232
},
{
"epoch": 2.625531914893617,
"grad_norm": 0.66015625,
"learning_rate": 2.807986584217072e-06,
"loss": 0.47725632786750793,
"step": 1234
},
{
"epoch": 2.629787234042553,
"grad_norm": 3.375,
"learning_rate": 2.7989478243795434e-06,
"loss": 0.5917444229125977,
"step": 1236
},
{
"epoch": 2.6340425531914895,
"grad_norm": 0.58984375,
"learning_rate": 2.790005971353233e-06,
"loss": 0.6352754831314087,
"step": 1238
},
{
"epoch": 2.6382978723404253,
"grad_norm": 0.400390625,
"learning_rate": 2.7811612220288905e-06,
"loss": 0.5205258131027222,
"step": 1240
},
{
"epoch": 2.6425531914893616,
"grad_norm": 0.53515625,
"learning_rate": 2.77241377115914e-06,
"loss": 0.716691255569458,
"step": 1242
},
{
"epoch": 2.646808510638298,
"grad_norm": 0.74609375,
"learning_rate": 2.7637638113541866e-06,
"loss": 0.3764870762825012,
"step": 1244
},
{
"epoch": 2.651063829787234,
"grad_norm": 0.95703125,
"learning_rate": 2.755211533077581e-06,
"loss": 0.5524653196334839,
"step": 1246
},
{
"epoch": 2.65531914893617,
"grad_norm": 0.4375,
"learning_rate": 2.746757124642024e-06,
"loss": 0.5442506074905396,
"step": 1248
},
{
"epoch": 2.6595744680851063,
"grad_norm": 0.53515625,
"learning_rate": 2.7384007722052168e-06,
"loss": 0.5800641775131226,
"step": 1250
},
{
"epoch": 2.6638297872340426,
"grad_norm": 0.5546875,
"learning_rate": 2.7301426597657662e-06,
"loss": 0.5853485465049744,
"step": 1252
},
{
"epoch": 2.6680851063829785,
"grad_norm": 0.53515625,
"learning_rate": 2.721982969159132e-06,
"loss": 0.38345175981521606,
"step": 1254
},
{
"epoch": 2.6723404255319148,
"grad_norm": 0.421875,
"learning_rate": 2.7139218800536224e-06,
"loss": 0.6944982409477234,
"step": 1256
},
{
"epoch": 2.676595744680851,
"grad_norm": 0.384765625,
"learning_rate": 2.7059595699464363e-06,
"loss": 0.5350843667984009,
"step": 1258
},
{
"epoch": 2.6808510638297873,
"grad_norm": 0.380859375,
"learning_rate": 2.6980962141597594e-06,
"loss": 0.5438748598098755,
"step": 1260
},
{
"epoch": 2.6851063829787236,
"grad_norm": 0.431640625,
"learning_rate": 2.6903319858369005e-06,
"loss": 0.7831379175186157,
"step": 1262
},
{
"epoch": 2.6893617021276595,
"grad_norm": 0.83203125,
"learning_rate": 2.6826670559384784e-06,
"loss": 0.3888491094112396,
"step": 1264
},
{
"epoch": 2.6936170212765957,
"grad_norm": 0.63671875,
"learning_rate": 2.6751015932386615e-06,
"loss": 0.4081690311431885,
"step": 1266
},
{
"epoch": 2.697872340425532,
"grad_norm": 0.55859375,
"learning_rate": 2.6676357643214467e-06,
"loss": 0.757609486579895,
"step": 1268
},
{
"epoch": 2.702127659574468,
"grad_norm": 0.359375,
"learning_rate": 2.660269733576995e-06,
"loss": 0.2168269008398056,
"step": 1270
},
{
"epoch": 2.706382978723404,
"grad_norm": 0.4453125,
"learning_rate": 2.6530036631980093e-06,
"loss": 0.5121868848800659,
"step": 1272
},
{
"epoch": 2.7106382978723405,
"grad_norm": 0.67578125,
"learning_rate": 2.6458377131761655e-06,
"loss": 0.588572084903717,
"step": 1274
},
{
"epoch": 2.7148936170212767,
"grad_norm": 0.396484375,
"learning_rate": 2.6387720412985873e-06,
"loss": 0.5837306380271912,
"step": 1276
},
{
"epoch": 2.719148936170213,
"grad_norm": 1.2734375,
"learning_rate": 2.631806803144373e-06,
"loss": 0.5779358148574829,
"step": 1278
},
{
"epoch": 2.723404255319149,
"grad_norm": 1.1171875,
"learning_rate": 2.624942152081171e-06,
"loss": 0.42244261503219604,
"step": 1280
},
{
"epoch": 2.727659574468085,
"grad_norm": 1.1796875,
"learning_rate": 2.6181782392618002e-06,
"loss": 0.5723677277565002,
"step": 1282
},
{
"epoch": 2.731914893617021,
"grad_norm": 0.65625,
"learning_rate": 2.611515213620924e-06,
"loss": 0.6737433075904846,
"step": 1284
},
{
"epoch": 2.7361702127659573,
"grad_norm": 0.953125,
"learning_rate": 2.604953221871769e-06,
"loss": 0.6697869300842285,
"step": 1286
},
{
"epoch": 2.7404255319148936,
"grad_norm": 0.431640625,
"learning_rate": 2.5984924085028968e-06,
"loss": 0.41797778010368347,
"step": 1288
},
{
"epoch": 2.74468085106383,
"grad_norm": 0.53515625,
"learning_rate": 2.5921329157750205e-06,
"loss": 0.6901787519454956,
"step": 1290
},
{
"epoch": 2.748936170212766,
"grad_norm": 0.62109375,
"learning_rate": 2.5858748837178724e-06,
"loss": 0.48409298062324524,
"step": 1292
},
{
"epoch": 2.753191489361702,
"grad_norm": 1.4765625,
"learning_rate": 2.579718450127124e-06,
"loss": 0.48840850591659546,
"step": 1294
},
{
"epoch": 2.7574468085106383,
"grad_norm": 0.515625,
"learning_rate": 2.5736637505613453e-06,
"loss": 0.5451318621635437,
"step": 1296
},
{
"epoch": 2.7617021276595746,
"grad_norm": 0.609375,
"learning_rate": 2.5677109183390254e-06,
"loss": 0.3569204807281494,
"step": 1298
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.43359375,
"learning_rate": 2.5618600845356374e-06,
"loss": 0.6634436845779419,
"step": 1300
},
{
"epoch": 2.7702127659574467,
"grad_norm": 0.416015625,
"learning_rate": 2.5561113779807473e-06,
"loss": 0.40077003836631775,
"step": 1302
},
{
"epoch": 2.774468085106383,
"grad_norm": 2.5625,
"learning_rate": 2.550464925255182e-06,
"loss": 0.49653542041778564,
"step": 1304
},
{
"epoch": 2.7787234042553193,
"grad_norm": 0.498046875,
"learning_rate": 2.544920850688239e-06,
"loss": 0.3718079626560211,
"step": 1306
},
{
"epoch": 2.7829787234042556,
"grad_norm": 0.490234375,
"learning_rate": 2.5394792763549506e-06,
"loss": 0.6696460843086243,
"step": 1308
},
{
"epoch": 2.7872340425531914,
"grad_norm": 0.494140625,
"learning_rate": 2.534140322073397e-06,
"loss": 0.4750995337963104,
"step": 1310
},
{
"epoch": 2.7914893617021277,
"grad_norm": 0.412109375,
"learning_rate": 2.5289041054020637e-06,
"loss": 0.38971856236457825,
"step": 1312
},
{
"epoch": 2.795744680851064,
"grad_norm": 0.44921875,
"learning_rate": 2.523770741637259e-06,
"loss": 0.5828387141227722,
"step": 1314
},
{
"epoch": 2.8,
"grad_norm": 0.494140625,
"learning_rate": 2.518740343810568e-06,
"loss": 0.2812992334365845,
"step": 1316
},
{
"epoch": 2.804255319148936,
"grad_norm": 0.69921875,
"learning_rate": 2.513813022686371e-06,
"loss": 0.6449145674705505,
"step": 1318
},
{
"epoch": 2.8085106382978724,
"grad_norm": 0.59375,
"learning_rate": 2.5089888867594004e-06,
"loss": 0.42496779561042786,
"step": 1320
},
{
"epoch": 2.8127659574468087,
"grad_norm": 0.90625,
"learning_rate": 2.5042680422523538e-06,
"loss": 0.6403509974479675,
"step": 1322
},
{
"epoch": 2.8170212765957445,
"grad_norm": 0.65625,
"learning_rate": 2.4996505931135513e-06,
"loss": 0.5965058207511902,
"step": 1324
},
{
"epoch": 2.821276595744681,
"grad_norm": 0.55078125,
"learning_rate": 2.4951366410146506e-06,
"loss": 0.38872432708740234,
"step": 1326
},
{
"epoch": 2.825531914893617,
"grad_norm": 0.60546875,
"learning_rate": 2.4907262853484093e-06,
"loss": 0.47181040048599243,
"step": 1328
},
{
"epoch": 2.829787234042553,
"grad_norm": 0.40234375,
"learning_rate": 2.4864196232264913e-06,
"loss": 0.5333115458488464,
"step": 1330
},
{
"epoch": 2.8340425531914892,
"grad_norm": 0.6015625,
"learning_rate": 2.4822167494773325e-06,
"loss": 0.6577153205871582,
"step": 1332
},
{
"epoch": 2.8382978723404255,
"grad_norm": 0.55078125,
"learning_rate": 2.4781177566440513e-06,
"loss": 0.544109046459198,
"step": 1334
},
{
"epoch": 2.842553191489362,
"grad_norm": 0.96484375,
"learning_rate": 2.474122734982411e-06,
"loss": 0.26606178283691406,
"step": 1336
},
{
"epoch": 2.846808510638298,
"grad_norm": 0.3984375,
"learning_rate": 2.4702317724588332e-06,
"loss": 0.486730694770813,
"step": 1338
},
{
"epoch": 2.851063829787234,
"grad_norm": 0.58203125,
"learning_rate": 2.4664449547484595e-06,
"loss": 0.47592607140541077,
"step": 1340
},
{
"epoch": 2.8553191489361702,
"grad_norm": 0.5625,
"learning_rate": 2.462762365233268e-06,
"loss": 0.4367084801197052,
"step": 1342
},
{
"epoch": 2.8595744680851065,
"grad_norm": 0.4453125,
"learning_rate": 2.459184085000232e-06,
"loss": 0.3742711842060089,
"step": 1344
},
{
"epoch": 2.8638297872340424,
"grad_norm": 0.6328125,
"learning_rate": 2.455710192839539e-06,
"loss": 0.5936036705970764,
"step": 1346
},
{
"epoch": 2.8680851063829786,
"grad_norm": 0.404296875,
"learning_rate": 2.452340765242855e-06,
"loss": 0.6136466860771179,
"step": 1348
},
{
"epoch": 2.872340425531915,
"grad_norm": 0.625,
"learning_rate": 2.449075876401641e-06,
"loss": 0.6735158562660217,
"step": 1350
},
{
"epoch": 2.876595744680851,
"grad_norm": 0.62109375,
"learning_rate": 2.4459155982055145e-06,
"loss": 0.6614925861358643,
"step": 1352
},
{
"epoch": 2.8808510638297875,
"grad_norm": 1.015625,
"learning_rate": 2.4428600002406735e-06,
"loss": 0.780015230178833,
"step": 1354
},
{
"epoch": 2.8851063829787233,
"grad_norm": 0.51953125,
"learning_rate": 2.4399091497883596e-06,
"loss": 0.38140493631362915,
"step": 1356
},
{
"epoch": 2.8893617021276596,
"grad_norm": 0.482421875,
"learning_rate": 2.4370631118233766e-06,
"loss": 0.38466039299964905,
"step": 1358
},
{
"epoch": 2.8936170212765955,
"grad_norm": 0.5390625,
"learning_rate": 2.4343219490126636e-06,
"loss": 0.6831486821174622,
"step": 1360
},
{
"epoch": 2.8978723404255318,
"grad_norm": 0.474609375,
"learning_rate": 2.4316857217139125e-06,
"loss": 0.5675507187843323,
"step": 1362
},
{
"epoch": 2.902127659574468,
"grad_norm": 0.4765625,
"learning_rate": 2.429154487974237e-06,
"loss": 0.5387779474258423,
"step": 1364
},
{
"epoch": 2.9063829787234043,
"grad_norm": 0.466796875,
"learning_rate": 2.4267283035288974e-06,
"loss": 0.5070762634277344,
"step": 1366
},
{
"epoch": 2.9106382978723406,
"grad_norm": 0.404296875,
"learning_rate": 2.4244072218000737e-06,
"loss": 0.49968618154525757,
"step": 1368
},
{
"epoch": 2.9148936170212765,
"grad_norm": 0.439453125,
"learning_rate": 2.422191293895687e-06,
"loss": 0.7925405502319336,
"step": 1370
},
{
"epoch": 2.9191489361702128,
"grad_norm": 0.8203125,
"learning_rate": 2.4200805686082757e-06,
"loss": 0.4414962828159332,
"step": 1372
},
{
"epoch": 2.923404255319149,
"grad_norm": 0.72265625,
"learning_rate": 2.4180750924139205e-06,
"loss": 0.5193897485733032,
"step": 1374
},
{
"epoch": 2.927659574468085,
"grad_norm": 0.455078125,
"learning_rate": 2.4161749094712216e-06,
"loss": 0.5439836978912354,
"step": 1376
},
{
"epoch": 2.931914893617021,
"grad_norm": 1.015625,
"learning_rate": 2.414380061620327e-06,
"loss": 0.5974451899528503,
"step": 1378
},
{
"epoch": 2.9361702127659575,
"grad_norm": 0.482421875,
"learning_rate": 2.4126905883820076e-06,
"loss": 0.43398624658584595,
"step": 1380
},
{
"epoch": 2.9404255319148938,
"grad_norm": 0.69921875,
"learning_rate": 2.411106526956792e-06,
"loss": 0.7541142702102661,
"step": 1382
},
{
"epoch": 2.94468085106383,
"grad_norm": 0.498046875,
"learning_rate": 2.4096279122241438e-06,
"loss": 0.592811107635498,
"step": 1384
},
{
"epoch": 2.948936170212766,
"grad_norm": 0.392578125,
"learning_rate": 2.408254776741697e-06,
"loss": 0.6341920495033264,
"step": 1386
},
{
"epoch": 2.953191489361702,
"grad_norm": 0.76171875,
"learning_rate": 2.4069871507445332e-06,
"loss": 0.755580484867096,
"step": 1388
},
{
"epoch": 2.9574468085106385,
"grad_norm": 0.455078125,
"learning_rate": 2.4058250621445224e-06,
"loss": 0.682244598865509,
"step": 1390
},
{
"epoch": 2.9617021276595743,
"grad_norm": 0.484375,
"learning_rate": 2.4047685365297056e-06,
"loss": 0.5976744890213013,
"step": 1392
},
{
"epoch": 2.9659574468085106,
"grad_norm": 0.4375,
"learning_rate": 2.403817597163731e-06,
"loss": 0.5079911351203918,
"step": 1394
},
{
"epoch": 2.970212765957447,
"grad_norm": 1.4296875,
"learning_rate": 2.402972264985341e-06,
"loss": 0.4225712716579437,
"step": 1396
},
{
"epoch": 2.974468085106383,
"grad_norm": 0.57421875,
"learning_rate": 2.4022325586079132e-06,
"loss": 0.6215579509735107,
"step": 1398
},
{
"epoch": 2.978723404255319,
"grad_norm": 0.3828125,
"learning_rate": 2.4015984943190496e-06,
"loss": 0.455652117729187,
"step": 1400
},
{
"epoch": 2.9829787234042553,
"grad_norm": 0.70703125,
"learning_rate": 2.401070086080218e-06,
"loss": 0.5189418792724609,
"step": 1402
},
{
"epoch": 2.9872340425531916,
"grad_norm": 2.890625,
"learning_rate": 2.400647345526445e-06,
"loss": 0.5081955790519714,
"step": 1404
},
{
"epoch": 2.9914893617021274,
"grad_norm": 0.443359375,
"learning_rate": 2.400330281966059e-06,
"loss": 0.5243685841560364,
"step": 1406
},
{
"epoch": 2.9957446808510637,
"grad_norm": 0.6796875,
"learning_rate": 2.400118902380485e-06,
"loss": 0.6540034413337708,
"step": 1408
},
{
"epoch": 3.0,
"grad_norm": 1.359375,
"learning_rate": 2.400013211424094e-06,
"loss": 0.3355269134044647,
"step": 1410
},
{
"epoch": 3.0,
"step": 1410,
"total_flos": 4.1743170019314893e+18,
"train_loss": 0.8916917941037644,
"train_runtime": 10519.9057,
"train_samples_per_second": 4.289,
"train_steps_per_second": 0.134
}
],
"logging_steps": 2,
"max_steps": 1410,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.1743170019314893e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}