9b-115 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
1b228bb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1748,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004576659038901602,
"grad_norm": 1.0062716007232666,
"learning_rate": 9.09090909090909e-08,
"loss": 1.9361354112625122,
"step": 2
},
{
"epoch": 0.009153318077803204,
"grad_norm": 1.1005034446716309,
"learning_rate": 2.727272727272727e-07,
"loss": 1.9705393314361572,
"step": 4
},
{
"epoch": 0.013729977116704805,
"grad_norm": 2.435994863510132,
"learning_rate": 4.545454545454545e-07,
"loss": 1.9127593040466309,
"step": 6
},
{
"epoch": 0.018306636155606407,
"grad_norm": 4.2885661125183105,
"learning_rate": 6.363636363636363e-07,
"loss": 1.974473237991333,
"step": 8
},
{
"epoch": 0.02288329519450801,
"grad_norm": 1.8748376369476318,
"learning_rate": 8.181818181818182e-07,
"loss": 2.226160764694214,
"step": 10
},
{
"epoch": 0.02745995423340961,
"grad_norm": 2.666358232498169,
"learning_rate": 1e-06,
"loss": 2.0884294509887695,
"step": 12
},
{
"epoch": 0.032036613272311214,
"grad_norm": 0.6817159056663513,
"learning_rate": 1.1818181818181818e-06,
"loss": 1.8083596229553223,
"step": 14
},
{
"epoch": 0.036613272311212815,
"grad_norm": 0.9749477505683899,
"learning_rate": 1.3636363636363634e-06,
"loss": 1.7538858652114868,
"step": 16
},
{
"epoch": 0.041189931350114416,
"grad_norm": 3.5282654762268066,
"learning_rate": 1.5454545454545454e-06,
"loss": 1.3263461589813232,
"step": 18
},
{
"epoch": 0.04576659038901602,
"grad_norm": 1.0763177871704102,
"learning_rate": 1.7272727272727273e-06,
"loss": 1.3052548170089722,
"step": 20
},
{
"epoch": 0.05034324942791762,
"grad_norm": 4.2185378074646,
"learning_rate": 1.909090909090909e-06,
"loss": 0.8882780075073242,
"step": 22
},
{
"epoch": 0.05491990846681922,
"grad_norm": 4.18178129196167,
"learning_rate": 2.0909090909090907e-06,
"loss": 1.695899248123169,
"step": 24
},
{
"epoch": 0.059496567505720827,
"grad_norm": 1.5718276500701904,
"learning_rate": 2.2727272727272728e-06,
"loss": 1.406914472579956,
"step": 26
},
{
"epoch": 0.06407322654462243,
"grad_norm": 4.563444137573242,
"learning_rate": 2.4545454545454544e-06,
"loss": 0.9809740781784058,
"step": 28
},
{
"epoch": 0.06864988558352403,
"grad_norm": 0.6998274922370911,
"learning_rate": 2.636363636363636e-06,
"loss": 1.5562105178833008,
"step": 30
},
{
"epoch": 0.07322654462242563,
"grad_norm": 0.8538845777511597,
"learning_rate": 2.818181818181818e-06,
"loss": 1.6105421781539917,
"step": 32
},
{
"epoch": 0.07780320366132723,
"grad_norm": 0.6970071196556091,
"learning_rate": 3e-06,
"loss": 1.4942922592163086,
"step": 34
},
{
"epoch": 0.08237986270022883,
"grad_norm": 0.6283139586448669,
"learning_rate": 3.1818181818181817e-06,
"loss": 1.4711008071899414,
"step": 36
},
{
"epoch": 0.08695652173913043,
"grad_norm": 3.1694211959838867,
"learning_rate": 3.3636363636363637e-06,
"loss": 1.3710073232650757,
"step": 38
},
{
"epoch": 0.09153318077803203,
"grad_norm": 0.9061569571495056,
"learning_rate": 3.5454545454545454e-06,
"loss": 1.3197435140609741,
"step": 40
},
{
"epoch": 0.09610983981693363,
"grad_norm": 1.3489203453063965,
"learning_rate": 3.727272727272727e-06,
"loss": 1.3319520950317383,
"step": 42
},
{
"epoch": 0.10068649885583524,
"grad_norm": 0.7797649502754211,
"learning_rate": 3.909090909090909e-06,
"loss": 1.093435287475586,
"step": 44
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.7038461565971375,
"learning_rate": 4.090909090909091e-06,
"loss": 1.39493989944458,
"step": 46
},
{
"epoch": 0.10983981693363844,
"grad_norm": 4.629183769226074,
"learning_rate": 4.272727272727272e-06,
"loss": 1.6254442930221558,
"step": 48
},
{
"epoch": 0.11441647597254005,
"grad_norm": 0.7052221894264221,
"learning_rate": 4.454545454545454e-06,
"loss": 1.4149081707000732,
"step": 50
},
{
"epoch": 0.11899313501144165,
"grad_norm": 0.6668927669525146,
"learning_rate": 4.636363636363636e-06,
"loss": 1.1638028621673584,
"step": 52
},
{
"epoch": 0.12356979405034325,
"grad_norm": 1.8050854206085205,
"learning_rate": 4.818181818181818e-06,
"loss": 0.893460750579834,
"step": 54
},
{
"epoch": 0.12814645308924486,
"grad_norm": 0.6511980295181274,
"learning_rate": 4.9999999999999996e-06,
"loss": 1.3653751611709595,
"step": 56
},
{
"epoch": 0.13272311212814644,
"grad_norm": 1.0147000551223755,
"learning_rate": 5.181818181818181e-06,
"loss": 0.8794707655906677,
"step": 58
},
{
"epoch": 0.13729977116704806,
"grad_norm": 0.7929437756538391,
"learning_rate": 5.363636363636363e-06,
"loss": 1.4038774967193604,
"step": 60
},
{
"epoch": 0.14187643020594964,
"grad_norm": 1.6879537105560303,
"learning_rate": 5.545454545454545e-06,
"loss": 1.3440090417861938,
"step": 62
},
{
"epoch": 0.14645308924485126,
"grad_norm": 2.4623799324035645,
"learning_rate": 5.727272727272727e-06,
"loss": 1.2824642658233643,
"step": 64
},
{
"epoch": 0.15102974828375287,
"grad_norm": 0.8197077512741089,
"learning_rate": 5.9090909090909085e-06,
"loss": 1.3210649490356445,
"step": 66
},
{
"epoch": 0.15560640732265446,
"grad_norm": 0.5857513546943665,
"learning_rate": 6.090909090909091e-06,
"loss": 1.3227065801620483,
"step": 68
},
{
"epoch": 0.16018306636155608,
"grad_norm": 1.8220713138580322,
"learning_rate": 6.272727272727273e-06,
"loss": 1.1438708305358887,
"step": 70
},
{
"epoch": 0.16475972540045766,
"grad_norm": 1.1689298152923584,
"learning_rate": 6.454545454545454e-06,
"loss": 1.2297066450119019,
"step": 72
},
{
"epoch": 0.16933638443935928,
"grad_norm": 0.6110695004463196,
"learning_rate": 6.636363636363637e-06,
"loss": 1.2812331914901733,
"step": 74
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.8178548216819763,
"learning_rate": 6.818181818181818e-06,
"loss": 1.1592620611190796,
"step": 76
},
{
"epoch": 0.17848970251716248,
"grad_norm": 0.551541268825531,
"learning_rate": 7e-06,
"loss": 1.3165788650512695,
"step": 78
},
{
"epoch": 0.18306636155606407,
"grad_norm": 0.6924233436584473,
"learning_rate": 7.1818181818181815e-06,
"loss": 0.9857466220855713,
"step": 80
},
{
"epoch": 0.18764302059496568,
"grad_norm": 0.8627240657806396,
"learning_rate": 7.363636363636363e-06,
"loss": 1.5733084678649902,
"step": 82
},
{
"epoch": 0.19221967963386727,
"grad_norm": 1.4400972127914429,
"learning_rate": 7.545454545454546e-06,
"loss": 1.2073043584823608,
"step": 84
},
{
"epoch": 0.19679633867276888,
"grad_norm": 0.751667320728302,
"learning_rate": 7.727272727272727e-06,
"loss": 1.3525782823562622,
"step": 86
},
{
"epoch": 0.20137299771167047,
"grad_norm": 0.982463002204895,
"learning_rate": 7.909090909090909e-06,
"loss": 1.0466015338897705,
"step": 88
},
{
"epoch": 0.20594965675057209,
"grad_norm": 1.4768906831741333,
"learning_rate": 7.999993553025613e-06,
"loss": 1.5740259885787964,
"step": 90
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.8806318640708923,
"learning_rate": 7.999941977369066e-06,
"loss": 1.0948330163955688,
"step": 92
},
{
"epoch": 0.2151029748283753,
"grad_norm": 0.8388352394104004,
"learning_rate": 7.999838826794875e-06,
"loss": 1.3860080242156982,
"step": 94
},
{
"epoch": 0.21967963386727687,
"grad_norm": 1.9620765447616577,
"learning_rate": 7.999684102780836e-06,
"loss": 0.6549928784370422,
"step": 96
},
{
"epoch": 0.2242562929061785,
"grad_norm": 1.8647589683532715,
"learning_rate": 7.99947780754362e-06,
"loss": 1.1090216636657715,
"step": 98
},
{
"epoch": 0.2288329519450801,
"grad_norm": 0.47244909405708313,
"learning_rate": 7.999219944038733e-06,
"loss": 1.3478974103927612,
"step": 100
},
{
"epoch": 0.2334096109839817,
"grad_norm": 1.5291101932525635,
"learning_rate": 7.998910515960482e-06,
"loss": 1.2799420356750488,
"step": 102
},
{
"epoch": 0.2379862700228833,
"grad_norm": 0.7091822028160095,
"learning_rate": 7.99854952774192e-06,
"loss": 1.0800800323486328,
"step": 104
},
{
"epoch": 0.2425629290617849,
"grad_norm": 0.5217757225036621,
"learning_rate": 7.99813698455478e-06,
"loss": 1.2569258213043213,
"step": 106
},
{
"epoch": 0.2471395881006865,
"grad_norm": 0.8455982208251953,
"learning_rate": 7.997672892309399e-06,
"loss": 1.2004809379577637,
"step": 108
},
{
"epoch": 0.2517162471395881,
"grad_norm": 1.3651947975158691,
"learning_rate": 7.997157257654642e-06,
"loss": 0.9266209006309509,
"step": 110
},
{
"epoch": 0.2562929061784897,
"grad_norm": 2.029172658920288,
"learning_rate": 7.996590087977799e-06,
"loss": 0.6148236989974976,
"step": 112
},
{
"epoch": 0.2608695652173913,
"grad_norm": 1.3701057434082031,
"learning_rate": 7.995971391404479e-06,
"loss": 1.3206911087036133,
"step": 114
},
{
"epoch": 0.2654462242562929,
"grad_norm": 1.0008249282836914,
"learning_rate": 7.9953011767985e-06,
"loss": 1.5282856225967407,
"step": 116
},
{
"epoch": 0.2700228832951945,
"grad_norm": 1.6718090772628784,
"learning_rate": 7.994579453761756e-06,
"loss": 1.2926162481307983,
"step": 118
},
{
"epoch": 0.2745995423340961,
"grad_norm": 1.2445013523101807,
"learning_rate": 7.993806232634083e-06,
"loss": 1.14131498336792,
"step": 120
},
{
"epoch": 0.2791762013729977,
"grad_norm": 0.8166568279266357,
"learning_rate": 7.992981524493107e-06,
"loss": 0.9494431614875793,
"step": 122
},
{
"epoch": 0.2837528604118993,
"grad_norm": 1.60244882106781,
"learning_rate": 7.992105341154091e-06,
"loss": 1.0895594358444214,
"step": 124
},
{
"epoch": 0.28832951945080093,
"grad_norm": 0.7963911890983582,
"learning_rate": 7.99117769516976e-06,
"loss": 1.2929226160049438,
"step": 126
},
{
"epoch": 0.2929061784897025,
"grad_norm": 1.1980226039886475,
"learning_rate": 7.990198599830122e-06,
"loss": 1.0238358974456787,
"step": 128
},
{
"epoch": 0.2974828375286041,
"grad_norm": 0.8094168901443481,
"learning_rate": 7.989168069162285e-06,
"loss": 1.2957402467727661,
"step": 130
},
{
"epoch": 0.30205949656750575,
"grad_norm": 0.6273795366287231,
"learning_rate": 7.988086117930241e-06,
"loss": 1.316016435623169,
"step": 132
},
{
"epoch": 0.30663615560640733,
"grad_norm": 1.0645545721054077,
"learning_rate": 7.986952761634676e-06,
"loss": 0.9111831784248352,
"step": 134
},
{
"epoch": 0.3112128146453089,
"grad_norm": 0.4852690100669861,
"learning_rate": 7.985768016512724e-06,
"loss": 1.2701897621154785,
"step": 136
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.6316735744476318,
"learning_rate": 7.984531899537751e-06,
"loss": 1.3380374908447266,
"step": 138
},
{
"epoch": 0.32036613272311215,
"grad_norm": 0.6724331378936768,
"learning_rate": 7.98324442841911e-06,
"loss": 1.0774213075637817,
"step": 140
},
{
"epoch": 0.32494279176201374,
"grad_norm": 0.5823185443878174,
"learning_rate": 7.981905621601877e-06,
"loss": 1.2555238008499146,
"step": 142
},
{
"epoch": 0.3295194508009153,
"grad_norm": 0.6299719214439392,
"learning_rate": 7.9805154982666e-06,
"loss": 0.9718642830848694,
"step": 144
},
{
"epoch": 0.3340961098398169,
"grad_norm": 0.7177326679229736,
"learning_rate": 7.979074078329013e-06,
"loss": 1.0789949893951416,
"step": 146
},
{
"epoch": 0.33867276887871856,
"grad_norm": 1.0502492189407349,
"learning_rate": 7.977581382439763e-06,
"loss": 0.8612786531448364,
"step": 148
},
{
"epoch": 0.34324942791762014,
"grad_norm": 0.8035002946853638,
"learning_rate": 7.976037431984097e-06,
"loss": 1.327546238899231,
"step": 150
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.8929418325424194,
"learning_rate": 7.974442249081574e-06,
"loss": 0.7809689044952393,
"step": 152
},
{
"epoch": 0.3524027459954233,
"grad_norm": 0.9010236859321594,
"learning_rate": 7.972795856585738e-06,
"loss": 1.2843378782272339,
"step": 154
},
{
"epoch": 0.35697940503432496,
"grad_norm": 0.834439218044281,
"learning_rate": 7.971098278083786e-06,
"loss": 1.1299086809158325,
"step": 156
},
{
"epoch": 0.36155606407322655,
"grad_norm": 0.6221197247505188,
"learning_rate": 7.969349537896246e-06,
"loss": 1.282414436340332,
"step": 158
},
{
"epoch": 0.36613272311212813,
"grad_norm": 0.8410677313804626,
"learning_rate": 7.96754966107661e-06,
"loss": 1.2675057649612427,
"step": 160
},
{
"epoch": 0.3707093821510298,
"grad_norm": 0.9409856796264648,
"learning_rate": 7.965698673410988e-06,
"loss": 0.917771577835083,
"step": 162
},
{
"epoch": 0.37528604118993136,
"grad_norm": 0.4894721210002899,
"learning_rate": 7.963796601417737e-06,
"loss": 1.2448134422302246,
"step": 164
},
{
"epoch": 0.37986270022883295,
"grad_norm": 0.4557670056819916,
"learning_rate": 7.961843472347074e-06,
"loss": 1.253254771232605,
"step": 166
},
{
"epoch": 0.38443935926773454,
"grad_norm": 0.455538272857666,
"learning_rate": 7.959839314180691e-06,
"loss": 1.2726975679397583,
"step": 168
},
{
"epoch": 0.3890160183066362,
"grad_norm": 0.6479983329772949,
"learning_rate": 7.957784155631355e-06,
"loss": 0.9780842065811157,
"step": 170
},
{
"epoch": 0.39359267734553777,
"grad_norm": 0.7144932150840759,
"learning_rate": 7.955678026142495e-06,
"loss": 1.0534026622772217,
"step": 172
},
{
"epoch": 0.39816933638443935,
"grad_norm": 0.6702520251274109,
"learning_rate": 7.95352095588778e-06,
"loss": 0.9748213887214661,
"step": 174
},
{
"epoch": 0.40274599542334094,
"grad_norm": 0.6281000971794128,
"learning_rate": 7.951312975770682e-06,
"loss": 1.2368974685668945,
"step": 176
},
{
"epoch": 0.4073226544622426,
"grad_norm": 1.0416147708892822,
"learning_rate": 7.949054117424044e-06,
"loss": 0.9652445316314697,
"step": 178
},
{
"epoch": 0.41189931350114417,
"grad_norm": 0.36721283197402954,
"learning_rate": 7.946744413209623e-06,
"loss": 1.143812656402588,
"step": 180
},
{
"epoch": 0.41647597254004576,
"grad_norm": 0.7831065654754639,
"learning_rate": 7.944383896217614e-06,
"loss": 1.2468208074569702,
"step": 182
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.5314235687255859,
"learning_rate": 7.941972600266196e-06,
"loss": 1.23768150806427,
"step": 184
},
{
"epoch": 0.425629290617849,
"grad_norm": 1.5851482152938843,
"learning_rate": 7.939510559901035e-06,
"loss": 1.0880942344665527,
"step": 186
},
{
"epoch": 0.4302059496567506,
"grad_norm": 0.5965549349784851,
"learning_rate": 7.936997810394788e-06,
"loss": 1.32969069480896,
"step": 188
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.8173778653144836,
"learning_rate": 7.934434387746609e-06,
"loss": 1.1866960525512695,
"step": 190
},
{
"epoch": 0.43935926773455375,
"grad_norm": 1.3977761268615723,
"learning_rate": 7.931820328681615e-06,
"loss": 0.6947106719017029,
"step": 192
},
{
"epoch": 0.4439359267734554,
"grad_norm": 0.6947522759437561,
"learning_rate": 7.92915567065038e-06,
"loss": 1.0972782373428345,
"step": 194
},
{
"epoch": 0.448512585812357,
"grad_norm": 0.48104986548423767,
"learning_rate": 7.926440451828384e-06,
"loss": 1.2536075115203857,
"step": 196
},
{
"epoch": 0.45308924485125857,
"grad_norm": 0.4325115382671356,
"learning_rate": 7.923674711115476e-06,
"loss": 1.0892629623413086,
"step": 198
},
{
"epoch": 0.4576659038901602,
"grad_norm": 1.5888208150863647,
"learning_rate": 7.920858488135305e-06,
"loss": 1.1664975881576538,
"step": 200
},
{
"epoch": 0.4622425629290618,
"grad_norm": 1.1934003829956055,
"learning_rate": 7.917991823234762e-06,
"loss": 1.325711965560913,
"step": 202
},
{
"epoch": 0.4668192219679634,
"grad_norm": 0.9322173595428467,
"learning_rate": 7.915074757483403e-06,
"loss": 1.133060097694397,
"step": 204
},
{
"epoch": 0.47139588100686497,
"grad_norm": 1.1125494241714478,
"learning_rate": 7.91210733267285e-06,
"loss": 1.2609753608703613,
"step": 206
},
{
"epoch": 0.4759725400457666,
"grad_norm": 0.402997225522995,
"learning_rate": 7.909089591316204e-06,
"loss": 1.1940068006515503,
"step": 208
},
{
"epoch": 0.4805491990846682,
"grad_norm": 2.592806339263916,
"learning_rate": 7.906021576647428e-06,
"loss": 0.990831732749939,
"step": 210
},
{
"epoch": 0.4851258581235698,
"grad_norm": 2.9936859607696533,
"learning_rate": 7.902903332620733e-06,
"loss": 1.0334012508392334,
"step": 212
},
{
"epoch": 0.4897025171624714,
"grad_norm": 0.9770790338516235,
"learning_rate": 7.89973490390994e-06,
"loss": 0.8017320036888123,
"step": 214
},
{
"epoch": 0.494279176201373,
"grad_norm": 0.7555065751075745,
"learning_rate": 7.896516335907856e-06,
"loss": 0.8924547433853149,
"step": 216
},
{
"epoch": 0.4988558352402746,
"grad_norm": 0.6128323674201965,
"learning_rate": 7.893247674725605e-06,
"loss": 1.4787169694900513,
"step": 218
},
{
"epoch": 0.5034324942791762,
"grad_norm": 1.4939192533493042,
"learning_rate": 7.889928967191976e-06,
"loss": 0.9237180948257446,
"step": 220
},
{
"epoch": 0.5080091533180778,
"grad_norm": 1.1495096683502197,
"learning_rate": 7.886560260852757e-06,
"loss": 1.079113483428955,
"step": 222
},
{
"epoch": 0.5125858123569794,
"grad_norm": 0.7052268981933594,
"learning_rate": 7.883141603970044e-06,
"loss": 1.2029013633728027,
"step": 224
},
{
"epoch": 0.517162471395881,
"grad_norm": 0.6179990768432617,
"learning_rate": 7.879673045521558e-06,
"loss": 1.2771333456039429,
"step": 226
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.7824820280075073,
"learning_rate": 7.876154635199936e-06,
"loss": 1.0359081029891968,
"step": 228
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.3128830194473267,
"learning_rate": 7.872586423412026e-06,
"loss": 0.9820423722267151,
"step": 230
},
{
"epoch": 0.5308924485125858,
"grad_norm": 1.6290223598480225,
"learning_rate": 7.868968461278157e-06,
"loss": 1.2495921850204468,
"step": 232
},
{
"epoch": 0.5354691075514875,
"grad_norm": 0.8125019669532776,
"learning_rate": 7.865300800631418e-06,
"loss": 1.2157059907913208,
"step": 234
},
{
"epoch": 0.540045766590389,
"grad_norm": 0.6958226561546326,
"learning_rate": 7.861583494016904e-06,
"loss": 1.1479324102401733,
"step": 236
},
{
"epoch": 0.5446224256292906,
"grad_norm": 0.6316367387771606,
"learning_rate": 7.857816594690967e-06,
"loss": 1.2824212312698364,
"step": 238
},
{
"epoch": 0.5491990846681922,
"grad_norm": 0.7441359162330627,
"learning_rate": 7.854000156620456e-06,
"loss": 1.2517260313034058,
"step": 240
},
{
"epoch": 0.5537757437070938,
"grad_norm": 0.4228470027446747,
"learning_rate": 7.85013423448194e-06,
"loss": 1.5795619487762451,
"step": 242
},
{
"epoch": 0.5583524027459954,
"grad_norm": 0.5715126991271973,
"learning_rate": 7.846218883660927e-06,
"loss": 1.2237060070037842,
"step": 244
},
{
"epoch": 0.562929061784897,
"grad_norm": 1.9045436382293701,
"learning_rate": 7.842254160251073e-06,
"loss": 1.079658031463623,
"step": 246
},
{
"epoch": 0.5675057208237986,
"grad_norm": 1.443070411682129,
"learning_rate": 7.838240121053368e-06,
"loss": 0.7990419268608093,
"step": 248
},
{
"epoch": 0.5720823798627003,
"grad_norm": 0.6427567005157471,
"learning_rate": 7.834176823575338e-06,
"loss": 1.2759908437728882,
"step": 250
},
{
"epoch": 0.5766590389016019,
"grad_norm": 0.8063530325889587,
"learning_rate": 7.830064326030206e-06,
"loss": 1.292599081993103,
"step": 252
},
{
"epoch": 0.5812356979405034,
"grad_norm": 0.7218191623687744,
"learning_rate": 7.825902687336065e-06,
"loss": 1.2685648202896118,
"step": 254
},
{
"epoch": 0.585812356979405,
"grad_norm": 0.25028905272483826,
"learning_rate": 7.821691967115038e-06,
"loss": 1.4054419994354248,
"step": 256
},
{
"epoch": 0.5903890160183066,
"grad_norm": 0.6193069815635681,
"learning_rate": 7.817432225692415e-06,
"loss": 1.2499405145645142,
"step": 258
},
{
"epoch": 0.5949656750572082,
"grad_norm": 0.5317597985267639,
"learning_rate": 7.813123524095793e-06,
"loss": 0.9288328289985657,
"step": 260
},
{
"epoch": 0.5995423340961098,
"grad_norm": 0.4047868549823761,
"learning_rate": 7.808765924054205e-06,
"loss": 0.9483444690704346,
"step": 262
},
{
"epoch": 0.6041189931350115,
"grad_norm": 0.5687839388847351,
"learning_rate": 7.80435948799723e-06,
"loss": 1.1598079204559326,
"step": 264
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.48621973395347595,
"learning_rate": 7.799904279054102e-06,
"loss": 1.3577367067337036,
"step": 266
},
{
"epoch": 0.6132723112128147,
"grad_norm": 0.47008511424064636,
"learning_rate": 7.795400361052801e-06,
"loss": 0.8724009394645691,
"step": 268
},
{
"epoch": 0.6178489702517163,
"grad_norm": 0.4799520969390869,
"learning_rate": 7.790847798519149e-06,
"loss": 1.2594244480133057,
"step": 270
},
{
"epoch": 0.6224256292906178,
"grad_norm": 2.5323190689086914,
"learning_rate": 7.78624665667587e-06,
"loss": 1.0330383777618408,
"step": 272
},
{
"epoch": 0.6270022883295194,
"grad_norm": 0.4832422137260437,
"learning_rate": 7.781597001441669e-06,
"loss": 0.9544627666473389,
"step": 274
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.6891815066337585,
"learning_rate": 7.776898899430286e-06,
"loss": 0.8545240759849548,
"step": 276
},
{
"epoch": 0.6361556064073226,
"grad_norm": 0.9485855102539062,
"learning_rate": 7.772152417949531e-06,
"loss": 0.829641580581665,
"step": 278
},
{
"epoch": 0.6407322654462243,
"grad_norm": 1.344787359237671,
"learning_rate": 7.767357625000333e-06,
"loss": 0.8731590509414673,
"step": 280
},
{
"epoch": 0.6453089244851259,
"grad_norm": 1.829545497894287,
"learning_rate": 7.762514589275758e-06,
"loss": 0.6393548846244812,
"step": 282
},
{
"epoch": 0.6498855835240275,
"grad_norm": 0.5783978700637817,
"learning_rate": 7.757623380160026e-06,
"loss": 1.0918872356414795,
"step": 284
},
{
"epoch": 0.6544622425629291,
"grad_norm": 0.5951263904571533,
"learning_rate": 7.752684067727519e-06,
"loss": 1.551460862159729,
"step": 286
},
{
"epoch": 0.6590389016018307,
"grad_norm": 0.9634541273117065,
"learning_rate": 7.747696722741773e-06,
"loss": 0.8825801014900208,
"step": 288
},
{
"epoch": 0.6636155606407322,
"grad_norm": 0.8059589862823486,
"learning_rate": 7.742661416654473e-06,
"loss": 0.9882611036300659,
"step": 290
},
{
"epoch": 0.6681922196796338,
"grad_norm": 0.5588476657867432,
"learning_rate": 7.737578221604416e-06,
"loss": 1.2011680603027344,
"step": 292
},
{
"epoch": 0.6727688787185355,
"grad_norm": 0.9811854362487793,
"learning_rate": 7.732447210416492e-06,
"loss": 1.2716686725616455,
"step": 294
},
{
"epoch": 0.6773455377574371,
"grad_norm": 1.7746957540512085,
"learning_rate": 7.727268456600627e-06,
"loss": 0.8755344748497009,
"step": 296
},
{
"epoch": 0.6819221967963387,
"grad_norm": 0.6389427185058594,
"learning_rate": 7.722042034350742e-06,
"loss": 1.26163649559021,
"step": 298
},
{
"epoch": 0.6864988558352403,
"grad_norm": 0.6335075497627258,
"learning_rate": 7.71676801854368e-06,
"loss": 1.2194799184799194,
"step": 300
},
{
"epoch": 0.6910755148741419,
"grad_norm": 0.6459981799125671,
"learning_rate": 7.711446484738143e-06,
"loss": 1.0250097513198853,
"step": 302
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.382996916770935,
"learning_rate": 7.706077509173595e-06,
"loss": 0.839065432548523,
"step": 304
},
{
"epoch": 0.700228832951945,
"grad_norm": 1.7975640296936035,
"learning_rate": 7.70066116876919e-06,
"loss": 1.1089591979980469,
"step": 306
},
{
"epoch": 0.7048054919908466,
"grad_norm": 1.6443461179733276,
"learning_rate": 7.69519754112265e-06,
"loss": 1.1687304973602295,
"step": 308
},
{
"epoch": 0.7093821510297483,
"grad_norm": 3.3438291549682617,
"learning_rate": 7.68968670450917e-06,
"loss": 0.9455866813659668,
"step": 310
},
{
"epoch": 0.7139588100686499,
"grad_norm": 1.3685333728790283,
"learning_rate": 7.68412873788028e-06,
"loss": 1.100264310836792,
"step": 312
},
{
"epoch": 0.7185354691075515,
"grad_norm": 0.9479398131370544,
"learning_rate": 7.678523720862733e-06,
"loss": 1.0691652297973633,
"step": 314
},
{
"epoch": 0.7231121281464531,
"grad_norm": 1.0067410469055176,
"learning_rate": 7.672871733757345e-06,
"loss": 1.057770013809204,
"step": 316
},
{
"epoch": 0.7276887871853547,
"grad_norm": 1.1182042360305786,
"learning_rate": 7.667172857537857e-06,
"loss": 0.6116782426834106,
"step": 318
},
{
"epoch": 0.7322654462242563,
"grad_norm": 0.6759845614433289,
"learning_rate": 7.661427173849773e-06,
"loss": 0.9427492022514343,
"step": 320
},
{
"epoch": 0.7368421052631579,
"grad_norm": 1.3793621063232422,
"learning_rate": 7.655634765009187e-06,
"loss": 0.8351959586143494,
"step": 322
},
{
"epoch": 0.7414187643020596,
"grad_norm": 0.438629686832428,
"learning_rate": 7.649795714001604e-06,
"loss": 1.2085388898849487,
"step": 324
},
{
"epoch": 0.7459954233409611,
"grad_norm": 0.7775664329528809,
"learning_rate": 7.643910104480756e-06,
"loss": 1.1938120126724243,
"step": 326
},
{
"epoch": 0.7505720823798627,
"grad_norm": 0.49513670802116394,
"learning_rate": 7.637978020767396e-06,
"loss": 1.266683578491211,
"step": 328
},
{
"epoch": 0.7551487414187643,
"grad_norm": 0.5126597881317139,
"learning_rate": 7.631999547848101e-06,
"loss": 1.2423049211502075,
"step": 330
},
{
"epoch": 0.7597254004576659,
"grad_norm": 0.5846846103668213,
"learning_rate": 7.6259747713740375e-06,
"loss": 1.1801856756210327,
"step": 332
},
{
"epoch": 0.7643020594965675,
"grad_norm": 0.812533974647522,
"learning_rate": 7.619903777659752e-06,
"loss": 1.1939353942871094,
"step": 334
},
{
"epoch": 0.7688787185354691,
"grad_norm": 0.6008883118629456,
"learning_rate": 7.613786653681925e-06,
"loss": 1.2463386058807373,
"step": 336
},
{
"epoch": 0.7734553775743707,
"grad_norm": 0.888378381729126,
"learning_rate": 7.6076234870781235e-06,
"loss": 1.2093459367752075,
"step": 338
},
{
"epoch": 0.7780320366132724,
"grad_norm": 0.39801281690597534,
"learning_rate": 7.601414366145554e-06,
"loss": 1.2171316146850586,
"step": 340
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.5358569622039795,
"learning_rate": 7.5951593798397864e-06,
"loss": 1.2029497623443604,
"step": 342
},
{
"epoch": 0.7871853546910755,
"grad_norm": 0.7264346480369568,
"learning_rate": 7.588858617773492e-06,
"loss": 1.1039307117462158,
"step": 344
},
{
"epoch": 0.7917620137299771,
"grad_norm": 0.641931414604187,
"learning_rate": 7.582512170215146e-06,
"loss": 1.2453477382659912,
"step": 346
},
{
"epoch": 0.7963386727688787,
"grad_norm": 0.9022382497787476,
"learning_rate": 7.5761201280877445e-06,
"loss": 1.0837339162826538,
"step": 348
},
{
"epoch": 0.8009153318077803,
"grad_norm": 0.6649481654167175,
"learning_rate": 7.569682582967502e-06,
"loss": 1.307905673980713,
"step": 350
},
{
"epoch": 0.8054919908466819,
"grad_norm": 0.45646166801452637,
"learning_rate": 7.563199627082528e-06,
"loss": 1.236507534980774,
"step": 352
},
{
"epoch": 0.8100686498855835,
"grad_norm": 1.7149626016616821,
"learning_rate": 7.5566713533115215e-06,
"loss": 1.1184428930282593,
"step": 354
},
{
"epoch": 0.8146453089244852,
"grad_norm": 2.212782144546509,
"learning_rate": 7.550097855182428e-06,
"loss": 0.9626376628875732,
"step": 356
},
{
"epoch": 0.8192219679633868,
"grad_norm": 0.4924994111061096,
"learning_rate": 7.543479226871106e-06,
"loss": 1.239965796470642,
"step": 358
},
{
"epoch": 0.8237986270022883,
"grad_norm": 0.8329795002937317,
"learning_rate": 7.536815563199976e-06,
"loss": 1.0326392650604248,
"step": 360
},
{
"epoch": 0.8283752860411899,
"grad_norm": 0.9352472424507141,
"learning_rate": 7.530106959636661e-06,
"loss": 1.404736876487732,
"step": 362
},
{
"epoch": 0.8329519450800915,
"grad_norm": 0.575380265712738,
"learning_rate": 7.523353512292619e-06,
"loss": 1.152444839477539,
"step": 364
},
{
"epoch": 0.8375286041189931,
"grad_norm": 0.49745839834213257,
"learning_rate": 7.51655531792177e-06,
"loss": 1.269639015197754,
"step": 366
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.8237262964248657,
"learning_rate": 7.509712473919102e-06,
"loss": 1.484926462173462,
"step": 368
},
{
"epoch": 0.8466819221967964,
"grad_norm": 0.4979184865951538,
"learning_rate": 7.502825078319286e-06,
"loss": 1.1458359956741333,
"step": 370
},
{
"epoch": 0.851258581235698,
"grad_norm": 0.8595190644264221,
"learning_rate": 7.495893229795259e-06,
"loss": 1.0554736852645874,
"step": 372
},
{
"epoch": 0.8558352402745996,
"grad_norm": 0.35459527373313904,
"learning_rate": 7.488917027656824e-06,
"loss": 1.0308196544647217,
"step": 374
},
{
"epoch": 0.8604118993135011,
"grad_norm": 0.5745582580566406,
"learning_rate": 7.481896571849214e-06,
"loss": 0.8475030660629272,
"step": 376
},
{
"epoch": 0.8649885583524027,
"grad_norm": 0.4970310628414154,
"learning_rate": 7.4748319629516725e-06,
"loss": 1.235321283340454,
"step": 378
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.52949059009552,
"learning_rate": 7.467723302176002e-06,
"loss": 1.2889394760131836,
"step": 380
},
{
"epoch": 0.8741418764302059,
"grad_norm": 0.4958958625793457,
"learning_rate": 7.46057069136512e-06,
"loss": 1.2626429796218872,
"step": 382
},
{
"epoch": 0.8787185354691075,
"grad_norm": 0.49720507860183716,
"learning_rate": 7.453374232991599e-06,
"loss": 1.2046830654144287,
"step": 384
},
{
"epoch": 0.8832951945080092,
"grad_norm": 0.36593207716941833,
"learning_rate": 7.446134030156197e-06,
"loss": 1.0390335321426392,
"step": 386
},
{
"epoch": 0.8878718535469108,
"grad_norm": 0.49038416147232056,
"learning_rate": 7.438850186586382e-06,
"loss": 1.2193632125854492,
"step": 388
},
{
"epoch": 0.8924485125858124,
"grad_norm": 0.5042470097541809,
"learning_rate": 7.431522806634845e-06,
"loss": 1.2251640558242798,
"step": 390
},
{
"epoch": 0.897025171624714,
"grad_norm": 0.4519674777984619,
"learning_rate": 7.424151995278005e-06,
"loss": 0.8979975581169128,
"step": 392
},
{
"epoch": 0.9016018306636155,
"grad_norm": 0.8786867260932922,
"learning_rate": 7.416737858114503e-06,
"loss": 1.0744414329528809,
"step": 394
},
{
"epoch": 0.9061784897025171,
"grad_norm": 0.5781663656234741,
"learning_rate": 7.409280501363697e-06,
"loss": 0.9060631990432739,
"step": 396
},
{
"epoch": 0.9107551487414187,
"grad_norm": 0.6782101392745972,
"learning_rate": 7.4017800318641296e-06,
"loss": 1.0410590171813965,
"step": 398
},
{
"epoch": 0.9153318077803204,
"grad_norm": 0.7507611513137817,
"learning_rate": 7.394236557072005e-06,
"loss": 0.9724853038787842,
"step": 400
},
{
"epoch": 0.919908466819222,
"grad_norm": 0.8260324001312256,
"learning_rate": 7.386650185059644e-06,
"loss": 1.1944291591644287,
"step": 402
},
{
"epoch": 0.9244851258581236,
"grad_norm": 0.37671464681625366,
"learning_rate": 7.379021024513942e-06,
"loss": 1.2070642709732056,
"step": 404
},
{
"epoch": 0.9290617848970252,
"grad_norm": 0.39617058634757996,
"learning_rate": 7.371349184734808e-06,
"loss": 1.1787455081939697,
"step": 406
},
{
"epoch": 0.9336384439359268,
"grad_norm": 5.384557723999023,
"learning_rate": 7.3636347756335965e-06,
"loss": 0.9677321910858154,
"step": 408
},
{
"epoch": 0.9382151029748284,
"grad_norm": 0.4886989891529083,
"learning_rate": 7.355877907731536e-06,
"loss": 0.9373984932899475,
"step": 410
},
{
"epoch": 0.9427917620137299,
"grad_norm": 0.4791018068790436,
"learning_rate": 7.34807869215815e-06,
"loss": 1.203087329864502,
"step": 412
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.605128288269043,
"learning_rate": 7.340237240649653e-06,
"loss": 0.9440658688545227,
"step": 414
},
{
"epoch": 0.9519450800915332,
"grad_norm": 0.6311787962913513,
"learning_rate": 7.3323536655473606e-06,
"loss": 1.196823000907898,
"step": 416
},
{
"epoch": 0.9565217391304348,
"grad_norm": 1.343970775604248,
"learning_rate": 7.324428079796077e-06,
"loss": 0.7435826063156128,
"step": 418
},
{
"epoch": 0.9610983981693364,
"grad_norm": 0.4175608158111572,
"learning_rate": 7.316460596942473e-06,
"loss": 1.2795171737670898,
"step": 420
},
{
"epoch": 0.965675057208238,
"grad_norm": 20.577402114868164,
"learning_rate": 7.308451331133465e-06,
"loss": 0.8990521430969238,
"step": 422
},
{
"epoch": 0.9702517162471396,
"grad_norm": 0.568890392780304,
"learning_rate": 7.3004003971145765e-06,
"loss": 1.1218894720077515,
"step": 424
},
{
"epoch": 0.9748283752860412,
"grad_norm": 0.489161878824234,
"learning_rate": 7.292307910228291e-06,
"loss": 0.8210854530334473,
"step": 426
},
{
"epoch": 0.9794050343249427,
"grad_norm": 0.6115430593490601,
"learning_rate": 7.28417398641241e-06,
"loss": 0.9877936244010925,
"step": 428
},
{
"epoch": 0.9839816933638444,
"grad_norm": 0.7786864638328552,
"learning_rate": 7.275998742198379e-06,
"loss": 1.2866984605789185,
"step": 430
},
{
"epoch": 0.988558352402746,
"grad_norm": 1.2606936693191528,
"learning_rate": 7.267782294709628e-06,
"loss": 1.2489262819290161,
"step": 432
},
{
"epoch": 0.9931350114416476,
"grad_norm": 0.37724900245666504,
"learning_rate": 7.259524761659886e-06,
"loss": 1.177677869796753,
"step": 434
},
{
"epoch": 0.9977116704805492,
"grad_norm": 0.5038094520568848,
"learning_rate": 7.251226261351502e-06,
"loss": 0.8965859413146973,
"step": 436
},
{
"epoch": 1.002288329519451,
"grad_norm": 0.5005406141281128,
"learning_rate": 7.242886912673746e-06,
"loss": 1.2278937101364136,
"step": 438
},
{
"epoch": 1.0068649885583525,
"grad_norm": 0.7988545298576355,
"learning_rate": 7.234506835101103e-06,
"loss": 0.7848602533340454,
"step": 440
},
{
"epoch": 1.011441647597254,
"grad_norm": 0.7789489030838013,
"learning_rate": 7.22608614869157e-06,
"loss": 0.7383415699005127,
"step": 442
},
{
"epoch": 1.0160183066361557,
"grad_norm": 0.8829706311225891,
"learning_rate": 7.217624974084921e-06,
"loss": 1.0122401714324951,
"step": 444
},
{
"epoch": 1.0205949656750573,
"grad_norm": 0.8735617399215698,
"learning_rate": 7.209123432501e-06,
"loss": 0.7811689376831055,
"step": 446
},
{
"epoch": 1.0251716247139588,
"grad_norm": 2.609708309173584,
"learning_rate": 7.2005816457379634e-06,
"loss": 0.4612530767917633,
"step": 448
},
{
"epoch": 1.0297482837528604,
"grad_norm": 0.750656008720398,
"learning_rate": 7.191999736170548e-06,
"loss": 0.7795161008834839,
"step": 450
},
{
"epoch": 1.034324942791762,
"grad_norm": 2.268953800201416,
"learning_rate": 7.183377826748313e-06,
"loss": 0.902981698513031,
"step": 452
},
{
"epoch": 1.0389016018306636,
"grad_norm": 0.6813761591911316,
"learning_rate": 7.174716040993879e-06,
"loss": 1.0745810270309448,
"step": 454
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.7869388461112976,
"learning_rate": 7.166014503001159e-06,
"loss": 0.9484665393829346,
"step": 456
},
{
"epoch": 1.0480549199084668,
"grad_norm": 1.6086623668670654,
"learning_rate": 7.15727333743358e-06,
"loss": 1.0312416553497314,
"step": 458
},
{
"epoch": 1.0526315789473684,
"grad_norm": 1.016251564025879,
"learning_rate": 7.148492669522301e-06,
"loss": 0.8899783492088318,
"step": 460
},
{
"epoch": 1.05720823798627,
"grad_norm": 1.0114802122116089,
"learning_rate": 7.139672625064407e-06,
"loss": 0.6136757731437683,
"step": 462
},
{
"epoch": 1.0617848970251715,
"grad_norm": 1.2711036205291748,
"learning_rate": 7.130813330421122e-06,
"loss": 0.650338888168335,
"step": 464
},
{
"epoch": 1.0663615560640731,
"grad_norm": 0.5720494985580444,
"learning_rate": 7.12191491251599e-06,
"loss": 1.0441478490829468,
"step": 466
},
{
"epoch": 1.070938215102975,
"grad_norm": 1.6055749654769897,
"learning_rate": 7.112977498833056e-06,
"loss": 0.6316787004470825,
"step": 468
},
{
"epoch": 1.0755148741418765,
"grad_norm": 1.1760274171829224,
"learning_rate": 7.104001217415046e-06,
"loss": 1.0118086338043213,
"step": 470
},
{
"epoch": 1.080091533180778,
"grad_norm": 1.1817903518676758,
"learning_rate": 7.094986196861522e-06,
"loss": 0.8072628378868103,
"step": 472
},
{
"epoch": 1.0846681922196797,
"grad_norm": 0.6759648323059082,
"learning_rate": 7.085932566327053e-06,
"loss": 1.1159520149230957,
"step": 474
},
{
"epoch": 1.0892448512585813,
"grad_norm": 0.4874918758869171,
"learning_rate": 7.076840455519351e-06,
"loss": 0.8690568804740906,
"step": 476
},
{
"epoch": 1.0938215102974829,
"grad_norm": 0.3136481046676636,
"learning_rate": 7.067709994697427e-06,
"loss": 0.6553730964660645,
"step": 478
},
{
"epoch": 1.0983981693363845,
"grad_norm": 2.1855392456054688,
"learning_rate": 7.058541314669709e-06,
"loss": 0.8631330728530884,
"step": 480
},
{
"epoch": 1.102974828375286,
"grad_norm": 0.570698082447052,
"learning_rate": 7.049334546792182e-06,
"loss": 0.9237059950828552,
"step": 482
},
{
"epoch": 1.1075514874141876,
"grad_norm": 0.5239076614379883,
"learning_rate": 7.040089822966498e-06,
"loss": 1.1490368843078613,
"step": 484
},
{
"epoch": 1.1121281464530892,
"grad_norm": 0.6927589774131775,
"learning_rate": 7.030807275638089e-06,
"loss": 0.9833446741104126,
"step": 486
},
{
"epoch": 1.1167048054919908,
"grad_norm": 0.7562556266784668,
"learning_rate": 7.0214870377942695e-06,
"loss": 0.9206014275550842,
"step": 488
},
{
"epoch": 1.1212814645308924,
"grad_norm": 0.7045158743858337,
"learning_rate": 7.012129242962328e-06,
"loss": 0.9196767807006836,
"step": 490
},
{
"epoch": 1.125858123569794,
"grad_norm": 0.2471354752779007,
"learning_rate": 7.0027340252076204e-06,
"loss": 0.8276454210281372,
"step": 492
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.010713815689087,
"learning_rate": 6.9933015191316456e-06,
"loss": 0.7970508337020874,
"step": 494
},
{
"epoch": 1.1350114416475972,
"grad_norm": 0.7042145729064941,
"learning_rate": 6.983831859870115e-06,
"loss": 0.5978801250457764,
"step": 496
},
{
"epoch": 1.139588100686499,
"grad_norm": 1.2138888835906982,
"learning_rate": 6.9743251830910195e-06,
"loss": 0.8952941298484802,
"step": 498
},
{
"epoch": 1.1441647597254005,
"grad_norm": 6.797738552093506,
"learning_rate": 6.964781624992687e-06,
"loss": 0.862623393535614,
"step": 500
},
{
"epoch": 1.1487414187643021,
"grad_norm": 0.5526463985443115,
"learning_rate": 6.955201322301825e-06,
"loss": 0.7486683130264282,
"step": 502
},
{
"epoch": 1.1533180778032037,
"grad_norm": 1.4951832294464111,
"learning_rate": 6.9455844122715704e-06,
"loss": 1.175217628479004,
"step": 504
},
{
"epoch": 1.1578947368421053,
"grad_norm": 1.2167983055114746,
"learning_rate": 6.935931032679517e-06,
"loss": 0.7940334677696228,
"step": 506
},
{
"epoch": 1.162471395881007,
"grad_norm": 0.5917116403579712,
"learning_rate": 6.926241321825741e-06,
"loss": 0.5206097960472107,
"step": 508
},
{
"epoch": 1.1670480549199085,
"grad_norm": 0.9334290027618408,
"learning_rate": 6.916515418530827e-06,
"loss": 0.7569844126701355,
"step": 510
},
{
"epoch": 1.17162471395881,
"grad_norm": 0.45942240953445435,
"learning_rate": 6.906753462133869e-06,
"loss": 0.8078737258911133,
"step": 512
},
{
"epoch": 1.1762013729977117,
"grad_norm": 1.3607593774795532,
"learning_rate": 6.896955592490482e-06,
"loss": 0.9944840669631958,
"step": 514
},
{
"epoch": 1.1807780320366132,
"grad_norm": 0.7573413848876953,
"learning_rate": 6.887121949970796e-06,
"loss": 0.9973494410514832,
"step": 516
},
{
"epoch": 1.1853546910755148,
"grad_norm": 0.7948639988899231,
"learning_rate": 6.8772526754574424e-06,
"loss": 0.8695286512374878,
"step": 518
},
{
"epoch": 1.1899313501144164,
"grad_norm": 0.877565324306488,
"learning_rate": 6.867347910343539e-06,
"loss": 1.0234124660491943,
"step": 520
},
{
"epoch": 1.194508009153318,
"grad_norm": 0.9130783677101135,
"learning_rate": 6.857407796530663e-06,
"loss": 1.0572234392166138,
"step": 522
},
{
"epoch": 1.1990846681922196,
"grad_norm": 0.5426770448684692,
"learning_rate": 6.847432476426821e-06,
"loss": 0.7567615509033203,
"step": 524
},
{
"epoch": 1.2036613272311212,
"grad_norm": 3.1823055744171143,
"learning_rate": 6.8374220929443994e-06,
"loss": 1.0477484464645386,
"step": 526
},
{
"epoch": 1.208237986270023,
"grad_norm": 0.5345934629440308,
"learning_rate": 6.82737678949813e-06,
"loss": 0.9875526428222656,
"step": 528
},
{
"epoch": 1.2128146453089246,
"grad_norm": 1.0108869075775146,
"learning_rate": 6.817296710003026e-06,
"loss": 0.9593431353569031,
"step": 530
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.7925944328308105,
"learning_rate": 6.807181998872322e-06,
"loss": 1.1987873315811157,
"step": 532
},
{
"epoch": 1.2219679633867278,
"grad_norm": 0.9541200399398804,
"learning_rate": 6.797032801015407e-06,
"loss": 0.6915098428726196,
"step": 534
},
{
"epoch": 1.2265446224256293,
"grad_norm": 0.5937853455543518,
"learning_rate": 6.78684926183575e-06,
"loss": 1.0340373516082764,
"step": 536
},
{
"epoch": 1.231121281464531,
"grad_norm": 0.5773180723190308,
"learning_rate": 6.776631527228811e-06,
"loss": 1.004191279411316,
"step": 538
},
{
"epoch": 1.2356979405034325,
"grad_norm": 0.6959558129310608,
"learning_rate": 6.766379743579954e-06,
"loss": 1.0310280323028564,
"step": 540
},
{
"epoch": 1.240274599542334,
"grad_norm": 0.5717634558677673,
"learning_rate": 6.756094057762353e-06,
"loss": 0.8756218552589417,
"step": 542
},
{
"epoch": 1.2448512585812357,
"grad_norm": 0.6597534418106079,
"learning_rate": 6.74577461713488e-06,
"loss": 0.9393812417984009,
"step": 544
},
{
"epoch": 1.2494279176201373,
"grad_norm": 0.7435896992683411,
"learning_rate": 6.735421569540004e-06,
"loss": 0.5626208782196045,
"step": 546
},
{
"epoch": 1.2540045766590389,
"grad_norm": 0.7489412426948547,
"learning_rate": 6.7250350633016655e-06,
"loss": 0.7460745573043823,
"step": 548
},
{
"epoch": 1.2585812356979404,
"grad_norm": 1.607967495918274,
"learning_rate": 6.714615247223148e-06,
"loss": 0.7691885828971863,
"step": 550
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.45488154888153076,
"learning_rate": 6.7041622705849625e-06,
"loss": 1.0543756484985352,
"step": 552
},
{
"epoch": 1.2677345537757438,
"grad_norm": 0.4644421935081482,
"learning_rate": 6.693676283142687e-06,
"loss": 1.0251156091690063,
"step": 554
},
{
"epoch": 1.2723112128146452,
"grad_norm": 0.6865373253822327,
"learning_rate": 6.68315743512484e-06,
"loss": 0.9979171752929688,
"step": 556
},
{
"epoch": 1.276887871853547,
"grad_norm": 1.0765372514724731,
"learning_rate": 6.672605877230714e-06,
"loss": 0.42125648260116577,
"step": 558
},
{
"epoch": 1.2814645308924484,
"grad_norm": 0.5296036005020142,
"learning_rate": 6.662021760628231e-06,
"loss": 0.6392301917076111,
"step": 560
},
{
"epoch": 1.2860411899313502,
"grad_norm": 0.6849478483200073,
"learning_rate": 6.651405236951756e-06,
"loss": 1.038710355758667,
"step": 562
},
{
"epoch": 1.2906178489702518,
"grad_norm": 1.4734174013137817,
"learning_rate": 6.640756458299951e-06,
"loss": 0.9006748199462891,
"step": 564
},
{
"epoch": 1.2951945080091534,
"grad_norm": 4.2481369972229,
"learning_rate": 6.630075577233572e-06,
"loss": 0.9259494543075562,
"step": 566
},
{
"epoch": 1.299771167048055,
"grad_norm": 0.4776517450809479,
"learning_rate": 6.619362746773299e-06,
"loss": 1.0371928215026855,
"step": 568
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.0805084705352783,
"learning_rate": 6.608618120397533e-06,
"loss": 0.5052875280380249,
"step": 570
},
{
"epoch": 1.3089244851258581,
"grad_norm": 1.1686513423919678,
"learning_rate": 6.597841852040207e-06,
"loss": 0.756683349609375,
"step": 572
},
{
"epoch": 1.3135011441647597,
"grad_norm": 2.118833065032959,
"learning_rate": 6.587034096088575e-06,
"loss": 0.8216329216957092,
"step": 574
},
{
"epoch": 1.3180778032036613,
"grad_norm": 1.1685985326766968,
"learning_rate": 6.576195007380998e-06,
"loss": 1.0449680089950562,
"step": 576
},
{
"epoch": 1.322654462242563,
"grad_norm": 1.8055627346038818,
"learning_rate": 6.5653247412047324e-06,
"loss": 0.5522174835205078,
"step": 578
},
{
"epoch": 1.3272311212814645,
"grad_norm": 0.8082292079925537,
"learning_rate": 6.554423453293698e-06,
"loss": 0.9655364155769348,
"step": 580
},
{
"epoch": 1.331807780320366,
"grad_norm": 0.6602555513381958,
"learning_rate": 6.543491299826255e-06,
"loss": 1.1645584106445312,
"step": 582
},
{
"epoch": 1.3363844393592679,
"grad_norm": 0.5118871927261353,
"learning_rate": 6.532528437422959e-06,
"loss": 1.0177983045578003,
"step": 584
},
{
"epoch": 1.3409610983981692,
"grad_norm": 0.8692869544029236,
"learning_rate": 6.521535023144319e-06,
"loss": 0.7149632573127747,
"step": 586
},
{
"epoch": 1.345537757437071,
"grad_norm": 0.8488191962242126,
"learning_rate": 6.510511214488554e-06,
"loss": 1.0412425994873047,
"step": 588
},
{
"epoch": 1.3501144164759724,
"grad_norm": 0.5424400568008423,
"learning_rate": 6.499457169389324e-06,
"loss": 1.0742121934890747,
"step": 590
},
{
"epoch": 1.3546910755148742,
"grad_norm": 0.7974820733070374,
"learning_rate": 6.4883730462134754e-06,
"loss": 1.0228968858718872,
"step": 592
},
{
"epoch": 1.3592677345537758,
"grad_norm": 0.5494404435157776,
"learning_rate": 6.477259003758778e-06,
"loss": 0.8311363458633423,
"step": 594
},
{
"epoch": 1.3638443935926774,
"grad_norm": 1.1344202756881714,
"learning_rate": 6.466115201251637e-06,
"loss": 0.9677754640579224,
"step": 596
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.9898701310157776,
"learning_rate": 6.454941798344816e-06,
"loss": 0.8938733339309692,
"step": 598
},
{
"epoch": 1.3729977116704806,
"grad_norm": 0.843387246131897,
"learning_rate": 6.443738955115158e-06,
"loss": 0.7526968121528625,
"step": 600
},
{
"epoch": 1.3775743707093822,
"grad_norm": 0.6977670788764954,
"learning_rate": 6.432506832061283e-06,
"loss": 0.9662184119224548,
"step": 602
},
{
"epoch": 1.3821510297482837,
"grad_norm": 0.7610148191452026,
"learning_rate": 6.421245590101285e-06,
"loss": 1.0838618278503418,
"step": 604
},
{
"epoch": 1.3867276887871853,
"grad_norm": 1.5654877424240112,
"learning_rate": 6.409955390570444e-06,
"loss": 1.034435510635376,
"step": 606
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.5249960422515869,
"learning_rate": 6.398636395218895e-06,
"loss": 1.0667188167572021,
"step": 608
},
{
"epoch": 1.3958810068649885,
"grad_norm": 0.8519649505615234,
"learning_rate": 6.387288766209325e-06,
"loss": 1.0217965841293335,
"step": 610
},
{
"epoch": 1.40045766590389,
"grad_norm": 1.1158952713012695,
"learning_rate": 6.375912666114637e-06,
"loss": 0.834468424320221,
"step": 612
},
{
"epoch": 1.4050343249427917,
"grad_norm": 4.450192928314209,
"learning_rate": 6.364508257915633e-06,
"loss": 0.6208648085594177,
"step": 614
},
{
"epoch": 1.4096109839816933,
"grad_norm": 0.5419265031814575,
"learning_rate": 6.353075704998674e-06,
"loss": 1.0283397436141968,
"step": 616
},
{
"epoch": 1.414187643020595,
"grad_norm": 0.71892249584198,
"learning_rate": 6.341615171153334e-06,
"loss": 1.074945092201233,
"step": 618
},
{
"epoch": 1.4187643020594964,
"grad_norm": 0.46902936697006226,
"learning_rate": 6.330126820570066e-06,
"loss": 1.1522700786590576,
"step": 620
},
{
"epoch": 1.4233409610983982,
"grad_norm": 0.6041531562805176,
"learning_rate": 6.318610817837834e-06,
"loss": 0.9732744097709656,
"step": 622
},
{
"epoch": 1.4279176201372998,
"grad_norm": 1.0350428819656372,
"learning_rate": 6.307067327941769e-06,
"loss": 0.9294142127037048,
"step": 624
},
{
"epoch": 1.4324942791762014,
"grad_norm": 0.8469648361206055,
"learning_rate": 6.2954965162607995e-06,
"loss": 0.9963588714599609,
"step": 626
},
{
"epoch": 1.437070938215103,
"grad_norm": 0.471488356590271,
"learning_rate": 6.283898548565278e-06,
"loss": 1.0501569509506226,
"step": 628
},
{
"epoch": 1.4416475972540046,
"grad_norm": 0.2829485237598419,
"learning_rate": 6.272273591014614e-06,
"loss": 0.918632447719574,
"step": 630
},
{
"epoch": 1.4462242562929062,
"grad_norm": 0.7685482501983643,
"learning_rate": 6.260621810154889e-06,
"loss": 1.004805326461792,
"step": 632
},
{
"epoch": 1.4508009153318078,
"grad_norm": 0.5823658108711243,
"learning_rate": 6.24894337291647e-06,
"loss": 1.0062049627304077,
"step": 634
},
{
"epoch": 1.4553775743707094,
"grad_norm": 0.4799053966999054,
"learning_rate": 6.23723844661162e-06,
"loss": 0.9444741010665894,
"step": 636
},
{
"epoch": 1.459954233409611,
"grad_norm": 1.4403252601623535,
"learning_rate": 6.2255071989321e-06,
"loss": 0.3563728332519531,
"step": 638
},
{
"epoch": 1.4645308924485125,
"grad_norm": 0.5606778860092163,
"learning_rate": 6.2137497979467664e-06,
"loss": 1.084834098815918,
"step": 640
},
{
"epoch": 1.4691075514874141,
"grad_norm": 1.5342822074890137,
"learning_rate": 6.201966412099164e-06,
"loss": 0.88880455493927,
"step": 642
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.9060531258583069,
"learning_rate": 6.190157210205114e-06,
"loss": 0.7019488215446472,
"step": 644
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.6863506436347961,
"learning_rate": 6.17832236145029e-06,
"loss": 1.0984705686569214,
"step": 646
},
{
"epoch": 1.482837528604119,
"grad_norm": 0.5668753385543823,
"learning_rate": 6.1664620353878e-06,
"loss": 0.5110697150230408,
"step": 648
},
{
"epoch": 1.4874141876430205,
"grad_norm": 0.5630995631217957,
"learning_rate": 6.154576401935756e-06,
"loss": 1.0951957702636719,
"step": 650
},
{
"epoch": 1.4919908466819223,
"grad_norm": 0.49918022751808167,
"learning_rate": 6.1426656313748375e-06,
"loss": 0.6987397074699402,
"step": 652
},
{
"epoch": 1.4965675057208239,
"grad_norm": 1.3646057844161987,
"learning_rate": 6.130729894345851e-06,
"loss": 1.1859971284866333,
"step": 654
},
{
"epoch": 1.5011441647597255,
"grad_norm": 1.018013596534729,
"learning_rate": 6.118769361847293e-06,
"loss": 0.9196734428405762,
"step": 656
},
{
"epoch": 1.505720823798627,
"grad_norm": 0.35407713055610657,
"learning_rate": 6.106784205232888e-06,
"loss": 1.0442997217178345,
"step": 658
},
{
"epoch": 1.5102974828375286,
"grad_norm": 1.8714250326156616,
"learning_rate": 6.094774596209148e-06,
"loss": 0.6379448771476746,
"step": 660
},
{
"epoch": 1.5148741418764302,
"grad_norm": 0.4186966121196747,
"learning_rate": 6.082740706832897e-06,
"loss": 1.0916244983673096,
"step": 662
},
{
"epoch": 1.5194508009153318,
"grad_norm": 1.4749921560287476,
"learning_rate": 6.07068270950882e-06,
"loss": 1.0407416820526123,
"step": 664
},
{
"epoch": 1.5240274599542334,
"grad_norm": 0.8104400634765625,
"learning_rate": 6.0586007769869824e-06,
"loss": 0.4534456133842468,
"step": 666
},
{
"epoch": 1.528604118993135,
"grad_norm": 0.7797476053237915,
"learning_rate": 6.046495082360364e-06,
"loss": 1.105326771736145,
"step": 668
},
{
"epoch": 1.5331807780320366,
"grad_norm": 0.4904558062553406,
"learning_rate": 6.034365799062368e-06,
"loss": 0.7495214939117432,
"step": 670
},
{
"epoch": 1.5377574370709381,
"grad_norm": 0.4737433195114136,
"learning_rate": 6.022213100864351e-06,
"loss": 1.113190770149231,
"step": 672
},
{
"epoch": 1.54233409610984,
"grad_norm": 0.6883158087730408,
"learning_rate": 6.01003716187312e-06,
"loss": 0.7975556254386902,
"step": 674
},
{
"epoch": 1.5469107551487413,
"grad_norm": 7.956705093383789,
"learning_rate": 5.9978381565284456e-06,
"loss": 0.86496502161026,
"step": 676
},
{
"epoch": 1.5514874141876431,
"grad_norm": 0.6836668848991394,
"learning_rate": 5.985616259600559e-06,
"loss": 1.0243340730667114,
"step": 678
},
{
"epoch": 1.5560640732265445,
"grad_norm": 1.809205174446106,
"learning_rate": 5.973371646187653e-06,
"loss": 0.7693554759025574,
"step": 680
},
{
"epoch": 1.5606407322654463,
"grad_norm": 0.5701059699058533,
"learning_rate": 5.961104491713367e-06,
"loss": 1.0237873792648315,
"step": 682
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.6132348775863647,
"learning_rate": 5.948814971924277e-06,
"loss": 1.0140217542648315,
"step": 684
},
{
"epoch": 1.5697940503432495,
"grad_norm": 1.0674152374267578,
"learning_rate": 5.936503262887384e-06,
"loss": 0.8194214701652527,
"step": 686
},
{
"epoch": 1.574370709382151,
"grad_norm": 0.5556658506393433,
"learning_rate": 5.924169540987577e-06,
"loss": 1.0535566806793213,
"step": 688
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.9293789267539978,
"learning_rate": 5.911813982925118e-06,
"loss": 0.8254351615905762,
"step": 690
},
{
"epoch": 1.5835240274599542,
"grad_norm": 0.554201602935791,
"learning_rate": 5.8994367657131095e-06,
"loss": 0.769871711730957,
"step": 692
},
{
"epoch": 1.5881006864988558,
"grad_norm": 0.642005205154419,
"learning_rate": 5.887038066674952e-06,
"loss": 1.2380577325820923,
"step": 694
},
{
"epoch": 1.5926773455377574,
"grad_norm": 1.3365410566329956,
"learning_rate": 5.874618063441807e-06,
"loss": 0.9225341081619263,
"step": 696
},
{
"epoch": 1.597254004576659,
"grad_norm": 0.7327346205711365,
"learning_rate": 5.862176933950059e-06,
"loss": 0.8071609139442444,
"step": 698
},
{
"epoch": 1.6018306636155606,
"grad_norm": 0.5678279399871826,
"learning_rate": 5.849714856438752e-06,
"loss": 1.0254077911376953,
"step": 700
},
{
"epoch": 1.6064073226544622,
"grad_norm": 0.5600216388702393,
"learning_rate": 5.837232009447051e-06,
"loss": 0.8201320171356201,
"step": 702
},
{
"epoch": 1.610983981693364,
"grad_norm": 1.1897834539413452,
"learning_rate": 5.824728571811667e-06,
"loss": 0.7250789999961853,
"step": 704
},
{
"epoch": 1.6155606407322654,
"grad_norm": 1.4982539415359497,
"learning_rate": 5.812204722664317e-06,
"loss": 0.5860614776611328,
"step": 706
},
{
"epoch": 1.6201372997711672,
"grad_norm": 0.4147469103336334,
"learning_rate": 5.799660641429135e-06,
"loss": 1.0965193510055542,
"step": 708
},
{
"epoch": 1.6247139588100685,
"grad_norm": 0.8616934418678284,
"learning_rate": 5.787096507820122e-06,
"loss": 0.6443649530410767,
"step": 710
},
{
"epoch": 1.6292906178489703,
"grad_norm": 0.39017513394355774,
"learning_rate": 5.774512501838552e-06,
"loss": 1.081406593322754,
"step": 712
},
{
"epoch": 1.6338672768878717,
"grad_norm": 0.526287317276001,
"learning_rate": 5.761908803770406e-06,
"loss": 0.8772663474082947,
"step": 714
},
{
"epoch": 1.6384439359267735,
"grad_norm": 8.515963554382324,
"learning_rate": 5.7492855941837886e-06,
"loss": 0.7267716526985168,
"step": 716
},
{
"epoch": 1.643020594965675,
"grad_norm": 1.0429171323776245,
"learning_rate": 5.7366430539263335e-06,
"loss": 0.962812066078186,
"step": 718
},
{
"epoch": 1.6475972540045767,
"grad_norm": 0.571498453617096,
"learning_rate": 5.7239813641226185e-06,
"loss": 0.7786587476730347,
"step": 720
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.4392048418521881,
"learning_rate": 5.711300706171571e-06,
"loss": 0.6211113333702087,
"step": 722
},
{
"epoch": 1.6567505720823799,
"grad_norm": 61.38982391357422,
"learning_rate": 5.698601261743866e-06,
"loss": 0.8990558385848999,
"step": 724
},
{
"epoch": 1.6613272311212814,
"grad_norm": 3.1536953449249268,
"learning_rate": 5.685883212779324e-06,
"loss": 0.7748126983642578,
"step": 726
},
{
"epoch": 1.665903890160183,
"grad_norm": 1.6510205268859863,
"learning_rate": 5.673146741484308e-06,
"loss": 0.5790435671806335,
"step": 728
},
{
"epoch": 1.6704805491990846,
"grad_norm": 0.7334662675857544,
"learning_rate": 5.660392030329107e-06,
"loss": 0.8529476523399353,
"step": 730
},
{
"epoch": 1.6750572082379862,
"grad_norm": 0.5321954488754272,
"learning_rate": 5.647619262045326e-06,
"loss": 1.0975439548492432,
"step": 732
},
{
"epoch": 1.679633867276888,
"grad_norm": 0.49299484491348267,
"learning_rate": 5.634828619623269e-06,
"loss": 1.08144211769104,
"step": 734
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.47397059202194214,
"learning_rate": 5.622020286309315e-06,
"loss": 0.7880240678787231,
"step": 736
},
{
"epoch": 1.6887871853546912,
"grad_norm": 0.4795650839805603,
"learning_rate": 5.6091944456032896e-06,
"loss": 1.0663710832595825,
"step": 738
},
{
"epoch": 1.6933638443935926,
"grad_norm": 1.204559087753296,
"learning_rate": 5.5963512812558456e-06,
"loss": 0.6782804727554321,
"step": 740
},
{
"epoch": 1.6979405034324944,
"grad_norm": 0.5787196159362793,
"learning_rate": 5.583490977265819e-06,
"loss": 0.5317763686180115,
"step": 742
},
{
"epoch": 1.7025171624713957,
"grad_norm": 0.7164469361305237,
"learning_rate": 5.570613717877605e-06,
"loss": 1.0468082427978516,
"step": 744
},
{
"epoch": 1.7070938215102975,
"grad_norm": 0.5695748329162598,
"learning_rate": 5.557719687578507e-06,
"loss": 1.039925217628479,
"step": 746
},
{
"epoch": 1.7116704805491991,
"grad_norm": 0.6732376217842102,
"learning_rate": 5.544809071096098e-06,
"loss": 0.7266217470169067,
"step": 748
},
{
"epoch": 1.7162471395881007,
"grad_norm": 0.5064290761947632,
"learning_rate": 5.531882053395577e-06,
"loss": 0.7927247285842896,
"step": 750
},
{
"epoch": 1.7208237986270023,
"grad_norm": 0.4806613326072693,
"learning_rate": 5.5189388196771166e-06,
"loss": 1.0538541078567505,
"step": 752
},
{
"epoch": 1.7254004576659039,
"grad_norm": 0.46450909972190857,
"learning_rate": 5.5059795553732094e-06,
"loss": 0.7507013082504272,
"step": 754
},
{
"epoch": 1.7299771167048055,
"grad_norm": 0.627607524394989,
"learning_rate": 5.49300444614601e-06,
"loss": 1.0437747240066528,
"step": 756
},
{
"epoch": 1.734553775743707,
"grad_norm": 0.679521381855011,
"learning_rate": 5.4800136778846814e-06,
"loss": 1.0802175998687744,
"step": 758
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.5956997275352478,
"learning_rate": 5.467007436702721e-06,
"loss": 0.8692449331283569,
"step": 760
},
{
"epoch": 1.7437070938215102,
"grad_norm": 1.8609957695007324,
"learning_rate": 5.453985908935304e-06,
"loss": 0.7606918215751648,
"step": 762
},
{
"epoch": 1.748283752860412,
"grad_norm": 0.900421679019928,
"learning_rate": 5.440949281136612e-06,
"loss": 1.09746515750885,
"step": 764
},
{
"epoch": 1.7528604118993134,
"grad_norm": 0.48335880041122437,
"learning_rate": 5.4278977400771545e-06,
"loss": 0.8808884024620056,
"step": 766
},
{
"epoch": 1.7574370709382152,
"grad_norm": 0.8775651454925537,
"learning_rate": 5.4148314727411e-06,
"loss": 1.0945841073989868,
"step": 768
},
{
"epoch": 1.7620137299771166,
"grad_norm": 0.4176448881626129,
"learning_rate": 5.401750666323595e-06,
"loss": 0.6265594959259033,
"step": 770
},
{
"epoch": 1.7665903890160184,
"grad_norm": 1.7150315046310425,
"learning_rate": 5.3886555082280794e-06,
"loss": 0.5311535000801086,
"step": 772
},
{
"epoch": 1.7711670480549198,
"grad_norm": 0.482889860868454,
"learning_rate": 5.375546186063606e-06,
"loss": 0.9832878112792969,
"step": 774
},
{
"epoch": 1.7757437070938216,
"grad_norm": 0.6067187786102295,
"learning_rate": 5.362422887642148e-06,
"loss": 0.5880881547927856,
"step": 776
},
{
"epoch": 1.7803203661327232,
"grad_norm": 0.8287796974182129,
"learning_rate": 5.3492858009759115e-06,
"loss": 0.987903356552124,
"step": 778
},
{
"epoch": 1.7848970251716247,
"grad_norm": 3.733571767807007,
"learning_rate": 5.3361351142746425e-06,
"loss": 0.8192511796951294,
"step": 780
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.6592600345611572,
"learning_rate": 5.32297101594293e-06,
"loss": 1.0811833143234253,
"step": 782
},
{
"epoch": 1.794050343249428,
"grad_norm": 1.0604369640350342,
"learning_rate": 5.3097936945775034e-06,
"loss": 0.7256264686584473,
"step": 784
},
{
"epoch": 1.7986270022883295,
"grad_norm": 4.27371883392334,
"learning_rate": 5.2966033389645345e-06,
"loss": 0.6703461408615112,
"step": 786
},
{
"epoch": 1.803203661327231,
"grad_norm": 0.9975327849388123,
"learning_rate": 5.283400138076932e-06,
"loss": 0.6835663914680481,
"step": 788
},
{
"epoch": 1.8077803203661327,
"grad_norm": 0.7741729617118835,
"learning_rate": 5.270184281071633e-06,
"loss": 0.5445237159729004,
"step": 790
},
{
"epoch": 1.8123569794050343,
"grad_norm": 0.5315194129943848,
"learning_rate": 5.256955957286892e-06,
"loss": 0.9225422143936157,
"step": 792
},
{
"epoch": 1.816933638443936,
"grad_norm": 0.5030812621116638,
"learning_rate": 5.243715356239573e-06,
"loss": 1.0199389457702637,
"step": 794
},
{
"epoch": 1.8215102974828374,
"grad_norm": 0.5460532903671265,
"learning_rate": 5.230462667622431e-06,
"loss": 0.9883009195327759,
"step": 796
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.6946282386779785,
"learning_rate": 5.217198081301393e-06,
"loss": 1.1155096292495728,
"step": 798
},
{
"epoch": 1.8306636155606406,
"grad_norm": 0.5267988443374634,
"learning_rate": 5.20392178731284e-06,
"loss": 0.8357391357421875,
"step": 800
},
{
"epoch": 1.8352402745995424,
"grad_norm": 0.9400213956832886,
"learning_rate": 5.190633975860886e-06,
"loss": 1.2634918689727783,
"step": 802
},
{
"epoch": 1.8398169336384438,
"grad_norm": 0.4523729085922241,
"learning_rate": 5.1773348373146495e-06,
"loss": 1.0519038438796997,
"step": 804
},
{
"epoch": 1.8443935926773456,
"grad_norm": 0.5702998042106628,
"learning_rate": 5.164024562205527e-06,
"loss": 1.0271356105804443,
"step": 806
},
{
"epoch": 1.8489702517162472,
"grad_norm": 0.3808506429195404,
"learning_rate": 5.150703341224464e-06,
"loss": 1.0296131372451782,
"step": 808
},
{
"epoch": 1.8535469107551488,
"grad_norm": 0.49956274032592773,
"learning_rate": 5.137371365219225e-06,
"loss": 0.9183504581451416,
"step": 810
},
{
"epoch": 1.8581235697940504,
"grad_norm": 0.42203259468078613,
"learning_rate": 5.1240288251916576e-06,
"loss": 1.1768728494644165,
"step": 812
},
{
"epoch": 1.862700228832952,
"grad_norm": 0.39127910137176514,
"learning_rate": 5.110675912294954e-06,
"loss": 0.7790493965148926,
"step": 814
},
{
"epoch": 1.8672768878718535,
"grad_norm": 0.422817587852478,
"learning_rate": 5.097312817830913e-06,
"loss": 1.0952423810958862,
"step": 816
},
{
"epoch": 1.8718535469107551,
"grad_norm": 0.6157692670822144,
"learning_rate": 5.083939733247205e-06,
"loss": 0.9987185001373291,
"step": 818
},
{
"epoch": 1.8764302059496567,
"grad_norm": 0.43606990575790405,
"learning_rate": 5.07055685013462e-06,
"loss": 0.6182645559310913,
"step": 820
},
{
"epoch": 1.8810068649885583,
"grad_norm": 1.6980317831039429,
"learning_rate": 5.057164360224333e-06,
"loss": 0.8162880539894104,
"step": 822
},
{
"epoch": 1.88558352402746,
"grad_norm": 1.3754355907440186,
"learning_rate": 5.0437624553851465e-06,
"loss": 0.9700958728790283,
"step": 824
},
{
"epoch": 1.8901601830663615,
"grad_norm": 0.695946991443634,
"learning_rate": 5.03035132762075e-06,
"loss": 0.7681318521499634,
"step": 826
},
{
"epoch": 1.8947368421052633,
"grad_norm": 1.2942097187042236,
"learning_rate": 5.016931169066964e-06,
"loss": 0.7290353178977966,
"step": 828
},
{
"epoch": 1.8993135011441646,
"grad_norm": 0.5696609616279602,
"learning_rate": 5.003502171988991e-06,
"loss": 1.1246237754821777,
"step": 830
},
{
"epoch": 1.9038901601830664,
"grad_norm": 0.6707322597503662,
"learning_rate": 4.990064528778662e-06,
"loss": 1.0869580507278442,
"step": 832
},
{
"epoch": 1.9084668192219678,
"grad_norm": 1.6657345294952393,
"learning_rate": 4.976618431951673e-06,
"loss": 1.004643201828003,
"step": 834
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.7031993269920349,
"learning_rate": 4.963164074144831e-06,
"loss": 0.9117480516433716,
"step": 836
},
{
"epoch": 1.9176201372997712,
"grad_norm": 0.6919447779655457,
"learning_rate": 4.949701648113299e-06,
"loss": 1.0158711671829224,
"step": 838
},
{
"epoch": 1.9221967963386728,
"grad_norm": 1.0875664949417114,
"learning_rate": 4.93623134672783e-06,
"loss": 0.877010703086853,
"step": 840
},
{
"epoch": 1.9267734553775744,
"grad_norm": 0.5526543259620667,
"learning_rate": 4.922753362972e-06,
"loss": 0.7300304770469666,
"step": 842
},
{
"epoch": 1.931350114416476,
"grad_norm": 0.9483214020729065,
"learning_rate": 4.90926788993945e-06,
"loss": 0.9140143394470215,
"step": 844
},
{
"epoch": 1.9359267734553776,
"grad_norm": 0.37856608629226685,
"learning_rate": 4.895775120831117e-06,
"loss": 0.9656968116760254,
"step": 846
},
{
"epoch": 1.9405034324942791,
"grad_norm": 0.519496738910675,
"learning_rate": 4.8822752489524655e-06,
"loss": 0.7197529673576355,
"step": 848
},
{
"epoch": 1.9450800915331807,
"grad_norm": 1.0532692670822144,
"learning_rate": 4.868768467710718e-06,
"loss": 0.6470821499824524,
"step": 850
},
{
"epoch": 1.9496567505720823,
"grad_norm": 0.7968090176582336,
"learning_rate": 4.855254970612085e-06,
"loss": 0.8287680149078369,
"step": 852
},
{
"epoch": 1.9542334096109841,
"grad_norm": 0.3205246925354004,
"learning_rate": 4.841734951258991e-06,
"loss": 1.038404941558838,
"step": 854
},
{
"epoch": 1.9588100686498855,
"grad_norm": 0.5507828593254089,
"learning_rate": 4.828208603347306e-06,
"loss": 1.0363742113113403,
"step": 856
},
{
"epoch": 1.9633867276887873,
"grad_norm": 1.1318949460983276,
"learning_rate": 4.8146761206635635e-06,
"loss": 0.49166426062583923,
"step": 858
},
{
"epoch": 1.9679633867276887,
"grad_norm": 0.5373101234436035,
"learning_rate": 4.801137697082188e-06,
"loss": 0.9425434470176697,
"step": 860
},
{
"epoch": 1.9725400457665905,
"grad_norm": 0.41498398780822754,
"learning_rate": 4.787593526562718e-06,
"loss": 1.0520573854446411,
"step": 862
},
{
"epoch": 1.9771167048054918,
"grad_norm": 2.031755208969116,
"learning_rate": 4.774043803147023e-06,
"loss": 0.8243655562400818,
"step": 864
},
{
"epoch": 1.9816933638443937,
"grad_norm": 0.6944281458854675,
"learning_rate": 4.76048872095653e-06,
"loss": 1.229433536529541,
"step": 866
},
{
"epoch": 1.9862700228832952,
"grad_norm": 1.356571078300476,
"learning_rate": 4.746928474189438e-06,
"loss": 0.4065757691860199,
"step": 868
},
{
"epoch": 1.9908466819221968,
"grad_norm": 0.8086536526679993,
"learning_rate": 4.733363257117937e-06,
"loss": 1.044547200202942,
"step": 870
},
{
"epoch": 1.9954233409610984,
"grad_norm": 1.1679444313049316,
"learning_rate": 4.719793264085423e-06,
"loss": 0.7908722758293152,
"step": 872
},
{
"epoch": 2.0,
"grad_norm": 0.5113776922225952,
"learning_rate": 4.7062186895037155e-06,
"loss": 1.0065279006958008,
"step": 874
},
{
"epoch": 2.004576659038902,
"grad_norm": 1.2484664916992188,
"learning_rate": 4.692639727850277e-06,
"loss": 0.8650211095809937,
"step": 876
},
{
"epoch": 2.009153318077803,
"grad_norm": 0.19410383701324463,
"learning_rate": 4.679056573665413e-06,
"loss": 0.5260782837867737,
"step": 878
},
{
"epoch": 2.013729977116705,
"grad_norm": 0.6674540638923645,
"learning_rate": 4.6654694215495e-06,
"loss": 0.8760992884635925,
"step": 880
},
{
"epoch": 2.0183066361556063,
"grad_norm": 0.5311161875724792,
"learning_rate": 4.651878466160191e-06,
"loss": 0.5279438495635986,
"step": 882
},
{
"epoch": 2.022883295194508,
"grad_norm": 0.8211866617202759,
"learning_rate": 4.638283902209623e-06,
"loss": 0.7287083864212036,
"step": 884
},
{
"epoch": 2.0274599542334095,
"grad_norm": 1.3398568630218506,
"learning_rate": 4.624685924461638e-06,
"loss": 0.7964086532592773,
"step": 886
},
{
"epoch": 2.0320366132723113,
"grad_norm": 1.409980297088623,
"learning_rate": 4.611084727728979e-06,
"loss": 0.7654436230659485,
"step": 888
},
{
"epoch": 2.0366132723112127,
"grad_norm": 1.149587631225586,
"learning_rate": 4.59748050687051e-06,
"loss": 0.7754305601119995,
"step": 890
},
{
"epoch": 2.0411899313501145,
"grad_norm": 0.6738428473472595,
"learning_rate": 4.583873456788419e-06,
"loss": 0.6593428254127502,
"step": 892
},
{
"epoch": 2.045766590389016,
"grad_norm": 0.582455039024353,
"learning_rate": 4.570263772425429e-06,
"loss": 0.5679339170455933,
"step": 894
},
{
"epoch": 2.0503432494279177,
"grad_norm": 1.0730149745941162,
"learning_rate": 4.556651648762e-06,
"loss": 0.47743597626686096,
"step": 896
},
{
"epoch": 2.054919908466819,
"grad_norm": 0.6254774928092957,
"learning_rate": 4.543037280813544e-06,
"loss": 0.7976337671279907,
"step": 898
},
{
"epoch": 2.059496567505721,
"grad_norm": 0.6231757998466492,
"learning_rate": 4.52942086362762e-06,
"loss": 0.9103917479515076,
"step": 900
},
{
"epoch": 2.064073226544622,
"grad_norm": 0.7223190665245056,
"learning_rate": 4.515802592281151e-06,
"loss": 0.6968510746955872,
"step": 902
},
{
"epoch": 2.068649885583524,
"grad_norm": 1.1463390588760376,
"learning_rate": 4.50218266187762e-06,
"loss": 0.5448936820030212,
"step": 904
},
{
"epoch": 2.073226544622426,
"grad_norm": 1.5348836183547974,
"learning_rate": 4.4885612675442795e-06,
"loss": 0.1512419879436493,
"step": 906
},
{
"epoch": 2.077803203661327,
"grad_norm": 1.0921180248260498,
"learning_rate": 4.474938604429356e-06,
"loss": 0.3723929226398468,
"step": 908
},
{
"epoch": 2.082379862700229,
"grad_norm": 2.096736431121826,
"learning_rate": 4.4613148676992534e-06,
"loss": 0.5486660599708557,
"step": 910
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.9730957746505737,
"learning_rate": 4.447690252535757e-06,
"loss": 0.48607346415519714,
"step": 912
},
{
"epoch": 2.091533180778032,
"grad_norm": 0.9440922737121582,
"learning_rate": 4.434064954133233e-06,
"loss": 0.8132773041725159,
"step": 914
},
{
"epoch": 2.0961098398169336,
"grad_norm": 0.5760765671730042,
"learning_rate": 4.4204391676958456e-06,
"loss": 0.8012776374816895,
"step": 916
},
{
"epoch": 2.1006864988558354,
"grad_norm": 1.3130919933319092,
"learning_rate": 4.406813088434739e-06,
"loss": 0.5709822177886963,
"step": 918
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.4790267050266266,
"learning_rate": 4.393186911565262e-06,
"loss": 0.793640673160553,
"step": 920
},
{
"epoch": 2.1098398169336385,
"grad_norm": 1.7611640691757202,
"learning_rate": 4.379560832304155e-06,
"loss": 0.8374230861663818,
"step": 922
},
{
"epoch": 2.11441647597254,
"grad_norm": 0.5854694247245789,
"learning_rate": 4.365935045866765e-06,
"loss": 0.7813926935195923,
"step": 924
},
{
"epoch": 2.1189931350114417,
"grad_norm": 0.768767237663269,
"learning_rate": 4.352309747464244e-06,
"loss": 0.547953188419342,
"step": 926
},
{
"epoch": 2.123569794050343,
"grad_norm": 0.6132713556289673,
"learning_rate": 4.338685132300746e-06,
"loss": 0.805243968963623,
"step": 928
},
{
"epoch": 2.128146453089245,
"grad_norm": 0.4178018569946289,
"learning_rate": 4.325061395570644e-06,
"loss": 0.44079485535621643,
"step": 930
},
{
"epoch": 2.1327231121281462,
"grad_norm": 1.141923189163208,
"learning_rate": 4.311438732455722e-06,
"loss": 0.5015432238578796,
"step": 932
},
{
"epoch": 2.137299771167048,
"grad_norm": 0.764822781085968,
"learning_rate": 4.297817338122382e-06,
"loss": 0.4405544102191925,
"step": 934
},
{
"epoch": 2.14187643020595,
"grad_norm": 0.8727527260780334,
"learning_rate": 4.28419740771885e-06,
"loss": 0.7797104120254517,
"step": 936
},
{
"epoch": 2.1464530892448512,
"grad_norm": 0.47269201278686523,
"learning_rate": 4.27057913637238e-06,
"loss": 0.14955393970012665,
"step": 938
},
{
"epoch": 2.151029748283753,
"grad_norm": 0.574876606464386,
"learning_rate": 4.2569627191864566e-06,
"loss": 0.6506639719009399,
"step": 940
},
{
"epoch": 2.1556064073226544,
"grad_norm": 0.43111807107925415,
"learning_rate": 4.243348351238e-06,
"loss": 0.8360904455184937,
"step": 942
},
{
"epoch": 2.160183066361556,
"grad_norm": 0.6124604940414429,
"learning_rate": 4.229736227574573e-06,
"loss": 0.4769759178161621,
"step": 944
},
{
"epoch": 2.1647597254004576,
"grad_norm": 0.5514540672302246,
"learning_rate": 4.216126543211582e-06,
"loss": 0.3582886755466461,
"step": 946
},
{
"epoch": 2.1693363844393594,
"grad_norm": 0.6413321495056152,
"learning_rate": 4.2025194931294905e-06,
"loss": 0.6755134463310242,
"step": 948
},
{
"epoch": 2.1739130434782608,
"grad_norm": 1.2605396509170532,
"learning_rate": 4.188915272271021e-06,
"loss": 0.26411038637161255,
"step": 950
},
{
"epoch": 2.1784897025171626,
"grad_norm": 2.322275161743164,
"learning_rate": 4.175314075538362e-06,
"loss": 0.5067375302314758,
"step": 952
},
{
"epoch": 2.183066361556064,
"grad_norm": 0.8927626609802246,
"learning_rate": 4.1617160977903755e-06,
"loss": 0.763658344745636,
"step": 954
},
{
"epoch": 2.1876430205949657,
"grad_norm": 0.5768288969993591,
"learning_rate": 4.148121533839809e-06,
"loss": 0.5286126732826233,
"step": 956
},
{
"epoch": 2.192219679633867,
"grad_norm": 0.7886779308319092,
"learning_rate": 4.134530578450499e-06,
"loss": 0.5433465838432312,
"step": 958
},
{
"epoch": 2.196796338672769,
"grad_norm": 0.5103306770324707,
"learning_rate": 4.120943426334587e-06,
"loss": 0.781455397605896,
"step": 960
},
{
"epoch": 2.2013729977116703,
"grad_norm": 1.2081961631774902,
"learning_rate": 4.107360272149724e-06,
"loss": 0.44611290097236633,
"step": 962
},
{
"epoch": 2.205949656750572,
"grad_norm": 0.5938241481781006,
"learning_rate": 4.093781310496284e-06,
"loss": 0.5686028599739075,
"step": 964
},
{
"epoch": 2.2105263157894735,
"grad_norm": 1.259899377822876,
"learning_rate": 4.080206735914578e-06,
"loss": 0.5111551880836487,
"step": 966
},
{
"epoch": 2.2151029748283753,
"grad_norm": 0.6723149418830872,
"learning_rate": 4.066636742882064e-06,
"loss": 0.7907751798629761,
"step": 968
},
{
"epoch": 2.219679633867277,
"grad_norm": 0.7715116739273071,
"learning_rate": 4.053071525810562e-06,
"loss": 0.8536049127578735,
"step": 970
},
{
"epoch": 2.2242562929061784,
"grad_norm": 0.6745052933692932,
"learning_rate": 4.039511279043469e-06,
"loss": 0.49250656366348267,
"step": 972
},
{
"epoch": 2.2288329519450802,
"grad_norm": 5.653090476989746,
"learning_rate": 4.025956196852978e-06,
"loss": 0.9080660939216614,
"step": 974
},
{
"epoch": 2.2334096109839816,
"grad_norm": 0.39930716156959534,
"learning_rate": 4.0124064734372824e-06,
"loss": 0.5726509094238281,
"step": 976
},
{
"epoch": 2.2379862700228834,
"grad_norm": 0.7205491662025452,
"learning_rate": 3.998862302917812e-06,
"loss": 0.5259721279144287,
"step": 978
},
{
"epoch": 2.242562929061785,
"grad_norm": 0.5892427563667297,
"learning_rate": 3.985323879336437e-06,
"loss": 0.70969158411026,
"step": 980
},
{
"epoch": 2.2471395881006866,
"grad_norm": 0.8280232548713684,
"learning_rate": 3.9717913966526935e-06,
"loss": 0.7658937573432922,
"step": 982
},
{
"epoch": 2.251716247139588,
"grad_norm": 0.46589598059654236,
"learning_rate": 3.958265048741008e-06,
"loss": 0.6634811758995056,
"step": 984
},
{
"epoch": 2.2562929061784898,
"grad_norm": 1.1013447046279907,
"learning_rate": 3.944745029387916e-06,
"loss": 0.6400603652000427,
"step": 986
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.5064674019813538,
"learning_rate": 3.931231532289282e-06,
"loss": 0.7838316559791565,
"step": 988
},
{
"epoch": 2.265446224256293,
"grad_norm": 0.6013701558113098,
"learning_rate": 3.917724751047534e-06,
"loss": 0.5404883623123169,
"step": 990
},
{
"epoch": 2.2700228832951943,
"grad_norm": 1.0321763753890991,
"learning_rate": 3.904224879168882e-06,
"loss": 0.6090070605278015,
"step": 992
},
{
"epoch": 2.274599542334096,
"grad_norm": 0.4366181492805481,
"learning_rate": 3.89073211006055e-06,
"loss": 0.8297733068466187,
"step": 994
},
{
"epoch": 2.279176201372998,
"grad_norm": 0.6068903207778931,
"learning_rate": 3.877246637027999e-06,
"loss": 0.816207766532898,
"step": 996
},
{
"epoch": 2.2837528604118993,
"grad_norm": 0.9020872712135315,
"learning_rate": 3.863768653272171e-06,
"loss": 0.851694643497467,
"step": 998
},
{
"epoch": 2.288329519450801,
"grad_norm": 0.699506938457489,
"learning_rate": 3.850298351886699e-06,
"loss": 0.5200238823890686,
"step": 1000
},
{
"epoch": 2.2929061784897025,
"grad_norm": 1.1063220500946045,
"learning_rate": 3.836835925855168e-06,
"loss": 0.46105656027793884,
"step": 1002
},
{
"epoch": 2.2974828375286043,
"grad_norm": 0.7333200573921204,
"learning_rate": 3.823381568048329e-06,
"loss": 0.4054844081401825,
"step": 1004
},
{
"epoch": 2.3020594965675056,
"grad_norm": 1.0534933805465698,
"learning_rate": 3.8099354712213375e-06,
"loss": 0.786138653755188,
"step": 1006
},
{
"epoch": 2.3066361556064074,
"grad_norm": 0.7661805152893066,
"learning_rate": 3.7964978280110078e-06,
"loss": 0.5901062488555908,
"step": 1008
},
{
"epoch": 2.311212814645309,
"grad_norm": 0.5444358587265015,
"learning_rate": 3.783068830933037e-06,
"loss": 0.3989161550998688,
"step": 1010
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.8000374436378479,
"learning_rate": 3.7696486723792508e-06,
"loss": 0.46749627590179443,
"step": 1012
},
{
"epoch": 2.320366132723112,
"grad_norm": 0.7630489468574524,
"learning_rate": 3.756237544614853e-06,
"loss": 0.5220370292663574,
"step": 1014
},
{
"epoch": 2.324942791762014,
"grad_norm": 0.7297222018241882,
"learning_rate": 3.7428356397756672e-06,
"loss": 0.9120384454727173,
"step": 1016
},
{
"epoch": 2.329519450800915,
"grad_norm": 1.5217046737670898,
"learning_rate": 3.7294431498653792e-06,
"loss": 0.28965604305267334,
"step": 1018
},
{
"epoch": 2.334096109839817,
"grad_norm": 1.606251835823059,
"learning_rate": 3.7160602667527954e-06,
"loss": 0.46628880500793457,
"step": 1020
},
{
"epoch": 2.3386727688787188,
"grad_norm": 0.49629804491996765,
"learning_rate": 3.7026871821690877e-06,
"loss": 0.8182316422462463,
"step": 1022
},
{
"epoch": 2.34324942791762,
"grad_norm": 0.2531331479549408,
"learning_rate": 3.6893240877050467e-06,
"loss": 0.7435541152954102,
"step": 1024
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.48934581875801086,
"learning_rate": 3.6759711748083416e-06,
"loss": 0.8625474572181702,
"step": 1026
},
{
"epoch": 2.3524027459954233,
"grad_norm": 1.5544626712799072,
"learning_rate": 3.6626286347807753e-06,
"loss": 0.5781938433647156,
"step": 1028
},
{
"epoch": 2.356979405034325,
"grad_norm": 0.654556930065155,
"learning_rate": 3.6492966587755356e-06,
"loss": 0.6204836368560791,
"step": 1030
},
{
"epoch": 2.3615560640732265,
"grad_norm": 1.397822618484497,
"learning_rate": 3.6359754377944726e-06,
"loss": 0.4885711967945099,
"step": 1032
},
{
"epoch": 2.3661327231121283,
"grad_norm": 1.5980985164642334,
"learning_rate": 3.622665162685351e-06,
"loss": 0.6035375595092773,
"step": 1034
},
{
"epoch": 2.3707093821510297,
"grad_norm": 0.5939676761627197,
"learning_rate": 3.6093660241391134e-06,
"loss": 0.6923315525054932,
"step": 1036
},
{
"epoch": 2.3752860411899315,
"grad_norm": 2.990952730178833,
"learning_rate": 3.5960782126871588e-06,
"loss": 0.3957272171974182,
"step": 1038
},
{
"epoch": 2.379862700228833,
"grad_norm": 1.2808301448822021,
"learning_rate": 3.5828019186986076e-06,
"loss": 0.35686612129211426,
"step": 1040
},
{
"epoch": 2.3844393592677346,
"grad_norm": 0.537975013256073,
"learning_rate": 3.5695373323775694e-06,
"loss": 0.7028881311416626,
"step": 1042
},
{
"epoch": 2.389016018306636,
"grad_norm": 0.5620653033256531,
"learning_rate": 3.556284643760426e-06,
"loss": 0.7551029920578003,
"step": 1044
},
{
"epoch": 2.393592677345538,
"grad_norm": 0.9883461594581604,
"learning_rate": 3.5430440427131087e-06,
"loss": 0.4496672749519348,
"step": 1046
},
{
"epoch": 2.398169336384439,
"grad_norm": 0.68025803565979,
"learning_rate": 3.5298157189283673e-06,
"loss": 0.7668349742889404,
"step": 1048
},
{
"epoch": 2.402745995423341,
"grad_norm": 0.7667739987373352,
"learning_rate": 3.5165998619230683e-06,
"loss": 0.7979832291603088,
"step": 1050
},
{
"epoch": 2.4073226544622424,
"grad_norm": 0.6154050827026367,
"learning_rate": 3.5033966610354655e-06,
"loss": 0.7762300968170166,
"step": 1052
},
{
"epoch": 2.411899313501144,
"grad_norm": 1.798595666885376,
"learning_rate": 3.4902063054224966e-06,
"loss": 0.46241626143455505,
"step": 1054
},
{
"epoch": 2.416475972540046,
"grad_norm": 0.9119539260864258,
"learning_rate": 3.4770289840570693e-06,
"loss": 0.18137064576148987,
"step": 1056
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.567173957824707,
"learning_rate": 3.463864885725358e-06,
"loss": 0.5903480648994446,
"step": 1058
},
{
"epoch": 2.425629290617849,
"grad_norm": 0.6367594599723816,
"learning_rate": 3.450714199024089e-06,
"loss": 0.4741784930229187,
"step": 1060
},
{
"epoch": 2.4302059496567505,
"grad_norm": 0.7184843420982361,
"learning_rate": 3.4375771123578527e-06,
"loss": 0.8506764769554138,
"step": 1062
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.7258248925209045,
"learning_rate": 3.424453813936394e-06,
"loss": 0.6282183527946472,
"step": 1064
},
{
"epoch": 2.4393592677345537,
"grad_norm": 0.9743756651878357,
"learning_rate": 3.4113444917719206e-06,
"loss": 0.5036060810089111,
"step": 1066
},
{
"epoch": 2.4439359267734555,
"grad_norm": 1.7455062866210938,
"learning_rate": 3.3982493336764046e-06,
"loss": 0.7877315282821655,
"step": 1068
},
{
"epoch": 2.448512585812357,
"grad_norm": 0.5519979000091553,
"learning_rate": 3.3851685272588995e-06,
"loss": 0.7620252966880798,
"step": 1070
},
{
"epoch": 2.4530892448512587,
"grad_norm": 1.4960353374481201,
"learning_rate": 3.3721022599228455e-06,
"loss": 0.6300139427185059,
"step": 1072
},
{
"epoch": 2.45766590389016,
"grad_norm": 0.8801581859588623,
"learning_rate": 3.359050718863388e-06,
"loss": 0.7080258131027222,
"step": 1074
},
{
"epoch": 2.462242562929062,
"grad_norm": 1.3520958423614502,
"learning_rate": 3.3460140910646953e-06,
"loss": 0.5704418420791626,
"step": 1076
},
{
"epoch": 2.466819221967963,
"grad_norm": 0.9597600102424622,
"learning_rate": 3.332992563297279e-06,
"loss": 0.8418775200843811,
"step": 1078
},
{
"epoch": 2.471395881006865,
"grad_norm": 0.7216306328773499,
"learning_rate": 3.3199863221153194e-06,
"loss": 0.8978160619735718,
"step": 1080
},
{
"epoch": 2.475972540045767,
"grad_norm": 0.3640708029270172,
"learning_rate": 3.3069955538539898e-06,
"loss": 0.5792869329452515,
"step": 1082
},
{
"epoch": 2.480549199084668,
"grad_norm": 0.4946868121623993,
"learning_rate": 3.294020444626791e-06,
"loss": 0.7915354371070862,
"step": 1084
},
{
"epoch": 2.4851258581235696,
"grad_norm": 0.6146786212921143,
"learning_rate": 3.281061180322883e-06,
"loss": 0.6907005310058594,
"step": 1086
},
{
"epoch": 2.4897025171624714,
"grad_norm": 0.7421013116836548,
"learning_rate": 3.2681179466044234e-06,
"loss": 0.487212598323822,
"step": 1088
},
{
"epoch": 2.494279176201373,
"grad_norm": 0.41838371753692627,
"learning_rate": 3.2551909289039026e-06,
"loss": 0.5698995590209961,
"step": 1090
},
{
"epoch": 2.4988558352402745,
"grad_norm": 0.3784742057323456,
"learning_rate": 3.2422803124214938e-06,
"loss": 0.5521360039710999,
"step": 1092
},
{
"epoch": 2.5034324942791764,
"grad_norm": 3.2495124340057373,
"learning_rate": 3.2293862821223954e-06,
"loss": 0.25563108921051025,
"step": 1094
},
{
"epoch": 2.5080091533180777,
"grad_norm": 0.7314022183418274,
"learning_rate": 3.216509022734181e-06,
"loss": 0.7299931049346924,
"step": 1096
},
{
"epoch": 2.5125858123569795,
"grad_norm": 0.7065015435218811,
"learning_rate": 3.203648718744155e-06,
"loss": 0.697057843208313,
"step": 1098
},
{
"epoch": 2.517162471395881,
"grad_norm": 0.490582674741745,
"learning_rate": 3.1908055543967117e-06,
"loss": 0.5592939257621765,
"step": 1100
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.41221341490745544,
"learning_rate": 3.177979713690686e-06,
"loss": 0.8416492342948914,
"step": 1102
},
{
"epoch": 2.526315789473684,
"grad_norm": 1.0584267377853394,
"learning_rate": 3.1651713803767308e-06,
"loss": 0.7285148501396179,
"step": 1104
},
{
"epoch": 2.530892448512586,
"grad_norm": 0.5549395680427551,
"learning_rate": 3.152380737954674e-06,
"loss": 0.7810051441192627,
"step": 1106
},
{
"epoch": 2.5354691075514877,
"grad_norm": 0.6194555759429932,
"learning_rate": 3.1396079696708933e-06,
"loss": 0.8377099633216858,
"step": 1108
},
{
"epoch": 2.540045766590389,
"grad_norm": 0.48931312561035156,
"learning_rate": 3.126853258515692e-06,
"loss": 0.8033105731010437,
"step": 1110
},
{
"epoch": 2.5446224256292904,
"grad_norm": 3.8660051822662354,
"learning_rate": 3.114116787220676e-06,
"loss": 0.8483383059501648,
"step": 1112
},
{
"epoch": 2.5491990846681922,
"grad_norm": 0.508552074432373,
"learning_rate": 3.101398738256134e-06,
"loss": 0.8187400698661804,
"step": 1114
},
{
"epoch": 2.553775743707094,
"grad_norm": 0.7558878064155579,
"learning_rate": 3.0886992938284283e-06,
"loss": 0.5747739672660828,
"step": 1116
},
{
"epoch": 2.5583524027459954,
"grad_norm": 3.0800881385803223,
"learning_rate": 3.076018635877382e-06,
"loss": 0.6367532014846802,
"step": 1118
},
{
"epoch": 2.5629290617848968,
"grad_norm": 1.0385645627975464,
"learning_rate": 3.063356946073667e-06,
"loss": 0.6223986148834229,
"step": 1120
},
{
"epoch": 2.5675057208237986,
"grad_norm": 0.7576572895050049,
"learning_rate": 3.050714405816212e-06,
"loss": 0.716172456741333,
"step": 1122
},
{
"epoch": 2.5720823798627004,
"grad_norm": 1.1599595546722412,
"learning_rate": 3.038091196229594e-06,
"loss": 0.7232475280761719,
"step": 1124
},
{
"epoch": 2.5766590389016018,
"grad_norm": 0.6561689972877502,
"learning_rate": 3.025487498161449e-06,
"loss": 0.4857536554336548,
"step": 1126
},
{
"epoch": 2.5812356979405036,
"grad_norm": 0.6233944892883301,
"learning_rate": 3.0129034921798784e-06,
"loss": 0.8831377029418945,
"step": 1128
},
{
"epoch": 2.585812356979405,
"grad_norm": 0.6065633296966553,
"learning_rate": 3.000339358570864e-06,
"loss": 0.801806628704071,
"step": 1130
},
{
"epoch": 2.5903890160183067,
"grad_norm": 0.5193330645561218,
"learning_rate": 2.9877952773356835e-06,
"loss": 0.7977845668792725,
"step": 1132
},
{
"epoch": 2.594965675057208,
"grad_norm": 0.6371508240699768,
"learning_rate": 2.9752714281883338e-06,
"loss": 0.7237481474876404,
"step": 1134
},
{
"epoch": 2.59954233409611,
"grad_norm": 4.00014591217041,
"learning_rate": 2.9627679905529503e-06,
"loss": 0.6999197006225586,
"step": 1136
},
{
"epoch": 2.6041189931350113,
"grad_norm": 0.9605089426040649,
"learning_rate": 2.9502851435612474e-06,
"loss": 0.5667267441749573,
"step": 1138
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.8212308883666992,
"learning_rate": 2.937823066049941e-06,
"loss": 0.84372878074646,
"step": 1140
},
{
"epoch": 2.613272311212815,
"grad_norm": 0.6663596034049988,
"learning_rate": 2.9253819365581923e-06,
"loss": 0.7569270133972168,
"step": 1142
},
{
"epoch": 2.6178489702517163,
"grad_norm": 1.229351282119751,
"learning_rate": 2.9129619333250482e-06,
"loss": 0.4845171868801117,
"step": 1144
},
{
"epoch": 2.6224256292906176,
"grad_norm": 1.0321894884109497,
"learning_rate": 2.900563234286891e-06,
"loss": 0.47281792759895325,
"step": 1146
},
{
"epoch": 2.6270022883295194,
"grad_norm": 2.871659994125366,
"learning_rate": 2.888186017074882e-06,
"loss": 0.6280223727226257,
"step": 1148
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.3605870306491852,
"learning_rate": 2.875830459012424e-06,
"loss": 0.07345299422740936,
"step": 1150
},
{
"epoch": 2.6361556064073226,
"grad_norm": 0.5999372005462646,
"learning_rate": 2.8634967371126165e-06,
"loss": 0.7574805617332458,
"step": 1152
},
{
"epoch": 2.6407322654462244,
"grad_norm": 0.6183683276176453,
"learning_rate": 2.851185028075723e-06,
"loss": 0.4930267333984375,
"step": 1154
},
{
"epoch": 2.645308924485126,
"grad_norm": 0.5491237640380859,
"learning_rate": 2.8388955082866333e-06,
"loss": 0.6228328943252563,
"step": 1156
},
{
"epoch": 2.6498855835240276,
"grad_norm": 0.5937864184379578,
"learning_rate": 2.826628353812348e-06,
"loss": 0.8555384278297424,
"step": 1158
},
{
"epoch": 2.654462242562929,
"grad_norm": 1.6247820854187012,
"learning_rate": 2.8143837403994396e-06,
"loss": 0.5580495595932007,
"step": 1160
},
{
"epoch": 2.6590389016018308,
"grad_norm": 7.112198352813721,
"learning_rate": 2.8021618434715545e-06,
"loss": 0.478378564119339,
"step": 1162
},
{
"epoch": 2.663615560640732,
"grad_norm": 0.6588612794876099,
"learning_rate": 2.7899628381268805e-06,
"loss": 0.46248742938041687,
"step": 1164
},
{
"epoch": 2.668192219679634,
"grad_norm": 0.46749043464660645,
"learning_rate": 2.777786899135649e-06,
"loss": 0.5418421030044556,
"step": 1166
},
{
"epoch": 2.6727688787185357,
"grad_norm": 0.7305618524551392,
"learning_rate": 2.765634200937632e-06,
"loss": 0.4692259728908539,
"step": 1168
},
{
"epoch": 2.677345537757437,
"grad_norm": 1.3373067378997803,
"learning_rate": 2.753504917639637e-06,
"loss": 0.6015383005142212,
"step": 1170
},
{
"epoch": 2.6819221967963385,
"grad_norm": 1.2775124311447144,
"learning_rate": 2.741399223013018e-06,
"loss": 0.6598130464553833,
"step": 1172
},
{
"epoch": 2.6864988558352403,
"grad_norm": 0.6947876214981079,
"learning_rate": 2.72931729049118e-06,
"loss": 0.5946341156959534,
"step": 1174
},
{
"epoch": 2.691075514874142,
"grad_norm": 0.5818822383880615,
"learning_rate": 2.7172592931671033e-06,
"loss": 0.8561844229698181,
"step": 1176
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.40154317021369934,
"learning_rate": 2.705225403790853e-06,
"loss": 0.5496221780776978,
"step": 1178
},
{
"epoch": 2.700228832951945,
"grad_norm": 0.6244839429855347,
"learning_rate": 2.693215794767111e-06,
"loss": 0.7859920859336853,
"step": 1180
},
{
"epoch": 2.7048054919908466,
"grad_norm": 1.3127132654190063,
"learning_rate": 2.6812306381527084e-06,
"loss": 0.7597426176071167,
"step": 1182
},
{
"epoch": 2.7093821510297484,
"grad_norm": 5.444507598876953,
"learning_rate": 2.6692701056541486e-06,
"loss": 0.199199840426445,
"step": 1184
},
{
"epoch": 2.71395881006865,
"grad_norm": 0.6784061789512634,
"learning_rate": 2.657334368625163e-06,
"loss": 0.6887036561965942,
"step": 1186
},
{
"epoch": 2.7185354691075516,
"grad_norm": 0.5293588638305664,
"learning_rate": 2.6454235980642436e-06,
"loss": 0.8016695380210876,
"step": 1188
},
{
"epoch": 2.723112128146453,
"grad_norm": 0.8115622997283936,
"learning_rate": 2.6335379646121993e-06,
"loss": 0.6355752944946289,
"step": 1190
},
{
"epoch": 2.727688787185355,
"grad_norm": 0.6038773059844971,
"learning_rate": 2.6216776385497098e-06,
"loss": 0.6571699380874634,
"step": 1192
},
{
"epoch": 2.732265446224256,
"grad_norm": 0.559846043586731,
"learning_rate": 2.6098427897948867e-06,
"loss": 0.7960324883460999,
"step": 1194
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.40631529688835144,
"learning_rate": 2.5980335879008364e-06,
"loss": 0.7562568783760071,
"step": 1196
},
{
"epoch": 2.7414187643020593,
"grad_norm": 1.7513792514801025,
"learning_rate": 2.586250202053233e-06,
"loss": 0.48247501254081726,
"step": 1198
},
{
"epoch": 2.745995423340961,
"grad_norm": 0.474061518907547,
"learning_rate": 2.574492801067902e-06,
"loss": 0.8583283424377441,
"step": 1200
},
{
"epoch": 2.750572082379863,
"grad_norm": 0.46135658025741577,
"learning_rate": 2.5627615533883803e-06,
"loss": 0.774861216545105,
"step": 1202
},
{
"epoch": 2.7551487414187643,
"grad_norm": 1.1955369710922241,
"learning_rate": 2.55105662708353e-06,
"loss": 0.5863184332847595,
"step": 1204
},
{
"epoch": 2.7597254004576657,
"grad_norm": 5.386921405792236,
"learning_rate": 2.539378189845112e-06,
"loss": 0.7277770638465881,
"step": 1206
},
{
"epoch": 2.7643020594965675,
"grad_norm": 0.7777206301689148,
"learning_rate": 2.5277264089853852e-06,
"loss": 0.32458746433258057,
"step": 1208
},
{
"epoch": 2.7688787185354693,
"grad_norm": 3.4855971336364746,
"learning_rate": 2.5161014514347212e-06,
"loss": 0.48918047547340393,
"step": 1210
},
{
"epoch": 2.7734553775743707,
"grad_norm": 0.9984601736068726,
"learning_rate": 2.5045034837392e-06,
"loss": 0.5557206869125366,
"step": 1212
},
{
"epoch": 2.7780320366132725,
"grad_norm": 0.3992338180541992,
"learning_rate": 2.492932672058231e-06,
"loss": 0.5206654071807861,
"step": 1214
},
{
"epoch": 2.782608695652174,
"grad_norm": 1.431427240371704,
"learning_rate": 2.4813891821621653e-06,
"loss": 0.28042441606521606,
"step": 1216
},
{
"epoch": 2.7871853546910756,
"grad_norm": 2.218466281890869,
"learning_rate": 2.4698731794299354e-06,
"loss": 0.3877430856227875,
"step": 1218
},
{
"epoch": 2.791762013729977,
"grad_norm": 0.46863940358161926,
"learning_rate": 2.4583848288466662e-06,
"loss": 0.493195503950119,
"step": 1220
},
{
"epoch": 2.796338672768879,
"grad_norm": 0.6733099222183228,
"learning_rate": 2.446924295001326e-06,
"loss": 0.7823275923728943,
"step": 1222
},
{
"epoch": 2.80091533180778,
"grad_norm": 1.4674835205078125,
"learning_rate": 2.435491742084368e-06,
"loss": 0.6491841077804565,
"step": 1224
},
{
"epoch": 2.805491990846682,
"grad_norm": 0.47652876377105713,
"learning_rate": 2.4240873338853628e-06,
"loss": 0.7960876226425171,
"step": 1226
},
{
"epoch": 2.8100686498855834,
"grad_norm": 0.4711756110191345,
"learning_rate": 2.4127112337906754e-06,
"loss": 0.536849319934845,
"step": 1228
},
{
"epoch": 2.814645308924485,
"grad_norm": 0.7543594241142273,
"learning_rate": 2.401363604781104e-06,
"loss": 0.6167381405830383,
"step": 1230
},
{
"epoch": 2.8192219679633865,
"grad_norm": 0.5257206559181213,
"learning_rate": 2.390044609429556e-06,
"loss": 0.8745390176773071,
"step": 1232
},
{
"epoch": 2.8237986270022883,
"grad_norm": 1.4748421907424927,
"learning_rate": 2.3787544098987148e-06,
"loss": 0.5339797139167786,
"step": 1234
},
{
"epoch": 2.82837528604119,
"grad_norm": 0.7052022218704224,
"learning_rate": 2.3674931679387184e-06,
"loss": 0.797744631767273,
"step": 1236
},
{
"epoch": 2.8329519450800915,
"grad_norm": 0.27761203050613403,
"learning_rate": 2.3562610448848415e-06,
"loss": 0.49625930190086365,
"step": 1238
},
{
"epoch": 2.837528604118993,
"grad_norm": 0.5554156303405762,
"learning_rate": 2.3450582016551826e-06,
"loss": 0.7721536159515381,
"step": 1240
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.4830935597419739,
"learning_rate": 2.3338847987483645e-06,
"loss": 0.810258686542511,
"step": 1242
},
{
"epoch": 2.8466819221967965,
"grad_norm": 0.7139240503311157,
"learning_rate": 2.3227409962412204e-06,
"loss": 0.8331779837608337,
"step": 1244
},
{
"epoch": 2.851258581235698,
"grad_norm": 1.6733156442642212,
"learning_rate": 2.3116269537865233e-06,
"loss": 0.6354835629463196,
"step": 1246
},
{
"epoch": 2.8558352402745997,
"grad_norm": 0.8812215328216553,
"learning_rate": 2.3005428306106773e-06,
"loss": 0.7057042121887207,
"step": 1248
},
{
"epoch": 2.860411899313501,
"grad_norm": 0.784724235534668,
"learning_rate": 2.2894887855114463e-06,
"loss": 0.843634843826294,
"step": 1250
},
{
"epoch": 2.864988558352403,
"grad_norm": 0.49218860268592834,
"learning_rate": 2.27846497685568e-06,
"loss": 0.7869199514389038,
"step": 1252
},
{
"epoch": 2.869565217391304,
"grad_norm": 2.8198912143707275,
"learning_rate": 2.2674715625770415e-06,
"loss": 0.549948513507843,
"step": 1254
},
{
"epoch": 2.874141876430206,
"grad_norm": 0.4654172956943512,
"learning_rate": 2.256508700173745e-06,
"loss": 0.5740110874176025,
"step": 1256
},
{
"epoch": 2.8787185354691074,
"grad_norm": 0.37810707092285156,
"learning_rate": 2.245576546706301e-06,
"loss": 0.5503138303756714,
"step": 1258
},
{
"epoch": 2.883295194508009,
"grad_norm": 0.8275235891342163,
"learning_rate": 2.234675258795269e-06,
"loss": 0.5749093890190125,
"step": 1260
},
{
"epoch": 2.887871853546911,
"grad_norm": 0.5841154456138611,
"learning_rate": 2.2238049926190025e-06,
"loss": 0.714622437953949,
"step": 1262
},
{
"epoch": 2.8924485125858124,
"grad_norm": 0.5689151287078857,
"learning_rate": 2.2129659039114243e-06,
"loss": 0.46536874771118164,
"step": 1264
},
{
"epoch": 2.8970251716247137,
"grad_norm": 0.5501039028167725,
"learning_rate": 2.2021581479597927e-06,
"loss": 0.7893953919410706,
"step": 1266
},
{
"epoch": 2.9016018306636155,
"grad_norm": 0.8494502902030945,
"learning_rate": 2.191381879602466e-06,
"loss": 0.6205670237541199,
"step": 1268
},
{
"epoch": 2.9061784897025174,
"grad_norm": 0.6330673098564148,
"learning_rate": 2.1806372532267006e-06,
"loss": 0.5450186133384705,
"step": 1270
},
{
"epoch": 2.9107551487414187,
"grad_norm": 0.5317128896713257,
"learning_rate": 2.1699244227664272e-06,
"loss": 0.4947490096092224,
"step": 1272
},
{
"epoch": 2.9153318077803205,
"grad_norm": 0.42626556754112244,
"learning_rate": 2.1592435417000485e-06,
"loss": 0.5672276020050049,
"step": 1274
},
{
"epoch": 2.919908466819222,
"grad_norm": 0.8525965213775635,
"learning_rate": 2.1485947630482434e-06,
"loss": 0.36214679479599,
"step": 1276
},
{
"epoch": 2.9244851258581237,
"grad_norm": 0.5449546575546265,
"learning_rate": 2.137978239371771e-06,
"loss": 0.8090663552284241,
"step": 1278
},
{
"epoch": 2.929061784897025,
"grad_norm": 1.8602927923202515,
"learning_rate": 2.127394122769286e-06,
"loss": 0.5202322006225586,
"step": 1280
},
{
"epoch": 2.933638443935927,
"grad_norm": 1.372131109237671,
"learning_rate": 2.11684256487516e-06,
"loss": 0.7397865056991577,
"step": 1282
},
{
"epoch": 2.9382151029748282,
"grad_norm": 0.4317033886909485,
"learning_rate": 2.1063237168573135e-06,
"loss": 0.8207894563674927,
"step": 1284
},
{
"epoch": 2.94279176201373,
"grad_norm": 1.2322509288787842,
"learning_rate": 2.0958377294150375e-06,
"loss": 0.6003591418266296,
"step": 1286
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.511931300163269,
"learning_rate": 2.085384752776851e-06,
"loss": 0.5409091114997864,
"step": 1288
},
{
"epoch": 2.9519450800915332,
"grad_norm": 0.4843280017375946,
"learning_rate": 2.074964936698335e-06,
"loss": 0.5519256591796875,
"step": 1290
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.8314401507377625,
"learning_rate": 2.0645784304599952e-06,
"loss": 0.7691973447799683,
"step": 1292
},
{
"epoch": 2.9610983981693364,
"grad_norm": 1.611412763595581,
"learning_rate": 2.0542253828651193e-06,
"loss": 0.46346092224121094,
"step": 1294
},
{
"epoch": 2.965675057208238,
"grad_norm": 0.45376840233802795,
"learning_rate": 2.0439059422376476e-06,
"loss": 0.7660473585128784,
"step": 1296
},
{
"epoch": 2.9702517162471396,
"grad_norm": 0.5917079448699951,
"learning_rate": 2.033620256420046e-06,
"loss": 0.7450892329216003,
"step": 1298
},
{
"epoch": 2.974828375286041,
"grad_norm": 0.5337526798248291,
"learning_rate": 2.0233684727711883e-06,
"loss": 0.7948495149612427,
"step": 1300
},
{
"epoch": 2.9794050343249427,
"grad_norm": 2.367062568664551,
"learning_rate": 2.0131507381642506e-06,
"loss": 0.47295913100242615,
"step": 1302
},
{
"epoch": 2.9839816933638446,
"grad_norm": 1.1392394304275513,
"learning_rate": 2.0029671989845923e-06,
"loss": 0.750167965888977,
"step": 1304
},
{
"epoch": 2.988558352402746,
"grad_norm": 0.531143307685852,
"learning_rate": 1.992818001127678e-06,
"loss": 0.39104270935058594,
"step": 1306
},
{
"epoch": 2.9931350114416477,
"grad_norm": 0.7250871062278748,
"learning_rate": 1.9827032899969756e-06,
"loss": 0.22440141439437866,
"step": 1308
},
{
"epoch": 2.997711670480549,
"grad_norm": 0.703926682472229,
"learning_rate": 1.9726232105018697e-06,
"loss": 0.7429046630859375,
"step": 1310
},
{
"epoch": 3.002288329519451,
"grad_norm": 0.43497398495674133,
"learning_rate": 1.9625779070556e-06,
"loss": 0.7082179188728333,
"step": 1312
},
{
"epoch": 3.0068649885583523,
"grad_norm": 0.5466197729110718,
"learning_rate": 1.9525675235731793e-06,
"loss": 0.3564700484275818,
"step": 1314
},
{
"epoch": 3.011441647597254,
"grad_norm": 0.5676448345184326,
"learning_rate": 1.9425922034693363e-06,
"loss": 0.6031548380851746,
"step": 1316
},
{
"epoch": 3.0160183066361554,
"grad_norm": 0.6110860109329224,
"learning_rate": 1.9326520896564614e-06,
"loss": 0.5565021634101868,
"step": 1318
},
{
"epoch": 3.0205949656750573,
"grad_norm": 0.7073854207992554,
"learning_rate": 1.9227473245425584e-06,
"loss": 0.6478375196456909,
"step": 1320
},
{
"epoch": 3.0251716247139586,
"grad_norm": 0.7119851112365723,
"learning_rate": 1.912878050029205e-06,
"loss": 0.3045092821121216,
"step": 1322
},
{
"epoch": 3.0297482837528604,
"grad_norm": 0.668566882610321,
"learning_rate": 1.9030444075095169e-06,
"loss": 0.5307950973510742,
"step": 1324
},
{
"epoch": 3.034324942791762,
"grad_norm": 0.6608180403709412,
"learning_rate": 1.8932465378661315e-06,
"loss": 0.28795385360717773,
"step": 1326
},
{
"epoch": 3.0389016018306636,
"grad_norm": 1.2194944620132446,
"learning_rate": 1.8834845814691727e-06,
"loss": 0.6476283073425293,
"step": 1328
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.6446576118469238,
"learning_rate": 1.873758678174258e-06,
"loss": 0.6247165203094482,
"step": 1330
},
{
"epoch": 3.0480549199084668,
"grad_norm": 0.10556092858314514,
"learning_rate": 1.864068967320483e-06,
"loss": 0.07942181080579758,
"step": 1332
},
{
"epoch": 3.0526315789473686,
"grad_norm": 1.9154139757156372,
"learning_rate": 1.8544155877284292e-06,
"loss": 0.2606959939002991,
"step": 1334
},
{
"epoch": 3.05720823798627,
"grad_norm": 0.7502045035362244,
"learning_rate": 1.8447986776981746e-06,
"loss": 0.6145456433296204,
"step": 1336
},
{
"epoch": 3.0617848970251718,
"grad_norm": 0.7055021524429321,
"learning_rate": 1.8352183750073134e-06,
"loss": 0.43598175048828125,
"step": 1338
},
{
"epoch": 3.066361556064073,
"grad_norm": 0.5926067233085632,
"learning_rate": 1.8256748169089803e-06,
"loss": 0.2945636212825775,
"step": 1340
},
{
"epoch": 3.070938215102975,
"grad_norm": 1.8550186157226562,
"learning_rate": 1.8161681401298842e-06,
"loss": 0.43354955315589905,
"step": 1342
},
{
"epoch": 3.0755148741418763,
"grad_norm": 2.016162395477295,
"learning_rate": 1.8066984808683547e-06,
"loss": 0.35298022627830505,
"step": 1344
},
{
"epoch": 3.080091533180778,
"grad_norm": 0.677463948726654,
"learning_rate": 1.7972659747923785e-06,
"loss": 0.358053982257843,
"step": 1346
},
{
"epoch": 3.0846681922196795,
"grad_norm": 1.3669031858444214,
"learning_rate": 1.787870757037672e-06,
"loss": 0.30859601497650146,
"step": 1348
},
{
"epoch": 3.0892448512585813,
"grad_norm": 1.2701737880706787,
"learning_rate": 1.7785129622057312e-06,
"loss": 0.5989816188812256,
"step": 1350
},
{
"epoch": 3.0938215102974826,
"grad_norm": 0.4244588613510132,
"learning_rate": 1.7691927243619105e-06,
"loss": 0.3882506191730499,
"step": 1352
},
{
"epoch": 3.0983981693363845,
"grad_norm": 4.394486427307129,
"learning_rate": 1.7599101770335015e-06,
"loss": 0.13407155871391296,
"step": 1354
},
{
"epoch": 3.1029748283752863,
"grad_norm": 0.7348482012748718,
"learning_rate": 1.7506654532078176e-06,
"loss": 0.1811959147453308,
"step": 1356
},
{
"epoch": 3.1075514874141876,
"grad_norm": 0.8721897602081299,
"learning_rate": 1.7414586853302909e-06,
"loss": 0.42143383622169495,
"step": 1358
},
{
"epoch": 3.1121281464530894,
"grad_norm": 5.011177062988281,
"learning_rate": 1.732290005302572e-06,
"loss": 0.31767553091049194,
"step": 1360
},
{
"epoch": 3.116704805491991,
"grad_norm": 0.8464891910552979,
"learning_rate": 1.7231595444806483e-06,
"loss": 0.6350334882736206,
"step": 1362
},
{
"epoch": 3.1212814645308926,
"grad_norm": 0.7363600134849548,
"learning_rate": 1.7140674336729477e-06,
"loss": 0.34634676575660706,
"step": 1364
},
{
"epoch": 3.125858123569794,
"grad_norm": 0.6495455503463745,
"learning_rate": 1.7050138031384776e-06,
"loss": 0.5073356032371521,
"step": 1366
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.8677927851676941,
"learning_rate": 1.6959987825849548e-06,
"loss": 0.41521307826042175,
"step": 1368
},
{
"epoch": 3.135011441647597,
"grad_norm": 0.6031743288040161,
"learning_rate": 1.6870225011669433e-06,
"loss": 0.5051814913749695,
"step": 1370
},
{
"epoch": 3.139588100686499,
"grad_norm": 0.5029802918434143,
"learning_rate": 1.67808508748401e-06,
"loss": 0.060064904391765594,
"step": 1372
},
{
"epoch": 3.1441647597254003,
"grad_norm": 0.49608922004699707,
"learning_rate": 1.6691866695788778e-06,
"loss": 0.5580463409423828,
"step": 1374
},
{
"epoch": 3.148741418764302,
"grad_norm": 0.5855568051338196,
"learning_rate": 1.6603273749355932e-06,
"loss": 0.5613203644752502,
"step": 1376
},
{
"epoch": 3.1533180778032035,
"grad_norm": 0.6027638912200928,
"learning_rate": 1.6515073304776996e-06,
"loss": 0.3589206635951996,
"step": 1378
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.7994204759597778,
"learning_rate": 1.642726662566419e-06,
"loss": 0.4340493083000183,
"step": 1380
},
{
"epoch": 3.1624713958810067,
"grad_norm": 0.8440326452255249,
"learning_rate": 1.6339854969988412e-06,
"loss": 0.5287958383560181,
"step": 1382
},
{
"epoch": 3.1670480549199085,
"grad_norm": 1.337528109550476,
"learning_rate": 1.6252839590061203e-06,
"loss": 0.48380246758461,
"step": 1384
},
{
"epoch": 3.17162471395881,
"grad_norm": 0.18262973427772522,
"learning_rate": 1.6166221732516876e-06,
"loss": 0.029057124629616737,
"step": 1386
},
{
"epoch": 3.1762013729977117,
"grad_norm": 1.918377161026001,
"learning_rate": 1.6080002638294516e-06,
"loss": 0.09718252718448639,
"step": 1388
},
{
"epoch": 3.1807780320366135,
"grad_norm": 0.17276065051555634,
"learning_rate": 1.599418354262036e-06,
"loss": 0.17694096267223358,
"step": 1390
},
{
"epoch": 3.185354691075515,
"grad_norm": 0.8832252621650696,
"learning_rate": 1.5908765674989995e-06,
"loss": 0.4334821403026581,
"step": 1392
},
{
"epoch": 3.1899313501144166,
"grad_norm": 6.330790996551514,
"learning_rate": 1.582375025915078e-06,
"loss": 0.2820512652397156,
"step": 1394
},
{
"epoch": 3.194508009153318,
"grad_norm": 0.6005669832229614,
"learning_rate": 1.573913851308431e-06,
"loss": 0.22479744255542755,
"step": 1396
},
{
"epoch": 3.19908466819222,
"grad_norm": 1.287660002708435,
"learning_rate": 1.5654931648988962e-06,
"loss": 0.4539932906627655,
"step": 1398
},
{
"epoch": 3.203661327231121,
"grad_norm": 0.5233741402626038,
"learning_rate": 1.5571130873262542e-06,
"loss": 0.17698650062084198,
"step": 1400
},
{
"epoch": 3.208237986270023,
"grad_norm": 0.9421582818031311,
"learning_rate": 1.5487737386484966e-06,
"loss": 0.4471212327480316,
"step": 1402
},
{
"epoch": 3.2128146453089244,
"grad_norm": 1.1425485610961914,
"learning_rate": 1.5404752383401145e-06,
"loss": 0.49938952922821045,
"step": 1404
},
{
"epoch": 3.217391304347826,
"grad_norm": 1.4222649335861206,
"learning_rate": 1.5322177052903725e-06,
"loss": 0.2908313572406769,
"step": 1406
},
{
"epoch": 3.2219679633867275,
"grad_norm": 0.6182151436805725,
"learning_rate": 1.5240012578016205e-06,
"loss": 0.45964670181274414,
"step": 1408
},
{
"epoch": 3.2265446224256293,
"grad_norm": 0.7475869059562683,
"learning_rate": 1.5158260135875908e-06,
"loss": 0.4192732572555542,
"step": 1410
},
{
"epoch": 3.2311212814645307,
"grad_norm": 0.8350936770439148,
"learning_rate": 1.507692089771708e-06,
"loss": 0.6540165543556213,
"step": 1412
},
{
"epoch": 3.2356979405034325,
"grad_norm": 1.6320070028305054,
"learning_rate": 1.4995996028854237e-06,
"loss": 0.42058131098747253,
"step": 1414
},
{
"epoch": 3.2402745995423343,
"grad_norm": 0.9828153252601624,
"learning_rate": 1.4915486688665344e-06,
"loss": 0.4954679012298584,
"step": 1416
},
{
"epoch": 3.2448512585812357,
"grad_norm": 0.4938318431377411,
"learning_rate": 1.4835394030575266e-06,
"loss": 0.5675240159034729,
"step": 1418
},
{
"epoch": 3.2494279176201375,
"grad_norm": 0.8090194463729858,
"learning_rate": 1.475571920203923e-06,
"loss": 0.5262541174888611,
"step": 1420
},
{
"epoch": 3.254004576659039,
"grad_norm": 0.9066356420516968,
"learning_rate": 1.4676463344526395e-06,
"loss": 0.5531529188156128,
"step": 1422
},
{
"epoch": 3.2585812356979407,
"grad_norm": 0.8314560651779175,
"learning_rate": 1.4597627593503473e-06,
"loss": 0.4790411591529846,
"step": 1424
},
{
"epoch": 3.263157894736842,
"grad_norm": 2.9613871574401855,
"learning_rate": 1.4519213078418494e-06,
"loss": 0.13854338228702545,
"step": 1426
},
{
"epoch": 3.267734553775744,
"grad_norm": 1.013364315032959,
"learning_rate": 1.4441220922684637e-06,
"loss": 0.49466753005981445,
"step": 1428
},
{
"epoch": 3.272311212814645,
"grad_norm": 0.32285788655281067,
"learning_rate": 1.4363652243664036e-06,
"loss": 0.045904774218797684,
"step": 1430
},
{
"epoch": 3.276887871853547,
"grad_norm": 1.0177583694458008,
"learning_rate": 1.4286508152651916e-06,
"loss": 0.43011364340782166,
"step": 1432
},
{
"epoch": 3.2814645308924484,
"grad_norm": 0.6156055927276611,
"learning_rate": 1.4209789754860566e-06,
"loss": 0.26439258456230164,
"step": 1434
},
{
"epoch": 3.28604118993135,
"grad_norm": 0.535756528377533,
"learning_rate": 1.4133498149403554e-06,
"loss": 0.3485221862792969,
"step": 1436
},
{
"epoch": 3.2906178489702516,
"grad_norm": 0.6638374924659729,
"learning_rate": 1.405763442927995e-06,
"loss": 0.43791279196739197,
"step": 1438
},
{
"epoch": 3.2951945080091534,
"grad_norm": 0.6146953701972961,
"learning_rate": 1.3982199681358703e-06,
"loss": 0.43667203187942505,
"step": 1440
},
{
"epoch": 3.2997711670480547,
"grad_norm": 0.8798004388809204,
"learning_rate": 1.3907194986363029e-06,
"loss": 0.679145872592926,
"step": 1442
},
{
"epoch": 3.3043478260869565,
"grad_norm": 2.4789793491363525,
"learning_rate": 1.383262141885496e-06,
"loss": 0.46688565611839294,
"step": 1444
},
{
"epoch": 3.308924485125858,
"grad_norm": 0.7254453897476196,
"learning_rate": 1.3758480047219964e-06,
"loss": 0.4225999414920807,
"step": 1446
},
{
"epoch": 3.3135011441647597,
"grad_norm": 1.1638004779815674,
"learning_rate": 1.3684771933651547e-06,
"loss": 0.5533671975135803,
"step": 1448
},
{
"epoch": 3.3180778032036615,
"grad_norm": 1.8972636461257935,
"learning_rate": 1.3611498134136171e-06,
"loss": 0.49239760637283325,
"step": 1450
},
{
"epoch": 3.322654462242563,
"grad_norm": 0.7231550216674805,
"learning_rate": 1.353865969843803e-06,
"loss": 0.6097209453582764,
"step": 1452
},
{
"epoch": 3.3272311212814647,
"grad_norm": 0.6900179982185364,
"learning_rate": 1.3466257670084006e-06,
"loss": 0.5563924312591553,
"step": 1454
},
{
"epoch": 3.331807780320366,
"grad_norm": 0.657533586025238,
"learning_rate": 1.3394293086348796e-06,
"loss": 0.5384810566902161,
"step": 1456
},
{
"epoch": 3.336384439359268,
"grad_norm": 0.20614972710609436,
"learning_rate": 1.3322766978239977e-06,
"loss": 0.21752725541591644,
"step": 1458
},
{
"epoch": 3.3409610983981692,
"grad_norm": 0.43330979347229004,
"learning_rate": 1.325168037048327e-06,
"loss": 0.1562982201576233,
"step": 1460
},
{
"epoch": 3.345537757437071,
"grad_norm": 0.8868097066879272,
"learning_rate": 1.3181034281507846e-06,
"loss": 0.039067141711711884,
"step": 1462
},
{
"epoch": 3.3501144164759724,
"grad_norm": 0.39520424604415894,
"learning_rate": 1.3110829723431763e-06,
"loss": 0.31014665961265564,
"step": 1464
},
{
"epoch": 3.354691075514874,
"grad_norm": 1.6159244775772095,
"learning_rate": 1.3041067702047407e-06,
"loss": 0.07728109508752823,
"step": 1466
},
{
"epoch": 3.3592677345537756,
"grad_norm": 0.7340835928916931,
"learning_rate": 1.297174921680714e-06,
"loss": 0.30891698598861694,
"step": 1468
},
{
"epoch": 3.3638443935926774,
"grad_norm": 1.7658299207687378,
"learning_rate": 1.2902875260808978e-06,
"loss": 0.07568443566560745,
"step": 1470
},
{
"epoch": 3.3684210526315788,
"grad_norm": 1.2565313577651978,
"learning_rate": 1.28344468207823e-06,
"loss": 0.2796335220336914,
"step": 1472
},
{
"epoch": 3.3729977116704806,
"grad_norm": 0.7257245182991028,
"learning_rate": 1.2766464877073805e-06,
"loss": 0.46368178725242615,
"step": 1474
},
{
"epoch": 3.3775743707093824,
"grad_norm": 1.0831571817398071,
"learning_rate": 1.2698930403633389e-06,
"loss": 0.39964616298675537,
"step": 1476
},
{
"epoch": 3.3821510297482837,
"grad_norm": 0.8762648105621338,
"learning_rate": 1.2631844368000236e-06,
"loss": 0.022859321907162666,
"step": 1478
},
{
"epoch": 3.386727688787185,
"grad_norm": 0.7149327993392944,
"learning_rate": 1.256520773128893e-06,
"loss": 0.29580366611480713,
"step": 1480
},
{
"epoch": 3.391304347826087,
"grad_norm": 5.417656898498535,
"learning_rate": 1.2499021448175713e-06,
"loss": 0.3861583173274994,
"step": 1482
},
{
"epoch": 3.3958810068649887,
"grad_norm": 0.5899186730384827,
"learning_rate": 1.2433286466884783e-06,
"loss": 0.5882078409194946,
"step": 1484
},
{
"epoch": 3.40045766590389,
"grad_norm": 0.6683065295219421,
"learning_rate": 1.2368003729174708e-06,
"loss": 0.5233013033866882,
"step": 1486
},
{
"epoch": 3.405034324942792,
"grad_norm": 0.7072110772132874,
"learning_rate": 1.2303174170324984e-06,
"loss": 0.5888646245002747,
"step": 1488
},
{
"epoch": 3.4096109839816933,
"grad_norm": 1.6215583086013794,
"learning_rate": 1.223879871912254e-06,
"loss": 0.04785463213920593,
"step": 1490
},
{
"epoch": 3.414187643020595,
"grad_norm": 0.6127355098724365,
"learning_rate": 1.2174878297848537e-06,
"loss": 0.0625072568655014,
"step": 1492
},
{
"epoch": 3.4187643020594964,
"grad_norm": 0.48454123735427856,
"learning_rate": 1.2111413822265077e-06,
"loss": 0.37776947021484375,
"step": 1494
},
{
"epoch": 3.4233409610983982,
"grad_norm": 0.6533837914466858,
"learning_rate": 1.2048406201602123e-06,
"loss": 0.17243488132953644,
"step": 1496
},
{
"epoch": 3.4279176201372996,
"grad_norm": 1.2736353874206543,
"learning_rate": 1.1985856338544457e-06,
"loss": 0.3051704466342926,
"step": 1498
},
{
"epoch": 3.4324942791762014,
"grad_norm": 0.9836899042129517,
"learning_rate": 1.1923765129218759e-06,
"loss": 0.34713953733444214,
"step": 1500
},
{
"epoch": 3.437070938215103,
"grad_norm": 1.0902957916259766,
"learning_rate": 1.1862133463180752e-06,
"loss": 0.4851178228855133,
"step": 1502
},
{
"epoch": 3.4416475972540046,
"grad_norm": 0.7768452167510986,
"learning_rate": 1.1800962223402466e-06,
"loss": 0.23431611061096191,
"step": 1504
},
{
"epoch": 3.446224256292906,
"grad_norm": 0.789863646030426,
"learning_rate": 1.174025228625962e-06,
"loss": 0.4964331090450287,
"step": 1506
},
{
"epoch": 3.4508009153318078,
"grad_norm": 0.9043194055557251,
"learning_rate": 1.168000452151899e-06,
"loss": 0.5745983719825745,
"step": 1508
},
{
"epoch": 3.4553775743707096,
"grad_norm": 0.6853423714637756,
"learning_rate": 1.1620219792326019e-06,
"loss": 0.3494628071784973,
"step": 1510
},
{
"epoch": 3.459954233409611,
"grad_norm": 0.7719877362251282,
"learning_rate": 1.1560898955192442e-06,
"loss": 0.07691881060600281,
"step": 1512
},
{
"epoch": 3.4645308924485128,
"grad_norm": 0.5891416072845459,
"learning_rate": 1.1502042859983956e-06,
"loss": 0.3369552493095398,
"step": 1514
},
{
"epoch": 3.469107551487414,
"grad_norm": 0.7656334638595581,
"learning_rate": 1.144365234990813e-06,
"loss": 0.5648132562637329,
"step": 1516
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.8338443636894226,
"learning_rate": 1.1385728261502265e-06,
"loss": 0.5203874707221985,
"step": 1518
},
{
"epoch": 3.4782608695652173,
"grad_norm": 1.7429405450820923,
"learning_rate": 1.1328271424621426e-06,
"loss": 0.4457243084907532,
"step": 1520
},
{
"epoch": 3.482837528604119,
"grad_norm": 0.6419166922569275,
"learning_rate": 1.127128266242655e-06,
"loss": 0.5154409408569336,
"step": 1522
},
{
"epoch": 3.4874141876430205,
"grad_norm": 0.7125447988510132,
"learning_rate": 1.1214762791372668e-06,
"loss": 0.38710278272628784,
"step": 1524
},
{
"epoch": 3.4919908466819223,
"grad_norm": 0.5910897850990295,
"learning_rate": 1.1158712621197187e-06,
"loss": 0.5922369956970215,
"step": 1526
},
{
"epoch": 3.4965675057208236,
"grad_norm": 0.42722317576408386,
"learning_rate": 1.1103132954908296e-06,
"loss": 0.03717589005827904,
"step": 1528
},
{
"epoch": 3.5011441647597255,
"grad_norm": 0.7920047044754028,
"learning_rate": 1.1048024588773493e-06,
"loss": 0.5138665437698364,
"step": 1530
},
{
"epoch": 3.505720823798627,
"grad_norm": 0.6985714435577393,
"learning_rate": 1.09933883123081e-06,
"loss": 0.5611461997032166,
"step": 1532
},
{
"epoch": 3.5102974828375286,
"grad_norm": 0.5378302931785583,
"learning_rate": 1.0939224908264042e-06,
"loss": 0.5791317820549011,
"step": 1534
},
{
"epoch": 3.5148741418764304,
"grad_norm": 0.10158717632293701,
"learning_rate": 1.0885535152618574e-06,
"loss": 0.2312551885843277,
"step": 1536
},
{
"epoch": 3.519450800915332,
"grad_norm": 0.9983445405960083,
"learning_rate": 1.0832319814563188e-06,
"loss": 0.32253575325012207,
"step": 1538
},
{
"epoch": 3.524027459954233,
"grad_norm": 1.0612983703613281,
"learning_rate": 1.0779579656492575e-06,
"loss": 0.3236549496650696,
"step": 1540
},
{
"epoch": 3.528604118993135,
"grad_norm": 0.7103164792060852,
"learning_rate": 1.072731543399372e-06,
"loss": 0.3252106308937073,
"step": 1542
},
{
"epoch": 3.533180778032037,
"grad_norm": 0.2725476622581482,
"learning_rate": 1.067552789583508e-06,
"loss": 0.23804107308387756,
"step": 1544
},
{
"epoch": 3.537757437070938,
"grad_norm": 1.5089448690414429,
"learning_rate": 1.0624217783955839e-06,
"loss": 0.38195565342903137,
"step": 1546
},
{
"epoch": 3.54233409610984,
"grad_norm": 0.6526052355766296,
"learning_rate": 1.0573385833455275e-06,
"loss": 0.380737841129303,
"step": 1548
},
{
"epoch": 3.5469107551487413,
"grad_norm": 1.1080514192581177,
"learning_rate": 1.0523032772582262e-06,
"loss": 0.38047873973846436,
"step": 1550
},
{
"epoch": 3.551487414187643,
"grad_norm": 0.7343178391456604,
"learning_rate": 1.047315932272482e-06,
"loss": 0.4824707508087158,
"step": 1552
},
{
"epoch": 3.5560640732265445,
"grad_norm": 0.633358359336853,
"learning_rate": 1.0423766198399744e-06,
"loss": 0.6635564565658569,
"step": 1554
},
{
"epoch": 3.5606407322654463,
"grad_norm": 0.5067686438560486,
"learning_rate": 1.0374854107242416e-06,
"loss": 0.3814176023006439,
"step": 1556
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.9163781404495239,
"learning_rate": 1.032642374999667e-06,
"loss": 0.37396469712257385,
"step": 1558
},
{
"epoch": 3.5697940503432495,
"grad_norm": 0.7029092311859131,
"learning_rate": 1.0278475820504685e-06,
"loss": 0.35979732871055603,
"step": 1560
},
{
"epoch": 3.5743707093821513,
"grad_norm": 0.5607945322990417,
"learning_rate": 1.0231011005697145e-06,
"loss": 0.5650622248649597,
"step": 1562
},
{
"epoch": 3.5789473684210527,
"grad_norm": 1.1728614568710327,
"learning_rate": 1.0184029985583304e-06,
"loss": 0.5536704063415527,
"step": 1564
},
{
"epoch": 3.583524027459954,
"grad_norm": 0.7152750492095947,
"learning_rate": 1.013753343324131e-06,
"loss": 0.594580888748169,
"step": 1566
},
{
"epoch": 3.588100686498856,
"grad_norm": 1.2149173021316528,
"learning_rate": 1.009152201480852e-06,
"loss": 0.2382848709821701,
"step": 1568
},
{
"epoch": 3.5926773455377576,
"grad_norm": 0.7747706770896912,
"learning_rate": 1.0045996389471982e-06,
"loss": 0.26458844542503357,
"step": 1570
},
{
"epoch": 3.597254004576659,
"grad_norm": 0.8040603995323181,
"learning_rate": 1.000095720945898e-06,
"loss": 0.5273915529251099,
"step": 1572
},
{
"epoch": 3.6018306636155604,
"grad_norm": 0.7427345514297485,
"learning_rate": 9.956405120027684e-07,
"loss": 0.630144476890564,
"step": 1574
},
{
"epoch": 3.606407322654462,
"grad_norm": 0.7674996852874756,
"learning_rate": 9.912340759457942e-07,
"loss": 0.3695995509624481,
"step": 1576
},
{
"epoch": 3.610983981693364,
"grad_norm": 0.6453800797462463,
"learning_rate": 9.868764759042061e-07,
"loss": 0.3349326550960541,
"step": 1578
},
{
"epoch": 3.6155606407322654,
"grad_norm": 0.8265721201896667,
"learning_rate": 9.82567774307585e-07,
"loss": 0.22504082322120667,
"step": 1580
},
{
"epoch": 3.620137299771167,
"grad_norm": 0.905486524105072,
"learning_rate": 9.783080328849617e-07,
"loss": 0.3136463165283203,
"step": 1582
},
{
"epoch": 3.6247139588100685,
"grad_norm": 0.6223527789115906,
"learning_rate": 9.740973126639342e-07,
"loss": 0.5206019282341003,
"step": 1584
},
{
"epoch": 3.6292906178489703,
"grad_norm": 0.7490825057029724,
"learning_rate": 9.699356739697942e-07,
"loss": 0.2996978163719177,
"step": 1586
},
{
"epoch": 3.6338672768878717,
"grad_norm": 1.3377209901809692,
"learning_rate": 9.658231764246612e-07,
"loss": 0.49734920263290405,
"step": 1588
},
{
"epoch": 3.6384439359267735,
"grad_norm": 0.6160526275634766,
"learning_rate": 9.617598789466309e-07,
"loss": 0.4488961696624756,
"step": 1590
},
{
"epoch": 3.643020594965675,
"grad_norm": 0.6402332782745361,
"learning_rate": 9.577458397489267e-07,
"loss": 0.34510815143585205,
"step": 1592
},
{
"epoch": 3.6475972540045767,
"grad_norm": 2.2822258472442627,
"learning_rate": 9.537811163390726e-07,
"loss": 0.29307034611701965,
"step": 1594
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.6280086636543274,
"learning_rate": 9.498657655180603e-07,
"loss": 0.7618148326873779,
"step": 1596
},
{
"epoch": 3.65675057208238,
"grad_norm": 1.1775965690612793,
"learning_rate": 9.459998433795451e-07,
"loss": 0.5977869033813477,
"step": 1598
},
{
"epoch": 3.6613272311212812,
"grad_norm": 1.0905108451843262,
"learning_rate": 9.421834053090337e-07,
"loss": 0.19339358806610107,
"step": 1600
},
{
"epoch": 3.665903890160183,
"grad_norm": 0.7036289572715759,
"learning_rate": 9.384165059830962e-07,
"loss": 0.3755697011947632,
"step": 1602
},
{
"epoch": 3.670480549199085,
"grad_norm": 1.6435350179672241,
"learning_rate": 9.346991993685812e-07,
"loss": 0.23423030972480774,
"step": 1604
},
{
"epoch": 3.675057208237986,
"grad_norm": 0.6723319888114929,
"learning_rate": 9.310315387218422e-07,
"loss": 0.3673589825630188,
"step": 1606
},
{
"epoch": 3.679633867276888,
"grad_norm": 0.6365793943405151,
"learning_rate": 9.274135765879747e-07,
"loss": 0.5359373092651367,
"step": 1608
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.7959802150726318,
"learning_rate": 9.238453648000641e-07,
"loss": 0.1777590662240982,
"step": 1610
},
{
"epoch": 3.688787185354691,
"grad_norm": 0.6853817105293274,
"learning_rate": 9.203269544784425e-07,
"loss": 0.38532423973083496,
"step": 1612
},
{
"epoch": 3.6933638443935926,
"grad_norm": 1.2843016386032104,
"learning_rate": 9.168583960299554e-07,
"loss": 0.08693390339612961,
"step": 1614
},
{
"epoch": 3.6979405034324944,
"grad_norm": 0.7138749361038208,
"learning_rate": 9.134397391472428e-07,
"loss": 0.39513278007507324,
"step": 1616
},
{
"epoch": 3.7025171624713957,
"grad_norm": 0.6505438685417175,
"learning_rate": 9.100710328080235e-07,
"loss": 0.6121611595153809,
"step": 1618
},
{
"epoch": 3.7070938215102975,
"grad_norm": 0.9069646596908569,
"learning_rate": 9.06752325274395e-07,
"loss": 0.5526926517486572,
"step": 1620
},
{
"epoch": 3.7116704805491993,
"grad_norm": 1.1165289878845215,
"learning_rate": 9.034836640921429e-07,
"loss": 0.07703058421611786,
"step": 1622
},
{
"epoch": 3.7162471395881007,
"grad_norm": 0.7630195021629333,
"learning_rate": 9.00265096090058e-07,
"loss": 0.46056246757507324,
"step": 1624
},
{
"epoch": 3.720823798627002,
"grad_norm": 0.6535618305206299,
"learning_rate": 8.970966673792673e-07,
"loss": 0.42857468128204346,
"step": 1626
},
{
"epoch": 3.725400457665904,
"grad_norm": 0.8734167218208313,
"learning_rate": 8.939784233525715e-07,
"loss": 0.256624311208725,
"step": 1628
},
{
"epoch": 3.7299771167048057,
"grad_norm": 0.7594066858291626,
"learning_rate": 8.909104086837956e-07,
"loss": 0.2708790898323059,
"step": 1630
},
{
"epoch": 3.734553775743707,
"grad_norm": 0.7039399743080139,
"learning_rate": 8.878926673271494e-07,
"loss": 0.24956341087818146,
"step": 1632
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.5838392376899719,
"learning_rate": 8.849252425165964e-07,
"loss": 0.15410839021205902,
"step": 1634
},
{
"epoch": 3.7437070938215102,
"grad_norm": 0.9392322301864624,
"learning_rate": 8.82008176765237e-07,
"loss": 0.28980621695518494,
"step": 1636
},
{
"epoch": 3.748283752860412,
"grad_norm": 0.7467678785324097,
"learning_rate": 8.791415118646951e-07,
"loss": 0.43078869581222534,
"step": 1638
},
{
"epoch": 3.7528604118993134,
"grad_norm": 0.9660971760749817,
"learning_rate": 8.763252888845239e-07,
"loss": 0.047181982547044754,
"step": 1640
},
{
"epoch": 3.757437070938215,
"grad_norm": 0.8727622032165527,
"learning_rate": 8.735595481716144e-07,
"loss": 0.696696937084198,
"step": 1642
},
{
"epoch": 3.7620137299771166,
"grad_norm": 0.6368957757949829,
"learning_rate": 8.708443293496197e-07,
"loss": 0.5460187196731567,
"step": 1644
},
{
"epoch": 3.7665903890160184,
"grad_norm": 0.7762473821640015,
"learning_rate": 8.681796713183851e-07,
"loss": 0.5858830809593201,
"step": 1646
},
{
"epoch": 3.7711670480549198,
"grad_norm": 0.8503757119178772,
"learning_rate": 8.655656122533918e-07,
"loss": 0.33725491166114807,
"step": 1648
},
{
"epoch": 3.7757437070938216,
"grad_norm": 0.9335038661956787,
"learning_rate": 8.630021896052107e-07,
"loss": 0.3240436017513275,
"step": 1650
},
{
"epoch": 3.780320366132723,
"grad_norm": 0.6535862684249878,
"learning_rate": 8.604894400989643e-07,
"loss": 0.5961631536483765,
"step": 1652
},
{
"epoch": 3.7848970251716247,
"grad_norm": 0.6312614679336548,
"learning_rate": 8.580273997338029e-07,
"loss": 0.5986257791519165,
"step": 1654
},
{
"epoch": 3.7894736842105265,
"grad_norm": 1.371013879776001,
"learning_rate": 8.556161037823857e-07,
"loss": 0.1755972057580948,
"step": 1656
},
{
"epoch": 3.794050343249428,
"grad_norm": 0.6041772961616516,
"learning_rate": 8.532555867903774e-07,
"loss": 0.4229702055454254,
"step": 1658
},
{
"epoch": 3.7986270022883293,
"grad_norm": 0.7439691424369812,
"learning_rate": 8.509458825759552e-07,
"loss": 0.42115482687950134,
"step": 1660
},
{
"epoch": 3.803203661327231,
"grad_norm": 0.712613046169281,
"learning_rate": 8.486870242293181e-07,
"loss": 0.27668675780296326,
"step": 1662
},
{
"epoch": 3.807780320366133,
"grad_norm": 0.6914879679679871,
"learning_rate": 8.46479044112221e-07,
"loss": 0.48982763290405273,
"step": 1664
},
{
"epoch": 3.8123569794050343,
"grad_norm": 0.22564047574996948,
"learning_rate": 8.443219738575045e-07,
"loss": 0.03790595009922981,
"step": 1666
},
{
"epoch": 3.816933638443936,
"grad_norm": 1.0601603984832764,
"learning_rate": 8.422158443686438e-07,
"loss": 0.29513728618621826,
"step": 1668
},
{
"epoch": 3.8215102974828374,
"grad_norm": 0.8904151320457458,
"learning_rate": 8.401606858193082e-07,
"loss": 0.5875513553619385,
"step": 1670
},
{
"epoch": 3.8260869565217392,
"grad_norm": 2.976503610610962,
"learning_rate": 8.381565276529259e-07,
"loss": 0.6087798476219177,
"step": 1672
},
{
"epoch": 3.8306636155606406,
"grad_norm": 0.632571280002594,
"learning_rate": 8.362033985822622e-07,
"loss": 0.32114294171333313,
"step": 1674
},
{
"epoch": 3.8352402745995424,
"grad_norm": 0.601119339466095,
"learning_rate": 8.343013265890103e-07,
"loss": 0.5873348712921143,
"step": 1676
},
{
"epoch": 3.839816933638444,
"grad_norm": 0.9582832455635071,
"learning_rate": 8.324503389233897e-07,
"loss": 0.3884204924106598,
"step": 1678
},
{
"epoch": 3.8443935926773456,
"grad_norm": 0.6112123727798462,
"learning_rate": 8.306504621037538e-07,
"loss": 0.5339500308036804,
"step": 1680
},
{
"epoch": 3.8489702517162474,
"grad_norm": 1.0593942403793335,
"learning_rate": 8.289017219162127e-07,
"loss": 0.5693700909614563,
"step": 1682
},
{
"epoch": 3.8535469107551488,
"grad_norm": 0.12832017242908478,
"learning_rate": 8.27204143414262e-07,
"loss": 0.35359063744544983,
"step": 1684
},
{
"epoch": 3.85812356979405,
"grad_norm": 0.6210229992866516,
"learning_rate": 8.25557750918425e-07,
"loss": 0.5230538845062256,
"step": 1686
},
{
"epoch": 3.862700228832952,
"grad_norm": 0.16382178664207458,
"learning_rate": 8.239625680159025e-07,
"loss": 0.26382216811180115,
"step": 1688
},
{
"epoch": 3.8672768878718538,
"grad_norm": 2.582536220550537,
"learning_rate": 8.224186175602379e-07,
"loss": 0.4692237079143524,
"step": 1690
},
{
"epoch": 3.871853546910755,
"grad_norm": 0.19737273454666138,
"learning_rate": 8.209259216709867e-07,
"loss": 0.20311836898326874,
"step": 1692
},
{
"epoch": 3.8764302059496565,
"grad_norm": 0.6413448452949524,
"learning_rate": 8.19484501733401e-07,
"loss": 0.3281576633453369,
"step": 1694
},
{
"epoch": 3.8810068649885583,
"grad_norm": 0.5512455701828003,
"learning_rate": 8.180943783981235e-07,
"loss": 0.43348217010498047,
"step": 1696
},
{
"epoch": 3.88558352402746,
"grad_norm": 0.6660462021827698,
"learning_rate": 8.167555715808909e-07,
"loss": 0.2919246256351471,
"step": 1698
},
{
"epoch": 3.8901601830663615,
"grad_norm": 0.6395224928855896,
"learning_rate": 8.154681004622488e-07,
"loss": 0.6133857369422913,
"step": 1700
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.9475614428520203,
"learning_rate": 8.142319834872765e-07,
"loss": 0.11332155019044876,
"step": 1702
},
{
"epoch": 3.8993135011441646,
"grad_norm": 0.9015949368476868,
"learning_rate": 8.130472383653242e-07,
"loss": 0.46062496304512024,
"step": 1704
},
{
"epoch": 3.9038901601830664,
"grad_norm": 1.1249760389328003,
"learning_rate": 8.119138820697578e-07,
"loss": 0.30429723858833313,
"step": 1706
},
{
"epoch": 3.908466819221968,
"grad_norm": 0.6844801306724548,
"learning_rate": 8.108319308377159e-07,
"loss": 0.2837408483028412,
"step": 1708
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.6176382303237915,
"learning_rate": 8.098014001698775e-07,
"loss": 0.3056495487689972,
"step": 1710
},
{
"epoch": 3.917620137299771,
"grad_norm": 0.6619205474853516,
"learning_rate": 8.088223048302401e-07,
"loss": 0.3151998519897461,
"step": 1712
},
{
"epoch": 3.922196796338673,
"grad_norm": 0.6511373519897461,
"learning_rate": 8.078946588459083e-07,
"loss": 0.31710049510002136,
"step": 1714
},
{
"epoch": 3.9267734553775746,
"grad_norm": 0.45258191227912903,
"learning_rate": 8.070184755068918e-07,
"loss": 0.49992504715919495,
"step": 1716
},
{
"epoch": 3.931350114416476,
"grad_norm": 1.0866183042526245,
"learning_rate": 8.061937673659166e-07,
"loss": 0.40558746457099915,
"step": 1718
},
{
"epoch": 3.9359267734553773,
"grad_norm": 0.6440381407737732,
"learning_rate": 8.054205462382437e-07,
"loss": 0.3771609663963318,
"step": 1720
},
{
"epoch": 3.940503432494279,
"grad_norm": 0.5777904987335205,
"learning_rate": 8.046988232015002e-07,
"loss": 0.5135948061943054,
"step": 1722
},
{
"epoch": 3.945080091533181,
"grad_norm": 0.6155217885971069,
"learning_rate": 8.040286085955212e-07,
"loss": 0.5833529233932495,
"step": 1724
},
{
"epoch": 3.9496567505720823,
"grad_norm": 0.9023070335388184,
"learning_rate": 8.034099120222018e-07,
"loss": 0.33015888929367065,
"step": 1726
},
{
"epoch": 3.954233409610984,
"grad_norm": 0.558512806892395,
"learning_rate": 8.028427423453575e-07,
"loss": 0.3327302932739258,
"step": 1728
},
{
"epoch": 3.9588100686498855,
"grad_norm": 1.3595865964889526,
"learning_rate": 8.023271076906006e-07,
"loss": 0.33213916420936584,
"step": 1730
},
{
"epoch": 3.9633867276887873,
"grad_norm": 0.9910792708396912,
"learning_rate": 8.018630154452202e-07,
"loss": 0.303692489862442,
"step": 1732
},
{
"epoch": 3.9679633867276887,
"grad_norm": 0.8309425711631775,
"learning_rate": 8.01450472258079e-07,
"loss": 0.5128837823867798,
"step": 1734
},
{
"epoch": 3.9725400457665905,
"grad_norm": 0.7355870604515076,
"learning_rate": 8.010894840395169e-07,
"loss": 0.3522130846977234,
"step": 1736
},
{
"epoch": 3.977116704805492,
"grad_norm": 0.762816846370697,
"learning_rate": 8.007800559612672e-07,
"loss": 0.6588464379310608,
"step": 1738
},
{
"epoch": 3.9816933638443937,
"grad_norm": 0.8466113209724426,
"learning_rate": 8.005221924563803e-07,
"loss": 0.7762544751167297,
"step": 1740
},
{
"epoch": 3.9862700228832955,
"grad_norm": 0.7509016990661621,
"learning_rate": 8.003158972191635e-07,
"loss": 0.39617398381233215,
"step": 1742
},
{
"epoch": 3.990846681922197,
"grad_norm": 0.9751285910606384,
"learning_rate": 8.001611732051253e-07,
"loss": 0.3054667115211487,
"step": 1744
},
{
"epoch": 3.995423340961098,
"grad_norm": 1.1020561456680298,
"learning_rate": 8.000580226309339e-07,
"loss": 0.30580854415893555,
"step": 1746
},
{
"epoch": 4.0,
"grad_norm": 0.5312338471412659,
"learning_rate": 8.000064469743863e-07,
"loss": 0.5320942401885986,
"step": 1748
},
{
"epoch": 4.0,
"step": 1748,
"total_flos": 3.454020596210336e+18,
"train_loss": 0.7742519322697508,
"train_runtime": 25098.6052,
"train_samples_per_second": 2.089,
"train_steps_per_second": 0.07
}
],
"logging_steps": 2,
"max_steps": 1748,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.454020596210336e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}