9b-113 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
8a6e3a2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1748,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004576659038901602,
"grad_norm": 18.951799392700195,
"learning_rate": 1.1363636363636364e-07,
"loss": 2.251408100128174,
"step": 2
},
{
"epoch": 0.009153318077803204,
"grad_norm": 3.432992696762085,
"learning_rate": 3.409090909090909e-07,
"loss": 2.022919178009033,
"step": 4
},
{
"epoch": 0.013729977116704805,
"grad_norm": 4.982711315155029,
"learning_rate": 5.681818181818182e-07,
"loss": 2.143446207046509,
"step": 6
},
{
"epoch": 0.018306636155606407,
"grad_norm": 12.981165885925293,
"learning_rate": 7.954545454545455e-07,
"loss": 2.0866191387176514,
"step": 8
},
{
"epoch": 0.02288329519450801,
"grad_norm": 5.714666843414307,
"learning_rate": 1.0227272727272729e-06,
"loss": 1.896759033203125,
"step": 10
},
{
"epoch": 0.02745995423340961,
"grad_norm": 12.909541130065918,
"learning_rate": 1.25e-06,
"loss": 2.020211696624756,
"step": 12
},
{
"epoch": 0.032036613272311214,
"grad_norm": 16.100811004638672,
"learning_rate": 1.4772727272727275e-06,
"loss": 1.792801022529602,
"step": 14
},
{
"epoch": 0.036613272311212815,
"grad_norm": 2.436553955078125,
"learning_rate": 1.7045454545454546e-06,
"loss": 1.8900394439697266,
"step": 16
},
{
"epoch": 0.041189931350114416,
"grad_norm": 5.285153865814209,
"learning_rate": 1.931818181818182e-06,
"loss": 1.5955464839935303,
"step": 18
},
{
"epoch": 0.04576659038901602,
"grad_norm": 12.234434127807617,
"learning_rate": 2.1590909090909092e-06,
"loss": 1.2172309160232544,
"step": 20
},
{
"epoch": 0.05034324942791762,
"grad_norm": 12.644134521484375,
"learning_rate": 2.3863636363636367e-06,
"loss": 0.9669137597084045,
"step": 22
},
{
"epoch": 0.05491990846681922,
"grad_norm": 3.1600699424743652,
"learning_rate": 2.6136363636363637e-06,
"loss": 1.7239738702774048,
"step": 24
},
{
"epoch": 0.059496567505720827,
"grad_norm": 13.365952491760254,
"learning_rate": 2.8409090909090916e-06,
"loss": 1.436122179031372,
"step": 26
},
{
"epoch": 0.06407322654462243,
"grad_norm": 2.321202278137207,
"learning_rate": 3.0681818181818186e-06,
"loss": 1.8740382194519043,
"step": 28
},
{
"epoch": 0.06864988558352403,
"grad_norm": 8.490443229675293,
"learning_rate": 3.2954545454545456e-06,
"loss": 1.5721473693847656,
"step": 30
},
{
"epoch": 0.07322654462242563,
"grad_norm": 3.1264488697052,
"learning_rate": 3.522727272727273e-06,
"loss": 1.5784003734588623,
"step": 32
},
{
"epoch": 0.07780320366132723,
"grad_norm": 2.265772581100464,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.4820594787597656,
"step": 34
},
{
"epoch": 0.08237986270022883,
"grad_norm": 2.990844964981079,
"learning_rate": 3.9772727272727275e-06,
"loss": 1.2070374488830566,
"step": 36
},
{
"epoch": 0.08695652173913043,
"grad_norm": 12.784393310546875,
"learning_rate": 4.204545454545455e-06,
"loss": 1.3093262910842896,
"step": 38
},
{
"epoch": 0.09153318077803203,
"grad_norm": 3.836043357849121,
"learning_rate": 4.4318181818181824e-06,
"loss": 1.4149296283721924,
"step": 40
},
{
"epoch": 0.09610983981693363,
"grad_norm": 1.9667078256607056,
"learning_rate": 4.6590909090909095e-06,
"loss": 1.117027997970581,
"step": 42
},
{
"epoch": 0.10068649885583524,
"grad_norm": 3.5431325435638428,
"learning_rate": 4.8863636363636365e-06,
"loss": 0.7506925463676453,
"step": 44
},
{
"epoch": 0.10526315789473684,
"grad_norm": 2.191105604171753,
"learning_rate": 5.113636363636364e-06,
"loss": 1.2588834762573242,
"step": 46
},
{
"epoch": 0.10983981693363844,
"grad_norm": 2.621471643447876,
"learning_rate": 5.340909090909091e-06,
"loss": 0.9852038621902466,
"step": 48
},
{
"epoch": 0.11441647597254005,
"grad_norm": 2.538278341293335,
"learning_rate": 5.568181818181818e-06,
"loss": 1.0315567255020142,
"step": 50
},
{
"epoch": 0.11899313501144165,
"grad_norm": 3.634997606277466,
"learning_rate": 5.795454545454546e-06,
"loss": 0.7927528619766235,
"step": 52
},
{
"epoch": 0.12356979405034325,
"grad_norm": 5.6927170753479,
"learning_rate": 6.022727272727273e-06,
"loss": 0.6859608888626099,
"step": 54
},
{
"epoch": 0.12814645308924486,
"grad_norm": 2.9429779052734375,
"learning_rate": 6.25e-06,
"loss": 1.399317741394043,
"step": 56
},
{
"epoch": 0.13272311212814644,
"grad_norm": 3.453831434249878,
"learning_rate": 6.477272727272727e-06,
"loss": 0.9231398105621338,
"step": 58
},
{
"epoch": 0.13729977116704806,
"grad_norm": 3.813654899597168,
"learning_rate": 6.704545454545454e-06,
"loss": 1.1301286220550537,
"step": 60
},
{
"epoch": 0.14187643020594964,
"grad_norm": 2.4758615493774414,
"learning_rate": 6.931818181818183e-06,
"loss": 1.120086669921875,
"step": 62
},
{
"epoch": 0.14645308924485126,
"grad_norm": 5.501305103302002,
"learning_rate": 7.15909090909091e-06,
"loss": 1.4239763021469116,
"step": 64
},
{
"epoch": 0.15102974828375287,
"grad_norm": 2.2261505126953125,
"learning_rate": 7.386363636363637e-06,
"loss": 0.883802056312561,
"step": 66
},
{
"epoch": 0.15560640732265446,
"grad_norm": 2.1745944023132324,
"learning_rate": 7.613636363636364e-06,
"loss": 0.9119312763214111,
"step": 68
},
{
"epoch": 0.16018306636155608,
"grad_norm": 2.2609024047851562,
"learning_rate": 7.840909090909091e-06,
"loss": 1.3790769577026367,
"step": 70
},
{
"epoch": 0.16475972540045766,
"grad_norm": 2.078526496887207,
"learning_rate": 8.068181818181819e-06,
"loss": 1.247062087059021,
"step": 72
},
{
"epoch": 0.16933638443935928,
"grad_norm": 1.86798095703125,
"learning_rate": 8.295454545454547e-06,
"loss": 1.320522665977478,
"step": 74
},
{
"epoch": 0.17391304347826086,
"grad_norm": 3.6566405296325684,
"learning_rate": 8.522727272727273e-06,
"loss": 1.1991426944732666,
"step": 76
},
{
"epoch": 0.17848970251716248,
"grad_norm": 2.2346911430358887,
"learning_rate": 8.750000000000001e-06,
"loss": 1.1312178373336792,
"step": 78
},
{
"epoch": 0.18306636155606407,
"grad_norm": 4.466203212738037,
"learning_rate": 8.977272727272727e-06,
"loss": 1.2709550857543945,
"step": 80
},
{
"epoch": 0.18764302059496568,
"grad_norm": 4.847050666809082,
"learning_rate": 9.204545454545455e-06,
"loss": 1.1973122358322144,
"step": 82
},
{
"epoch": 0.19221967963386727,
"grad_norm": 15.698393821716309,
"learning_rate": 9.431818181818183e-06,
"loss": 0.9569052457809448,
"step": 84
},
{
"epoch": 0.19679633867276888,
"grad_norm": 1.8609542846679688,
"learning_rate": 9.65909090909091e-06,
"loss": 1.2557601928710938,
"step": 86
},
{
"epoch": 0.20137299771167047,
"grad_norm": 2.1308515071868896,
"learning_rate": 9.886363636363637e-06,
"loss": 1.3304195404052734,
"step": 88
},
{
"epoch": 0.20594965675057209,
"grad_norm": 3.0522403717041016,
"learning_rate": 9.999991941282018e-06,
"loss": 1.2789955139160156,
"step": 90
},
{
"epoch": 0.21052631578947367,
"grad_norm": 2.302807569503784,
"learning_rate": 9.999927471711333e-06,
"loss": 0.9662280082702637,
"step": 92
},
{
"epoch": 0.2151029748283753,
"grad_norm": 1.821358323097229,
"learning_rate": 9.999798533493595e-06,
"loss": 1.3702021837234497,
"step": 94
},
{
"epoch": 0.21967963386727687,
"grad_norm": 2.9049158096313477,
"learning_rate": 9.999605128476047e-06,
"loss": 1.2553328275680542,
"step": 96
},
{
"epoch": 0.2242562929061785,
"grad_norm": 4.430267810821533,
"learning_rate": 9.999347259429527e-06,
"loss": 1.2501583099365234,
"step": 98
},
{
"epoch": 0.2288329519450801,
"grad_norm": 1.757613182067871,
"learning_rate": 9.999024930048416e-06,
"loss": 0.9878523945808411,
"step": 100
},
{
"epoch": 0.2334096109839817,
"grad_norm": 1.882934808731079,
"learning_rate": 9.998638144950604e-06,
"loss": 1.3420817852020264,
"step": 102
},
{
"epoch": 0.2379862700228833,
"grad_norm": 2.3838956356048584,
"learning_rate": 9.998186909677402e-06,
"loss": 0.6844009160995483,
"step": 104
},
{
"epoch": 0.2425629290617849,
"grad_norm": 1.2917922735214233,
"learning_rate": 9.997671230693475e-06,
"loss": 1.2744860649108887,
"step": 106
},
{
"epoch": 0.2471395881006865,
"grad_norm": 6.209701061248779,
"learning_rate": 9.997091115386751e-06,
"loss": 0.9371986985206604,
"step": 108
},
{
"epoch": 0.2517162471395881,
"grad_norm": 3.551896333694458,
"learning_rate": 9.996446572068303e-06,
"loss": 0.993743896484375,
"step": 110
},
{
"epoch": 0.2562929061784897,
"grad_norm": 2.348245859146118,
"learning_rate": 9.995737609972248e-06,
"loss": 1.222794771194458,
"step": 112
},
{
"epoch": 0.2608695652173913,
"grad_norm": 2.0239150524139404,
"learning_rate": 9.9949642392556e-06,
"loss": 1.301463007926941,
"step": 114
},
{
"epoch": 0.2654462242562929,
"grad_norm": 15.598209381103516,
"learning_rate": 9.994126470998126e-06,
"loss": 0.9942538738250732,
"step": 116
},
{
"epoch": 0.2700228832951945,
"grad_norm": 3.621354103088379,
"learning_rate": 9.993224317202196e-06,
"loss": 0.9668116569519043,
"step": 118
},
{
"epoch": 0.2745995423340961,
"grad_norm": 1.4748951196670532,
"learning_rate": 9.992257790792606e-06,
"loss": 1.1951305866241455,
"step": 120
},
{
"epoch": 0.2791762013729977,
"grad_norm": 3.5889358520507812,
"learning_rate": 9.991226905616387e-06,
"loss": 1.4021278619766235,
"step": 122
},
{
"epoch": 0.2837528604118993,
"grad_norm": 2.6599173545837402,
"learning_rate": 9.990131676442615e-06,
"loss": 1.2020851373672485,
"step": 124
},
{
"epoch": 0.28832951945080093,
"grad_norm": 6.152349948883057,
"learning_rate": 9.9889721189622e-06,
"loss": 1.3222763538360596,
"step": 126
},
{
"epoch": 0.2929061784897025,
"grad_norm": 3.790332555770874,
"learning_rate": 9.987748249787654e-06,
"loss": 0.996853232383728,
"step": 128
},
{
"epoch": 0.2974828375286041,
"grad_norm": 3.391437292098999,
"learning_rate": 9.986460086452857e-06,
"loss": 1.0844181776046753,
"step": 130
},
{
"epoch": 0.30205949656750575,
"grad_norm": 5.271035194396973,
"learning_rate": 9.985107647412804e-06,
"loss": 1.159407138824463,
"step": 132
},
{
"epoch": 0.30663615560640733,
"grad_norm": 1.60601007938385,
"learning_rate": 9.983690952043345e-06,
"loss": 1.3746013641357422,
"step": 134
},
{
"epoch": 0.3112128146453089,
"grad_norm": 7.673926830291748,
"learning_rate": 9.982210020640905e-06,
"loss": 1.0017273426055908,
"step": 136
},
{
"epoch": 0.3157894736842105,
"grad_norm": 2.7053334712982178,
"learning_rate": 9.98066487442219e-06,
"loss": 1.3173471689224243,
"step": 138
},
{
"epoch": 0.32036613272311215,
"grad_norm": 2.064483404159546,
"learning_rate": 9.979055535523887e-06,
"loss": 1.0609164237976074,
"step": 140
},
{
"epoch": 0.32494279176201374,
"grad_norm": 2.2231831550598145,
"learning_rate": 9.977382027002348e-06,
"loss": 1.304241418838501,
"step": 142
},
{
"epoch": 0.3295194508009153,
"grad_norm": 1.224456787109375,
"learning_rate": 9.97564437283325e-06,
"loss": 1.1332193613052368,
"step": 144
},
{
"epoch": 0.3340961098398169,
"grad_norm": 1.458966851234436,
"learning_rate": 9.973842597911268e-06,
"loss": 1.2001259326934814,
"step": 146
},
{
"epoch": 0.33867276887871856,
"grad_norm": 2.087167501449585,
"learning_rate": 9.971976728049704e-06,
"loss": 1.2756290435791016,
"step": 148
},
{
"epoch": 0.34324942791762014,
"grad_norm": 1.9153831005096436,
"learning_rate": 9.970046789980122e-06,
"loss": 1.2948122024536133,
"step": 150
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.9743618965148926,
"learning_rate": 9.96805281135197e-06,
"loss": 1.3063007593154907,
"step": 152
},
{
"epoch": 0.3524027459954233,
"grad_norm": 2.7115976810455322,
"learning_rate": 9.965994820732174e-06,
"loss": 0.9853567481040955,
"step": 154
},
{
"epoch": 0.35697940503432496,
"grad_norm": 1.8446264266967773,
"learning_rate": 9.963872847604735e-06,
"loss": 0.8984273076057434,
"step": 156
},
{
"epoch": 0.36155606407322655,
"grad_norm": 2.8307297229766846,
"learning_rate": 9.961686922370309e-06,
"loss": 1.1862984895706177,
"step": 158
},
{
"epoch": 0.36613272311212813,
"grad_norm": 1.9325522184371948,
"learning_rate": 9.959437076345764e-06,
"loss": 1.1266686916351318,
"step": 160
},
{
"epoch": 0.3707093821510298,
"grad_norm": 2.4998505115509033,
"learning_rate": 9.957123341763736e-06,
"loss": 1.0668294429779053,
"step": 162
},
{
"epoch": 0.37528604118993136,
"grad_norm": 1.6686221361160278,
"learning_rate": 9.954745751772172e-06,
"loss": 1.3585450649261475,
"step": 164
},
{
"epoch": 0.37986270022883295,
"grad_norm": 3.186450958251953,
"learning_rate": 9.952304340433845e-06,
"loss": 1.062609314918518,
"step": 166
},
{
"epoch": 0.38443935926773454,
"grad_norm": 4.109751224517822,
"learning_rate": 9.949799142725866e-06,
"loss": 1.1804240942001343,
"step": 168
},
{
"epoch": 0.3890160183066362,
"grad_norm": 1.3450946807861328,
"learning_rate": 9.947230194539196e-06,
"loss": 1.2886464595794678,
"step": 170
},
{
"epoch": 0.39359267734553777,
"grad_norm": 2.0440635681152344,
"learning_rate": 9.94459753267812e-06,
"loss": 1.0574209690093994,
"step": 172
},
{
"epoch": 0.39816933638443935,
"grad_norm": 3.0480129718780518,
"learning_rate": 9.941901194859726e-06,
"loss": 1.2568514347076416,
"step": 174
},
{
"epoch": 0.40274599542334094,
"grad_norm": 7.81062650680542,
"learning_rate": 9.939141219713353e-06,
"loss": 0.9926815032958984,
"step": 176
},
{
"epoch": 0.4073226544622426,
"grad_norm": 2.2387683391571045,
"learning_rate": 9.936317646780057e-06,
"loss": 1.392266869544983,
"step": 178
},
{
"epoch": 0.41189931350114417,
"grad_norm": 1.5428338050842285,
"learning_rate": 9.933430516512029e-06,
"loss": 1.0441172122955322,
"step": 180
},
{
"epoch": 0.41647597254004576,
"grad_norm": 1.78026282787323,
"learning_rate": 9.930479870272018e-06,
"loss": 1.2641940116882324,
"step": 182
},
{
"epoch": 0.42105263157894735,
"grad_norm": 3.4682302474975586,
"learning_rate": 9.927465750332747e-06,
"loss": 1.2930469512939453,
"step": 184
},
{
"epoch": 0.425629290617849,
"grad_norm": 2.080035448074341,
"learning_rate": 9.924388199876294e-06,
"loss": 1.3206355571746826,
"step": 186
},
{
"epoch": 0.4302059496567506,
"grad_norm": 2.443369150161743,
"learning_rate": 9.921247262993487e-06,
"loss": 1.131381630897522,
"step": 188
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.6168911457061768,
"learning_rate": 9.918042984683262e-06,
"loss": 0.9173140525817871,
"step": 190
},
{
"epoch": 0.43935926773455375,
"grad_norm": 2.228675365447998,
"learning_rate": 9.91477541085202e-06,
"loss": 1.298048734664917,
"step": 192
},
{
"epoch": 0.4439359267734554,
"grad_norm": 5.84655237197876,
"learning_rate": 9.911444588312976e-06,
"loss": 0.8752405643463135,
"step": 194
},
{
"epoch": 0.448512585812357,
"grad_norm": 1.5287593603134155,
"learning_rate": 9.908050564785481e-06,
"loss": 1.25732421875,
"step": 196
},
{
"epoch": 0.45308924485125857,
"grad_norm": 1.5870686769485474,
"learning_rate": 9.904593388894347e-06,
"loss": 1.1686997413635254,
"step": 198
},
{
"epoch": 0.4576659038901602,
"grad_norm": 2.4326746463775635,
"learning_rate": 9.901073110169132e-06,
"loss": 1.0250574350357056,
"step": 200
},
{
"epoch": 0.4622425629290618,
"grad_norm": 3.7590527534484863,
"learning_rate": 9.897489779043454e-06,
"loss": 1.367837905883789,
"step": 202
},
{
"epoch": 0.4668192219679634,
"grad_norm": 5.2029924392700195,
"learning_rate": 9.893843446854255e-06,
"loss": 1.2215626239776611,
"step": 204
},
{
"epoch": 0.47139588100686497,
"grad_norm": 2.076643466949463,
"learning_rate": 9.890134165841064e-06,
"loss": 1.1037054061889648,
"step": 206
},
{
"epoch": 0.4759725400457666,
"grad_norm": 6.4338531494140625,
"learning_rate": 9.886361989145256e-06,
"loss": 1.1291370391845703,
"step": 208
},
{
"epoch": 0.4805491990846682,
"grad_norm": 4.497616291046143,
"learning_rate": 9.882526970809286e-06,
"loss": 1.289217233657837,
"step": 210
},
{
"epoch": 0.4851258581235698,
"grad_norm": 4.023069381713867,
"learning_rate": 9.878629165775916e-06,
"loss": 1.3998191356658936,
"step": 212
},
{
"epoch": 0.4897025171624714,
"grad_norm": 2.4669904708862305,
"learning_rate": 9.874668629887428e-06,
"loss": 1.2304133176803589,
"step": 214
},
{
"epoch": 0.494279176201373,
"grad_norm": 3.409818410873413,
"learning_rate": 9.870645419884821e-06,
"loss": 1.0029253959655762,
"step": 216
},
{
"epoch": 0.4988558352402746,
"grad_norm": 3.6900084018707275,
"learning_rate": 9.866559593407006e-06,
"loss": 1.1414201259613037,
"step": 218
},
{
"epoch": 0.5034324942791762,
"grad_norm": 2.412198781967163,
"learning_rate": 9.862411208989971e-06,
"loss": 1.0081186294555664,
"step": 220
},
{
"epoch": 0.5080091533180778,
"grad_norm": 4.239388465881348,
"learning_rate": 9.858200326065948e-06,
"loss": 1.2810657024383545,
"step": 222
},
{
"epoch": 0.5125858123569794,
"grad_norm": 6.374491214752197,
"learning_rate": 9.853927004962557e-06,
"loss": 1.068434715270996,
"step": 224
},
{
"epoch": 0.517162471395881,
"grad_norm": 2.322037696838379,
"learning_rate": 9.849591306901948e-06,
"loss": 1.2551425695419312,
"step": 226
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.8771734237670898,
"learning_rate": 9.845193293999921e-06,
"loss": 0.7825489044189453,
"step": 228
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.1865154504776,
"learning_rate": 9.840733029265033e-06,
"loss": 1.1792237758636475,
"step": 230
},
{
"epoch": 0.5308924485125858,
"grad_norm": 1.4317781925201416,
"learning_rate": 9.836210576597699e-06,
"loss": 1.24580717086792,
"step": 232
},
{
"epoch": 0.5354691075514875,
"grad_norm": 2.155020236968994,
"learning_rate": 9.831626000789274e-06,
"loss": 1.2573895454406738,
"step": 234
},
{
"epoch": 0.540045766590389,
"grad_norm": 3.024273157119751,
"learning_rate": 9.826979367521131e-06,
"loss": 0.9660389423370361,
"step": 236
},
{
"epoch": 0.5446224256292906,
"grad_norm": 2.3436639308929443,
"learning_rate": 9.82227074336371e-06,
"loss": 0.9024907350540161,
"step": 238
},
{
"epoch": 0.5491990846681922,
"grad_norm": 1.5992021560668945,
"learning_rate": 9.81750019577557e-06,
"loss": 1.2666064500808716,
"step": 240
},
{
"epoch": 0.5537757437070938,
"grad_norm": 17.894210815429688,
"learning_rate": 9.812667793102425e-06,
"loss": 0.9698783159255981,
"step": 242
},
{
"epoch": 0.5583524027459954,
"grad_norm": 1.664934515953064,
"learning_rate": 9.80777360457616e-06,
"loss": 1.1118632555007935,
"step": 244
},
{
"epoch": 0.562929061784897,
"grad_norm": 2.275209665298462,
"learning_rate": 9.802817700313842e-06,
"loss": 1.523911714553833,
"step": 246
},
{
"epoch": 0.5675057208237986,
"grad_norm": 4.4243998527526855,
"learning_rate": 9.797800151316711e-06,
"loss": 0.8372693061828613,
"step": 248
},
{
"epoch": 0.5720823798627003,
"grad_norm": 4.079118728637695,
"learning_rate": 9.792721029469173e-06,
"loss": 0.6180707216262817,
"step": 250
},
{
"epoch": 0.5766590389016019,
"grad_norm": 11.111287117004395,
"learning_rate": 9.787580407537759e-06,
"loss": 0.893789529800415,
"step": 252
},
{
"epoch": 0.5812356979405034,
"grad_norm": 15.717961311340332,
"learning_rate": 9.782378359170082e-06,
"loss": 0.946647584438324,
"step": 254
},
{
"epoch": 0.585812356979405,
"grad_norm": 1.3548306226730347,
"learning_rate": 9.777114958893799e-06,
"loss": 1.598212480545044,
"step": 256
},
{
"epoch": 0.5903890160183066,
"grad_norm": 3.011941909790039,
"learning_rate": 9.77179028211552e-06,
"loss": 1.2696183919906616,
"step": 258
},
{
"epoch": 0.5949656750572082,
"grad_norm": 4.57606315612793,
"learning_rate": 9.766404405119742e-06,
"loss": 0.5768990516662598,
"step": 260
},
{
"epoch": 0.5995423340961098,
"grad_norm": 2.027827024459839,
"learning_rate": 9.760957405067758e-06,
"loss": 1.0868414640426636,
"step": 262
},
{
"epoch": 0.6041189931350115,
"grad_norm": 1.9111216068267822,
"learning_rate": 9.75544935999654e-06,
"loss": 1.313867449760437,
"step": 264
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.7331411838531494,
"learning_rate": 9.749880348817629e-06,
"loss": 1.0849635601043701,
"step": 266
},
{
"epoch": 0.6132723112128147,
"grad_norm": 2.179687976837158,
"learning_rate": 9.744250451316003e-06,
"loss": 1.2717711925506592,
"step": 268
},
{
"epoch": 0.6178489702517163,
"grad_norm": 1.9285099506378174,
"learning_rate": 9.738559748148937e-06,
"loss": 1.4478249549865723,
"step": 270
},
{
"epoch": 0.6224256292906178,
"grad_norm": 2.466549873352051,
"learning_rate": 9.732808320844838e-06,
"loss": 1.247833490371704,
"step": 272
},
{
"epoch": 0.6270022883295194,
"grad_norm": 1.4353868961334229,
"learning_rate": 9.726996251802088e-06,
"loss": 0.9141141176223755,
"step": 274
},
{
"epoch": 0.631578947368421,
"grad_norm": 6.866336822509766,
"learning_rate": 9.721123624287858e-06,
"loss": 1.126939058303833,
"step": 276
},
{
"epoch": 0.6361556064073226,
"grad_norm": 5.548222064971924,
"learning_rate": 9.715190522436916e-06,
"loss": 1.0435082912445068,
"step": 278
},
{
"epoch": 0.6407322654462243,
"grad_norm": 1.6613945960998535,
"learning_rate": 9.709197031250419e-06,
"loss": 0.8629664182662964,
"step": 280
},
{
"epoch": 0.6453089244851259,
"grad_norm": 3.140124797821045,
"learning_rate": 9.7031432365947e-06,
"loss": 1.2095342874526978,
"step": 282
},
{
"epoch": 0.6498855835240275,
"grad_norm": 1.81770920753479,
"learning_rate": 9.697029225200033e-06,
"loss": 1.2224023342132568,
"step": 284
},
{
"epoch": 0.6544622425629291,
"grad_norm": 3.419079303741455,
"learning_rate": 9.690855084659399e-06,
"loss": 1.1394864320755005,
"step": 286
},
{
"epoch": 0.6590389016018307,
"grad_norm": 4.641355514526367,
"learning_rate": 9.684620903427217e-06,
"loss": 0.603121280670166,
"step": 288
},
{
"epoch": 0.6636155606407322,
"grad_norm": 2.0329110622406006,
"learning_rate": 9.678326770818091e-06,
"loss": 1.2675936222076416,
"step": 290
},
{
"epoch": 0.6681922196796338,
"grad_norm": 2.861783981323242,
"learning_rate": 9.671972777005522e-06,
"loss": 1.2691437005996704,
"step": 292
},
{
"epoch": 0.6727688787185355,
"grad_norm": 3.443868398666382,
"learning_rate": 9.665559013020615e-06,
"loss": 0.9782974720001221,
"step": 294
},
{
"epoch": 0.6773455377574371,
"grad_norm": 4.7333807945251465,
"learning_rate": 9.659085570750786e-06,
"loss": 1.1612757444381714,
"step": 296
},
{
"epoch": 0.6819221967963387,
"grad_norm": 10.009819030761719,
"learning_rate": 9.652552542938428e-06,
"loss": 1.2675966024398804,
"step": 298
},
{
"epoch": 0.6864988558352403,
"grad_norm": 1.4802839756011963,
"learning_rate": 9.645960023179601e-06,
"loss": 1.2351771593093872,
"step": 300
},
{
"epoch": 0.6910755148741419,
"grad_norm": 2.1872692108154297,
"learning_rate": 9.63930810592268e-06,
"loss": 1.2081533670425415,
"step": 302
},
{
"epoch": 0.6956521739130435,
"grad_norm": 4.144794940948486,
"learning_rate": 9.632596886466995e-06,
"loss": 1.019889235496521,
"step": 304
},
{
"epoch": 0.700228832951945,
"grad_norm": 2.8666250705718994,
"learning_rate": 9.625826460961488e-06,
"loss": 0.9050840139389038,
"step": 306
},
{
"epoch": 0.7048054919908466,
"grad_norm": 1.1820842027664185,
"learning_rate": 9.618996926403314e-06,
"loss": 0.5790635347366333,
"step": 308
},
{
"epoch": 0.7093821510297483,
"grad_norm": 3.2699780464172363,
"learning_rate": 9.612108380636463e-06,
"loss": 1.002554178237915,
"step": 310
},
{
"epoch": 0.7139588100686499,
"grad_norm": 2.4456770420074463,
"learning_rate": 9.605160922350351e-06,
"loss": 1.2322945594787598,
"step": 312
},
{
"epoch": 0.7185354691075515,
"grad_norm": 1.6210792064666748,
"learning_rate": 9.598154651078419e-06,
"loss": 1.235492467880249,
"step": 314
},
{
"epoch": 0.7231121281464531,
"grad_norm": 7.475627899169922,
"learning_rate": 9.591089667196682e-06,
"loss": 0.9872145652770996,
"step": 316
},
{
"epoch": 0.7276887871853547,
"grad_norm": 1.6771211624145508,
"learning_rate": 9.583966071922322e-06,
"loss": 1.2031590938568115,
"step": 318
},
{
"epoch": 0.7322654462242563,
"grad_norm": 5.789905071258545,
"learning_rate": 9.576783967312218e-06,
"loss": 1.190985918045044,
"step": 320
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.9887858629226685,
"learning_rate": 9.569543456261485e-06,
"loss": 1.188122272491455,
"step": 322
},
{
"epoch": 0.7414187643020596,
"grad_norm": 2.4599740505218506,
"learning_rate": 9.562244642502007e-06,
"loss": 0.8889155387878418,
"step": 324
},
{
"epoch": 0.7459954233409611,
"grad_norm": 3.597360372543335,
"learning_rate": 9.554887630600945e-06,
"loss": 1.1945111751556396,
"step": 326
},
{
"epoch": 0.7505720823798627,
"grad_norm": 1.4225729703903198,
"learning_rate": 9.547472525959247e-06,
"loss": 1.2383577823638916,
"step": 328
},
{
"epoch": 0.7551487414187643,
"grad_norm": 1.7745798826217651,
"learning_rate": 9.539999434810127e-06,
"loss": 1.2871983051300049,
"step": 330
},
{
"epoch": 0.7597254004576659,
"grad_norm": 2.066685914993286,
"learning_rate": 9.532468464217548e-06,
"loss": 1.271721601486206,
"step": 332
},
{
"epoch": 0.7643020594965675,
"grad_norm": 5.190197467803955,
"learning_rate": 9.524879722074691e-06,
"loss": 1.0039429664611816,
"step": 334
},
{
"epoch": 0.7688787185354691,
"grad_norm": 4.809805393218994,
"learning_rate": 9.517233317102406e-06,
"loss": 1.159362554550171,
"step": 336
},
{
"epoch": 0.7734553775743707,
"grad_norm": 1.4193916320800781,
"learning_rate": 9.509529358847655e-06,
"loss": 1.2524155378341675,
"step": 338
},
{
"epoch": 0.7780320366132724,
"grad_norm": 5.768612861633301,
"learning_rate": 9.501767957681943e-06,
"loss": 1.1373052597045898,
"step": 340
},
{
"epoch": 0.782608695652174,
"grad_norm": 2.3782432079315186,
"learning_rate": 9.493949224799735e-06,
"loss": 1.197812795639038,
"step": 342
},
{
"epoch": 0.7871853546910755,
"grad_norm": 2.0369019508361816,
"learning_rate": 9.486073272216867e-06,
"loss": 0.8322545886039734,
"step": 344
},
{
"epoch": 0.7917620137299771,
"grad_norm": 3.6057472229003906,
"learning_rate": 9.478140212768935e-06,
"loss": 1.1050453186035156,
"step": 346
},
{
"epoch": 0.7963386727688787,
"grad_norm": 3.5964956283569336,
"learning_rate": 9.470150160109682e-06,
"loss": 1.0718210935592651,
"step": 348
},
{
"epoch": 0.8009153318077803,
"grad_norm": 12.48847770690918,
"learning_rate": 9.462103228709379e-06,
"loss": 1.3313639163970947,
"step": 350
},
{
"epoch": 0.8054919908466819,
"grad_norm": 1.933138370513916,
"learning_rate": 9.453999533853162e-06,
"loss": 1.4489710330963135,
"step": 352
},
{
"epoch": 0.8100686498855835,
"grad_norm": 2.0144715309143066,
"learning_rate": 9.445839191639404e-06,
"loss": 1.2176668643951416,
"step": 354
},
{
"epoch": 0.8146453089244852,
"grad_norm": 3.0317883491516113,
"learning_rate": 9.437622318978037e-06,
"loss": 0.6330467462539673,
"step": 356
},
{
"epoch": 0.8192219679633868,
"grad_norm": 6.440718650817871,
"learning_rate": 9.429349033588884e-06,
"loss": 0.8626018762588501,
"step": 358
},
{
"epoch": 0.8237986270022883,
"grad_norm": 2.311493396759033,
"learning_rate": 9.421019453999972e-06,
"loss": 1.0874342918395996,
"step": 360
},
{
"epoch": 0.8283752860411899,
"grad_norm": 2.266531467437744,
"learning_rate": 9.412633699545828e-06,
"loss": 1.2565999031066895,
"step": 362
},
{
"epoch": 0.8329519450800915,
"grad_norm": 1.8435766696929932,
"learning_rate": 9.404191890365775e-06,
"loss": 0.9089647531509399,
"step": 364
},
{
"epoch": 0.8375286041189931,
"grad_norm": 3.46240496635437,
"learning_rate": 9.395694147402214e-06,
"loss": 1.1782324314117432,
"step": 366
},
{
"epoch": 0.8421052631578947,
"grad_norm": 4.988228797912598,
"learning_rate": 9.387140592398878e-06,
"loss": 1.0270354747772217,
"step": 368
},
{
"epoch": 0.8466819221967964,
"grad_norm": 4.704240798950195,
"learning_rate": 9.378531347899108e-06,
"loss": 0.7700833082199097,
"step": 370
},
{
"epoch": 0.851258581235698,
"grad_norm": 7.564844131469727,
"learning_rate": 9.369866537244076e-06,
"loss": 0.8138679265975952,
"step": 372
},
{
"epoch": 0.8558352402745996,
"grad_norm": 4.306687355041504,
"learning_rate": 9.36114628457103e-06,
"loss": 1.4564876556396484,
"step": 374
},
{
"epoch": 0.8604118993135011,
"grad_norm": 3.528275966644287,
"learning_rate": 9.352370714811518e-06,
"loss": 0.7638095021247864,
"step": 376
},
{
"epoch": 0.8649885583524027,
"grad_norm": 2.6173832416534424,
"learning_rate": 9.343539953689592e-06,
"loss": 0.62486332654953,
"step": 378
},
{
"epoch": 0.8695652173913043,
"grad_norm": 3.4161272048950195,
"learning_rate": 9.334654127720005e-06,
"loss": 0.9487941265106201,
"step": 380
},
{
"epoch": 0.8741418764302059,
"grad_norm": 2.358881711959839,
"learning_rate": 9.325713364206402e-06,
"loss": 1.1284657716751099,
"step": 382
},
{
"epoch": 0.8787185354691075,
"grad_norm": 2.407302141189575,
"learning_rate": 9.3167177912395e-06,
"loss": 1.1534430980682373,
"step": 384
},
{
"epoch": 0.8832951945080092,
"grad_norm": 2.1035103797912598,
"learning_rate": 9.307667537695248e-06,
"loss": 1.2289859056472778,
"step": 386
},
{
"epoch": 0.8878718535469108,
"grad_norm": 2.478761911392212,
"learning_rate": 9.298562733232979e-06,
"loss": 1.2465143203735352,
"step": 388
},
{
"epoch": 0.8924485125858124,
"grad_norm": 2.6959574222564697,
"learning_rate": 9.289403508293558e-06,
"loss": 1.225327730178833,
"step": 390
},
{
"epoch": 0.897025171624714,
"grad_norm": 4.03607702255249,
"learning_rate": 9.280189994097507e-06,
"loss": 1.2250657081604004,
"step": 392
},
{
"epoch": 0.9016018306636155,
"grad_norm": 2.1316354274749756,
"learning_rate": 9.27092232264313e-06,
"loss": 1.065427303314209,
"step": 394
},
{
"epoch": 0.9061784897025171,
"grad_norm": 2.279680013656616,
"learning_rate": 9.261600626704622e-06,
"loss": 1.302757978439331,
"step": 396
},
{
"epoch": 0.9107551487414187,
"grad_norm": 2.0204954147338867,
"learning_rate": 9.252225039830163e-06,
"loss": 1.316508412361145,
"step": 398
},
{
"epoch": 0.9153318077803204,
"grad_norm": 2.101081609725952,
"learning_rate": 9.242795696340008e-06,
"loss": 1.1978795528411865,
"step": 400
},
{
"epoch": 0.919908466819222,
"grad_norm": 1.5630186796188354,
"learning_rate": 9.233312731324557e-06,
"loss": 0.9110370874404907,
"step": 402
},
{
"epoch": 0.9244851258581236,
"grad_norm": 3.5780534744262695,
"learning_rate": 9.22377628064243e-06,
"loss": 1.2253358364105225,
"step": 404
},
{
"epoch": 0.9290617848970252,
"grad_norm": 0.4819481372833252,
"learning_rate": 9.214186480918511e-06,
"loss": 1.119720697402954,
"step": 406
},
{
"epoch": 0.9336384439359268,
"grad_norm": 1.6182817220687866,
"learning_rate": 9.204543469541997e-06,
"loss": 1.2026877403259277,
"step": 408
},
{
"epoch": 0.9382151029748284,
"grad_norm": 5.046341896057129,
"learning_rate": 9.194847384664422e-06,
"loss": 1.0480332374572754,
"step": 410
},
{
"epoch": 0.9427917620137299,
"grad_norm": 1.5035834312438965,
"learning_rate": 9.185098365197688e-06,
"loss": 1.1682276725769043,
"step": 412
},
{
"epoch": 0.9473684210526315,
"grad_norm": 1.4273966550827026,
"learning_rate": 9.175296550812067e-06,
"loss": 1.2405450344085693,
"step": 414
},
{
"epoch": 0.9519450800915332,
"grad_norm": 7.252469539642334,
"learning_rate": 9.165442081934202e-06,
"loss": 1.122786045074463,
"step": 416
},
{
"epoch": 0.9565217391304348,
"grad_norm": 2.6789910793304443,
"learning_rate": 9.155535099745097e-06,
"loss": 1.2588951587677002,
"step": 418
},
{
"epoch": 0.9610983981693364,
"grad_norm": 3.5507678985595703,
"learning_rate": 9.145575746178092e-06,
"loss": 1.0224729776382446,
"step": 420
},
{
"epoch": 0.965675057208238,
"grad_norm": 7.071379661560059,
"learning_rate": 9.135564163916833e-06,
"loss": 1.204231858253479,
"step": 422
},
{
"epoch": 0.9702517162471396,
"grad_norm": 1.381493330001831,
"learning_rate": 9.125500496393221e-06,
"loss": 0.7406469583511353,
"step": 424
},
{
"epoch": 0.9748283752860412,
"grad_norm": 2.6033928394317627,
"learning_rate": 9.115384887785366e-06,
"loss": 1.4214835166931152,
"step": 426
},
{
"epoch": 0.9794050343249427,
"grad_norm": 50.466766357421875,
"learning_rate": 9.105217483015514e-06,
"loss": 0.8424577713012695,
"step": 428
},
{
"epoch": 0.9839816933638444,
"grad_norm": 2.872386932373047,
"learning_rate": 9.094998427747974e-06,
"loss": 1.2860726118087769,
"step": 430
},
{
"epoch": 0.988558352402746,
"grad_norm": 3.8774938583374023,
"learning_rate": 9.084727868387036e-06,
"loss": 1.2441885471343994,
"step": 432
},
{
"epoch": 0.9931350114416476,
"grad_norm": 1.6605303287506104,
"learning_rate": 9.074405952074858e-06,
"loss": 1.3228825330734253,
"step": 434
},
{
"epoch": 0.9977116704805492,
"grad_norm": 2.563741683959961,
"learning_rate": 9.064032826689378e-06,
"loss": 1.1689465045928955,
"step": 436
},
{
"epoch": 1.002288329519451,
"grad_norm": 1.281950831413269,
"learning_rate": 9.053608640842183e-06,
"loss": 1.0966238975524902,
"step": 438
},
{
"epoch": 1.0068649885583525,
"grad_norm": 3.721905469894409,
"learning_rate": 9.04313354387638e-06,
"loss": 1.159909963607788,
"step": 440
},
{
"epoch": 1.011441647597254,
"grad_norm": 2.1142349243164062,
"learning_rate": 9.032607685864463e-06,
"loss": 0.7140793800354004,
"step": 442
},
{
"epoch": 1.0160183066361557,
"grad_norm": 1.9991384744644165,
"learning_rate": 9.022031217606153e-06,
"loss": 0.6476885080337524,
"step": 444
},
{
"epoch": 1.0205949656750573,
"grad_norm": 8.886889457702637,
"learning_rate": 9.011404290626251e-06,
"loss": 0.5267953872680664,
"step": 446
},
{
"epoch": 1.0251716247139588,
"grad_norm": 6.78672456741333,
"learning_rate": 9.000727057172456e-06,
"loss": 0.8036065101623535,
"step": 448
},
{
"epoch": 1.0297482837528604,
"grad_norm": 2.5695552825927734,
"learning_rate": 8.989999670213186e-06,
"loss": 0.8730241060256958,
"step": 450
},
{
"epoch": 1.034324942791762,
"grad_norm": 1.9509334564208984,
"learning_rate": 8.979222283435392e-06,
"loss": 1.1160993576049805,
"step": 452
},
{
"epoch": 1.0389016018306636,
"grad_norm": 3.5826354026794434,
"learning_rate": 8.96839505124235e-06,
"loss": 1.0545172691345215,
"step": 454
},
{
"epoch": 1.0434782608695652,
"grad_norm": 2.619070053100586,
"learning_rate": 8.95751812875145e-06,
"loss": 0.4537786543369293,
"step": 456
},
{
"epoch": 1.0480549199084668,
"grad_norm": 1.5989960432052612,
"learning_rate": 8.946591671791977e-06,
"loss": 1.024822473526001,
"step": 458
},
{
"epoch": 1.0526315789473684,
"grad_norm": 2.2013661861419678,
"learning_rate": 8.935615836902876e-06,
"loss": 0.6335904598236084,
"step": 460
},
{
"epoch": 1.05720823798627,
"grad_norm": 2.2854695320129395,
"learning_rate": 8.92459078133051e-06,
"loss": 1.0793886184692383,
"step": 462
},
{
"epoch": 1.0617848970251715,
"grad_norm": 3.123532772064209,
"learning_rate": 8.913516663026404e-06,
"loss": 0.9790216088294983,
"step": 464
},
{
"epoch": 1.0663615560640731,
"grad_norm": 2.568563222885132,
"learning_rate": 8.902393640644988e-06,
"loss": 0.5305502414703369,
"step": 466
},
{
"epoch": 1.070938215102975,
"grad_norm": 1.6813656091690063,
"learning_rate": 8.89122187354132e-06,
"loss": 0.9657827019691467,
"step": 468
},
{
"epoch": 1.0755148741418765,
"grad_norm": 3.9278571605682373,
"learning_rate": 8.880001521768808e-06,
"loss": 0.8472052216529846,
"step": 470
},
{
"epoch": 1.080091533180778,
"grad_norm": 0.806633710861206,
"learning_rate": 8.868732746076904e-06,
"loss": 0.6929956674575806,
"step": 472
},
{
"epoch": 1.0846681922196797,
"grad_norm": 7.49754524230957,
"learning_rate": 8.857415707908818e-06,
"loss": 0.7502920627593994,
"step": 474
},
{
"epoch": 1.0892448512585813,
"grad_norm": 1.6636337041854858,
"learning_rate": 8.846050569399191e-06,
"loss": 1.094468593597412,
"step": 476
},
{
"epoch": 1.0938215102974829,
"grad_norm": 1.5605403184890747,
"learning_rate": 8.834637493371785e-06,
"loss": 1.0230355262756348,
"step": 478
},
{
"epoch": 1.0983981693363845,
"grad_norm": 2.4448082447052,
"learning_rate": 8.823176643337137e-06,
"loss": 1.1374318599700928,
"step": 480
},
{
"epoch": 1.102974828375286,
"grad_norm": 1.8032822608947754,
"learning_rate": 8.811668183490228e-06,
"loss": 0.8566664457321167,
"step": 482
},
{
"epoch": 1.1075514874141876,
"grad_norm": 4.632296562194824,
"learning_rate": 8.800112278708124e-06,
"loss": 0.8171731233596802,
"step": 484
},
{
"epoch": 1.1121281464530892,
"grad_norm": 0.4392177164554596,
"learning_rate": 8.788509094547612e-06,
"loss": 0.5787323713302612,
"step": 486
},
{
"epoch": 1.1167048054919908,
"grad_norm": 1.527347207069397,
"learning_rate": 8.776858797242837e-06,
"loss": 0.9281891584396362,
"step": 488
},
{
"epoch": 1.1212814645308924,
"grad_norm": 2.3181469440460205,
"learning_rate": 8.76516155370291e-06,
"loss": 0.5847245454788208,
"step": 490
},
{
"epoch": 1.125858123569794,
"grad_norm": 1.7468464374542236,
"learning_rate": 8.753417531509527e-06,
"loss": 1.0287659168243408,
"step": 492
},
{
"epoch": 1.1304347826086956,
"grad_norm": 8.322272300720215,
"learning_rate": 8.741626898914558e-06,
"loss": 0.62440025806427,
"step": 494
},
{
"epoch": 1.1350114416475972,
"grad_norm": 1.4653323888778687,
"learning_rate": 8.729789824837644e-06,
"loss": 0.5702868700027466,
"step": 496
},
{
"epoch": 1.139588100686499,
"grad_norm": 2.8388781547546387,
"learning_rate": 8.717906478863776e-06,
"loss": 0.7256041765213013,
"step": 498
},
{
"epoch": 1.1441647597254005,
"grad_norm": 3.141469717025757,
"learning_rate": 8.70597703124086e-06,
"loss": 0.9217201471328735,
"step": 500
},
{
"epoch": 1.1487414187643021,
"grad_norm": 16.2866153717041,
"learning_rate": 8.694001652877283e-06,
"loss": 1.0957762002944946,
"step": 502
},
{
"epoch": 1.1533180778032037,
"grad_norm": 21.44241714477539,
"learning_rate": 8.681980515339464e-06,
"loss": 1.2868103981018066,
"step": 504
},
{
"epoch": 1.1578947368421053,
"grad_norm": 26.603134155273438,
"learning_rate": 8.669913790849396e-06,
"loss": 0.899326741695404,
"step": 506
},
{
"epoch": 1.162471395881007,
"grad_norm": 2.4431211948394775,
"learning_rate": 8.657801652282178e-06,
"loss": 0.8970417976379395,
"step": 508
},
{
"epoch": 1.1670480549199085,
"grad_norm": 2.117124080657959,
"learning_rate": 8.645644273163536e-06,
"loss": 0.9268218278884888,
"step": 510
},
{
"epoch": 1.17162471395881,
"grad_norm": 3.660663604736328,
"learning_rate": 8.633441827667338e-06,
"loss": 1.3189082145690918,
"step": 512
},
{
"epoch": 1.1762013729977117,
"grad_norm": 2.381092071533203,
"learning_rate": 8.621194490613104e-06,
"loss": 1.006082534790039,
"step": 514
},
{
"epoch": 1.1807780320366132,
"grad_norm": 1.8072335720062256,
"learning_rate": 8.608902437463495e-06,
"loss": 1.0185256004333496,
"step": 516
},
{
"epoch": 1.1853546910755148,
"grad_norm": 1.9344371557235718,
"learning_rate": 8.596565844321804e-06,
"loss": 0.7876001596450806,
"step": 518
},
{
"epoch": 1.1899313501144164,
"grad_norm": 21.985254287719727,
"learning_rate": 8.584184887929424e-06,
"loss": 0.8519538640975952,
"step": 520
},
{
"epoch": 1.194508009153318,
"grad_norm": 7.320517063140869,
"learning_rate": 8.57175974566333e-06,
"loss": 0.9020718336105347,
"step": 522
},
{
"epoch": 1.1990846681922196,
"grad_norm": 1.7533776760101318,
"learning_rate": 8.559290595533528e-06,
"loss": 0.7076669931411743,
"step": 524
},
{
"epoch": 1.2036613272311212,
"grad_norm": 1.601244568824768,
"learning_rate": 8.5467776161805e-06,
"loss": 1.0420892238616943,
"step": 526
},
{
"epoch": 1.208237986270023,
"grad_norm": 4.683840751647949,
"learning_rate": 8.534220986872664e-06,
"loss": 0.9078390002250671,
"step": 528
},
{
"epoch": 1.2128146453089246,
"grad_norm": 2.8337855339050293,
"learning_rate": 8.521620887503783e-06,
"loss": 0.9289965629577637,
"step": 530
},
{
"epoch": 1.2173913043478262,
"grad_norm": 2.4027771949768066,
"learning_rate": 8.508977498590404e-06,
"loss": 0.7684561610221863,
"step": 532
},
{
"epoch": 1.2219679633867278,
"grad_norm": 5.679591178894043,
"learning_rate": 8.496291001269261e-06,
"loss": 1.1440486907958984,
"step": 534
},
{
"epoch": 1.2265446224256293,
"grad_norm": 2.527660608291626,
"learning_rate": 8.483561577294688e-06,
"loss": 0.6309778690338135,
"step": 536
},
{
"epoch": 1.231121281464531,
"grad_norm": 1.9118728637695312,
"learning_rate": 8.470789409036014e-06,
"loss": 1.0466161966323853,
"step": 538
},
{
"epoch": 1.2356979405034325,
"grad_norm": 4.447290897369385,
"learning_rate": 8.457974679474944e-06,
"loss": 1.0474622249603271,
"step": 540
},
{
"epoch": 1.240274599542334,
"grad_norm": 2.7177319526672363,
"learning_rate": 8.445117572202943e-06,
"loss": 1.147586464881897,
"step": 542
},
{
"epoch": 1.2448512585812357,
"grad_norm": 2.1759936809539795,
"learning_rate": 8.432218271418602e-06,
"loss": 1.1140575408935547,
"step": 544
},
{
"epoch": 1.2494279176201373,
"grad_norm": 2.892226219177246,
"learning_rate": 8.419276961925006e-06,
"loss": 1.0437395572662354,
"step": 546
},
{
"epoch": 1.2540045766590389,
"grad_norm": 7.295011520385742,
"learning_rate": 8.406293829127083e-06,
"loss": 0.7300729751586914,
"step": 548
},
{
"epoch": 1.2585812356979404,
"grad_norm": 0.512110710144043,
"learning_rate": 8.393269059028937e-06,
"loss": 0.7643875479698181,
"step": 550
},
{
"epoch": 1.263157894736842,
"grad_norm": 6.191455364227295,
"learning_rate": 8.380202838231205e-06,
"loss": 0.8880730867385864,
"step": 552
},
{
"epoch": 1.2677345537757438,
"grad_norm": 1.766752004623413,
"learning_rate": 8.367095353928361e-06,
"loss": 0.906735360622406,
"step": 554
},
{
"epoch": 1.2723112128146452,
"grad_norm": 4.45707368850708,
"learning_rate": 8.35394679390605e-06,
"loss": 1.1020989418029785,
"step": 556
},
{
"epoch": 1.276887871853547,
"grad_norm": 1.6555581092834473,
"learning_rate": 8.340757346538394e-06,
"loss": 1.121458888053894,
"step": 558
},
{
"epoch": 1.2814645308924484,
"grad_norm": 2.084331750869751,
"learning_rate": 8.32752720078529e-06,
"loss": 0.7600382566452026,
"step": 560
},
{
"epoch": 1.2860411899313502,
"grad_norm": 2.9418492317199707,
"learning_rate": 8.314256546189696e-06,
"loss": 0.8527880907058716,
"step": 562
},
{
"epoch": 1.2906178489702518,
"grad_norm": 6.980574607849121,
"learning_rate": 8.30094557287494e-06,
"loss": 0.7204505205154419,
"step": 564
},
{
"epoch": 1.2951945080091534,
"grad_norm": 78.67940521240234,
"learning_rate": 8.287594471541966e-06,
"loss": 1.0420033931732178,
"step": 566
},
{
"epoch": 1.299771167048055,
"grad_norm": 7.022028923034668,
"learning_rate": 8.274203433466625e-06,
"loss": 0.9533605575561523,
"step": 568
},
{
"epoch": 1.3043478260869565,
"grad_norm": 3.6711020469665527,
"learning_rate": 8.260772650496918e-06,
"loss": 1.0180366039276123,
"step": 570
},
{
"epoch": 1.3089244851258581,
"grad_norm": 3.5704965591430664,
"learning_rate": 8.247302315050261e-06,
"loss": 0.6935830116271973,
"step": 572
},
{
"epoch": 1.3135011441647597,
"grad_norm": 6.118966102600098,
"learning_rate": 8.23379262011072e-06,
"loss": 0.9882407784461975,
"step": 574
},
{
"epoch": 1.3180778032036613,
"grad_norm": 2.1732988357543945,
"learning_rate": 8.220243759226248e-06,
"loss": 1.063117504119873,
"step": 576
},
{
"epoch": 1.322654462242563,
"grad_norm": 6.698834419250488,
"learning_rate": 8.206655926505916e-06,
"loss": 0.732232391834259,
"step": 578
},
{
"epoch": 1.3272311212814645,
"grad_norm": 2.403120756149292,
"learning_rate": 8.193029316617123e-06,
"loss": 1.0798766613006592,
"step": 580
},
{
"epoch": 1.331807780320366,
"grad_norm": 1.4961130619049072,
"learning_rate": 8.17936412478282e-06,
"loss": 1.0436818599700928,
"step": 582
},
{
"epoch": 1.3363844393592679,
"grad_norm": 1.0409057140350342,
"learning_rate": 8.1656605467787e-06,
"loss": 0.5859847068786621,
"step": 584
},
{
"epoch": 1.3409610983981692,
"grad_norm": 8.445878982543945,
"learning_rate": 8.1519187789304e-06,
"loss": 0.9882102012634277,
"step": 586
},
{
"epoch": 1.345537757437071,
"grad_norm": 5.030638694763184,
"learning_rate": 8.138139018110694e-06,
"loss": 0.825863778591156,
"step": 588
},
{
"epoch": 1.3501144164759724,
"grad_norm": 2.1235110759735107,
"learning_rate": 8.124321461736655e-06,
"loss": 0.8253368139266968,
"step": 590
},
{
"epoch": 1.3546910755148742,
"grad_norm": 2.607447624206543,
"learning_rate": 8.110466307766845e-06,
"loss": 1.1002779006958008,
"step": 592
},
{
"epoch": 1.3592677345537758,
"grad_norm": 2.3491551876068115,
"learning_rate": 8.096573754698473e-06,
"loss": 1.1305601596832275,
"step": 594
},
{
"epoch": 1.3638443935926774,
"grad_norm": 1.7016195058822632,
"learning_rate": 8.082644001564548e-06,
"loss": 1.1057755947113037,
"step": 596
},
{
"epoch": 1.368421052631579,
"grad_norm": 3.3334386348724365,
"learning_rate": 8.068677247931021e-06,
"loss": 1.200844645500183,
"step": 598
},
{
"epoch": 1.3729977116704806,
"grad_norm": 2.949726104736328,
"learning_rate": 8.054673693893948e-06,
"loss": 1.1354503631591797,
"step": 600
},
{
"epoch": 1.3775743707093822,
"grad_norm": 1.6623036861419678,
"learning_rate": 8.040633540076604e-06,
"loss": 1.0025185346603394,
"step": 602
},
{
"epoch": 1.3821510297482837,
"grad_norm": 1.9049066305160522,
"learning_rate": 8.026556987626606e-06,
"loss": 0.38326674699783325,
"step": 604
},
{
"epoch": 1.3867276887871853,
"grad_norm": 1.9863322973251343,
"learning_rate": 8.012444238213056e-06,
"loss": 1.0257573127746582,
"step": 606
},
{
"epoch": 1.391304347826087,
"grad_norm": 8.692577362060547,
"learning_rate": 7.99829549402362e-06,
"loss": 0.9867333173751831,
"step": 608
},
{
"epoch": 1.3958810068649885,
"grad_norm": 1.6721243858337402,
"learning_rate": 7.984110957761657e-06,
"loss": 0.8433778285980225,
"step": 610
},
{
"epoch": 1.40045766590389,
"grad_norm": 8.466538429260254,
"learning_rate": 7.969890832643296e-06,
"loss": 0.9302389621734619,
"step": 612
},
{
"epoch": 1.4050343249427917,
"grad_norm": 1.992497444152832,
"learning_rate": 7.955635322394543e-06,
"loss": 0.7917460203170776,
"step": 614
},
{
"epoch": 1.4096109839816933,
"grad_norm": 38.11425018310547,
"learning_rate": 7.941344631248343e-06,
"loss": 0.743791937828064,
"step": 616
},
{
"epoch": 1.414187643020595,
"grad_norm": 1.7407969236373901,
"learning_rate": 7.927018963941668e-06,
"loss": 1.0373704433441162,
"step": 618
},
{
"epoch": 1.4187643020594964,
"grad_norm": 1.8311736583709717,
"learning_rate": 7.912658525712582e-06,
"loss": 1.042643427848816,
"step": 620
},
{
"epoch": 1.4233409610983982,
"grad_norm": 4.667344570159912,
"learning_rate": 7.898263522297294e-06,
"loss": 0.8382468223571777,
"step": 622
},
{
"epoch": 1.4279176201372998,
"grad_norm": 1.3862837553024292,
"learning_rate": 7.883834159927212e-06,
"loss": 0.8047330379486084,
"step": 624
},
{
"epoch": 1.4324942791762014,
"grad_norm": 2.3761935234069824,
"learning_rate": 7.869370645326e-06,
"loss": 1.0753339529037476,
"step": 626
},
{
"epoch": 1.437070938215103,
"grad_norm": 10.085698127746582,
"learning_rate": 7.854873185706598e-06,
"loss": 0.7506072521209717,
"step": 628
},
{
"epoch": 1.4416475972540046,
"grad_norm": 8.790916442871094,
"learning_rate": 7.840341988768269e-06,
"loss": 1.1122334003448486,
"step": 630
},
{
"epoch": 1.4462242562929062,
"grad_norm": 3.126335382461548,
"learning_rate": 7.825777262693612e-06,
"loss": 0.5835685133934021,
"step": 632
},
{
"epoch": 1.4508009153318078,
"grad_norm": 2.1324803829193115,
"learning_rate": 7.811179216145588e-06,
"loss": 1.0125725269317627,
"step": 634
},
{
"epoch": 1.4553775743707094,
"grad_norm": 7.664412498474121,
"learning_rate": 7.796548058264525e-06,
"loss": 0.8314673900604248,
"step": 636
},
{
"epoch": 1.459954233409611,
"grad_norm": 1.4939762353897095,
"learning_rate": 7.781883998665126e-06,
"loss": 1.0299837589263916,
"step": 638
},
{
"epoch": 1.4645308924485125,
"grad_norm": 125.39936828613281,
"learning_rate": 7.767187247433459e-06,
"loss": 0.8142813444137573,
"step": 640
},
{
"epoch": 1.4691075514874141,
"grad_norm": 3.3667984008789062,
"learning_rate": 7.752458015123955e-06,
"loss": 0.7184183597564697,
"step": 642
},
{
"epoch": 1.4736842105263157,
"grad_norm": 8.629426002502441,
"learning_rate": 7.737696512756393e-06,
"loss": 0.7312871217727661,
"step": 644
},
{
"epoch": 1.4782608695652173,
"grad_norm": 20.76643943786621,
"learning_rate": 7.722902951812863e-06,
"loss": 0.8988088965415955,
"step": 646
},
{
"epoch": 1.482837528604119,
"grad_norm": 1.4531633853912354,
"learning_rate": 7.70807754423475e-06,
"loss": 0.8973408937454224,
"step": 648
},
{
"epoch": 1.4874141876430205,
"grad_norm": 2.0992820262908936,
"learning_rate": 7.693220502419696e-06,
"loss": 0.918885350227356,
"step": 650
},
{
"epoch": 1.4919908466819223,
"grad_norm": 1.0393502712249756,
"learning_rate": 7.678332039218549e-06,
"loss": 1.0323870182037354,
"step": 652
},
{
"epoch": 1.4965675057208239,
"grad_norm": 2.6336605548858643,
"learning_rate": 7.663412367932315e-06,
"loss": 1.0499887466430664,
"step": 654
},
{
"epoch": 1.5011441647597255,
"grad_norm": 7.964874267578125,
"learning_rate": 7.648461702309116e-06,
"loss": 0.7312684059143066,
"step": 656
},
{
"epoch": 1.505720823798627,
"grad_norm": 3.785597801208496,
"learning_rate": 7.633480256541112e-06,
"loss": 1.2282322645187378,
"step": 658
},
{
"epoch": 1.5102974828375286,
"grad_norm": 2.3747780323028564,
"learning_rate": 7.618468245261436e-06,
"loss": 0.7873207330703735,
"step": 660
},
{
"epoch": 1.5148741418764302,
"grad_norm": 2.2096705436706543,
"learning_rate": 7.603425883541123e-06,
"loss": 0.7644495964050293,
"step": 662
},
{
"epoch": 1.5194508009153318,
"grad_norm": 1.7680861949920654,
"learning_rate": 7.588353386886026e-06,
"loss": 1.152151346206665,
"step": 664
},
{
"epoch": 1.5240274599542334,
"grad_norm": 14.067296028137207,
"learning_rate": 7.573250971233729e-06,
"loss": 0.8791661262512207,
"step": 666
},
{
"epoch": 1.528604118993135,
"grad_norm": 1.3611705303192139,
"learning_rate": 7.5581188529504556e-06,
"loss": 0.40379875898361206,
"step": 668
},
{
"epoch": 1.5331807780320366,
"grad_norm": 1.303026795387268,
"learning_rate": 7.5429572488279615e-06,
"loss": 1.0647927522659302,
"step": 670
},
{
"epoch": 1.5377574370709381,
"grad_norm": 1.101729393005371,
"learning_rate": 7.5277663760804395e-06,
"loss": 1.0676069259643555,
"step": 672
},
{
"epoch": 1.54233409610984,
"grad_norm": 4.634459972381592,
"learning_rate": 7.512546452341402e-06,
"loss": 0.5080143809318542,
"step": 674
},
{
"epoch": 1.5469107551487413,
"grad_norm": 2.555481433868408,
"learning_rate": 7.497297695660558e-06,
"loss": 0.9243414402008057,
"step": 676
},
{
"epoch": 1.5514874141876431,
"grad_norm": 2.7285444736480713,
"learning_rate": 7.482020324500699e-06,
"loss": 0.9102246165275574,
"step": 678
},
{
"epoch": 1.5560640732265445,
"grad_norm": 28.053659439086914,
"learning_rate": 7.466714557734567e-06,
"loss": 0.8449078798294067,
"step": 680
},
{
"epoch": 1.5606407322654463,
"grad_norm": 1.8448493480682373,
"learning_rate": 7.451380614641709e-06,
"loss": 1.1165541410446167,
"step": 682
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.3494691848754883,
"learning_rate": 7.436018714905347e-06,
"loss": 1.132646083831787,
"step": 684
},
{
"epoch": 1.5697940503432495,
"grad_norm": 1.6197850704193115,
"learning_rate": 7.4206290786092305e-06,
"loss": 1.129209280014038,
"step": 686
},
{
"epoch": 1.574370709382151,
"grad_norm": 7.3508620262146,
"learning_rate": 7.405211926234472e-06,
"loss": 0.6533366441726685,
"step": 688
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.5411512851715088,
"learning_rate": 7.389767478656399e-06,
"loss": 0.9583989381790161,
"step": 690
},
{
"epoch": 1.5835240274599542,
"grad_norm": 2.164099931716919,
"learning_rate": 7.374295957141387e-06,
"loss": 0.6867862939834595,
"step": 692
},
{
"epoch": 1.5881006864988558,
"grad_norm": 3.5565309524536133,
"learning_rate": 7.358797583343691e-06,
"loss": 1.251814603805542,
"step": 694
},
{
"epoch": 1.5926773455377574,
"grad_norm": 2.2329952716827393,
"learning_rate": 7.34327257930226e-06,
"loss": 0.7910688519477844,
"step": 696
},
{
"epoch": 1.597254004576659,
"grad_norm": 1.991982102394104,
"learning_rate": 7.327721167437575e-06,
"loss": 1.0751738548278809,
"step": 698
},
{
"epoch": 1.6018306636155606,
"grad_norm": 2.1958391666412354,
"learning_rate": 7.312143570548441e-06,
"loss": 1.0445199012756348,
"step": 700
},
{
"epoch": 1.6064073226544622,
"grad_norm": 1.833368182182312,
"learning_rate": 7.296540011808814e-06,
"loss": 1.0390658378601074,
"step": 702
},
{
"epoch": 1.610983981693364,
"grad_norm": 5.710247039794922,
"learning_rate": 7.280910714764584e-06,
"loss": 0.8262543678283691,
"step": 704
},
{
"epoch": 1.6155606407322654,
"grad_norm": 1.8299520015716553,
"learning_rate": 7.2652559033303974e-06,
"loss": 1.061065435409546,
"step": 706
},
{
"epoch": 1.6201372997711672,
"grad_norm": 2.613022565841675,
"learning_rate": 7.249575801786421e-06,
"loss": 1.0792577266693115,
"step": 708
},
{
"epoch": 1.6247139588100685,
"grad_norm": 1.213928461074829,
"learning_rate": 7.233870634775153e-06,
"loss": 0.6793702840805054,
"step": 710
},
{
"epoch": 1.6292906178489703,
"grad_norm": 2.241819143295288,
"learning_rate": 7.218140627298192e-06,
"loss": 0.7657841444015503,
"step": 712
},
{
"epoch": 1.6338672768878717,
"grad_norm": 2.9433786869049072,
"learning_rate": 7.202386004713008e-06,
"loss": 0.9588929414749146,
"step": 714
},
{
"epoch": 1.6384439359267735,
"grad_norm": 4.815126419067383,
"learning_rate": 7.1866069927297366e-06,
"loss": 0.5673887729644775,
"step": 716
},
{
"epoch": 1.643020594965675,
"grad_norm": 5.819674968719482,
"learning_rate": 7.170803817407917e-06,
"loss": 0.895261287689209,
"step": 718
},
{
"epoch": 1.6475972540045767,
"grad_norm": 1.3659911155700684,
"learning_rate": 7.154976705153274e-06,
"loss": 0.9793621301651001,
"step": 720
},
{
"epoch": 1.6521739130434783,
"grad_norm": 14.497611999511719,
"learning_rate": 7.139125882714465e-06,
"loss": 0.8763951063156128,
"step": 722
},
{
"epoch": 1.6567505720823799,
"grad_norm": 2.1780354976654053,
"learning_rate": 7.123251577179834e-06,
"loss": 0.7888288497924805,
"step": 724
},
{
"epoch": 1.6613272311212814,
"grad_norm": 1.8186107873916626,
"learning_rate": 7.107354015974156e-06,
"loss": 0.7892118096351624,
"step": 726
},
{
"epoch": 1.665903890160183,
"grad_norm": 5.142183303833008,
"learning_rate": 7.091433426855387e-06,
"loss": 0.8022271394729614,
"step": 728
},
{
"epoch": 1.6704805491990846,
"grad_norm": 2.999373435974121,
"learning_rate": 7.075490037911384e-06,
"loss": 0.48615947365760803,
"step": 730
},
{
"epoch": 1.6750572082379862,
"grad_norm": 13.79496955871582,
"learning_rate": 7.059524077556659e-06,
"loss": 0.46404361724853516,
"step": 732
},
{
"epoch": 1.679633867276888,
"grad_norm": 1.3626662492752075,
"learning_rate": 7.043535774529088e-06,
"loss": 0.9120252132415771,
"step": 734
},
{
"epoch": 1.6842105263157894,
"grad_norm": 2.496614933013916,
"learning_rate": 7.027525357886644e-06,
"loss": 0.6731216311454773,
"step": 736
},
{
"epoch": 1.6887871853546912,
"grad_norm": 3.0987555980682373,
"learning_rate": 7.011493057004113e-06,
"loss": 0.6688947677612305,
"step": 738
},
{
"epoch": 1.6933638443935926,
"grad_norm": 8.601667404174805,
"learning_rate": 6.995439101569808e-06,
"loss": 1.0317034721374512,
"step": 740
},
{
"epoch": 1.6979405034324944,
"grad_norm": 10.779264450073242,
"learning_rate": 6.9793637215822755e-06,
"loss": 0.8653741478919983,
"step": 742
},
{
"epoch": 1.7025171624713957,
"grad_norm": 2.992541790008545,
"learning_rate": 6.963267147347007e-06,
"loss": 0.9310321807861328,
"step": 744
},
{
"epoch": 1.7070938215102975,
"grad_norm": 12.640564918518066,
"learning_rate": 6.947149609473134e-06,
"loss": 0.6817935109138489,
"step": 746
},
{
"epoch": 1.7116704805491991,
"grad_norm": 1.5293439626693726,
"learning_rate": 6.931011338870123e-06,
"loss": 1.009579062461853,
"step": 748
},
{
"epoch": 1.7162471395881007,
"grad_norm": 1.235052227973938,
"learning_rate": 6.914852566744472e-06,
"loss": 0.5917520523071289,
"step": 750
},
{
"epoch": 1.7208237986270023,
"grad_norm": 3.1631014347076416,
"learning_rate": 6.8986735245963965e-06,
"loss": 1.1671645641326904,
"step": 752
},
{
"epoch": 1.7254004576659039,
"grad_norm": 11.111958503723145,
"learning_rate": 6.8824744442165124e-06,
"loss": 1.0235098600387573,
"step": 754
},
{
"epoch": 1.7299771167048055,
"grad_norm": 4.900933742523193,
"learning_rate": 6.866255557682513e-06,
"loss": 1.1278393268585205,
"step": 756
},
{
"epoch": 1.734553775743707,
"grad_norm": 1.4825588464736938,
"learning_rate": 6.850017097355852e-06,
"loss": 0.6875651478767395,
"step": 758
},
{
"epoch": 1.7391304347826086,
"grad_norm": 6.850196361541748,
"learning_rate": 6.833759295878403e-06,
"loss": 0.9467449188232422,
"step": 760
},
{
"epoch": 1.7437070938215102,
"grad_norm": 5.623671054840088,
"learning_rate": 6.817482386169131e-06,
"loss": 0.9277236461639404,
"step": 762
},
{
"epoch": 1.748283752860412,
"grad_norm": 1.6635899543762207,
"learning_rate": 6.801186601420766e-06,
"loss": 1.0011539459228516,
"step": 764
},
{
"epoch": 1.7528604118993134,
"grad_norm": 2.057003974914551,
"learning_rate": 6.7848721750964444e-06,
"loss": 1.0401999950408936,
"step": 766
},
{
"epoch": 1.7574370709382152,
"grad_norm": 1.9083833694458008,
"learning_rate": 6.768539340926376e-06,
"loss": 0.8578721284866333,
"step": 768
},
{
"epoch": 1.7620137299771166,
"grad_norm": 6.834735870361328,
"learning_rate": 6.752188332904495e-06,
"loss": 0.4599095582962036,
"step": 770
},
{
"epoch": 1.7665903890160184,
"grad_norm": 2.0236639976501465,
"learning_rate": 6.7358193852851006e-06,
"loss": 1.0432019233703613,
"step": 772
},
{
"epoch": 1.7711670480549198,
"grad_norm": 2.964568614959717,
"learning_rate": 6.719432732579509e-06,
"loss": 1.0962594747543335,
"step": 774
},
{
"epoch": 1.7757437070938216,
"grad_norm": 2.739694833755493,
"learning_rate": 6.7030286095526855e-06,
"loss": 1.00520658493042,
"step": 776
},
{
"epoch": 1.7803203661327232,
"grad_norm": 2.45100474357605,
"learning_rate": 6.6866072512198895e-06,
"loss": 0.8793066143989563,
"step": 778
},
{
"epoch": 1.7848970251716247,
"grad_norm": 2.1978166103363037,
"learning_rate": 6.670168892843304e-06,
"loss": 1.0643588304519653,
"step": 780
},
{
"epoch": 1.7894736842105263,
"grad_norm": 2.2212750911712646,
"learning_rate": 6.653713769928664e-06,
"loss": 0.9725496172904968,
"step": 782
},
{
"epoch": 1.794050343249428,
"grad_norm": 12.925155639648438,
"learning_rate": 6.6372421182218806e-06,
"loss": 0.5852289199829102,
"step": 784
},
{
"epoch": 1.7986270022883295,
"grad_norm": 2.02931547164917,
"learning_rate": 6.620754173705669e-06,
"loss": 0.4103405475616455,
"step": 786
},
{
"epoch": 1.803203661327231,
"grad_norm": 6.498274803161621,
"learning_rate": 6.604250172596166e-06,
"loss": 0.7299265265464783,
"step": 788
},
{
"epoch": 1.8077803203661327,
"grad_norm": 0.8046561479568481,
"learning_rate": 6.587730351339542e-06,
"loss": 0.5971012115478516,
"step": 790
},
{
"epoch": 1.8123569794050343,
"grad_norm": 2.3184573650360107,
"learning_rate": 6.571194946608615e-06,
"loss": 0.5819271802902222,
"step": 792
},
{
"epoch": 1.816933638443936,
"grad_norm": 2.6694231033325195,
"learning_rate": 6.554644195299467e-06,
"loss": 1.0282055139541626,
"step": 794
},
{
"epoch": 1.8215102974828374,
"grad_norm": 2.3228371143341064,
"learning_rate": 6.53807833452804e-06,
"loss": 0.8639980554580688,
"step": 796
},
{
"epoch": 1.8260869565217392,
"grad_norm": 3.077423095703125,
"learning_rate": 6.521497601626742e-06,
"loss": 1.0374181270599365,
"step": 798
},
{
"epoch": 1.8306636155606406,
"grad_norm": 3.502397060394287,
"learning_rate": 6.504902234141052e-06,
"loss": 0.8395485877990723,
"step": 800
},
{
"epoch": 1.8352402745995424,
"grad_norm": 1.694636583328247,
"learning_rate": 6.4882924698261086e-06,
"loss": 0.8845337629318237,
"step": 802
},
{
"epoch": 1.8398169336384438,
"grad_norm": 1.3276764154434204,
"learning_rate": 6.4716685466433125e-06,
"loss": 0.7552372217178345,
"step": 804
},
{
"epoch": 1.8443935926773456,
"grad_norm": 3.640122652053833,
"learning_rate": 6.455030702756909e-06,
"loss": 0.7707520723342896,
"step": 806
},
{
"epoch": 1.8489702517162472,
"grad_norm": 1.5462133884429932,
"learning_rate": 6.438379176530581e-06,
"loss": 0.9930239915847778,
"step": 808
},
{
"epoch": 1.8535469107551488,
"grad_norm": 1.9529536962509155,
"learning_rate": 6.421714206524032e-06,
"loss": 1.040754795074463,
"step": 810
},
{
"epoch": 1.8581235697940504,
"grad_norm": 3.2038698196411133,
"learning_rate": 6.405036031489573e-06,
"loss": 0.9112997055053711,
"step": 812
},
{
"epoch": 1.862700228832952,
"grad_norm": 2.7182154655456543,
"learning_rate": 6.3883448903686926e-06,
"loss": 1.0292009115219116,
"step": 814
},
{
"epoch": 1.8672768878718535,
"grad_norm": 1.7210997343063354,
"learning_rate": 6.371641022288642e-06,
"loss": 0.7663242816925049,
"step": 816
},
{
"epoch": 1.8718535469107551,
"grad_norm": 1.3438034057617188,
"learning_rate": 6.354924666559007e-06,
"loss": 0.9235577583312988,
"step": 818
},
{
"epoch": 1.8764302059496567,
"grad_norm": 6.343740940093994,
"learning_rate": 6.338196062668276e-06,
"loss": 0.9253222942352295,
"step": 820
},
{
"epoch": 1.8810068649885583,
"grad_norm": 10.784764289855957,
"learning_rate": 6.321455450280417e-06,
"loss": 1.0267045497894287,
"step": 822
},
{
"epoch": 1.88558352402746,
"grad_norm": 4.409283638000488,
"learning_rate": 6.304703069231434e-06,
"loss": 1.1460933685302734,
"step": 824
},
{
"epoch": 1.8901601830663615,
"grad_norm": 6.675430774688721,
"learning_rate": 6.287939159525939e-06,
"loss": 0.9188438653945923,
"step": 826
},
{
"epoch": 1.8947368421052633,
"grad_norm": 4.170138359069824,
"learning_rate": 6.271163961333706e-06,
"loss": 0.858944833278656,
"step": 828
},
{
"epoch": 1.8993135011441646,
"grad_norm": 1.5649385452270508,
"learning_rate": 6.25437771498624e-06,
"loss": 0.9133040904998779,
"step": 830
},
{
"epoch": 1.9038901601830664,
"grad_norm": 2.2200839519500732,
"learning_rate": 6.237580660973328e-06,
"loss": 1.1158314943313599,
"step": 832
},
{
"epoch": 1.9084668192219678,
"grad_norm": 8.802407264709473,
"learning_rate": 6.220773039939592e-06,
"loss": 0.8092963695526123,
"step": 834
},
{
"epoch": 1.9130434782608696,
"grad_norm": 3.916245222091675,
"learning_rate": 6.20395509268104e-06,
"loss": 0.9357765913009644,
"step": 836
},
{
"epoch": 1.9176201372997712,
"grad_norm": 2.1684670448303223,
"learning_rate": 6.1871270601416255e-06,
"loss": 1.0700328350067139,
"step": 838
},
{
"epoch": 1.9221967963386728,
"grad_norm": 1.537369728088379,
"learning_rate": 6.170289183409789e-06,
"loss": 1.0650542974472046,
"step": 840
},
{
"epoch": 1.9267734553775744,
"grad_norm": 1.5754413604736328,
"learning_rate": 6.153441703715e-06,
"loss": 1.18636155128479,
"step": 842
},
{
"epoch": 1.931350114416476,
"grad_norm": 2.923276901245117,
"learning_rate": 6.136584862424313e-06,
"loss": 1.0751769542694092,
"step": 844
},
{
"epoch": 1.9359267734553776,
"grad_norm": 2.0239417552948,
"learning_rate": 6.119718901038898e-06,
"loss": 1.0461246967315674,
"step": 846
},
{
"epoch": 1.9405034324942791,
"grad_norm": 5.281694412231445,
"learning_rate": 6.102844061190582e-06,
"loss": 0.9867255091667175,
"step": 848
},
{
"epoch": 1.9450800915331807,
"grad_norm": 2.6705641746520996,
"learning_rate": 6.0859605846383986e-06,
"loss": 1.0285825729370117,
"step": 850
},
{
"epoch": 1.9496567505720823,
"grad_norm": 2.842245578765869,
"learning_rate": 6.069068713265107e-06,
"loss": 0.8631390929222107,
"step": 852
},
{
"epoch": 1.9542334096109841,
"grad_norm": 12.086994171142578,
"learning_rate": 6.05216868907374e-06,
"loss": 0.9532708525657654,
"step": 854
},
{
"epoch": 1.9588100686498855,
"grad_norm": 6.184656620025635,
"learning_rate": 6.035260754184133e-06,
"loss": 0.7836180925369263,
"step": 856
},
{
"epoch": 1.9633867276887873,
"grad_norm": 0.9443724751472473,
"learning_rate": 6.0183451508294555e-06,
"loss": 0.505516767501831,
"step": 858
},
{
"epoch": 1.9679633867276887,
"grad_norm": 4.477334499359131,
"learning_rate": 6.001422121352736e-06,
"loss": 0.7708160877227783,
"step": 860
},
{
"epoch": 1.9725400457665905,
"grad_norm": 3.583055257797241,
"learning_rate": 5.984491908203398e-06,
"loss": 0.8886803388595581,
"step": 862
},
{
"epoch": 1.9771167048054918,
"grad_norm": 3.272289991378784,
"learning_rate": 5.96755475393378e-06,
"loss": 1.0874934196472168,
"step": 864
},
{
"epoch": 1.9816933638443937,
"grad_norm": 5.9545979499816895,
"learning_rate": 5.950610901195664e-06,
"loss": 0.8436876535415649,
"step": 866
},
{
"epoch": 1.9862700228832952,
"grad_norm": 3.9929182529449463,
"learning_rate": 5.933660592736798e-06,
"loss": 1.1498433351516724,
"step": 868
},
{
"epoch": 1.9908466819221968,
"grad_norm": 1.720807433128357,
"learning_rate": 5.9167040713974224e-06,
"loss": 1.0520302057266235,
"step": 870
},
{
"epoch": 1.9954233409610984,
"grad_norm": 1.747268557548523,
"learning_rate": 5.89974158010678e-06,
"loss": 0.8821731805801392,
"step": 872
},
{
"epoch": 2.0,
"grad_norm": 3.795553207397461,
"learning_rate": 5.8827733618796455e-06,
"loss": 0.7908928990364075,
"step": 874
},
{
"epoch": 2.004576659038902,
"grad_norm": 1.620730996131897,
"learning_rate": 5.865799659812846e-06,
"loss": 0.945500910282135,
"step": 876
},
{
"epoch": 2.009153318077803,
"grad_norm": 6.447601795196533,
"learning_rate": 5.848820717081767e-06,
"loss": 0.8182247877120972,
"step": 878
},
{
"epoch": 2.013729977116705,
"grad_norm": 1.7102856636047363,
"learning_rate": 5.831836776936876e-06,
"loss": 0.7212214469909668,
"step": 880
},
{
"epoch": 2.0183066361556063,
"grad_norm": 2.019994020462036,
"learning_rate": 5.81484808270024e-06,
"loss": 0.6066313982009888,
"step": 882
},
{
"epoch": 2.022883295194508,
"grad_norm": 11.098702430725098,
"learning_rate": 5.79785487776203e-06,
"loss": 0.5976270437240601,
"step": 884
},
{
"epoch": 2.0274599542334095,
"grad_norm": 2.936749219894409,
"learning_rate": 5.780857405577048e-06,
"loss": 0.7158992290496826,
"step": 886
},
{
"epoch": 2.0320366132723113,
"grad_norm": 0.6403222680091858,
"learning_rate": 5.7638559096612244e-06,
"loss": 0.5098772644996643,
"step": 888
},
{
"epoch": 2.0366132723112127,
"grad_norm": 6.3164143562316895,
"learning_rate": 5.746850633588138e-06,
"loss": 0.5138643980026245,
"step": 890
},
{
"epoch": 2.0411899313501145,
"grad_norm": 5.037423610687256,
"learning_rate": 5.729841820985525e-06,
"loss": 0.7996326684951782,
"step": 892
},
{
"epoch": 2.045766590389016,
"grad_norm": 4.635165691375732,
"learning_rate": 5.712829715531787e-06,
"loss": 0.5906950235366821,
"step": 894
},
{
"epoch": 2.0503432494279177,
"grad_norm": 4.191209316253662,
"learning_rate": 5.6958145609525005e-06,
"loss": 0.9277024269104004,
"step": 896
},
{
"epoch": 2.054919908466819,
"grad_norm": 2.6020865440368652,
"learning_rate": 5.67879660101693e-06,
"loss": 0.4953917860984802,
"step": 898
},
{
"epoch": 2.059496567505721,
"grad_norm": 8.963276863098145,
"learning_rate": 5.661776079534526e-06,
"loss": 0.7233725786209106,
"step": 900
},
{
"epoch": 2.064073226544622,
"grad_norm": 0.6874380111694336,
"learning_rate": 5.644753240351439e-06,
"loss": 0.34285467863082886,
"step": 902
},
{
"epoch": 2.068649885583524,
"grad_norm": 3.8761210441589355,
"learning_rate": 5.6277283273470255e-06,
"loss": 0.7474602460861206,
"step": 904
},
{
"epoch": 2.073226544622426,
"grad_norm": 4.701927185058594,
"learning_rate": 5.6107015844303505e-06,
"loss": 0.5753633975982666,
"step": 906
},
{
"epoch": 2.077803203661327,
"grad_norm": 3.259704113006592,
"learning_rate": 5.593673255536696e-06,
"loss": 0.6351042985916138,
"step": 908
},
{
"epoch": 2.082379862700229,
"grad_norm": 5.490569114685059,
"learning_rate": 5.5766435846240674e-06,
"loss": 0.8573955297470093,
"step": 910
},
{
"epoch": 2.0869565217391304,
"grad_norm": 2.4090561866760254,
"learning_rate": 5.559612815669697e-06,
"loss": 0.36256104707717896,
"step": 912
},
{
"epoch": 2.091533180778032,
"grad_norm": 2.0318257808685303,
"learning_rate": 5.5425811926665426e-06,
"loss": 0.6678656339645386,
"step": 914
},
{
"epoch": 2.0961098398169336,
"grad_norm": 2.644015073776245,
"learning_rate": 5.525548959619807e-06,
"loss": 0.8250109553337097,
"step": 916
},
{
"epoch": 2.1006864988558354,
"grad_norm": 2.946031332015991,
"learning_rate": 5.508516360543424e-06,
"loss": 0.6581653356552124,
"step": 918
},
{
"epoch": 2.1052631578947367,
"grad_norm": 3.0805141925811768,
"learning_rate": 5.491483639456577e-06,
"loss": 0.6303462982177734,
"step": 920
},
{
"epoch": 2.1098398169336385,
"grad_norm": 3.5895307064056396,
"learning_rate": 5.474451040380194e-06,
"loss": 0.6395201683044434,
"step": 922
},
{
"epoch": 2.11441647597254,
"grad_norm": 6.964833736419678,
"learning_rate": 5.457418807333458e-06,
"loss": 0.5622954368591309,
"step": 924
},
{
"epoch": 2.1189931350114417,
"grad_norm": 1.2632827758789062,
"learning_rate": 5.440387184330306e-06,
"loss": 0.6777645349502563,
"step": 926
},
{
"epoch": 2.123569794050343,
"grad_norm": 1.7766778469085693,
"learning_rate": 5.423356415375933e-06,
"loss": 0.5913289785385132,
"step": 928
},
{
"epoch": 2.128146453089245,
"grad_norm": 4.999666213989258,
"learning_rate": 5.406326744463305e-06,
"loss": 0.4775615930557251,
"step": 930
},
{
"epoch": 2.1327231121281462,
"grad_norm": 4.9804229736328125,
"learning_rate": 5.389298415569653e-06,
"loss": 0.5832971334457397,
"step": 932
},
{
"epoch": 2.137299771167048,
"grad_norm": 2.392467975616455,
"learning_rate": 5.372271672652978e-06,
"loss": 0.8872381448745728,
"step": 934
},
{
"epoch": 2.14187643020595,
"grad_norm": 1.6833393573760986,
"learning_rate": 5.355246759648563e-06,
"loss": 0.770750105381012,
"step": 936
},
{
"epoch": 2.1464530892448512,
"grad_norm": 4.981933116912842,
"learning_rate": 5.338223920465476e-06,
"loss": 0.46837982535362244,
"step": 938
},
{
"epoch": 2.151029748283753,
"grad_norm": 2.7053420543670654,
"learning_rate": 5.321203398983071e-06,
"loss": 0.7598111629486084,
"step": 940
},
{
"epoch": 2.1556064073226544,
"grad_norm": 19.10597801208496,
"learning_rate": 5.3041854390475e-06,
"loss": 0.653712272644043,
"step": 942
},
{
"epoch": 2.160183066361556,
"grad_norm": 6.7961201667785645,
"learning_rate": 5.287170284468216e-06,
"loss": 0.6563678979873657,
"step": 944
},
{
"epoch": 2.1647597254004576,
"grad_norm": 1.762424111366272,
"learning_rate": 5.2701581790144775e-06,
"loss": 0.6890889406204224,
"step": 946
},
{
"epoch": 2.1693363844393594,
"grad_norm": 3.5865209102630615,
"learning_rate": 5.253149366411864e-06,
"loss": 0.620913028717041,
"step": 948
},
{
"epoch": 2.1739130434782608,
"grad_norm": 2.4212989807128906,
"learning_rate": 5.236144090338777e-06,
"loss": 0.9455079436302185,
"step": 950
},
{
"epoch": 2.1784897025171626,
"grad_norm": 2.00994610786438,
"learning_rate": 5.219142594422953e-06,
"loss": 0.9673428535461426,
"step": 952
},
{
"epoch": 2.183066361556064,
"grad_norm": 1.9694055318832397,
"learning_rate": 5.20214512223797e-06,
"loss": 0.8950019478797913,
"step": 954
},
{
"epoch": 2.1876430205949657,
"grad_norm": 2.0830254554748535,
"learning_rate": 5.185151917299762e-06,
"loss": 0.832808256149292,
"step": 956
},
{
"epoch": 2.192219679633867,
"grad_norm": 2.595118761062622,
"learning_rate": 5.168163223063125e-06,
"loss": 0.6324760913848877,
"step": 958
},
{
"epoch": 2.196796338672769,
"grad_norm": 1.7887650728225708,
"learning_rate": 5.151179282918234e-06,
"loss": 0.8795846700668335,
"step": 960
},
{
"epoch": 2.2013729977116703,
"grad_norm": 2.8889565467834473,
"learning_rate": 5.134200340187155e-06,
"loss": 0.6858696341514587,
"step": 962
},
{
"epoch": 2.205949656750572,
"grad_norm": 1.9928938150405884,
"learning_rate": 5.117226638120356e-06,
"loss": 0.8538745045661926,
"step": 964
},
{
"epoch": 2.2105263157894735,
"grad_norm": 4.457608699798584,
"learning_rate": 5.100258419893223e-06,
"loss": 0.8814704418182373,
"step": 966
},
{
"epoch": 2.2151029748283753,
"grad_norm": 8.807114601135254,
"learning_rate": 5.083295928602581e-06,
"loss": 0.6599367260932922,
"step": 968
},
{
"epoch": 2.219679633867277,
"grad_norm": 4.574615955352783,
"learning_rate": 5.066339407263203e-06,
"loss": 0.8049777746200562,
"step": 970
},
{
"epoch": 2.2242562929061784,
"grad_norm": 1.0171655416488647,
"learning_rate": 5.049389098804337e-06,
"loss": 0.41160112619400024,
"step": 972
},
{
"epoch": 2.2288329519450802,
"grad_norm": 2.5793819427490234,
"learning_rate": 5.032445246066223e-06,
"loss": 0.9556428790092468,
"step": 974
},
{
"epoch": 2.2334096109839816,
"grad_norm": 14.489500999450684,
"learning_rate": 5.0155080917966035e-06,
"loss": 0.846240758895874,
"step": 976
},
{
"epoch": 2.2379862700228834,
"grad_norm": 1.3771382570266724,
"learning_rate": 4.998577878647265e-06,
"loss": 1.0038063526153564,
"step": 978
},
{
"epoch": 2.242562929061785,
"grad_norm": 2.4952244758605957,
"learning_rate": 4.981654849170546e-06,
"loss": 0.8787197470664978,
"step": 980
},
{
"epoch": 2.2471395881006866,
"grad_norm": 1.98478102684021,
"learning_rate": 4.964739245815867e-06,
"loss": 0.6971051692962646,
"step": 982
},
{
"epoch": 2.251716247139588,
"grad_norm": 10.939866065979004,
"learning_rate": 4.947831310926261e-06,
"loss": 0.8242970705032349,
"step": 984
},
{
"epoch": 2.2562929061784898,
"grad_norm": 3.1165544986724854,
"learning_rate": 4.930931286734896e-06,
"loss": 0.7920933961868286,
"step": 986
},
{
"epoch": 2.260869565217391,
"grad_norm": 4.3943095207214355,
"learning_rate": 4.914039415361604e-06,
"loss": 0.6728335618972778,
"step": 988
},
{
"epoch": 2.265446224256293,
"grad_norm": 4.197206497192383,
"learning_rate": 4.897155938809418e-06,
"loss": 0.5807865858078003,
"step": 990
},
{
"epoch": 2.2700228832951943,
"grad_norm": 4.6128363609313965,
"learning_rate": 4.880281098961104e-06,
"loss": 0.6835315227508545,
"step": 992
},
{
"epoch": 2.274599542334096,
"grad_norm": 3.85286808013916,
"learning_rate": 4.863415137575688e-06,
"loss": 0.8232854008674622,
"step": 994
},
{
"epoch": 2.279176201372998,
"grad_norm": 2.8220767974853516,
"learning_rate": 4.846558296285e-06,
"loss": 0.7026492357254028,
"step": 996
},
{
"epoch": 2.2837528604118993,
"grad_norm": 1.7302947044372559,
"learning_rate": 4.829710816590214e-06,
"loss": 0.8062256574630737,
"step": 998
},
{
"epoch": 2.288329519450801,
"grad_norm": 3.6414108276367188,
"learning_rate": 4.812872939858375e-06,
"loss": 0.45558106899261475,
"step": 1000
},
{
"epoch": 2.2929061784897025,
"grad_norm": 6.842305660247803,
"learning_rate": 4.796044907318961e-06,
"loss": 0.6691060066223145,
"step": 1002
},
{
"epoch": 2.2974828375286043,
"grad_norm": 2.9899022579193115,
"learning_rate": 4.7792269600604115e-06,
"loss": 0.8572442531585693,
"step": 1004
},
{
"epoch": 2.3020594965675056,
"grad_norm": 1.9516880512237549,
"learning_rate": 4.7624193390266725e-06,
"loss": 0.883170485496521,
"step": 1006
},
{
"epoch": 2.3066361556064074,
"grad_norm": 10.57199478149414,
"learning_rate": 4.74562228501376e-06,
"loss": 0.5891278982162476,
"step": 1008
},
{
"epoch": 2.311212814645309,
"grad_norm": 7.4861602783203125,
"learning_rate": 4.7288360386662965e-06,
"loss": 0.5752084255218506,
"step": 1010
},
{
"epoch": 2.3157894736842106,
"grad_norm": 4.013728141784668,
"learning_rate": 4.7120608404740644e-06,
"loss": 0.6811657547950745,
"step": 1012
},
{
"epoch": 2.320366132723112,
"grad_norm": 2.6324877738952637,
"learning_rate": 4.695296930768567e-06,
"loss": 0.8065996170043945,
"step": 1014
},
{
"epoch": 2.324942791762014,
"grad_norm": 8.653948783874512,
"learning_rate": 4.678544549719585e-06,
"loss": 1.0626581907272339,
"step": 1016
},
{
"epoch": 2.329519450800915,
"grad_norm": 2.111722707748413,
"learning_rate": 4.6618039373317245e-06,
"loss": 0.6138767004013062,
"step": 1018
},
{
"epoch": 2.334096109839817,
"grad_norm": 2.4597508907318115,
"learning_rate": 4.645075333440995e-06,
"loss": 0.7229528427124023,
"step": 1020
},
{
"epoch": 2.3386727688787188,
"grad_norm": 2.4709105491638184,
"learning_rate": 4.6283589777113605e-06,
"loss": 0.8089841604232788,
"step": 1022
},
{
"epoch": 2.34324942791762,
"grad_norm": 3.309113025665283,
"learning_rate": 4.611655109631309e-06,
"loss": 0.8193925023078918,
"step": 1024
},
{
"epoch": 2.3478260869565215,
"grad_norm": 1.6609846353530884,
"learning_rate": 4.594963968510428e-06,
"loss": 0.9571186900138855,
"step": 1026
},
{
"epoch": 2.3524027459954233,
"grad_norm": 3.329925537109375,
"learning_rate": 4.578285793475969e-06,
"loss": 0.6692005395889282,
"step": 1028
},
{
"epoch": 2.356979405034325,
"grad_norm": 0.6110982894897461,
"learning_rate": 4.56162082346942e-06,
"loss": 0.5079280138015747,
"step": 1030
},
{
"epoch": 2.3615560640732265,
"grad_norm": 5.090636253356934,
"learning_rate": 4.544969297243091e-06,
"loss": 0.32914191484451294,
"step": 1032
},
{
"epoch": 2.3661327231121283,
"grad_norm": 1.61818528175354,
"learning_rate": 4.528331453356689e-06,
"loss": 0.7983392477035522,
"step": 1034
},
{
"epoch": 2.3707093821510297,
"grad_norm": 1.8456085920333862,
"learning_rate": 4.511707530173892e-06,
"loss": 0.6458148956298828,
"step": 1036
},
{
"epoch": 2.3752860411899315,
"grad_norm": 3.5649495124816895,
"learning_rate": 4.495097765858949e-06,
"loss": 1.0234034061431885,
"step": 1038
},
{
"epoch": 2.379862700228833,
"grad_norm": 2.833116292953491,
"learning_rate": 4.47850239837326e-06,
"loss": 0.875385046005249,
"step": 1040
},
{
"epoch": 2.3844393592677346,
"grad_norm": 2.9960877895355225,
"learning_rate": 4.461921665471962e-06,
"loss": 0.65616375207901,
"step": 1042
},
{
"epoch": 2.389016018306636,
"grad_norm": 5.081766128540039,
"learning_rate": 4.445355804700533e-06,
"loss": 0.838226318359375,
"step": 1044
},
{
"epoch": 2.393592677345538,
"grad_norm": 5.546652793884277,
"learning_rate": 4.428805053391386e-06,
"loss": 0.39645886421203613,
"step": 1046
},
{
"epoch": 2.398169336384439,
"grad_norm": 3.0292372703552246,
"learning_rate": 4.41226964866046e-06,
"loss": 0.7210261821746826,
"step": 1048
},
{
"epoch": 2.402745995423341,
"grad_norm": 4.103295803070068,
"learning_rate": 4.395749827403835e-06,
"loss": 0.8829662203788757,
"step": 1050
},
{
"epoch": 2.4073226544622424,
"grad_norm": 6.127950668334961,
"learning_rate": 4.3792458262943324e-06,
"loss": 0.26367780566215515,
"step": 1052
},
{
"epoch": 2.411899313501144,
"grad_norm": 1.9098131656646729,
"learning_rate": 4.362757881778122e-06,
"loss": 0.7983935475349426,
"step": 1054
},
{
"epoch": 2.416475972540046,
"grad_norm": 1.961759328842163,
"learning_rate": 4.346286230071337e-06,
"loss": 0.8044694066047668,
"step": 1056
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.8351384401321411,
"learning_rate": 4.329831107156698e-06,
"loss": 0.4787551164627075,
"step": 1058
},
{
"epoch": 2.425629290617849,
"grad_norm": 2.753488779067993,
"learning_rate": 4.313392748780112e-06,
"loss": 0.7191129922866821,
"step": 1060
},
{
"epoch": 2.4302059496567505,
"grad_norm": 2.585822582244873,
"learning_rate": 4.296971390447317e-06,
"loss": 0.8327975273132324,
"step": 1062
},
{
"epoch": 2.4347826086956523,
"grad_norm": 2.122526168823242,
"learning_rate": 4.2805672674204935e-06,
"loss": 0.5977469086647034,
"step": 1064
},
{
"epoch": 2.4393592677345537,
"grad_norm": 2.236255645751953,
"learning_rate": 4.264180614714901e-06,
"loss": 0.7535611987113953,
"step": 1066
},
{
"epoch": 2.4439359267734555,
"grad_norm": 2.458059787750244,
"learning_rate": 4.247811667095506e-06,
"loss": 0.752835750579834,
"step": 1068
},
{
"epoch": 2.448512585812357,
"grad_norm": 1.7041510343551636,
"learning_rate": 4.2314606590736256e-06,
"loss": 0.463468998670578,
"step": 1070
},
{
"epoch": 2.4530892448512587,
"grad_norm": 1.9309899806976318,
"learning_rate": 4.215127824903558e-06,
"loss": 0.7822204232215881,
"step": 1072
},
{
"epoch": 2.45766590389016,
"grad_norm": 2.6464591026306152,
"learning_rate": 4.198813398579236e-06,
"loss": 0.6719791293144226,
"step": 1074
},
{
"epoch": 2.462242562929062,
"grad_norm": 2.6165504455566406,
"learning_rate": 4.1825176138308695e-06,
"loss": 0.7982878684997559,
"step": 1076
},
{
"epoch": 2.466819221967963,
"grad_norm": 3.580535411834717,
"learning_rate": 4.1662407041215995e-06,
"loss": 0.41490495204925537,
"step": 1078
},
{
"epoch": 2.471395881006865,
"grad_norm": 2.3776938915252686,
"learning_rate": 4.14998290264415e-06,
"loss": 0.5821819305419922,
"step": 1080
},
{
"epoch": 2.475972540045767,
"grad_norm": 2.160568952560425,
"learning_rate": 4.133744442317487e-06,
"loss": 0.8048266768455505,
"step": 1082
},
{
"epoch": 2.480549199084668,
"grad_norm": 16.509410858154297,
"learning_rate": 4.117525555783489e-06,
"loss": 0.9339464902877808,
"step": 1084
},
{
"epoch": 2.4851258581235696,
"grad_norm": 5.036529064178467,
"learning_rate": 4.101326475403604e-06,
"loss": 0.5219019651412964,
"step": 1086
},
{
"epoch": 2.4897025171624714,
"grad_norm": 8.633739471435547,
"learning_rate": 4.08514743325553e-06,
"loss": 0.8041296601295471,
"step": 1088
},
{
"epoch": 2.494279176201373,
"grad_norm": 7.70281457901001,
"learning_rate": 4.068988661129879e-06,
"loss": 0.8011189699172974,
"step": 1090
},
{
"epoch": 2.4988558352402745,
"grad_norm": 3.9831337928771973,
"learning_rate": 4.052850390526868e-06,
"loss": 0.6868438720703125,
"step": 1092
},
{
"epoch": 2.5034324942791764,
"grad_norm": 2.012017250061035,
"learning_rate": 4.036732852652995e-06,
"loss": 1.0341452360153198,
"step": 1094
},
{
"epoch": 2.5080091533180777,
"grad_norm": 2.237452268600464,
"learning_rate": 4.020636278417727e-06,
"loss": 0.8562324047088623,
"step": 1096
},
{
"epoch": 2.5125858123569795,
"grad_norm": 2.1376569271087646,
"learning_rate": 4.0045608984301945e-06,
"loss": 0.7588418126106262,
"step": 1098
},
{
"epoch": 2.517162471395881,
"grad_norm": 3.0613656044006348,
"learning_rate": 3.98850694299589e-06,
"loss": 0.7711232900619507,
"step": 1100
},
{
"epoch": 2.5217391304347827,
"grad_norm": 2.243285655975342,
"learning_rate": 3.972474642113357e-06,
"loss": 0.7679527401924133,
"step": 1102
},
{
"epoch": 2.526315789473684,
"grad_norm": 2.8805336952209473,
"learning_rate": 3.956464225470914e-06,
"loss": 0.5456798672676086,
"step": 1104
},
{
"epoch": 2.530892448512586,
"grad_norm": 3.419220209121704,
"learning_rate": 3.940475922443343e-06,
"loss": 0.8449265956878662,
"step": 1106
},
{
"epoch": 2.5354691075514877,
"grad_norm": 2.3951187133789062,
"learning_rate": 3.924509962088617e-06,
"loss": 0.5211961269378662,
"step": 1108
},
{
"epoch": 2.540045766590389,
"grad_norm": 0.6526300311088562,
"learning_rate": 3.9085665731446155e-06,
"loss": 0.5454580783843994,
"step": 1110
},
{
"epoch": 2.5446224256292904,
"grad_norm": 1.6130284070968628,
"learning_rate": 3.892645984025846e-06,
"loss": 0.8388891220092773,
"step": 1112
},
{
"epoch": 2.5491990846681922,
"grad_norm": 1.6545199155807495,
"learning_rate": 3.876748422820168e-06,
"loss": 0.8067798018455505,
"step": 1114
},
{
"epoch": 2.553775743707094,
"grad_norm": 1.631625771522522,
"learning_rate": 3.860874117285535e-06,
"loss": 0.7862741947174072,
"step": 1116
},
{
"epoch": 2.5583524027459954,
"grad_norm": 1.7587617635726929,
"learning_rate": 3.8450232948467285e-06,
"loss": 0.7546199560165405,
"step": 1118
},
{
"epoch": 2.5629290617848968,
"grad_norm": 3.396798610687256,
"learning_rate": 3.829196182592084e-06,
"loss": 0.8121140003204346,
"step": 1120
},
{
"epoch": 2.5675057208237986,
"grad_norm": 3.6833112239837646,
"learning_rate": 3.8133930072702653e-06,
"loss": 0.8952106237411499,
"step": 1122
},
{
"epoch": 2.5720823798627004,
"grad_norm": 3.539543390274048,
"learning_rate": 3.797613995286993e-06,
"loss": 0.9799838066101074,
"step": 1124
},
{
"epoch": 2.5766590389016018,
"grad_norm": 6.784366607666016,
"learning_rate": 3.7818593727018114e-06,
"loss": 0.7816819548606873,
"step": 1126
},
{
"epoch": 2.5812356979405036,
"grad_norm": 7.371065616607666,
"learning_rate": 3.7661293652248486e-06,
"loss": 0.6339356899261475,
"step": 1128
},
{
"epoch": 2.585812356979405,
"grad_norm": 4.603682994842529,
"learning_rate": 3.7504241982135802e-06,
"loss": 0.5646559000015259,
"step": 1130
},
{
"epoch": 2.5903890160183067,
"grad_norm": 10.729260444641113,
"learning_rate": 3.734744096669605e-06,
"loss": 0.3738934397697449,
"step": 1132
},
{
"epoch": 2.594965675057208,
"grad_norm": 2.632481813430786,
"learning_rate": 3.7190892852354177e-06,
"loss": 0.5772243738174438,
"step": 1134
},
{
"epoch": 2.59954233409611,
"grad_norm": 2.2666807174682617,
"learning_rate": 3.703459988191188e-06,
"loss": 0.8983527421951294,
"step": 1136
},
{
"epoch": 2.6041189931350113,
"grad_norm": 2.42862868309021,
"learning_rate": 3.6878564294515597e-06,
"loss": 0.41785839200019836,
"step": 1138
},
{
"epoch": 2.608695652173913,
"grad_norm": 2.7941179275512695,
"learning_rate": 3.672278832562427e-06,
"loss": 0.9465121030807495,
"step": 1140
},
{
"epoch": 2.613272311212815,
"grad_norm": 2.7512271404266357,
"learning_rate": 3.656727420697741e-06,
"loss": 0.9477014541625977,
"step": 1142
},
{
"epoch": 2.6178489702517163,
"grad_norm": 3.315143346786499,
"learning_rate": 3.641202416656311e-06,
"loss": 0.48098376393318176,
"step": 1144
},
{
"epoch": 2.6224256292906176,
"grad_norm": 0.8696630001068115,
"learning_rate": 3.6257040428586143e-06,
"loss": 0.4189261794090271,
"step": 1146
},
{
"epoch": 2.6270022883295194,
"grad_norm": 4.89910364151001,
"learning_rate": 3.610232521343603e-06,
"loss": 0.25716477632522583,
"step": 1148
},
{
"epoch": 2.6315789473684212,
"grad_norm": 1.6397091150283813,
"learning_rate": 3.5947880737655307e-06,
"loss": 0.5442467927932739,
"step": 1150
},
{
"epoch": 2.6361556064073226,
"grad_norm": 2.3286402225494385,
"learning_rate": 3.5793709213907713e-06,
"loss": 0.8711032867431641,
"step": 1152
},
{
"epoch": 2.6407322654462244,
"grad_norm": 6.281716346740723,
"learning_rate": 3.563981285094654e-06,
"loss": 0.6418501138687134,
"step": 1154
},
{
"epoch": 2.645308924485126,
"grad_norm": 1.7850065231323242,
"learning_rate": 3.5486193853582917e-06,
"loss": 0.5711671710014343,
"step": 1156
},
{
"epoch": 2.6498855835240276,
"grad_norm": 4.116028308868408,
"learning_rate": 3.533285442265435e-06,
"loss": 0.7774121165275574,
"step": 1158
},
{
"epoch": 2.654462242562929,
"grad_norm": 2.7807161808013916,
"learning_rate": 3.5179796754993e-06,
"loss": 0.6718448400497437,
"step": 1160
},
{
"epoch": 2.6590389016018308,
"grad_norm": 2.0300822257995605,
"learning_rate": 3.5027023043394436e-06,
"loss": 0.824101448059082,
"step": 1162
},
{
"epoch": 2.663615560640732,
"grad_norm": 3.844541072845459,
"learning_rate": 3.4874535476586014e-06,
"loss": 0.62496018409729,
"step": 1164
},
{
"epoch": 2.668192219679634,
"grad_norm": 2.7920563220977783,
"learning_rate": 3.4722336239195615e-06,
"loss": 0.4988476634025574,
"step": 1166
},
{
"epoch": 2.6727688787185357,
"grad_norm": 8.478647232055664,
"learning_rate": 3.45704275117204e-06,
"loss": 0.6625174283981323,
"step": 1168
},
{
"epoch": 2.677345537757437,
"grad_norm": 8.968935012817383,
"learning_rate": 3.4418811470495467e-06,
"loss": 0.6531594395637512,
"step": 1170
},
{
"epoch": 2.6819221967963385,
"grad_norm": 2.9643266201019287,
"learning_rate": 3.426749028766273e-06,
"loss": 0.33440902829170227,
"step": 1172
},
{
"epoch": 2.6864988558352403,
"grad_norm": 3.355323553085327,
"learning_rate": 3.411646613113976e-06,
"loss": 0.6681325435638428,
"step": 1174
},
{
"epoch": 2.691075514874142,
"grad_norm": 6.2293782234191895,
"learning_rate": 3.3965741164588796e-06,
"loss": 0.4832615852355957,
"step": 1176
},
{
"epoch": 2.6956521739130435,
"grad_norm": 4.777223587036133,
"learning_rate": 3.381531754738567e-06,
"loss": 0.6058177947998047,
"step": 1178
},
{
"epoch": 2.700228832951945,
"grad_norm": 6.559420585632324,
"learning_rate": 3.366519743458889e-06,
"loss": 0.6945388317108154,
"step": 1180
},
{
"epoch": 2.7048054919908466,
"grad_norm": 4.55678653717041,
"learning_rate": 3.351538297690886e-06,
"loss": 0.8977880477905273,
"step": 1182
},
{
"epoch": 2.7093821510297484,
"grad_norm": 1.9731976985931396,
"learning_rate": 3.336587632067686e-06,
"loss": 0.8714834451675415,
"step": 1184
},
{
"epoch": 2.71395881006865,
"grad_norm": 2.371523380279541,
"learning_rate": 3.321667960781454e-06,
"loss": 0.8558663129806519,
"step": 1186
},
{
"epoch": 2.7185354691075516,
"grad_norm": 23.87647819519043,
"learning_rate": 3.3067794975803047e-06,
"loss": 0.6330064535140991,
"step": 1188
},
{
"epoch": 2.723112128146453,
"grad_norm": 0.8517280220985413,
"learning_rate": 3.2919224557652494e-06,
"loss": 0.4448906481266022,
"step": 1190
},
{
"epoch": 2.727688787185355,
"grad_norm": 8.36446475982666,
"learning_rate": 3.2770970481871378e-06,
"loss": 0.6628245115280151,
"step": 1192
},
{
"epoch": 2.732265446224256,
"grad_norm": 7.5198235511779785,
"learning_rate": 3.262303487243609e-06,
"loss": 0.7516108751296997,
"step": 1194
},
{
"epoch": 2.736842105263158,
"grad_norm": 28.500797271728516,
"learning_rate": 3.247541984876046e-06,
"loss": 0.6165962815284729,
"step": 1196
},
{
"epoch": 2.7414187643020593,
"grad_norm": 2.045964479446411,
"learning_rate": 3.232812752566542e-06,
"loss": 0.5546071529388428,
"step": 1198
},
{
"epoch": 2.745995423340961,
"grad_norm": 1.7794232368469238,
"learning_rate": 3.218116001334878e-06,
"loss": 0.8487035036087036,
"step": 1200
},
{
"epoch": 2.750572082379863,
"grad_norm": 2.4322621822357178,
"learning_rate": 3.203451941735476e-06,
"loss": 0.7778904438018799,
"step": 1202
},
{
"epoch": 2.7551487414187643,
"grad_norm": 7.543785095214844,
"learning_rate": 3.1888207838544127e-06,
"loss": 0.5872822403907776,
"step": 1204
},
{
"epoch": 2.7597254004576657,
"grad_norm": 2.053680419921875,
"learning_rate": 3.1742227373063907e-06,
"loss": 0.7311505675315857,
"step": 1206
},
{
"epoch": 2.7643020594965675,
"grad_norm": 2.327242851257324,
"learning_rate": 3.159658011231732e-06,
"loss": 0.7317074537277222,
"step": 1208
},
{
"epoch": 2.7688787185354693,
"grad_norm": 1.6999424695968628,
"learning_rate": 3.1451268142934023e-06,
"loss": 0.6040608882904053,
"step": 1210
},
{
"epoch": 2.7734553775743707,
"grad_norm": 6.141793251037598,
"learning_rate": 3.1306293546740007e-06,
"loss": 0.5794140100479126,
"step": 1212
},
{
"epoch": 2.7780320366132725,
"grad_norm": 10.223762512207031,
"learning_rate": 3.116165840072789e-06,
"loss": 0.963721513748169,
"step": 1214
},
{
"epoch": 2.782608695652174,
"grad_norm": 3.224015474319458,
"learning_rate": 3.101736477702707e-06,
"loss": 0.8357968330383301,
"step": 1216
},
{
"epoch": 2.7871853546910756,
"grad_norm": 2.6462185382843018,
"learning_rate": 3.08734147428742e-06,
"loss": 0.5886989831924438,
"step": 1218
},
{
"epoch": 2.791762013729977,
"grad_norm": 2.715287208557129,
"learning_rate": 3.0729810360583333e-06,
"loss": 0.852825403213501,
"step": 1220
},
{
"epoch": 2.796338672768879,
"grad_norm": 2.657933473587036,
"learning_rate": 3.058655368751658e-06,
"loss": 0.8328732848167419,
"step": 1222
},
{
"epoch": 2.80091533180778,
"grad_norm": 3.437394142150879,
"learning_rate": 3.04436467760546e-06,
"loss": 0.8728150129318237,
"step": 1224
},
{
"epoch": 2.805491990846682,
"grad_norm": 5.744348049163818,
"learning_rate": 3.030109167356704e-06,
"loss": 0.7142363786697388,
"step": 1226
},
{
"epoch": 2.8100686498855834,
"grad_norm": 3.96525239944458,
"learning_rate": 3.0158890422383445e-06,
"loss": 0.6308979988098145,
"step": 1228
},
{
"epoch": 2.814645308924485,
"grad_norm": 1.1390433311462402,
"learning_rate": 3.00170450597638e-06,
"loss": 0.5205511450767517,
"step": 1230
},
{
"epoch": 2.8192219679633865,
"grad_norm": 2.2246572971343994,
"learning_rate": 2.9875557617869456e-06,
"loss": 0.794964075088501,
"step": 1232
},
{
"epoch": 2.8237986270022883,
"grad_norm": 3.394249677658081,
"learning_rate": 2.9734430123733937e-06,
"loss": 0.3426223397254944,
"step": 1234
},
{
"epoch": 2.82837528604119,
"grad_norm": 2.538475751876831,
"learning_rate": 2.9593664599233984e-06,
"loss": 0.8177708387374878,
"step": 1236
},
{
"epoch": 2.8329519450800915,
"grad_norm": 2.663276195526123,
"learning_rate": 2.9453263061060522e-06,
"loss": 0.607596755027771,
"step": 1238
},
{
"epoch": 2.837528604118993,
"grad_norm": 5.610261917114258,
"learning_rate": 2.9313227520689787e-06,
"loss": 0.610859751701355,
"step": 1240
},
{
"epoch": 2.8421052631578947,
"grad_norm": 3.410015821456909,
"learning_rate": 2.917355998435456e-06,
"loss": 0.5197435617446899,
"step": 1242
},
{
"epoch": 2.8466819221967965,
"grad_norm": 3.7659153938293457,
"learning_rate": 2.903426245301526e-06,
"loss": 0.8168070316314697,
"step": 1244
},
{
"epoch": 2.851258581235698,
"grad_norm": 7.588848114013672,
"learning_rate": 2.8895336922331546e-06,
"loss": 0.6579625010490417,
"step": 1246
},
{
"epoch": 2.8558352402745997,
"grad_norm": 2.474982738494873,
"learning_rate": 2.875678538263347e-06,
"loss": 0.3433322310447693,
"step": 1248
},
{
"epoch": 2.860411899313501,
"grad_norm": 7.560976982116699,
"learning_rate": 2.8618609818893082e-06,
"loss": 0.5438008308410645,
"step": 1250
},
{
"epoch": 2.864988558352403,
"grad_norm": 2.223369836807251,
"learning_rate": 2.8480812210696005e-06,
"loss": 0.8767250776290894,
"step": 1252
},
{
"epoch": 2.869565217391304,
"grad_norm": 3.2690696716308594,
"learning_rate": 2.834339453221302e-06,
"loss": 0.9083548784255981,
"step": 1254
},
{
"epoch": 2.874141876430206,
"grad_norm": 4.433350563049316,
"learning_rate": 2.8206358752171813e-06,
"loss": 0.597790002822876,
"step": 1256
},
{
"epoch": 2.8787185354691074,
"grad_norm": 2.7721469402313232,
"learning_rate": 2.8069706833828763e-06,
"loss": 0.772271990776062,
"step": 1258
},
{
"epoch": 2.883295194508009,
"grad_norm": 2.6030502319335938,
"learning_rate": 2.7933440734940863e-06,
"loss": 0.8379377126693726,
"step": 1260
},
{
"epoch": 2.887871853546911,
"grad_norm": 6.367020606994629,
"learning_rate": 2.7797562407737533e-06,
"loss": 1.0624537467956543,
"step": 1262
},
{
"epoch": 2.8924485125858124,
"grad_norm": 2.0598034858703613,
"learning_rate": 2.766207379889281e-06,
"loss": 0.8280332684516907,
"step": 1264
},
{
"epoch": 2.8970251716247137,
"grad_norm": 10.405440330505371,
"learning_rate": 2.752697684949741e-06,
"loss": 0.5287131071090698,
"step": 1266
},
{
"epoch": 2.9016018306636155,
"grad_norm": 2.383814811706543,
"learning_rate": 2.739227349503083e-06,
"loss": 0.85811847448349,
"step": 1268
},
{
"epoch": 2.9061784897025174,
"grad_norm": 4.059746265411377,
"learning_rate": 2.7257965665333765e-06,
"loss": 0.8082501888275146,
"step": 1270
},
{
"epoch": 2.9107551487414187,
"grad_norm": 0.8926740288734436,
"learning_rate": 2.712405528458034e-06,
"loss": 0.537039041519165,
"step": 1272
},
{
"epoch": 2.9153318077803205,
"grad_norm": 1.766548752784729,
"learning_rate": 2.6990544271250607e-06,
"loss": 0.8076680302619934,
"step": 1274
},
{
"epoch": 2.919908466819222,
"grad_norm": 2.0170223712921143,
"learning_rate": 2.6857434538103043e-06,
"loss": 0.5731369256973267,
"step": 1276
},
{
"epoch": 2.9244851258581237,
"grad_norm": 16.56928253173828,
"learning_rate": 2.672472799214714e-06,
"loss": 0.5599454641342163,
"step": 1278
},
{
"epoch": 2.929061784897025,
"grad_norm": 1.690929651260376,
"learning_rate": 2.659242653461608e-06,
"loss": 0.671285092830658,
"step": 1280
},
{
"epoch": 2.933638443935927,
"grad_norm": 0.7769700288772583,
"learning_rate": 2.64605320609395e-06,
"loss": 0.452056348323822,
"step": 1282
},
{
"epoch": 2.9382151029748282,
"grad_norm": 1.5757992267608643,
"learning_rate": 2.6329046460716424e-06,
"loss": 0.43658745288848877,
"step": 1284
},
{
"epoch": 2.94279176201373,
"grad_norm": 4.979938507080078,
"learning_rate": 2.6197971617687972e-06,
"loss": 0.504380464553833,
"step": 1286
},
{
"epoch": 2.9473684210526314,
"grad_norm": 8.039284706115723,
"learning_rate": 2.606730940971064e-06,
"loss": 0.754509687423706,
"step": 1288
},
{
"epoch": 2.9519450800915332,
"grad_norm": 12.317577362060547,
"learning_rate": 2.5937061708729187e-06,
"loss": 0.9081135988235474,
"step": 1290
},
{
"epoch": 2.9565217391304346,
"grad_norm": 7.108748912811279,
"learning_rate": 2.5807230380749942e-06,
"loss": 0.5429270267486572,
"step": 1292
},
{
"epoch": 2.9610983981693364,
"grad_norm": 2.071383476257324,
"learning_rate": 2.5677817285813996e-06,
"loss": 0.55026775598526,
"step": 1294
},
{
"epoch": 2.965675057208238,
"grad_norm": 2.826601266860962,
"learning_rate": 2.5548824277970595e-06,
"loss": 0.8305662274360657,
"step": 1296
},
{
"epoch": 2.9702517162471396,
"grad_norm": 6.209619522094727,
"learning_rate": 2.542025320525058e-06,
"loss": 0.9413694143295288,
"step": 1298
},
{
"epoch": 2.974828375286041,
"grad_norm": 41.0315055847168,
"learning_rate": 2.5292105909639857e-06,
"loss": 0.5638728737831116,
"step": 1300
},
{
"epoch": 2.9794050343249427,
"grad_norm": 0.5875497460365295,
"learning_rate": 2.5164384227053133e-06,
"loss": 0.5167251229286194,
"step": 1302
},
{
"epoch": 2.9839816933638446,
"grad_norm": 3.607098340988159,
"learning_rate": 2.5037089987307405e-06,
"loss": 0.8430502414703369,
"step": 1304
},
{
"epoch": 2.988558352402746,
"grad_norm": 6.2934064865112305,
"learning_rate": 2.491022501409598e-06,
"loss": 0.522983193397522,
"step": 1306
},
{
"epoch": 2.9931350114416477,
"grad_norm": 1.3800368309020996,
"learning_rate": 2.4783791124962197e-06,
"loss": 0.7301946878433228,
"step": 1308
},
{
"epoch": 2.997711670480549,
"grad_norm": 2.08162522315979,
"learning_rate": 2.4657790131273376e-06,
"loss": 0.7967828512191772,
"step": 1310
},
{
"epoch": 3.002288329519451,
"grad_norm": 1.4447431564331055,
"learning_rate": 2.4532223838195006e-06,
"loss": 0.5404419898986816,
"step": 1312
},
{
"epoch": 3.0068649885583523,
"grad_norm": 3.0550334453582764,
"learning_rate": 2.4407094044664746e-06,
"loss": 0.6030987501144409,
"step": 1314
},
{
"epoch": 3.011441647597254,
"grad_norm": 6.2352166175842285,
"learning_rate": 2.4282402543366706e-06,
"loss": 0.561785101890564,
"step": 1316
},
{
"epoch": 3.0160183066361554,
"grad_norm": 1.6362286806106567,
"learning_rate": 2.4158151120705773e-06,
"loss": 0.2637900412082672,
"step": 1318
},
{
"epoch": 3.0205949656750573,
"grad_norm": 2.0714526176452637,
"learning_rate": 2.4034341556781986e-06,
"loss": 0.6601877212524414,
"step": 1320
},
{
"epoch": 3.0251716247139586,
"grad_norm": 2.9864554405212402,
"learning_rate": 2.3910975625365066e-06,
"loss": 0.5502775311470032,
"step": 1322
},
{
"epoch": 3.0297482837528604,
"grad_norm": 7.837927341461182,
"learning_rate": 2.3788055093868962e-06,
"loss": 0.33134838938713074,
"step": 1324
},
{
"epoch": 3.034324942791762,
"grad_norm": 4.929914474487305,
"learning_rate": 2.366558172332665e-06,
"loss": 0.7175261974334717,
"step": 1326
},
{
"epoch": 3.0389016018306636,
"grad_norm": 4.021068572998047,
"learning_rate": 2.354355726836466e-06,
"loss": 0.5193214416503906,
"step": 1328
},
{
"epoch": 3.0434782608695654,
"grad_norm": 3.3567593097686768,
"learning_rate": 2.342198347717823e-06,
"loss": 0.7330721020698547,
"step": 1330
},
{
"epoch": 3.0480549199084668,
"grad_norm": 4.255941390991211,
"learning_rate": 2.330086209150604e-06,
"loss": 0.3949548602104187,
"step": 1332
},
{
"epoch": 3.0526315789473686,
"grad_norm": 3.187955379486084,
"learning_rate": 2.3180194846605367e-06,
"loss": 0.7485113739967346,
"step": 1334
},
{
"epoch": 3.05720823798627,
"grad_norm": 2.202007532119751,
"learning_rate": 2.3059983471227186e-06,
"loss": 0.3207942247390747,
"step": 1336
},
{
"epoch": 3.0617848970251718,
"grad_norm": 2.3094310760498047,
"learning_rate": 2.294022968759142e-06,
"loss": 0.7273589372634888,
"step": 1338
},
{
"epoch": 3.066361556064073,
"grad_norm": 2.190314292907715,
"learning_rate": 2.2820935211362256e-06,
"loss": 0.5962270498275757,
"step": 1340
},
{
"epoch": 3.070938215102975,
"grad_norm": 6.551016330718994,
"learning_rate": 2.2702101751623555e-06,
"loss": 0.6893465518951416,
"step": 1342
},
{
"epoch": 3.0755148741418763,
"grad_norm": 2.070547342300415,
"learning_rate": 2.2583731010854436e-06,
"loss": 0.730362057685852,
"step": 1344
},
{
"epoch": 3.080091533180778,
"grad_norm": 3.0296781063079834,
"learning_rate": 2.2465824684904737e-06,
"loss": 0.44114387035369873,
"step": 1346
},
{
"epoch": 3.0846681922196795,
"grad_norm": 2.1151537895202637,
"learning_rate": 2.23483844629709e-06,
"loss": 0.6440198421478271,
"step": 1348
},
{
"epoch": 3.0892448512585813,
"grad_norm": 2.34732723236084,
"learning_rate": 2.223141202757164e-06,
"loss": 0.6223734617233276,
"step": 1350
},
{
"epoch": 3.0938215102974826,
"grad_norm": 3.51031231880188,
"learning_rate": 2.2114909054523883e-06,
"loss": 0.65424644947052,
"step": 1352
},
{
"epoch": 3.0983981693363845,
"grad_norm": 4.628645896911621,
"learning_rate": 2.199887721291877e-06,
"loss": 0.6126776337623596,
"step": 1354
},
{
"epoch": 3.1029748283752863,
"grad_norm": 2.8772079944610596,
"learning_rate": 2.188331816509772e-06,
"loss": 0.7314285039901733,
"step": 1356
},
{
"epoch": 3.1075514874141876,
"grad_norm": 0.4025164842605591,
"learning_rate": 2.176823356662864e-06,
"loss": 0.3512212634086609,
"step": 1358
},
{
"epoch": 3.1121281464530894,
"grad_norm": 2.19573974609375,
"learning_rate": 2.1653625066282153e-06,
"loss": 0.6310482025146484,
"step": 1360
},
{
"epoch": 3.116704805491991,
"grad_norm": 7.4715495109558105,
"learning_rate": 2.153949430600811e-06,
"loss": 0.41319721937179565,
"step": 1362
},
{
"epoch": 3.1212814645308926,
"grad_norm": 3.5463063716888428,
"learning_rate": 2.142584292091185e-06,
"loss": 0.49941742420196533,
"step": 1364
},
{
"epoch": 3.125858123569794,
"grad_norm": 3.7344858646392822,
"learning_rate": 2.1312672539230973e-06,
"loss": 0.5115246772766113,
"step": 1366
},
{
"epoch": 3.130434782608696,
"grad_norm": 7.448498249053955,
"learning_rate": 2.119998478231194e-06,
"loss": 0.43049943447113037,
"step": 1368
},
{
"epoch": 3.135011441647597,
"grad_norm": 4.2311882972717285,
"learning_rate": 2.1087781264586795e-06,
"loss": 0.7117477655410767,
"step": 1370
},
{
"epoch": 3.139588100686499,
"grad_norm": 1.4386028051376343,
"learning_rate": 2.0976063593550126e-06,
"loss": 0.5673470497131348,
"step": 1372
},
{
"epoch": 3.1441647597254003,
"grad_norm": 2.9334120750427246,
"learning_rate": 2.0864833369735974e-06,
"loss": 0.4686206579208374,
"step": 1374
},
{
"epoch": 3.148741418764302,
"grad_norm": 2.7471981048583984,
"learning_rate": 2.0754092186694917e-06,
"loss": 0.4157622456550598,
"step": 1376
},
{
"epoch": 3.1533180778032035,
"grad_norm": 3.158262252807617,
"learning_rate": 2.064384163097125e-06,
"loss": 0.7791534066200256,
"step": 1378
},
{
"epoch": 3.1578947368421053,
"grad_norm": 13.052440643310547,
"learning_rate": 2.0534083282080243e-06,
"loss": 0.3141913414001465,
"step": 1380
},
{
"epoch": 3.1624713958810067,
"grad_norm": 11.294751167297363,
"learning_rate": 2.0424818712485516e-06,
"loss": 0.6616432070732117,
"step": 1382
},
{
"epoch": 3.1670480549199085,
"grad_norm": 25.06646728515625,
"learning_rate": 2.0316049487576505e-06,
"loss": 0.5768702030181885,
"step": 1384
},
{
"epoch": 3.17162471395881,
"grad_norm": 2.034757375717163,
"learning_rate": 2.0207777165646096e-06,
"loss": 0.40132611989974976,
"step": 1386
},
{
"epoch": 3.1762013729977117,
"grad_norm": 7.5245137214660645,
"learning_rate": 2.010000329786815e-06,
"loss": 0.4842037260532379,
"step": 1388
},
{
"epoch": 3.1807780320366135,
"grad_norm": 2.598806381225586,
"learning_rate": 1.9992729428275452e-06,
"loss": 0.4087521433830261,
"step": 1390
},
{
"epoch": 3.185354691075515,
"grad_norm": 3.6281611919403076,
"learning_rate": 1.9885957093737494e-06,
"loss": 0.6152184009552002,
"step": 1392
},
{
"epoch": 3.1899313501144166,
"grad_norm": 1.8896663188934326,
"learning_rate": 1.977968782393848e-06,
"loss": 0.2853028178215027,
"step": 1394
},
{
"epoch": 3.194508009153318,
"grad_norm": 14.76069164276123,
"learning_rate": 1.9673923141355387e-06,
"loss": 0.45738574862480164,
"step": 1396
},
{
"epoch": 3.19908466819222,
"grad_norm": 2.0312094688415527,
"learning_rate": 1.9568664561236208e-06,
"loss": 0.5310682654380798,
"step": 1398
},
{
"epoch": 3.203661327231121,
"grad_norm": 12.73653793334961,
"learning_rate": 1.946391359157818e-06,
"loss": 0.7988094091415405,
"step": 1400
},
{
"epoch": 3.208237986270023,
"grad_norm": 3.7427358627319336,
"learning_rate": 1.935967173310621e-06,
"loss": 0.48929834365844727,
"step": 1402
},
{
"epoch": 3.2128146453089244,
"grad_norm": 2.088498592376709,
"learning_rate": 1.9255940479251433e-06,
"loss": 0.5670595765113831,
"step": 1404
},
{
"epoch": 3.217391304347826,
"grad_norm": 3.3635458946228027,
"learning_rate": 1.915272131612966e-06,
"loss": 0.4398784637451172,
"step": 1406
},
{
"epoch": 3.2219679633867275,
"grad_norm": 2.194676637649536,
"learning_rate": 1.905001572252026e-06,
"loss": 0.7907838821411133,
"step": 1408
},
{
"epoch": 3.2265446224256293,
"grad_norm": 3.3996782302856445,
"learning_rate": 1.8947825169844886e-06,
"loss": 0.6364421844482422,
"step": 1410
},
{
"epoch": 3.2311212814645307,
"grad_norm": 5.239405632019043,
"learning_rate": 1.8846151122146353e-06,
"loss": 0.42004287242889404,
"step": 1412
},
{
"epoch": 3.2356979405034325,
"grad_norm": 0.6929391026496887,
"learning_rate": 1.8744995036067799e-06,
"loss": 0.26028335094451904,
"step": 1414
},
{
"epoch": 3.2402745995423343,
"grad_norm": 0.17362770438194275,
"learning_rate": 1.8644358360831683e-06,
"loss": 0.3416166305541992,
"step": 1416
},
{
"epoch": 3.2448512585812357,
"grad_norm": 1.93403959274292,
"learning_rate": 1.8544242538219084e-06,
"loss": 0.34089672565460205,
"step": 1418
},
{
"epoch": 3.2494279176201375,
"grad_norm": 3.117030620574951,
"learning_rate": 1.8444649002549042e-06,
"loss": 0.7176865339279175,
"step": 1420
},
{
"epoch": 3.254004576659039,
"grad_norm": 14.445137977600098,
"learning_rate": 1.8345579180657996e-06,
"loss": 0.5071847438812256,
"step": 1422
},
{
"epoch": 3.2585812356979407,
"grad_norm": 2.8883860111236572,
"learning_rate": 1.8247034491879346e-06,
"loss": 0.528769850730896,
"step": 1424
},
{
"epoch": 3.263157894736842,
"grad_norm": 6.8145036697387695,
"learning_rate": 1.8149016348023121e-06,
"loss": 0.6137822270393372,
"step": 1426
},
{
"epoch": 3.267734553775744,
"grad_norm": 3.661930561065674,
"learning_rate": 1.8051526153355797e-06,
"loss": 0.726434588432312,
"step": 1428
},
{
"epoch": 3.272311212814645,
"grad_norm": 2.532559871673584,
"learning_rate": 1.7954565304580046e-06,
"loss": 0.707175076007843,
"step": 1430
},
{
"epoch": 3.276887871853547,
"grad_norm": 13.103681564331055,
"learning_rate": 1.7858135190814896e-06,
"loss": 0.4193027913570404,
"step": 1432
},
{
"epoch": 3.2814645308924484,
"grad_norm": 21.0889949798584,
"learning_rate": 1.776223719357571e-06,
"loss": 0.4109644293785095,
"step": 1434
},
{
"epoch": 3.28604118993135,
"grad_norm": 1.8079733848571777,
"learning_rate": 1.7666872686754443e-06,
"loss": 0.34100282192230225,
"step": 1436
},
{
"epoch": 3.2906178489702516,
"grad_norm": 63.32174301147461,
"learning_rate": 1.757204303659994e-06,
"loss": 0.5300724506378174,
"step": 1438
},
{
"epoch": 3.2951945080091534,
"grad_norm": 3.9798078536987305,
"learning_rate": 1.747774960169838e-06,
"loss": 0.4038028120994568,
"step": 1440
},
{
"epoch": 3.2997711670480547,
"grad_norm": 2.6402554512023926,
"learning_rate": 1.738399373295379e-06,
"loss": 0.6060991287231445,
"step": 1442
},
{
"epoch": 3.3043478260869565,
"grad_norm": 5.125815391540527,
"learning_rate": 1.7290776773568701e-06,
"loss": 0.487943172454834,
"step": 1444
},
{
"epoch": 3.308924485125858,
"grad_norm": 2.9676544666290283,
"learning_rate": 1.7198100059024958e-06,
"loss": 0.6720585823059082,
"step": 1446
},
{
"epoch": 3.3135011441647597,
"grad_norm": 4.840664386749268,
"learning_rate": 1.7105964917064435e-06,
"loss": 0.5866726636886597,
"step": 1448
},
{
"epoch": 3.3180778032036615,
"grad_norm": 12.175851821899414,
"learning_rate": 1.7014372667670218e-06,
"loss": 0.62703537940979,
"step": 1450
},
{
"epoch": 3.322654462242563,
"grad_norm": 2.532794952392578,
"learning_rate": 1.692332462304754e-06,
"loss": 0.8183742761611938,
"step": 1452
},
{
"epoch": 3.3272311212814647,
"grad_norm": 5.84249210357666,
"learning_rate": 1.683282208760501e-06,
"loss": 0.4687821567058563,
"step": 1454
},
{
"epoch": 3.331807780320366,
"grad_norm": 5.44354248046875,
"learning_rate": 1.6742866357935997e-06,
"loss": 0.3934783935546875,
"step": 1456
},
{
"epoch": 3.336384439359268,
"grad_norm": 5.401495933532715,
"learning_rate": 1.6653458722799973e-06,
"loss": 0.6204153299331665,
"step": 1458
},
{
"epoch": 3.3409610983981692,
"grad_norm": 2.019944190979004,
"learning_rate": 1.656460046310409e-06,
"loss": 0.6797080039978027,
"step": 1460
},
{
"epoch": 3.345537757437071,
"grad_norm": 4.623880386352539,
"learning_rate": 1.6476292851884809e-06,
"loss": 0.640425443649292,
"step": 1462
},
{
"epoch": 3.3501144164759724,
"grad_norm": 2.2270355224609375,
"learning_rate": 1.6388537154289707e-06,
"loss": 0.6423420906066895,
"step": 1464
},
{
"epoch": 3.354691075514874,
"grad_norm": 11.633206367492676,
"learning_rate": 1.6301334627559262e-06,
"loss": 0.694199800491333,
"step": 1466
},
{
"epoch": 3.3592677345537756,
"grad_norm": 1.825758695602417,
"learning_rate": 1.6214686521008927e-06,
"loss": 0.6198365688323975,
"step": 1468
},
{
"epoch": 3.3638443935926774,
"grad_norm": 4.699729919433594,
"learning_rate": 1.6128594076011226e-06,
"loss": 0.5542856454849243,
"step": 1470
},
{
"epoch": 3.3684210526315788,
"grad_norm": 1.0216766595840454,
"learning_rate": 1.6043058525977879e-06,
"loss": 0.2599141001701355,
"step": 1472
},
{
"epoch": 3.3729977116704806,
"grad_norm": 2.5108420848846436,
"learning_rate": 1.5958081096342256e-06,
"loss": 0.734022319316864,
"step": 1474
},
{
"epoch": 3.3775743707093824,
"grad_norm": 2.6871957778930664,
"learning_rate": 1.5873663004541738e-06,
"loss": 0.7463738918304443,
"step": 1476
},
{
"epoch": 3.3821510297482837,
"grad_norm": 4.7424397468566895,
"learning_rate": 1.5789805460000296e-06,
"loss": 0.7995430827140808,
"step": 1478
},
{
"epoch": 3.386727688787185,
"grad_norm": 10.438132286071777,
"learning_rate": 1.5706509664111164e-06,
"loss": 0.3752162456512451,
"step": 1480
},
{
"epoch": 3.391304347826087,
"grad_norm": 2.6828954219818115,
"learning_rate": 1.5623776810219643e-06,
"loss": 0.3232945203781128,
"step": 1482
},
{
"epoch": 3.3958810068649887,
"grad_norm": 2.298041820526123,
"learning_rate": 1.554160808360598e-06,
"loss": 0.77318274974823,
"step": 1484
},
{
"epoch": 3.40045766590389,
"grad_norm": 4.018006801605225,
"learning_rate": 1.5460004661468386e-06,
"loss": 0.5600519180297852,
"step": 1486
},
{
"epoch": 3.405034324942792,
"grad_norm": 2.762150287628174,
"learning_rate": 1.537896771290623e-06,
"loss": 0.7082411050796509,
"step": 1488
},
{
"epoch": 3.4096109839816933,
"grad_norm": 9.372475624084473,
"learning_rate": 1.5298498398903178e-06,
"loss": 0.5185490846633911,
"step": 1490
},
{
"epoch": 3.414187643020595,
"grad_norm": 3.0372886657714844,
"learning_rate": 1.5218597872310673e-06,
"loss": 0.7110550403594971,
"step": 1492
},
{
"epoch": 3.4187643020594964,
"grad_norm": 2.260855197906494,
"learning_rate": 1.5139267277831348e-06,
"loss": 0.363142192363739,
"step": 1494
},
{
"epoch": 3.4233409610983982,
"grad_norm": 4.717143535614014,
"learning_rate": 1.5060507752002656e-06,
"loss": 0.42447251081466675,
"step": 1496
},
{
"epoch": 3.4279176201372996,
"grad_norm": 2.622420072555542,
"learning_rate": 1.4982320423180574e-06,
"loss": 0.4342220425605774,
"step": 1498
},
{
"epoch": 3.4324942791762014,
"grad_norm": 2.491084575653076,
"learning_rate": 1.490470641152345e-06,
"loss": 0.4623292088508606,
"step": 1500
},
{
"epoch": 3.437070938215103,
"grad_norm": 7.559942245483398,
"learning_rate": 1.4827666828975943e-06,
"loss": 0.40327316522598267,
"step": 1502
},
{
"epoch": 3.4416475972540046,
"grad_norm": 5.229639530181885,
"learning_rate": 1.4751202779253086e-06,
"loss": 0.3544307351112366,
"step": 1504
},
{
"epoch": 3.446224256292906,
"grad_norm": 3.293280601501465,
"learning_rate": 1.4675315357824527e-06,
"loss": 0.7979464530944824,
"step": 1506
},
{
"epoch": 3.4508009153318078,
"grad_norm": 1.9233644008636475,
"learning_rate": 1.4600005651898741e-06,
"loss": 0.6851720213890076,
"step": 1508
},
{
"epoch": 3.4553775743707096,
"grad_norm": 3.2330470085144043,
"learning_rate": 1.4525274740407524e-06,
"loss": 0.6885404586791992,
"step": 1510
},
{
"epoch": 3.459954233409611,
"grad_norm": 4.268771648406982,
"learning_rate": 1.4451123693990555e-06,
"loss": 0.4016433656215668,
"step": 1512
},
{
"epoch": 3.4645308924485128,
"grad_norm": 5.801273345947266,
"learning_rate": 1.4377553574979946e-06,
"loss": 0.2582213878631592,
"step": 1514
},
{
"epoch": 3.469107551487414,
"grad_norm": 1.9621074199676514,
"learning_rate": 1.4304565437385165e-06,
"loss": 0.7295342683792114,
"step": 1516
},
{
"epoch": 3.473684210526316,
"grad_norm": 4.003790378570557,
"learning_rate": 1.4232160326877832e-06,
"loss": 0.3042123317718506,
"step": 1518
},
{
"epoch": 3.4782608695652173,
"grad_norm": 2.3673927783966064,
"learning_rate": 1.4160339280776785e-06,
"loss": 0.5627752542495728,
"step": 1520
},
{
"epoch": 3.482837528604119,
"grad_norm": 3.853731632232666,
"learning_rate": 1.408910332803319e-06,
"loss": 0.3514673113822937,
"step": 1522
},
{
"epoch": 3.4874141876430205,
"grad_norm": 88.11868286132812,
"learning_rate": 1.4018453489215835e-06,
"loss": 0.5591588020324707,
"step": 1524
},
{
"epoch": 3.4919908466819223,
"grad_norm": 2.757547616958618,
"learning_rate": 1.3948390776496484e-06,
"loss": 0.47397273778915405,
"step": 1526
},
{
"epoch": 3.4965675057208236,
"grad_norm": 6.918557643890381,
"learning_rate": 1.3878916193635373e-06,
"loss": 0.33857864141464233,
"step": 1528
},
{
"epoch": 3.5011441647597255,
"grad_norm": 20.17970848083496,
"learning_rate": 1.3810030735966867e-06,
"loss": 0.6692100763320923,
"step": 1530
},
{
"epoch": 3.505720823798627,
"grad_norm": 1.748712420463562,
"learning_rate": 1.3741735390385128e-06,
"loss": 0.6161905527114868,
"step": 1532
},
{
"epoch": 3.5102974828375286,
"grad_norm": 12.148778915405273,
"learning_rate": 1.3674031135330054e-06,
"loss": 0.45425230264663696,
"step": 1534
},
{
"epoch": 3.5148741418764304,
"grad_norm": 2.459949254989624,
"learning_rate": 1.360691894077322e-06,
"loss": 0.7768383026123047,
"step": 1536
},
{
"epoch": 3.519450800915332,
"grad_norm": 1.3486472368240356,
"learning_rate": 1.3540399768203989e-06,
"loss": 0.7696608304977417,
"step": 1538
},
{
"epoch": 3.524027459954233,
"grad_norm": 7.643558979034424,
"learning_rate": 1.347447457061572e-06,
"loss": 0.7031220197677612,
"step": 1540
},
{
"epoch": 3.528604118993135,
"grad_norm": 2.340738534927368,
"learning_rate": 1.3409144292492152e-06,
"loss": 0.5920687913894653,
"step": 1542
},
{
"epoch": 3.533180778032037,
"grad_norm": 2.42668080329895,
"learning_rate": 1.3344409869793851e-06,
"loss": 0.6916133761405945,
"step": 1544
},
{
"epoch": 3.537757437070938,
"grad_norm": 3.409830093383789,
"learning_rate": 1.3280272229944799e-06,
"loss": 0.764272153377533,
"step": 1546
},
{
"epoch": 3.54233409610984,
"grad_norm": 1.986831545829773,
"learning_rate": 1.3216732291819096e-06,
"loss": 0.5565627813339233,
"step": 1548
},
{
"epoch": 3.5469107551487413,
"grad_norm": 6.892856597900391,
"learning_rate": 1.315379096572783e-06,
"loss": 0.1884869635105133,
"step": 1550
},
{
"epoch": 3.551487414187643,
"grad_norm": 4.558145999908447,
"learning_rate": 1.3091449153406024e-06,
"loss": 0.4693112373352051,
"step": 1552
},
{
"epoch": 3.5560640732265445,
"grad_norm": 6.389334201812744,
"learning_rate": 1.3029707747999681e-06,
"loss": 0.7146168947219849,
"step": 1554
},
{
"epoch": 3.5606407322654463,
"grad_norm": 12.014660835266113,
"learning_rate": 1.2968567634053023e-06,
"loss": 0.3005984127521515,
"step": 1556
},
{
"epoch": 3.5652173913043477,
"grad_norm": 2.894502639770508,
"learning_rate": 1.290802968749584e-06,
"loss": 0.6210153698921204,
"step": 1558
},
{
"epoch": 3.5697940503432495,
"grad_norm": 13.574015617370605,
"learning_rate": 1.2848094775630856e-06,
"loss": 0.7224968671798706,
"step": 1560
},
{
"epoch": 3.5743707093821513,
"grad_norm": 2.8741824626922607,
"learning_rate": 1.2788763757121433e-06,
"loss": 0.5549713373184204,
"step": 1562
},
{
"epoch": 3.5789473684210527,
"grad_norm": 3.075819730758667,
"learning_rate": 1.2730037481979132e-06,
"loss": 0.7967220544815063,
"step": 1564
},
{
"epoch": 3.583524027459954,
"grad_norm": 1.3015577793121338,
"learning_rate": 1.2671916791551638e-06,
"loss": 0.4095366597175598,
"step": 1566
},
{
"epoch": 3.588100686498856,
"grad_norm": 7.499904155731201,
"learning_rate": 1.2614402518510652e-06,
"loss": 0.7936575412750244,
"step": 1568
},
{
"epoch": 3.5926773455377576,
"grad_norm": 7.007381439208984,
"learning_rate": 1.255749548683998e-06,
"loss": 0.5722820162773132,
"step": 1570
},
{
"epoch": 3.597254004576659,
"grad_norm": 1.998162031173706,
"learning_rate": 1.2501196511823727e-06,
"loss": 0.40031328797340393,
"step": 1572
},
{
"epoch": 3.6018306636155604,
"grad_norm": 2.335953712463379,
"learning_rate": 1.2445506400034608e-06,
"loss": 0.5704180002212524,
"step": 1574
},
{
"epoch": 3.606407322654462,
"grad_norm": 3.6565961837768555,
"learning_rate": 1.239042594932243e-06,
"loss": 0.5407567620277405,
"step": 1576
},
{
"epoch": 3.610983981693364,
"grad_norm": 4.150671482086182,
"learning_rate": 1.2335955948802579e-06,
"loss": 0.7380489110946655,
"step": 1578
},
{
"epoch": 3.6155606407322654,
"grad_norm": 7.434592247009277,
"learning_rate": 1.2282097178844815e-06,
"loss": 0.36467939615249634,
"step": 1580
},
{
"epoch": 3.620137299771167,
"grad_norm": 2.6412289142608643,
"learning_rate": 1.2228850411062023e-06,
"loss": 0.4587993025779724,
"step": 1582
},
{
"epoch": 3.6247139588100685,
"grad_norm": 18.970722198486328,
"learning_rate": 1.217621640829918e-06,
"loss": 0.39366841316223145,
"step": 1584
},
{
"epoch": 3.6292906178489703,
"grad_norm": 3.4149601459503174,
"learning_rate": 1.2124195924622428e-06,
"loss": 0.3141392469406128,
"step": 1586
},
{
"epoch": 3.6338672768878717,
"grad_norm": 2.6209511756896973,
"learning_rate": 1.2072789705308267e-06,
"loss": 0.4500223398208618,
"step": 1588
},
{
"epoch": 3.6384439359267735,
"grad_norm": 8.329633712768555,
"learning_rate": 1.2021998486832888e-06,
"loss": 0.47874724864959717,
"step": 1590
},
{
"epoch": 3.643020594965675,
"grad_norm": 8.009090423583984,
"learning_rate": 1.1971822996861585e-06,
"loss": 0.6222150325775146,
"step": 1592
},
{
"epoch": 3.6475972540045767,
"grad_norm": 1.0920637845993042,
"learning_rate": 1.192226395423841e-06,
"loss": 0.39640748500823975,
"step": 1594
},
{
"epoch": 3.6521739130434785,
"grad_norm": 3.376007556915283,
"learning_rate": 1.1873322068975756e-06,
"loss": 0.3654574751853943,
"step": 1596
},
{
"epoch": 3.65675057208238,
"grad_norm": 4.961648941040039,
"learning_rate": 1.1824998042244316e-06,
"loss": 0.08938822150230408,
"step": 1598
},
{
"epoch": 3.6613272311212812,
"grad_norm": 9.972960472106934,
"learning_rate": 1.1777292566362922e-06,
"loss": 0.48825398087501526,
"step": 1600
},
{
"epoch": 3.665903890160183,
"grad_norm": 1.0946178436279297,
"learning_rate": 1.1730206324788704e-06,
"loss": 0.1946982443332672,
"step": 1602
},
{
"epoch": 3.670480549199085,
"grad_norm": 3.8373024463653564,
"learning_rate": 1.1683739992107267e-06,
"loss": 0.38114720582962036,
"step": 1604
},
{
"epoch": 3.675057208237986,
"grad_norm": 5.0955047607421875,
"learning_rate": 1.163789423402303e-06,
"loss": 0.628394365310669,
"step": 1606
},
{
"epoch": 3.679633867276888,
"grad_norm": 3.3337788581848145,
"learning_rate": 1.1592669707349685e-06,
"loss": 0.5962799787521362,
"step": 1608
},
{
"epoch": 3.6842105263157894,
"grad_norm": 4.6492509841918945,
"learning_rate": 1.1548067060000804e-06,
"loss": 0.6295109987258911,
"step": 1610
},
{
"epoch": 3.688787185354691,
"grad_norm": 18.938716888427734,
"learning_rate": 1.1504086930980533e-06,
"loss": 0.27833229303359985,
"step": 1612
},
{
"epoch": 3.6933638443935926,
"grad_norm": 5.88167667388916,
"learning_rate": 1.1460729950374445e-06,
"loss": 0.2558402419090271,
"step": 1614
},
{
"epoch": 3.6979405034324944,
"grad_norm": 3.370272636413574,
"learning_rate": 1.1417996739340537e-06,
"loss": 0.8182615637779236,
"step": 1616
},
{
"epoch": 3.7025171624713957,
"grad_norm": 2.657801389694214,
"learning_rate": 1.1375887910100295e-06,
"loss": 0.7889156341552734,
"step": 1618
},
{
"epoch": 3.7070938215102975,
"grad_norm": 1.4287097454071045,
"learning_rate": 1.1334404065929939e-06,
"loss": 0.625603437423706,
"step": 1620
},
{
"epoch": 3.7116704805491993,
"grad_norm": 5.524312973022461,
"learning_rate": 1.1293545801151788e-06,
"loss": 0.6062618494033813,
"step": 1622
},
{
"epoch": 3.7162471395881007,
"grad_norm": 3.7626378536224365,
"learning_rate": 1.1253313701125727e-06,
"loss": 0.5309630632400513,
"step": 1624
},
{
"epoch": 3.720823798627002,
"grad_norm": 3.515129566192627,
"learning_rate": 1.1213708342240843e-06,
"loss": 0.644262433052063,
"step": 1626
},
{
"epoch": 3.725400457665904,
"grad_norm": 2.292351007461548,
"learning_rate": 1.1174730291907145e-06,
"loss": 0.42081764340400696,
"step": 1628
},
{
"epoch": 3.7299771167048057,
"grad_norm": 0.5895558595657349,
"learning_rate": 1.1136380108547446e-06,
"loss": 0.1861320436000824,
"step": 1630
},
{
"epoch": 3.734553775743707,
"grad_norm": 1.528093934059143,
"learning_rate": 1.109865834158937e-06,
"loss": 0.11224275827407837,
"step": 1632
},
{
"epoch": 3.7391304347826084,
"grad_norm": 4.198037147521973,
"learning_rate": 1.1061565531457457e-06,
"loss": 0.41927570104599,
"step": 1634
},
{
"epoch": 3.7437070938215102,
"grad_norm": 2.7169034481048584,
"learning_rate": 1.1025102209565463e-06,
"loss": 0.31835800409317017,
"step": 1636
},
{
"epoch": 3.748283752860412,
"grad_norm": 2.142369270324707,
"learning_rate": 1.098926889830869e-06,
"loss": 0.8377334475517273,
"step": 1638
},
{
"epoch": 3.7528604118993134,
"grad_norm": 1.0328131914138794,
"learning_rate": 1.0954066111056552e-06,
"loss": 0.043129995465278625,
"step": 1640
},
{
"epoch": 3.757437070938215,
"grad_norm": 3.350853681564331,
"learning_rate": 1.091949435214518e-06,
"loss": 0.7317330837249756,
"step": 1642
},
{
"epoch": 3.7620137299771166,
"grad_norm": 2.704472541809082,
"learning_rate": 1.0885554116870248e-06,
"loss": 0.7968192100524902,
"step": 1644
},
{
"epoch": 3.7665903890160184,
"grad_norm": 0.1938525140285492,
"learning_rate": 1.0852245891479815e-06,
"loss": 0.03669770807027817,
"step": 1646
},
{
"epoch": 3.7711670480549198,
"grad_norm": 2.804135799407959,
"learning_rate": 1.08195701531674e-06,
"loss": 0.6800875067710876,
"step": 1648
},
{
"epoch": 3.7757437070938216,
"grad_norm": 0.6211162805557251,
"learning_rate": 1.0787527370065134e-06,
"loss": 0.49580419063568115,
"step": 1650
},
{
"epoch": 3.780320366132723,
"grad_norm": 9.704923629760742,
"learning_rate": 1.0756118001237055e-06,
"loss": 0.302669882774353,
"step": 1652
},
{
"epoch": 3.7848970251716247,
"grad_norm": 1.8114262819290161,
"learning_rate": 1.0725342496672537e-06,
"loss": 0.6123445630073547,
"step": 1654
},
{
"epoch": 3.7894736842105265,
"grad_norm": 1.53504478931427,
"learning_rate": 1.0695201297279822e-06,
"loss": 0.5894599556922913,
"step": 1656
},
{
"epoch": 3.794050343249428,
"grad_norm": 1.9072415828704834,
"learning_rate": 1.066569483487972e-06,
"loss": 0.45558661222457886,
"step": 1658
},
{
"epoch": 3.7986270022883293,
"grad_norm": 4.523164749145508,
"learning_rate": 1.063682353219944e-06,
"loss": 0.510258674621582,
"step": 1660
},
{
"epoch": 3.803203661327231,
"grad_norm": 3.0822136402130127,
"learning_rate": 1.0608587802866479e-06,
"loss": 0.8670095205307007,
"step": 1662
},
{
"epoch": 3.807780320366133,
"grad_norm": 1.9081244468688965,
"learning_rate": 1.0580988051402764e-06,
"loss": 0.7442867755889893,
"step": 1664
},
{
"epoch": 3.8123569794050343,
"grad_norm": 3.498737335205078,
"learning_rate": 1.0554024673218808e-06,
"loss": 0.48411673307418823,
"step": 1666
},
{
"epoch": 3.816933638443936,
"grad_norm": 5.711022853851318,
"learning_rate": 1.052769805460805e-06,
"loss": 0.41106265783309937,
"step": 1668
},
{
"epoch": 3.8215102974828374,
"grad_norm": 14.483606338500977,
"learning_rate": 1.0502008572741354e-06,
"loss": 0.6216870546340942,
"step": 1670
},
{
"epoch": 3.8260869565217392,
"grad_norm": 3.853814125061035,
"learning_rate": 1.0476956595661574e-06,
"loss": 0.5673307180404663,
"step": 1672
},
{
"epoch": 3.8306636155606406,
"grad_norm": 0.7420060038566589,
"learning_rate": 1.045254248227828e-06,
"loss": 0.2633235454559326,
"step": 1674
},
{
"epoch": 3.8352402745995424,
"grad_norm": 1.8191514015197754,
"learning_rate": 1.042876658236263e-06,
"loss": 0.41805195808410645,
"step": 1676
},
{
"epoch": 3.839816933638444,
"grad_norm": 3.2566258907318115,
"learning_rate": 1.0405629236542371e-06,
"loss": 0.4752364754676819,
"step": 1678
},
{
"epoch": 3.8443935926773456,
"grad_norm": 0.7727957963943481,
"learning_rate": 1.0383130776296923e-06,
"loss": 0.4213360846042633,
"step": 1680
},
{
"epoch": 3.8489702517162474,
"grad_norm": 20.79123878479004,
"learning_rate": 1.036127152395266e-06,
"loss": 0.6792758703231812,
"step": 1682
},
{
"epoch": 3.8535469107551488,
"grad_norm": 2.3199901580810547,
"learning_rate": 1.0340051792678276e-06,
"loss": 0.7548638582229614,
"step": 1684
},
{
"epoch": 3.85812356979405,
"grad_norm": 2.1454737186431885,
"learning_rate": 1.0319471886480315e-06,
"loss": 0.5545675754547119,
"step": 1686
},
{
"epoch": 3.862700228832952,
"grad_norm": 2.8614749908447266,
"learning_rate": 1.0299532100198784e-06,
"loss": 0.5754020810127258,
"step": 1688
},
{
"epoch": 3.8672768878718538,
"grad_norm": 5.313638687133789,
"learning_rate": 1.0280232719502975e-06,
"loss": 0.32507747411727905,
"step": 1690
},
{
"epoch": 3.871853546910755,
"grad_norm": 6.3688859939575195,
"learning_rate": 1.0261574020887336e-06,
"loss": 0.5391194820404053,
"step": 1692
},
{
"epoch": 3.8764302059496565,
"grad_norm": 2.4996461868286133,
"learning_rate": 1.0243556271667513e-06,
"loss": 0.6387747526168823,
"step": 1694
},
{
"epoch": 3.8810068649885583,
"grad_norm": 4.952239513397217,
"learning_rate": 1.0226179729976544e-06,
"loss": 0.23900987207889557,
"step": 1696
},
{
"epoch": 3.88558352402746,
"grad_norm": 2.5153167247772217,
"learning_rate": 1.0209444644761138e-06,
"loss": 0.7215524911880493,
"step": 1698
},
{
"epoch": 3.8901601830663615,
"grad_norm": 2.760258197784424,
"learning_rate": 1.0193351255778111e-06,
"loss": 0.6749666929244995,
"step": 1700
},
{
"epoch": 3.8947368421052633,
"grad_norm": 3.0027530193328857,
"learning_rate": 1.0177899793590958e-06,
"loss": 0.7268638610839844,
"step": 1702
},
{
"epoch": 3.8993135011441646,
"grad_norm": 236.1914520263672,
"learning_rate": 1.0163090479566553e-06,
"loss": 0.5174607634544373,
"step": 1704
},
{
"epoch": 3.9038901601830664,
"grad_norm": 3.04256534576416,
"learning_rate": 1.0148923525871973e-06,
"loss": 0.501115620136261,
"step": 1706
},
{
"epoch": 3.908466819221968,
"grad_norm": 5.1830973625183105,
"learning_rate": 1.0135399135471451e-06,
"loss": 0.37597358226776123,
"step": 1708
},
{
"epoch": 3.9130434782608696,
"grad_norm": 8.081040382385254,
"learning_rate": 1.012251750212347e-06,
"loss": 0.47812142968177795,
"step": 1710
},
{
"epoch": 3.917620137299771,
"grad_norm": 6.02420711517334,
"learning_rate": 1.0110278810378003e-06,
"loss": 0.3577365577220917,
"step": 1712
},
{
"epoch": 3.922196796338673,
"grad_norm": 9.104696273803711,
"learning_rate": 1.0098683235573856e-06,
"loss": 0.700224757194519,
"step": 1714
},
{
"epoch": 3.9267734553775746,
"grad_norm": 6.046388626098633,
"learning_rate": 1.0087730943836149e-06,
"loss": 0.8385029435157776,
"step": 1716
},
{
"epoch": 3.931350114416476,
"grad_norm": 7.305265426635742,
"learning_rate": 1.0077422092073958e-06,
"loss": 0.43619590997695923,
"step": 1718
},
{
"epoch": 3.9359267734553773,
"grad_norm": 7.582434177398682,
"learning_rate": 1.0067756827978048e-06,
"loss": 0.4124600887298584,
"step": 1720
},
{
"epoch": 3.940503432494279,
"grad_norm": 5.771373748779297,
"learning_rate": 1.0058735290018753e-06,
"loss": 0.5592625141143799,
"step": 1722
},
{
"epoch": 3.945080091533181,
"grad_norm": 2.8298215866088867,
"learning_rate": 1.0050357607444016e-06,
"loss": 0.6869655847549438,
"step": 1724
},
{
"epoch": 3.9496567505720823,
"grad_norm": 2.2002291679382324,
"learning_rate": 1.0042623900277524e-06,
"loss": 0.3992425799369812,
"step": 1726
},
{
"epoch": 3.954233409610984,
"grad_norm": 1.8750855922698975,
"learning_rate": 1.003553427931697e-06,
"loss": 0.6239903569221497,
"step": 1728
},
{
"epoch": 3.9588100686498855,
"grad_norm": 5.004032135009766,
"learning_rate": 1.0029088846132508e-06,
"loss": 0.3995712995529175,
"step": 1730
},
{
"epoch": 3.9633867276887873,
"grad_norm": 0.870651125907898,
"learning_rate": 1.0023287693065253e-06,
"loss": 0.429832398891449,
"step": 1732
},
{
"epoch": 3.9679633867276887,
"grad_norm": 6.058652400970459,
"learning_rate": 1.0018130903225988e-06,
"loss": 0.6913228034973145,
"step": 1734
},
{
"epoch": 3.9725400457665905,
"grad_norm": 3.251649856567383,
"learning_rate": 1.001361855049396e-06,
"loss": 0.3300914764404297,
"step": 1736
},
{
"epoch": 3.977116704805492,
"grad_norm": 7.703200340270996,
"learning_rate": 1.000975069951584e-06,
"loss": 0.24439480900764465,
"step": 1738
},
{
"epoch": 3.9816933638443937,
"grad_norm": 6.612419605255127,
"learning_rate": 1.0006527405704755e-06,
"loss": 0.501896858215332,
"step": 1740
},
{
"epoch": 3.9862700228832955,
"grad_norm": 2.503108024597168,
"learning_rate": 1.0003948715239546e-06,
"loss": 0.34206831455230713,
"step": 1742
},
{
"epoch": 3.990846681922197,
"grad_norm": 4.86019229888916,
"learning_rate": 1.0002014665064067e-06,
"loss": 0.30612480640411377,
"step": 1744
},
{
"epoch": 3.995423340961098,
"grad_norm": 2.529582977294922,
"learning_rate": 1.0000725282886676e-06,
"loss": 0.6712839603424072,
"step": 1746
},
{
"epoch": 4.0,
"grad_norm": 2.009540557861328,
"learning_rate": 1.000008058717983e-06,
"loss": 0.042355649173259735,
"step": 1748
},
{
"epoch": 4.0,
"step": 1748,
"total_flos": 3.4427459894320824e+18,
"train_loss": 0.8253325358783353,
"train_runtime": 4390.1543,
"train_samples_per_second": 11.945,
"train_steps_per_second": 0.398
}
],
"logging_steps": 2,
"max_steps": 1748,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.4427459894320824e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}