flyingbugs's picture
Model save
b616832 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1779,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016863406408094434,
"grad_norm": 56.98304098027541,
"learning_rate": 2.8089887640449437e-07,
"loss": 10.9859,
"step": 1
},
{
"epoch": 0.003372681281618887,
"grad_norm": 55.57657460005535,
"learning_rate": 5.617977528089887e-07,
"loss": 11.1229,
"step": 2
},
{
"epoch": 0.00505902192242833,
"grad_norm": 56.33594802581801,
"learning_rate": 8.426966292134832e-07,
"loss": 11.0837,
"step": 3
},
{
"epoch": 0.006745362563237774,
"grad_norm": 56.31959427274796,
"learning_rate": 1.1235955056179775e-06,
"loss": 11.1692,
"step": 4
},
{
"epoch": 0.008431703204047217,
"grad_norm": 53.79759651259812,
"learning_rate": 1.404494382022472e-06,
"loss": 11.2125,
"step": 5
},
{
"epoch": 0.01011804384485666,
"grad_norm": 57.26270277410124,
"learning_rate": 1.6853932584269663e-06,
"loss": 11.042,
"step": 6
},
{
"epoch": 0.011804384485666104,
"grad_norm": 61.96582900530684,
"learning_rate": 1.966292134831461e-06,
"loss": 10.7828,
"step": 7
},
{
"epoch": 0.013490725126475547,
"grad_norm": 62.57155538626125,
"learning_rate": 2.247191011235955e-06,
"loss": 10.6918,
"step": 8
},
{
"epoch": 0.01517706576728499,
"grad_norm": 82.29395324590898,
"learning_rate": 2.5280898876404495e-06,
"loss": 9.7507,
"step": 9
},
{
"epoch": 0.016863406408094434,
"grad_norm": 90.9461623535896,
"learning_rate": 2.808988764044944e-06,
"loss": 9.4252,
"step": 10
},
{
"epoch": 0.01854974704890388,
"grad_norm": 99.14771668814727,
"learning_rate": 3.089887640449438e-06,
"loss": 9.0263,
"step": 11
},
{
"epoch": 0.02023608768971332,
"grad_norm": 95.60708207776585,
"learning_rate": 3.3707865168539327e-06,
"loss": 4.655,
"step": 12
},
{
"epoch": 0.021922428330522766,
"grad_norm": 62.44385087566126,
"learning_rate": 3.651685393258427e-06,
"loss": 3.5398,
"step": 13
},
{
"epoch": 0.023608768971332208,
"grad_norm": 52.52199287821377,
"learning_rate": 3.932584269662922e-06,
"loss": 3.1373,
"step": 14
},
{
"epoch": 0.025295109612141653,
"grad_norm": 33.78765944113671,
"learning_rate": 4.213483146067416e-06,
"loss": 2.3934,
"step": 15
},
{
"epoch": 0.026981450252951095,
"grad_norm": 22.3441016196329,
"learning_rate": 4.49438202247191e-06,
"loss": 2.0155,
"step": 16
},
{
"epoch": 0.02866779089376054,
"grad_norm": 6.116667827314463,
"learning_rate": 4.7752808988764044e-06,
"loss": 1.3431,
"step": 17
},
{
"epoch": 0.03035413153456998,
"grad_norm": 5.066417296011239,
"learning_rate": 5.056179775280899e-06,
"loss": 1.3379,
"step": 18
},
{
"epoch": 0.03204047217537943,
"grad_norm": 4.09740440774978,
"learning_rate": 5.3370786516853935e-06,
"loss": 1.2552,
"step": 19
},
{
"epoch": 0.03372681281618887,
"grad_norm": 2.8692740755186263,
"learning_rate": 5.617977528089888e-06,
"loss": 1.1335,
"step": 20
},
{
"epoch": 0.03541315345699832,
"grad_norm": 2.3265505323675137,
"learning_rate": 5.8988764044943826e-06,
"loss": 1.1324,
"step": 21
},
{
"epoch": 0.03709949409780776,
"grad_norm": 1.798729034462734,
"learning_rate": 6.179775280898876e-06,
"loss": 1.0814,
"step": 22
},
{
"epoch": 0.0387858347386172,
"grad_norm": 51.17680122626074,
"learning_rate": 6.460674157303372e-06,
"loss": 0.9465,
"step": 23
},
{
"epoch": 0.04047217537942664,
"grad_norm": 9.519207965054298,
"learning_rate": 6.741573033707865e-06,
"loss": 0.9286,
"step": 24
},
{
"epoch": 0.04215851602023609,
"grad_norm": 1.784859029228961,
"learning_rate": 7.022471910112361e-06,
"loss": 0.9436,
"step": 25
},
{
"epoch": 0.04384485666104553,
"grad_norm": 1.183273056167306,
"learning_rate": 7.303370786516854e-06,
"loss": 0.8943,
"step": 26
},
{
"epoch": 0.045531197301854974,
"grad_norm": 1.0093506180093674,
"learning_rate": 7.584269662921349e-06,
"loss": 0.8803,
"step": 27
},
{
"epoch": 0.047217537942664416,
"grad_norm": 0.9102179680507179,
"learning_rate": 7.865168539325843e-06,
"loss": 0.8366,
"step": 28
},
{
"epoch": 0.048903878583473864,
"grad_norm": 0.83692942638752,
"learning_rate": 8.146067415730338e-06,
"loss": 0.8032,
"step": 29
},
{
"epoch": 0.050590219224283306,
"grad_norm": 0.7948758696576288,
"learning_rate": 8.426966292134832e-06,
"loss": 0.8193,
"step": 30
},
{
"epoch": 0.05227655986509275,
"grad_norm": 0.9560450801368573,
"learning_rate": 8.707865168539327e-06,
"loss": 0.7804,
"step": 31
},
{
"epoch": 0.05396290050590219,
"grad_norm": 0.653701463750391,
"learning_rate": 8.98876404494382e-06,
"loss": 0.7531,
"step": 32
},
{
"epoch": 0.05564924114671164,
"grad_norm": 0.70122814315399,
"learning_rate": 9.269662921348316e-06,
"loss": 0.7444,
"step": 33
},
{
"epoch": 0.05733558178752108,
"grad_norm": 0.7091859317644309,
"learning_rate": 9.550561797752809e-06,
"loss": 0.7139,
"step": 34
},
{
"epoch": 0.05902192242833052,
"grad_norm": 0.673989006196913,
"learning_rate": 9.831460674157303e-06,
"loss": 0.6992,
"step": 35
},
{
"epoch": 0.06070826306913996,
"grad_norm": 0.6011374276083163,
"learning_rate": 1.0112359550561798e-05,
"loss": 0.7038,
"step": 36
},
{
"epoch": 0.06239460370994941,
"grad_norm": 0.6205180106169818,
"learning_rate": 1.0393258426966292e-05,
"loss": 0.6854,
"step": 37
},
{
"epoch": 0.06408094435075885,
"grad_norm": 0.5323665662116028,
"learning_rate": 1.0674157303370787e-05,
"loss": 0.6865,
"step": 38
},
{
"epoch": 0.0657672849915683,
"grad_norm": 0.5211714451245653,
"learning_rate": 1.0955056179775282e-05,
"loss": 0.6914,
"step": 39
},
{
"epoch": 0.06745362563237774,
"grad_norm": 0.6271140400004065,
"learning_rate": 1.1235955056179776e-05,
"loss": 0.6996,
"step": 40
},
{
"epoch": 0.06913996627318718,
"grad_norm": 0.5358305966066623,
"learning_rate": 1.151685393258427e-05,
"loss": 0.6421,
"step": 41
},
{
"epoch": 0.07082630691399663,
"grad_norm": 0.4160609364118775,
"learning_rate": 1.1797752808988765e-05,
"loss": 0.6366,
"step": 42
},
{
"epoch": 0.07251264755480608,
"grad_norm": 0.5973714197954098,
"learning_rate": 1.207865168539326e-05,
"loss": 0.673,
"step": 43
},
{
"epoch": 0.07419898819561552,
"grad_norm": 0.49953757804840393,
"learning_rate": 1.2359550561797752e-05,
"loss": 0.6259,
"step": 44
},
{
"epoch": 0.07588532883642496,
"grad_norm": 0.47704181367251264,
"learning_rate": 1.2640449438202249e-05,
"loss": 0.6219,
"step": 45
},
{
"epoch": 0.0775716694772344,
"grad_norm": 0.372415612114133,
"learning_rate": 1.2921348314606743e-05,
"loss": 0.6202,
"step": 46
},
{
"epoch": 0.07925801011804384,
"grad_norm": 0.3538579138166064,
"learning_rate": 1.3202247191011236e-05,
"loss": 0.6284,
"step": 47
},
{
"epoch": 0.08094435075885328,
"grad_norm": 0.4529401926284446,
"learning_rate": 1.348314606741573e-05,
"loss": 0.621,
"step": 48
},
{
"epoch": 0.08263069139966273,
"grad_norm": 0.6008956943318995,
"learning_rate": 1.3764044943820225e-05,
"loss": 0.6222,
"step": 49
},
{
"epoch": 0.08431703204047218,
"grad_norm": 0.40843012641704357,
"learning_rate": 1.4044943820224721e-05,
"loss": 0.6483,
"step": 50
},
{
"epoch": 0.08600337268128162,
"grad_norm": 0.33729728212338617,
"learning_rate": 1.4325842696629212e-05,
"loss": 0.6233,
"step": 51
},
{
"epoch": 0.08768971332209106,
"grad_norm": 0.3370599653432775,
"learning_rate": 1.4606741573033709e-05,
"loss": 0.6106,
"step": 52
},
{
"epoch": 0.0893760539629005,
"grad_norm": 0.3191164676272232,
"learning_rate": 1.4887640449438203e-05,
"loss": 0.5751,
"step": 53
},
{
"epoch": 0.09106239460370995,
"grad_norm": 0.36054754880278495,
"learning_rate": 1.5168539325842698e-05,
"loss": 0.6046,
"step": 54
},
{
"epoch": 0.09274873524451939,
"grad_norm": 0.30445133959986886,
"learning_rate": 1.544943820224719e-05,
"loss": 0.5989,
"step": 55
},
{
"epoch": 0.09443507588532883,
"grad_norm": 0.3035359738566352,
"learning_rate": 1.5730337078651687e-05,
"loss": 0.5789,
"step": 56
},
{
"epoch": 0.09612141652613827,
"grad_norm": 0.35195109228569355,
"learning_rate": 1.601123595505618e-05,
"loss": 0.5895,
"step": 57
},
{
"epoch": 0.09780775716694773,
"grad_norm": 0.31940901935141724,
"learning_rate": 1.6292134831460676e-05,
"loss": 0.5936,
"step": 58
},
{
"epoch": 0.09949409780775717,
"grad_norm": 0.30162419503220067,
"learning_rate": 1.657303370786517e-05,
"loss": 0.6055,
"step": 59
},
{
"epoch": 0.10118043844856661,
"grad_norm": 0.2963597866280323,
"learning_rate": 1.6853932584269665e-05,
"loss": 0.605,
"step": 60
},
{
"epoch": 0.10286677908937605,
"grad_norm": 0.30817805390280884,
"learning_rate": 1.7134831460674158e-05,
"loss": 0.5693,
"step": 61
},
{
"epoch": 0.1045531197301855,
"grad_norm": 0.2911807010165559,
"learning_rate": 1.7415730337078654e-05,
"loss": 0.5651,
"step": 62
},
{
"epoch": 0.10623946037099494,
"grad_norm": 0.27943368825325704,
"learning_rate": 1.7696629213483147e-05,
"loss": 0.5748,
"step": 63
},
{
"epoch": 0.10792580101180438,
"grad_norm": 0.29822593015129506,
"learning_rate": 1.797752808988764e-05,
"loss": 0.596,
"step": 64
},
{
"epoch": 0.10961214165261383,
"grad_norm": 0.27141944390865275,
"learning_rate": 1.8258426966292136e-05,
"loss": 0.533,
"step": 65
},
{
"epoch": 0.11129848229342328,
"grad_norm": 0.2862392624126836,
"learning_rate": 1.8539325842696632e-05,
"loss": 0.5566,
"step": 66
},
{
"epoch": 0.11298482293423272,
"grad_norm": 0.3180895996678809,
"learning_rate": 1.8820224719101125e-05,
"loss": 0.5508,
"step": 67
},
{
"epoch": 0.11467116357504216,
"grad_norm": 0.33616423964611203,
"learning_rate": 1.9101123595505618e-05,
"loss": 0.5564,
"step": 68
},
{
"epoch": 0.1163575042158516,
"grad_norm": 0.25782088441170614,
"learning_rate": 1.9382022471910114e-05,
"loss": 0.5538,
"step": 69
},
{
"epoch": 0.11804384485666104,
"grad_norm": 0.3443028051586477,
"learning_rate": 1.9662921348314607e-05,
"loss": 0.5599,
"step": 70
},
{
"epoch": 0.11973018549747048,
"grad_norm": 0.3663521866601868,
"learning_rate": 1.99438202247191e-05,
"loss": 0.5958,
"step": 71
},
{
"epoch": 0.12141652613827993,
"grad_norm": 0.2430774348391263,
"learning_rate": 2.0224719101123596e-05,
"loss": 0.5405,
"step": 72
},
{
"epoch": 0.12310286677908938,
"grad_norm": 0.3645167427473072,
"learning_rate": 2.0505617977528092e-05,
"loss": 0.5357,
"step": 73
},
{
"epoch": 0.12478920741989882,
"grad_norm": 0.5495193914586647,
"learning_rate": 2.0786516853932585e-05,
"loss": 0.5809,
"step": 74
},
{
"epoch": 0.12647554806070826,
"grad_norm": 0.2886314502296464,
"learning_rate": 2.1067415730337078e-05,
"loss": 0.5434,
"step": 75
},
{
"epoch": 0.1281618887015177,
"grad_norm": 0.40721543977266106,
"learning_rate": 2.1348314606741574e-05,
"loss": 0.5578,
"step": 76
},
{
"epoch": 0.12984822934232715,
"grad_norm": 0.3893677045739628,
"learning_rate": 2.1629213483146067e-05,
"loss": 0.5632,
"step": 77
},
{
"epoch": 0.1315345699831366,
"grad_norm": 0.3009475566119484,
"learning_rate": 2.1910112359550563e-05,
"loss": 0.5509,
"step": 78
},
{
"epoch": 0.13322091062394603,
"grad_norm": 0.3919989486891414,
"learning_rate": 2.2191011235955056e-05,
"loss": 0.5489,
"step": 79
},
{
"epoch": 0.13490725126475547,
"grad_norm": 0.3420814355138489,
"learning_rate": 2.2471910112359552e-05,
"loss": 0.5556,
"step": 80
},
{
"epoch": 0.13659359190556492,
"grad_norm": 0.3491052919845026,
"learning_rate": 2.2752808988764045e-05,
"loss": 0.5549,
"step": 81
},
{
"epoch": 0.13827993254637436,
"grad_norm": 0.35387991760875304,
"learning_rate": 2.303370786516854e-05,
"loss": 0.5486,
"step": 82
},
{
"epoch": 0.1399662731871838,
"grad_norm": 0.3864441022271678,
"learning_rate": 2.3314606741573034e-05,
"loss": 0.5384,
"step": 83
},
{
"epoch": 0.14165261382799327,
"grad_norm": 0.3427958684700061,
"learning_rate": 2.359550561797753e-05,
"loss": 0.5399,
"step": 84
},
{
"epoch": 0.1433389544688027,
"grad_norm": 0.3452334549252852,
"learning_rate": 2.3876404494382023e-05,
"loss": 0.5378,
"step": 85
},
{
"epoch": 0.14502529510961215,
"grad_norm": 0.42336780350074216,
"learning_rate": 2.415730337078652e-05,
"loss": 0.5416,
"step": 86
},
{
"epoch": 0.1467116357504216,
"grad_norm": 0.341238202299115,
"learning_rate": 2.4438202247191012e-05,
"loss": 0.5587,
"step": 87
},
{
"epoch": 0.14839797639123103,
"grad_norm": 0.357192951048682,
"learning_rate": 2.4719101123595505e-05,
"loss": 0.5153,
"step": 88
},
{
"epoch": 0.15008431703204048,
"grad_norm": 0.429528094646145,
"learning_rate": 2.5e-05,
"loss": 0.5427,
"step": 89
},
{
"epoch": 0.15177065767284992,
"grad_norm": 0.33591123944757306,
"learning_rate": 2.5280898876404497e-05,
"loss": 0.5435,
"step": 90
},
{
"epoch": 0.15345699831365936,
"grad_norm": 0.430601786111773,
"learning_rate": 2.556179775280899e-05,
"loss": 0.5322,
"step": 91
},
{
"epoch": 0.1551433389544688,
"grad_norm": 0.3358306851062295,
"learning_rate": 2.5842696629213486e-05,
"loss": 0.5364,
"step": 92
},
{
"epoch": 0.15682967959527824,
"grad_norm": 0.3870041027885968,
"learning_rate": 2.6123595505617983e-05,
"loss": 0.5162,
"step": 93
},
{
"epoch": 0.15851602023608768,
"grad_norm": 0.3310317022988692,
"learning_rate": 2.6404494382022472e-05,
"loss": 0.5343,
"step": 94
},
{
"epoch": 0.16020236087689713,
"grad_norm": 0.31435605043103565,
"learning_rate": 2.6685393258426965e-05,
"loss": 0.5458,
"step": 95
},
{
"epoch": 0.16188870151770657,
"grad_norm": 0.31895462371866395,
"learning_rate": 2.696629213483146e-05,
"loss": 0.5387,
"step": 96
},
{
"epoch": 0.163575042158516,
"grad_norm": 0.33243010294837577,
"learning_rate": 2.7247191011235957e-05,
"loss": 0.5153,
"step": 97
},
{
"epoch": 0.16526138279932545,
"grad_norm": 0.32802364042835597,
"learning_rate": 2.752808988764045e-05,
"loss": 0.5429,
"step": 98
},
{
"epoch": 0.16694772344013492,
"grad_norm": 0.3413628299001048,
"learning_rate": 2.7808988764044946e-05,
"loss": 0.5398,
"step": 99
},
{
"epoch": 0.16863406408094436,
"grad_norm": 0.3403836902658499,
"learning_rate": 2.8089887640449443e-05,
"loss": 0.5197,
"step": 100
},
{
"epoch": 0.1703204047217538,
"grad_norm": 0.33690189921055563,
"learning_rate": 2.8370786516853936e-05,
"loss": 0.5266,
"step": 101
},
{
"epoch": 0.17200674536256325,
"grad_norm": 0.34265834509255605,
"learning_rate": 2.8651685393258425e-05,
"loss": 0.5123,
"step": 102
},
{
"epoch": 0.1736930860033727,
"grad_norm": 0.30290512367105066,
"learning_rate": 2.893258426966292e-05,
"loss": 0.512,
"step": 103
},
{
"epoch": 0.17537942664418213,
"grad_norm": 0.3152679845470168,
"learning_rate": 2.9213483146067417e-05,
"loss": 0.5315,
"step": 104
},
{
"epoch": 0.17706576728499157,
"grad_norm": 0.36153831566772826,
"learning_rate": 2.949438202247191e-05,
"loss": 0.5195,
"step": 105
},
{
"epoch": 0.178752107925801,
"grad_norm": 0.3201065064193262,
"learning_rate": 2.9775280898876406e-05,
"loss": 0.5415,
"step": 106
},
{
"epoch": 0.18043844856661045,
"grad_norm": 0.35736910108528097,
"learning_rate": 3.0056179775280903e-05,
"loss": 0.5083,
"step": 107
},
{
"epoch": 0.1821247892074199,
"grad_norm": 0.3692448119416969,
"learning_rate": 3.0337078651685396e-05,
"loss": 0.5213,
"step": 108
},
{
"epoch": 0.18381112984822934,
"grad_norm": 0.3284827088565517,
"learning_rate": 3.061797752808989e-05,
"loss": 0.512,
"step": 109
},
{
"epoch": 0.18549747048903878,
"grad_norm": 0.34971818895710677,
"learning_rate": 3.089887640449438e-05,
"loss": 0.5216,
"step": 110
},
{
"epoch": 0.18718381112984822,
"grad_norm": 0.359706671456223,
"learning_rate": 3.1179775280898874e-05,
"loss": 0.5097,
"step": 111
},
{
"epoch": 0.18887015177065766,
"grad_norm": 0.31627697174210256,
"learning_rate": 3.1460674157303374e-05,
"loss": 0.5163,
"step": 112
},
{
"epoch": 0.1905564924114671,
"grad_norm": 0.29216175832212843,
"learning_rate": 3.1741573033707866e-05,
"loss": 0.5391,
"step": 113
},
{
"epoch": 0.19224283305227655,
"grad_norm": 0.32764391376282487,
"learning_rate": 3.202247191011236e-05,
"loss": 0.5012,
"step": 114
},
{
"epoch": 0.19392917369308602,
"grad_norm": 0.42826805653790595,
"learning_rate": 3.230337078651686e-05,
"loss": 0.5168,
"step": 115
},
{
"epoch": 0.19561551433389546,
"grad_norm": 0.3433775054706283,
"learning_rate": 3.258426966292135e-05,
"loss": 0.5172,
"step": 116
},
{
"epoch": 0.1973018549747049,
"grad_norm": 0.38194080024414423,
"learning_rate": 3.2865168539325845e-05,
"loss": 0.5346,
"step": 117
},
{
"epoch": 0.19898819561551434,
"grad_norm": 0.4384292462018469,
"learning_rate": 3.314606741573034e-05,
"loss": 0.5063,
"step": 118
},
{
"epoch": 0.20067453625632378,
"grad_norm": 0.4221030612516262,
"learning_rate": 3.342696629213483e-05,
"loss": 0.4992,
"step": 119
},
{
"epoch": 0.20236087689713322,
"grad_norm": 0.5169282239706593,
"learning_rate": 3.370786516853933e-05,
"loss": 0.5027,
"step": 120
},
{
"epoch": 0.20404721753794267,
"grad_norm": 0.44828094060244733,
"learning_rate": 3.398876404494382e-05,
"loss": 0.5181,
"step": 121
},
{
"epoch": 0.2057335581787521,
"grad_norm": 0.4110052533087039,
"learning_rate": 3.4269662921348316e-05,
"loss": 0.5261,
"step": 122
},
{
"epoch": 0.20741989881956155,
"grad_norm": 0.3782790860010657,
"learning_rate": 3.455056179775281e-05,
"loss": 0.4979,
"step": 123
},
{
"epoch": 0.209106239460371,
"grad_norm": 0.35048604392267096,
"learning_rate": 3.483146067415731e-05,
"loss": 0.5252,
"step": 124
},
{
"epoch": 0.21079258010118043,
"grad_norm": 0.3668221403195122,
"learning_rate": 3.51123595505618e-05,
"loss": 0.5149,
"step": 125
},
{
"epoch": 0.21247892074198987,
"grad_norm": 0.40260454650546834,
"learning_rate": 3.5393258426966294e-05,
"loss": 0.5106,
"step": 126
},
{
"epoch": 0.21416526138279932,
"grad_norm": 0.46604542272081867,
"learning_rate": 3.5674157303370787e-05,
"loss": 0.5155,
"step": 127
},
{
"epoch": 0.21585160202360876,
"grad_norm": 0.3899275125570308,
"learning_rate": 3.595505617977528e-05,
"loss": 0.5012,
"step": 128
},
{
"epoch": 0.2175379426644182,
"grad_norm": 0.49335563725473797,
"learning_rate": 3.623595505617978e-05,
"loss": 0.5228,
"step": 129
},
{
"epoch": 0.21922428330522767,
"grad_norm": 0.3269454707802011,
"learning_rate": 3.651685393258427e-05,
"loss": 0.5024,
"step": 130
},
{
"epoch": 0.2209106239460371,
"grad_norm": 0.4867838720776066,
"learning_rate": 3.6797752808988765e-05,
"loss": 0.4735,
"step": 131
},
{
"epoch": 0.22259696458684655,
"grad_norm": 0.4634246367958874,
"learning_rate": 3.7078651685393264e-05,
"loss": 0.4915,
"step": 132
},
{
"epoch": 0.224283305227656,
"grad_norm": 0.4892729954570698,
"learning_rate": 3.735955056179776e-05,
"loss": 0.5027,
"step": 133
},
{
"epoch": 0.22596964586846544,
"grad_norm": 0.5748514707897312,
"learning_rate": 3.764044943820225e-05,
"loss": 0.4915,
"step": 134
},
{
"epoch": 0.22765598650927488,
"grad_norm": 0.479626533442717,
"learning_rate": 3.792134831460674e-05,
"loss": 0.4874,
"step": 135
},
{
"epoch": 0.22934232715008432,
"grad_norm": 0.5473804829632216,
"learning_rate": 3.8202247191011236e-05,
"loss": 0.5041,
"step": 136
},
{
"epoch": 0.23102866779089376,
"grad_norm": 0.49996519487113167,
"learning_rate": 3.8483146067415735e-05,
"loss": 0.4808,
"step": 137
},
{
"epoch": 0.2327150084317032,
"grad_norm": 0.5420765917435771,
"learning_rate": 3.876404494382023e-05,
"loss": 0.5,
"step": 138
},
{
"epoch": 0.23440134907251264,
"grad_norm": 0.5229436999215559,
"learning_rate": 3.904494382022472e-05,
"loss": 0.4934,
"step": 139
},
{
"epoch": 0.23608768971332209,
"grad_norm": 0.5275954099593283,
"learning_rate": 3.9325842696629214e-05,
"loss": 0.4989,
"step": 140
},
{
"epoch": 0.23777403035413153,
"grad_norm": 0.42305450719450605,
"learning_rate": 3.960674157303371e-05,
"loss": 0.4933,
"step": 141
},
{
"epoch": 0.23946037099494097,
"grad_norm": 0.40063348620691686,
"learning_rate": 3.98876404494382e-05,
"loss": 0.4981,
"step": 142
},
{
"epoch": 0.2411467116357504,
"grad_norm": 0.4653164060812678,
"learning_rate": 4.01685393258427e-05,
"loss": 0.5054,
"step": 143
},
{
"epoch": 0.24283305227655985,
"grad_norm": 0.4718439452539601,
"learning_rate": 4.044943820224719e-05,
"loss": 0.5181,
"step": 144
},
{
"epoch": 0.24451939291736932,
"grad_norm": 0.45311660232496087,
"learning_rate": 4.0730337078651685e-05,
"loss": 0.4944,
"step": 145
},
{
"epoch": 0.24620573355817876,
"grad_norm": 0.3421077711921803,
"learning_rate": 4.1011235955056184e-05,
"loss": 0.4946,
"step": 146
},
{
"epoch": 0.2478920741989882,
"grad_norm": 0.4741973695340924,
"learning_rate": 4.129213483146068e-05,
"loss": 0.5048,
"step": 147
},
{
"epoch": 0.24957841483979765,
"grad_norm": 0.40653718235510355,
"learning_rate": 4.157303370786517e-05,
"loss": 0.4849,
"step": 148
},
{
"epoch": 0.25126475548060706,
"grad_norm": 0.5158190551546549,
"learning_rate": 4.185393258426967e-05,
"loss": 0.5089,
"step": 149
},
{
"epoch": 0.25295109612141653,
"grad_norm": 0.6037801873244963,
"learning_rate": 4.2134831460674156e-05,
"loss": 0.4955,
"step": 150
},
{
"epoch": 0.25463743676222594,
"grad_norm": 0.5513324822622154,
"learning_rate": 4.2415730337078655e-05,
"loss": 0.492,
"step": 151
},
{
"epoch": 0.2563237774030354,
"grad_norm": 0.32755920076131817,
"learning_rate": 4.269662921348315e-05,
"loss": 0.4823,
"step": 152
},
{
"epoch": 0.2580101180438449,
"grad_norm": 0.3987104376121614,
"learning_rate": 4.297752808988764e-05,
"loss": 0.4861,
"step": 153
},
{
"epoch": 0.2596964586846543,
"grad_norm": 0.43570907360445954,
"learning_rate": 4.3258426966292134e-05,
"loss": 0.482,
"step": 154
},
{
"epoch": 0.26138279932546377,
"grad_norm": 0.49586172159275893,
"learning_rate": 4.353932584269663e-05,
"loss": 0.5087,
"step": 155
},
{
"epoch": 0.2630691399662732,
"grad_norm": 0.43200956217260056,
"learning_rate": 4.3820224719101126e-05,
"loss": 0.5028,
"step": 156
},
{
"epoch": 0.26475548060708265,
"grad_norm": 0.461283086175957,
"learning_rate": 4.410112359550562e-05,
"loss": 0.4883,
"step": 157
},
{
"epoch": 0.26644182124789206,
"grad_norm": 0.44146587137145016,
"learning_rate": 4.438202247191011e-05,
"loss": 0.5036,
"step": 158
},
{
"epoch": 0.26812816188870153,
"grad_norm": 0.46992072065252666,
"learning_rate": 4.4662921348314605e-05,
"loss": 0.5013,
"step": 159
},
{
"epoch": 0.26981450252951095,
"grad_norm": 0.4655499464280253,
"learning_rate": 4.4943820224719104e-05,
"loss": 0.484,
"step": 160
},
{
"epoch": 0.2715008431703204,
"grad_norm": 0.3716614860490022,
"learning_rate": 4.52247191011236e-05,
"loss": 0.479,
"step": 161
},
{
"epoch": 0.27318718381112983,
"grad_norm": 0.4854406076007183,
"learning_rate": 4.550561797752809e-05,
"loss": 0.4832,
"step": 162
},
{
"epoch": 0.2748735244519393,
"grad_norm": 0.518732815595233,
"learning_rate": 4.578651685393259e-05,
"loss": 0.5046,
"step": 163
},
{
"epoch": 0.2765598650927487,
"grad_norm": 0.6286368875144291,
"learning_rate": 4.606741573033708e-05,
"loss": 0.5081,
"step": 164
},
{
"epoch": 0.2782462057335582,
"grad_norm": 0.4499305788696525,
"learning_rate": 4.6348314606741575e-05,
"loss": 0.4655,
"step": 165
},
{
"epoch": 0.2799325463743676,
"grad_norm": 0.5216481086847019,
"learning_rate": 4.662921348314607e-05,
"loss": 0.469,
"step": 166
},
{
"epoch": 0.28161888701517707,
"grad_norm": 0.4372638183735836,
"learning_rate": 4.691011235955056e-05,
"loss": 0.4874,
"step": 167
},
{
"epoch": 0.28330522765598654,
"grad_norm": 0.6373461982457821,
"learning_rate": 4.719101123595506e-05,
"loss": 0.5105,
"step": 168
},
{
"epoch": 0.28499156829679595,
"grad_norm": 0.6523204371028702,
"learning_rate": 4.747191011235955e-05,
"loss": 0.5034,
"step": 169
},
{
"epoch": 0.2866779089376054,
"grad_norm": 0.5846990743822145,
"learning_rate": 4.7752808988764046e-05,
"loss": 0.4764,
"step": 170
},
{
"epoch": 0.28836424957841483,
"grad_norm": 0.42113540601536525,
"learning_rate": 4.803370786516854e-05,
"loss": 0.478,
"step": 171
},
{
"epoch": 0.2900505902192243,
"grad_norm": 0.540824682868501,
"learning_rate": 4.831460674157304e-05,
"loss": 0.4703,
"step": 172
},
{
"epoch": 0.2917369308600337,
"grad_norm": 0.6232981512866863,
"learning_rate": 4.859550561797753e-05,
"loss": 0.4752,
"step": 173
},
{
"epoch": 0.2934232715008432,
"grad_norm": 0.4306794411707756,
"learning_rate": 4.8876404494382024e-05,
"loss": 0.4906,
"step": 174
},
{
"epoch": 0.2951096121416526,
"grad_norm": 0.5163479053494551,
"learning_rate": 4.915730337078652e-05,
"loss": 0.4697,
"step": 175
},
{
"epoch": 0.29679595278246207,
"grad_norm": 0.5847728472881939,
"learning_rate": 4.943820224719101e-05,
"loss": 0.4947,
"step": 176
},
{
"epoch": 0.2984822934232715,
"grad_norm": 0.4312351326528099,
"learning_rate": 4.971910112359551e-05,
"loss": 0.4968,
"step": 177
},
{
"epoch": 0.30016863406408095,
"grad_norm": 0.6653668186872613,
"learning_rate": 5e-05,
"loss": 0.5176,
"step": 178
},
{
"epoch": 0.30185497470489037,
"grad_norm": 0.8558829245058476,
"learning_rate": 4.996876951905059e-05,
"loss": 0.4898,
"step": 179
},
{
"epoch": 0.30354131534569984,
"grad_norm": 0.4911849753770153,
"learning_rate": 4.993753903810119e-05,
"loss": 0.4849,
"step": 180
},
{
"epoch": 0.30522765598650925,
"grad_norm": 0.6143613254381958,
"learning_rate": 4.990630855715178e-05,
"loss": 0.4593,
"step": 181
},
{
"epoch": 0.3069139966273187,
"grad_norm": 0.6563084412266731,
"learning_rate": 4.9875078076202377e-05,
"loss": 0.4801,
"step": 182
},
{
"epoch": 0.3086003372681282,
"grad_norm": 0.4457353642314339,
"learning_rate": 4.984384759525297e-05,
"loss": 0.4749,
"step": 183
},
{
"epoch": 0.3102866779089376,
"grad_norm": 0.4828788641392981,
"learning_rate": 4.9812617114303564e-05,
"loss": 0.4797,
"step": 184
},
{
"epoch": 0.31197301854974707,
"grad_norm": 0.5791211620153529,
"learning_rate": 4.9781386633354154e-05,
"loss": 0.4789,
"step": 185
},
{
"epoch": 0.3136593591905565,
"grad_norm": 0.43012457307157376,
"learning_rate": 4.975015615240475e-05,
"loss": 0.4832,
"step": 186
},
{
"epoch": 0.31534569983136596,
"grad_norm": 0.6278035890937275,
"learning_rate": 4.971892567145534e-05,
"loss": 0.4661,
"step": 187
},
{
"epoch": 0.31703204047217537,
"grad_norm": 0.4457342787463174,
"learning_rate": 4.968769519050593e-05,
"loss": 0.4695,
"step": 188
},
{
"epoch": 0.31871838111298484,
"grad_norm": 0.4543503799114186,
"learning_rate": 4.965646470955653e-05,
"loss": 0.4769,
"step": 189
},
{
"epoch": 0.32040472175379425,
"grad_norm": 0.4948122735470389,
"learning_rate": 4.962523422860712e-05,
"loss": 0.4682,
"step": 190
},
{
"epoch": 0.3220910623946037,
"grad_norm": 0.48362708618764566,
"learning_rate": 4.959400374765772e-05,
"loss": 0.4601,
"step": 191
},
{
"epoch": 0.32377740303541314,
"grad_norm": 0.4130937815682131,
"learning_rate": 4.956277326670831e-05,
"loss": 0.4719,
"step": 192
},
{
"epoch": 0.3254637436762226,
"grad_norm": 0.5198231324227804,
"learning_rate": 4.95315427857589e-05,
"loss": 0.4722,
"step": 193
},
{
"epoch": 0.327150084317032,
"grad_norm": 0.4651378767011854,
"learning_rate": 4.95003123048095e-05,
"loss": 0.4535,
"step": 194
},
{
"epoch": 0.3288364249578415,
"grad_norm": 0.39247629815392066,
"learning_rate": 4.946908182386009e-05,
"loss": 0.4832,
"step": 195
},
{
"epoch": 0.3305227655986509,
"grad_norm": 0.5667870862895168,
"learning_rate": 4.9437851342910686e-05,
"loss": 0.4903,
"step": 196
},
{
"epoch": 0.33220910623946037,
"grad_norm": 0.5189515974550599,
"learning_rate": 4.9406620861961276e-05,
"loss": 0.4922,
"step": 197
},
{
"epoch": 0.33389544688026984,
"grad_norm": 0.41445820324605953,
"learning_rate": 4.937539038101187e-05,
"loss": 0.4877,
"step": 198
},
{
"epoch": 0.33558178752107926,
"grad_norm": 0.5690605331975456,
"learning_rate": 4.9344159900062464e-05,
"loss": 0.4861,
"step": 199
},
{
"epoch": 0.3372681281618887,
"grad_norm": 0.43127216298664417,
"learning_rate": 4.931292941911306e-05,
"loss": 0.4736,
"step": 200
},
{
"epoch": 0.33895446880269814,
"grad_norm": 0.5088487302873465,
"learning_rate": 4.928169893816365e-05,
"loss": 0.4643,
"step": 201
},
{
"epoch": 0.3406408094435076,
"grad_norm": 0.4022315371323236,
"learning_rate": 4.925046845721424e-05,
"loss": 0.4712,
"step": 202
},
{
"epoch": 0.342327150084317,
"grad_norm": 0.5435438730616811,
"learning_rate": 4.921923797626484e-05,
"loss": 0.4883,
"step": 203
},
{
"epoch": 0.3440134907251265,
"grad_norm": 0.5647135768046861,
"learning_rate": 4.918800749531543e-05,
"loss": 0.4874,
"step": 204
},
{
"epoch": 0.3456998313659359,
"grad_norm": 0.3975530293289323,
"learning_rate": 4.9156777014366025e-05,
"loss": 0.4824,
"step": 205
},
{
"epoch": 0.3473861720067454,
"grad_norm": 0.6242457799816916,
"learning_rate": 4.9125546533416615e-05,
"loss": 0.47,
"step": 206
},
{
"epoch": 0.3490725126475548,
"grad_norm": 0.5011673819192746,
"learning_rate": 4.909431605246721e-05,
"loss": 0.4678,
"step": 207
},
{
"epoch": 0.35075885328836426,
"grad_norm": 0.41505977699214663,
"learning_rate": 4.90630855715178e-05,
"loss": 0.4623,
"step": 208
},
{
"epoch": 0.3524451939291737,
"grad_norm": 0.6222052645158483,
"learning_rate": 4.90318550905684e-05,
"loss": 0.4939,
"step": 209
},
{
"epoch": 0.35413153456998314,
"grad_norm": 0.4163461583203549,
"learning_rate": 4.900062460961899e-05,
"loss": 0.4708,
"step": 210
},
{
"epoch": 0.35581787521079256,
"grad_norm": 0.6235836351033357,
"learning_rate": 4.896939412866958e-05,
"loss": 0.462,
"step": 211
},
{
"epoch": 0.357504215851602,
"grad_norm": 0.3903487157898828,
"learning_rate": 4.8938163647720176e-05,
"loss": 0.4799,
"step": 212
},
{
"epoch": 0.3591905564924115,
"grad_norm": 0.5806378172072378,
"learning_rate": 4.8906933166770766e-05,
"loss": 0.4917,
"step": 213
},
{
"epoch": 0.3608768971332209,
"grad_norm": 0.3985626907622682,
"learning_rate": 4.887570268582136e-05,
"loss": 0.4815,
"step": 214
},
{
"epoch": 0.3625632377740304,
"grad_norm": 0.536198940838733,
"learning_rate": 4.8844472204871954e-05,
"loss": 0.4729,
"step": 215
},
{
"epoch": 0.3642495784148398,
"grad_norm": 0.4329558877769293,
"learning_rate": 4.881324172392255e-05,
"loss": 0.4825,
"step": 216
},
{
"epoch": 0.36593591905564926,
"grad_norm": 0.4094546659846234,
"learning_rate": 4.878201124297315e-05,
"loss": 0.4709,
"step": 217
},
{
"epoch": 0.3676222596964587,
"grad_norm": 0.4900231997961044,
"learning_rate": 4.875078076202374e-05,
"loss": 0.4806,
"step": 218
},
{
"epoch": 0.36930860033726814,
"grad_norm": 1.3182340985233345,
"learning_rate": 4.8719550281074335e-05,
"loss": 0.4983,
"step": 219
},
{
"epoch": 0.37099494097807756,
"grad_norm": 0.5623706911628698,
"learning_rate": 4.8688319800124925e-05,
"loss": 0.4746,
"step": 220
},
{
"epoch": 0.37268128161888703,
"grad_norm": 0.7551184358602725,
"learning_rate": 4.865708931917552e-05,
"loss": 0.4876,
"step": 221
},
{
"epoch": 0.37436762225969644,
"grad_norm": 0.5124638039352849,
"learning_rate": 4.862585883822611e-05,
"loss": 0.4817,
"step": 222
},
{
"epoch": 0.3760539629005059,
"grad_norm": 0.5748662022626354,
"learning_rate": 4.859462835727671e-05,
"loss": 0.4873,
"step": 223
},
{
"epoch": 0.3777403035413153,
"grad_norm": 0.4995856424465425,
"learning_rate": 4.85633978763273e-05,
"loss": 0.4841,
"step": 224
},
{
"epoch": 0.3794266441821248,
"grad_norm": 0.5492555776305382,
"learning_rate": 4.853216739537789e-05,
"loss": 0.4572,
"step": 225
},
{
"epoch": 0.3811129848229342,
"grad_norm": 0.6217180425514838,
"learning_rate": 4.8500936914428486e-05,
"loss": 0.4873,
"step": 226
},
{
"epoch": 0.3827993254637437,
"grad_norm": 0.753543701418373,
"learning_rate": 4.8469706433479076e-05,
"loss": 0.4677,
"step": 227
},
{
"epoch": 0.3844856661045531,
"grad_norm": 0.39884775752702056,
"learning_rate": 4.843847595252967e-05,
"loss": 0.4853,
"step": 228
},
{
"epoch": 0.38617200674536256,
"grad_norm": 0.8356414158467076,
"learning_rate": 4.840724547158026e-05,
"loss": 0.4842,
"step": 229
},
{
"epoch": 0.38785834738617203,
"grad_norm": 0.4690693833719156,
"learning_rate": 4.837601499063086e-05,
"loss": 0.461,
"step": 230
},
{
"epoch": 0.38954468802698144,
"grad_norm": 0.5808095607024085,
"learning_rate": 4.834478450968145e-05,
"loss": 0.4627,
"step": 231
},
{
"epoch": 0.3912310286677909,
"grad_norm": 0.9887565739758644,
"learning_rate": 4.831355402873205e-05,
"loss": 0.5001,
"step": 232
},
{
"epoch": 0.39291736930860033,
"grad_norm": 0.3624233562603427,
"learning_rate": 4.828232354778264e-05,
"loss": 0.4391,
"step": 233
},
{
"epoch": 0.3946037099494098,
"grad_norm": 0.5710016977364857,
"learning_rate": 4.825109306683323e-05,
"loss": 0.4783,
"step": 234
},
{
"epoch": 0.3962900505902192,
"grad_norm": 0.40833038130567695,
"learning_rate": 4.8219862585883825e-05,
"loss": 0.4642,
"step": 235
},
{
"epoch": 0.3979763912310287,
"grad_norm": 0.47027421121390883,
"learning_rate": 4.8188632104934415e-05,
"loss": 0.4698,
"step": 236
},
{
"epoch": 0.3996627318718381,
"grad_norm": 0.4210977303820093,
"learning_rate": 4.815740162398501e-05,
"loss": 0.4651,
"step": 237
},
{
"epoch": 0.40134907251264756,
"grad_norm": 0.43664310696121306,
"learning_rate": 4.81261711430356e-05,
"loss": 0.4669,
"step": 238
},
{
"epoch": 0.403035413153457,
"grad_norm": 0.5887659356197822,
"learning_rate": 4.80949406620862e-05,
"loss": 0.4751,
"step": 239
},
{
"epoch": 0.40472175379426645,
"grad_norm": 0.4374894796331959,
"learning_rate": 4.806371018113679e-05,
"loss": 0.4532,
"step": 240
},
{
"epoch": 0.40640809443507586,
"grad_norm": 0.5253351044852949,
"learning_rate": 4.8032479700187386e-05,
"loss": 0.471,
"step": 241
},
{
"epoch": 0.40809443507588533,
"grad_norm": 0.45194720426123525,
"learning_rate": 4.8001249219237976e-05,
"loss": 0.4981,
"step": 242
},
{
"epoch": 0.40978077571669475,
"grad_norm": 0.4873740274970084,
"learning_rate": 4.797001873828857e-05,
"loss": 0.4761,
"step": 243
},
{
"epoch": 0.4114671163575042,
"grad_norm": 0.464630536061398,
"learning_rate": 4.793878825733917e-05,
"loss": 0.4561,
"step": 244
},
{
"epoch": 0.4131534569983137,
"grad_norm": 0.386296619575784,
"learning_rate": 4.790755777638976e-05,
"loss": 0.4658,
"step": 245
},
{
"epoch": 0.4148397976391231,
"grad_norm": 0.5513350789766529,
"learning_rate": 4.787632729544036e-05,
"loss": 0.4735,
"step": 246
},
{
"epoch": 0.41652613827993257,
"grad_norm": 0.39229874724094077,
"learning_rate": 4.784509681449095e-05,
"loss": 0.4645,
"step": 247
},
{
"epoch": 0.418212478920742,
"grad_norm": 0.6182543773392607,
"learning_rate": 4.781386633354154e-05,
"loss": 0.4798,
"step": 248
},
{
"epoch": 0.41989881956155145,
"grad_norm": 0.5517604855595091,
"learning_rate": 4.7782635852592134e-05,
"loss": 0.4535,
"step": 249
},
{
"epoch": 0.42158516020236086,
"grad_norm": 0.44676355561808895,
"learning_rate": 4.7751405371642724e-05,
"loss": 0.4565,
"step": 250
},
{
"epoch": 0.42327150084317033,
"grad_norm": 0.545302526098748,
"learning_rate": 4.772017489069332e-05,
"loss": 0.4518,
"step": 251
},
{
"epoch": 0.42495784148397975,
"grad_norm": 0.38455720972294166,
"learning_rate": 4.768894440974391e-05,
"loss": 0.4812,
"step": 252
},
{
"epoch": 0.4266441821247892,
"grad_norm": 0.6308884761349483,
"learning_rate": 4.765771392879451e-05,
"loss": 0.4763,
"step": 253
},
{
"epoch": 0.42833052276559863,
"grad_norm": 0.4252859223489624,
"learning_rate": 4.76264834478451e-05,
"loss": 0.4708,
"step": 254
},
{
"epoch": 0.4300168634064081,
"grad_norm": 0.4588786362077238,
"learning_rate": 4.7595252966895696e-05,
"loss": 0.4524,
"step": 255
},
{
"epoch": 0.4317032040472175,
"grad_norm": 0.5056357623998745,
"learning_rate": 4.7564022485946286e-05,
"loss": 0.4857,
"step": 256
},
{
"epoch": 0.433389544688027,
"grad_norm": 0.36885042655172334,
"learning_rate": 4.7532792004996876e-05,
"loss": 0.4532,
"step": 257
},
{
"epoch": 0.4350758853288364,
"grad_norm": 0.47099160176857435,
"learning_rate": 4.750156152404747e-05,
"loss": 0.4621,
"step": 258
},
{
"epoch": 0.43676222596964587,
"grad_norm": 0.4003081993445989,
"learning_rate": 4.747033104309806e-05,
"loss": 0.4538,
"step": 259
},
{
"epoch": 0.43844856661045534,
"grad_norm": 0.34772689545249436,
"learning_rate": 4.743910056214866e-05,
"loss": 0.4617,
"step": 260
},
{
"epoch": 0.44013490725126475,
"grad_norm": 0.5217549125371695,
"learning_rate": 4.740787008119925e-05,
"loss": 0.4809,
"step": 261
},
{
"epoch": 0.4418212478920742,
"grad_norm": 0.3865859039529573,
"learning_rate": 4.737663960024985e-05,
"loss": 0.4723,
"step": 262
},
{
"epoch": 0.44350758853288363,
"grad_norm": 0.41060711895278124,
"learning_rate": 4.734540911930044e-05,
"loss": 0.4633,
"step": 263
},
{
"epoch": 0.4451939291736931,
"grad_norm": 0.35590597776520944,
"learning_rate": 4.7314178638351034e-05,
"loss": 0.4743,
"step": 264
},
{
"epoch": 0.4468802698145025,
"grad_norm": 0.44077025461728286,
"learning_rate": 4.7282948157401624e-05,
"loss": 0.4618,
"step": 265
},
{
"epoch": 0.448566610455312,
"grad_norm": 0.3914683036988655,
"learning_rate": 4.7251717676452214e-05,
"loss": 0.4837,
"step": 266
},
{
"epoch": 0.4502529510961214,
"grad_norm": 0.39989639702012575,
"learning_rate": 4.722048719550281e-05,
"loss": 0.473,
"step": 267
},
{
"epoch": 0.45193929173693087,
"grad_norm": 0.35245741880842596,
"learning_rate": 4.71892567145534e-05,
"loss": 0.4687,
"step": 268
},
{
"epoch": 0.4536256323777403,
"grad_norm": 0.4526741581262521,
"learning_rate": 4.7158026233604005e-05,
"loss": 0.4515,
"step": 269
},
{
"epoch": 0.45531197301854975,
"grad_norm": 0.365110364415945,
"learning_rate": 4.7126795752654595e-05,
"loss": 0.47,
"step": 270
},
{
"epoch": 0.45699831365935917,
"grad_norm": 0.46637818011624854,
"learning_rate": 4.7095565271705186e-05,
"loss": 0.4474,
"step": 271
},
{
"epoch": 0.45868465430016864,
"grad_norm": 0.44190547233976,
"learning_rate": 4.706433479075578e-05,
"loss": 0.4803,
"step": 272
},
{
"epoch": 0.46037099494097805,
"grad_norm": 0.4875433864770596,
"learning_rate": 4.703310430980637e-05,
"loss": 0.4593,
"step": 273
},
{
"epoch": 0.4620573355817875,
"grad_norm": 0.39643388270510754,
"learning_rate": 4.700187382885697e-05,
"loss": 0.4669,
"step": 274
},
{
"epoch": 0.463743676222597,
"grad_norm": 0.5625953425266915,
"learning_rate": 4.697064334790756e-05,
"loss": 0.444,
"step": 275
},
{
"epoch": 0.4654300168634064,
"grad_norm": 0.3682394173749922,
"learning_rate": 4.693941286695816e-05,
"loss": 0.4582,
"step": 276
},
{
"epoch": 0.4671163575042159,
"grad_norm": 0.5666864728568041,
"learning_rate": 4.690818238600875e-05,
"loss": 0.4627,
"step": 277
},
{
"epoch": 0.4688026981450253,
"grad_norm": 0.4064111657157596,
"learning_rate": 4.6876951905059344e-05,
"loss": 0.4824,
"step": 278
},
{
"epoch": 0.47048903878583476,
"grad_norm": 0.45845078406278744,
"learning_rate": 4.6845721424109934e-05,
"loss": 0.4478,
"step": 279
},
{
"epoch": 0.47217537942664417,
"grad_norm": 0.548590898565881,
"learning_rate": 4.6814490943160524e-05,
"loss": 0.4594,
"step": 280
},
{
"epoch": 0.47386172006745364,
"grad_norm": 0.3886498640345789,
"learning_rate": 4.678326046221112e-05,
"loss": 0.4453,
"step": 281
},
{
"epoch": 0.47554806070826305,
"grad_norm": 0.5103744392911466,
"learning_rate": 4.675202998126171e-05,
"loss": 0.4448,
"step": 282
},
{
"epoch": 0.4772344013490725,
"grad_norm": 0.4518474291293266,
"learning_rate": 4.672079950031231e-05,
"loss": 0.46,
"step": 283
},
{
"epoch": 0.47892074198988194,
"grad_norm": 0.5456343340353047,
"learning_rate": 4.66895690193629e-05,
"loss": 0.4566,
"step": 284
},
{
"epoch": 0.4806070826306914,
"grad_norm": 0.4566447835972022,
"learning_rate": 4.6658338538413495e-05,
"loss": 0.4603,
"step": 285
},
{
"epoch": 0.4822934232715008,
"grad_norm": 0.4746694572858959,
"learning_rate": 4.6627108057464085e-05,
"loss": 0.4512,
"step": 286
},
{
"epoch": 0.4839797639123103,
"grad_norm": 0.5188723656216718,
"learning_rate": 4.659587757651468e-05,
"loss": 0.4825,
"step": 287
},
{
"epoch": 0.4856661045531197,
"grad_norm": 0.4402398441258149,
"learning_rate": 4.656464709556527e-05,
"loss": 0.4304,
"step": 288
},
{
"epoch": 0.4873524451939292,
"grad_norm": 0.5613408976197366,
"learning_rate": 4.653341661461586e-05,
"loss": 0.4701,
"step": 289
},
{
"epoch": 0.48903878583473864,
"grad_norm": 0.5401648721182448,
"learning_rate": 4.650218613366646e-05,
"loss": 0.4551,
"step": 290
},
{
"epoch": 0.49072512647554806,
"grad_norm": 0.5791278786888271,
"learning_rate": 4.647095565271705e-05,
"loss": 0.4511,
"step": 291
},
{
"epoch": 0.4924114671163575,
"grad_norm": 0.5246744706582873,
"learning_rate": 4.643972517176765e-05,
"loss": 0.4549,
"step": 292
},
{
"epoch": 0.49409780775716694,
"grad_norm": 0.7164081142152631,
"learning_rate": 4.640849469081824e-05,
"loss": 0.4674,
"step": 293
},
{
"epoch": 0.4957841483979764,
"grad_norm": 0.40548449726070296,
"learning_rate": 4.6377264209868834e-05,
"loss": 0.4571,
"step": 294
},
{
"epoch": 0.4974704890387858,
"grad_norm": 0.6593403514913315,
"learning_rate": 4.634603372891943e-05,
"loss": 0.462,
"step": 295
},
{
"epoch": 0.4991568296795953,
"grad_norm": 0.39387605629333194,
"learning_rate": 4.631480324797002e-05,
"loss": 0.4535,
"step": 296
},
{
"epoch": 0.5008431703204047,
"grad_norm": 0.6989949061990584,
"learning_rate": 4.628357276702062e-05,
"loss": 0.4548,
"step": 297
},
{
"epoch": 0.5025295109612141,
"grad_norm": 0.5728394753919458,
"learning_rate": 4.625234228607121e-05,
"loss": 0.4644,
"step": 298
},
{
"epoch": 0.5042158516020236,
"grad_norm": 0.5792715131221312,
"learning_rate": 4.6221111805121805e-05,
"loss": 0.4451,
"step": 299
},
{
"epoch": 0.5059021922428331,
"grad_norm": 0.4586042579159975,
"learning_rate": 4.6189881324172395e-05,
"loss": 0.4397,
"step": 300
},
{
"epoch": 0.5075885328836425,
"grad_norm": 0.5912133606348081,
"learning_rate": 4.615865084322299e-05,
"loss": 0.4581,
"step": 301
},
{
"epoch": 0.5092748735244519,
"grad_norm": 0.4840692893259534,
"learning_rate": 4.612742036227358e-05,
"loss": 0.47,
"step": 302
},
{
"epoch": 0.5109612141652614,
"grad_norm": 0.628029656964694,
"learning_rate": 4.609618988132417e-05,
"loss": 0.4657,
"step": 303
},
{
"epoch": 0.5126475548060708,
"grad_norm": 0.5119987303703423,
"learning_rate": 4.606495940037477e-05,
"loss": 0.45,
"step": 304
},
{
"epoch": 0.5143338954468802,
"grad_norm": 0.5538879969713244,
"learning_rate": 4.603372891942536e-05,
"loss": 0.4494,
"step": 305
},
{
"epoch": 0.5160202360876898,
"grad_norm": 0.6059271546930384,
"learning_rate": 4.6002498438475956e-05,
"loss": 0.451,
"step": 306
},
{
"epoch": 0.5177065767284992,
"grad_norm": 0.40827887502082827,
"learning_rate": 4.5971267957526547e-05,
"loss": 0.4539,
"step": 307
},
{
"epoch": 0.5193929173693086,
"grad_norm": 0.5394798456701918,
"learning_rate": 4.5940037476577143e-05,
"loss": 0.4657,
"step": 308
},
{
"epoch": 0.521079258010118,
"grad_norm": 0.37887469267167845,
"learning_rate": 4.5908806995627734e-05,
"loss": 0.4708,
"step": 309
},
{
"epoch": 0.5227655986509275,
"grad_norm": 0.47009884438847327,
"learning_rate": 4.587757651467833e-05,
"loss": 0.4557,
"step": 310
},
{
"epoch": 0.524451939291737,
"grad_norm": 0.4408946597680707,
"learning_rate": 4.584634603372892e-05,
"loss": 0.454,
"step": 311
},
{
"epoch": 0.5261382799325464,
"grad_norm": 0.43048975043678045,
"learning_rate": 4.581511555277951e-05,
"loss": 0.4565,
"step": 312
},
{
"epoch": 0.5278246205733558,
"grad_norm": 0.5519067163880373,
"learning_rate": 4.578388507183011e-05,
"loss": 0.4584,
"step": 313
},
{
"epoch": 0.5295109612141653,
"grad_norm": 0.4933302641718265,
"learning_rate": 4.57526545908807e-05,
"loss": 0.4631,
"step": 314
},
{
"epoch": 0.5311973018549747,
"grad_norm": 0.4571212964029398,
"learning_rate": 4.5721424109931295e-05,
"loss": 0.4403,
"step": 315
},
{
"epoch": 0.5328836424957841,
"grad_norm": 0.457978654022723,
"learning_rate": 4.5690193628981885e-05,
"loss": 0.4567,
"step": 316
},
{
"epoch": 0.5345699831365935,
"grad_norm": 0.4789628002820523,
"learning_rate": 4.565896314803248e-05,
"loss": 0.4647,
"step": 317
},
{
"epoch": 0.5362563237774031,
"grad_norm": 0.5030611987631772,
"learning_rate": 4.562773266708307e-05,
"loss": 0.4472,
"step": 318
},
{
"epoch": 0.5379426644182125,
"grad_norm": 0.537594026859627,
"learning_rate": 4.559650218613367e-05,
"loss": 0.4537,
"step": 319
},
{
"epoch": 0.5396290050590219,
"grad_norm": 0.545365577796258,
"learning_rate": 4.556527170518426e-05,
"loss": 0.452,
"step": 320
},
{
"epoch": 0.5413153456998314,
"grad_norm": 0.5272307911586873,
"learning_rate": 4.5534041224234856e-05,
"loss": 0.4507,
"step": 321
},
{
"epoch": 0.5430016863406408,
"grad_norm": 0.43739398109220595,
"learning_rate": 4.550281074328545e-05,
"loss": 0.4607,
"step": 322
},
{
"epoch": 0.5446880269814502,
"grad_norm": 0.5883556732443432,
"learning_rate": 4.547158026233604e-05,
"loss": 0.4674,
"step": 323
},
{
"epoch": 0.5463743676222597,
"grad_norm": 0.4259537371040564,
"learning_rate": 4.544034978138664e-05,
"loss": 0.4711,
"step": 324
},
{
"epoch": 0.5480607082630692,
"grad_norm": 0.47670365038826346,
"learning_rate": 4.540911930043723e-05,
"loss": 0.453,
"step": 325
},
{
"epoch": 0.5497470489038786,
"grad_norm": 0.3898528313656299,
"learning_rate": 4.537788881948782e-05,
"loss": 0.4404,
"step": 326
},
{
"epoch": 0.551433389544688,
"grad_norm": 0.39330434537215003,
"learning_rate": 4.534665833853842e-05,
"loss": 0.4659,
"step": 327
},
{
"epoch": 0.5531197301854974,
"grad_norm": 0.5271047700192718,
"learning_rate": 4.531542785758901e-05,
"loss": 0.4679,
"step": 328
},
{
"epoch": 0.554806070826307,
"grad_norm": 0.43702194412823087,
"learning_rate": 4.5284197376639605e-05,
"loss": 0.4458,
"step": 329
},
{
"epoch": 0.5564924114671164,
"grad_norm": 0.4942049663231375,
"learning_rate": 4.5252966895690195e-05,
"loss": 0.4782,
"step": 330
},
{
"epoch": 0.5581787521079258,
"grad_norm": 0.47280783766806017,
"learning_rate": 4.522173641474079e-05,
"loss": 0.469,
"step": 331
},
{
"epoch": 0.5598650927487352,
"grad_norm": 0.49600299133397724,
"learning_rate": 4.519050593379138e-05,
"loss": 0.4442,
"step": 332
},
{
"epoch": 0.5615514333895447,
"grad_norm": 0.3778585896882259,
"learning_rate": 4.515927545284198e-05,
"loss": 0.4349,
"step": 333
},
{
"epoch": 0.5632377740303541,
"grad_norm": 0.6634881870711851,
"learning_rate": 4.512804497189257e-05,
"loss": 0.4493,
"step": 334
},
{
"epoch": 0.5649241146711635,
"grad_norm": 0.3529106630733103,
"learning_rate": 4.509681449094316e-05,
"loss": 0.4597,
"step": 335
},
{
"epoch": 0.5666104553119731,
"grad_norm": 0.5092975219112303,
"learning_rate": 4.5065584009993756e-05,
"loss": 0.4467,
"step": 336
},
{
"epoch": 0.5682967959527825,
"grad_norm": 0.4391198480506805,
"learning_rate": 4.5034353529044346e-05,
"loss": 0.4532,
"step": 337
},
{
"epoch": 0.5699831365935919,
"grad_norm": 0.5198814283532447,
"learning_rate": 4.500312304809494e-05,
"loss": 0.4421,
"step": 338
},
{
"epoch": 0.5716694772344013,
"grad_norm": 0.46667997995117966,
"learning_rate": 4.497189256714553e-05,
"loss": 0.4549,
"step": 339
},
{
"epoch": 0.5733558178752108,
"grad_norm": 0.4975478516344748,
"learning_rate": 4.494066208619613e-05,
"loss": 0.4443,
"step": 340
},
{
"epoch": 0.5750421585160203,
"grad_norm": 0.3894373802203641,
"learning_rate": 4.490943160524672e-05,
"loss": 0.4493,
"step": 341
},
{
"epoch": 0.5767284991568297,
"grad_norm": 0.4992453837229512,
"learning_rate": 4.487820112429732e-05,
"loss": 0.4429,
"step": 342
},
{
"epoch": 0.5784148397976391,
"grad_norm": 0.3647456589580748,
"learning_rate": 4.484697064334791e-05,
"loss": 0.4382,
"step": 343
},
{
"epoch": 0.5801011804384486,
"grad_norm": 0.4516326357024824,
"learning_rate": 4.48157401623985e-05,
"loss": 0.4705,
"step": 344
},
{
"epoch": 0.581787521079258,
"grad_norm": 0.35544882261647254,
"learning_rate": 4.4784509681449095e-05,
"loss": 0.4627,
"step": 345
},
{
"epoch": 0.5834738617200674,
"grad_norm": 0.37857935514451707,
"learning_rate": 4.475327920049969e-05,
"loss": 0.4502,
"step": 346
},
{
"epoch": 0.5851602023608768,
"grad_norm": 0.3829292936307298,
"learning_rate": 4.472204871955029e-05,
"loss": 0.4521,
"step": 347
},
{
"epoch": 0.5868465430016864,
"grad_norm": 0.3960422907404094,
"learning_rate": 4.469081823860088e-05,
"loss": 0.4592,
"step": 348
},
{
"epoch": 0.5885328836424958,
"grad_norm": 0.5180788477319964,
"learning_rate": 4.465958775765147e-05,
"loss": 0.4622,
"step": 349
},
{
"epoch": 0.5902192242833052,
"grad_norm": 0.383759399659539,
"learning_rate": 4.4628357276702066e-05,
"loss": 0.4565,
"step": 350
},
{
"epoch": 0.5919055649241147,
"grad_norm": 0.4755348626514461,
"learning_rate": 4.4597126795752656e-05,
"loss": 0.461,
"step": 351
},
{
"epoch": 0.5935919055649241,
"grad_norm": 0.40158049133043633,
"learning_rate": 4.456589631480325e-05,
"loss": 0.4537,
"step": 352
},
{
"epoch": 0.5952782462057336,
"grad_norm": 0.4375462386612478,
"learning_rate": 4.453466583385384e-05,
"loss": 0.452,
"step": 353
},
{
"epoch": 0.596964586846543,
"grad_norm": 0.4109536436881464,
"learning_rate": 4.450343535290444e-05,
"loss": 0.4544,
"step": 354
},
{
"epoch": 0.5986509274873525,
"grad_norm": 0.4129498203971123,
"learning_rate": 4.447220487195503e-05,
"loss": 0.4541,
"step": 355
},
{
"epoch": 0.6003372681281619,
"grad_norm": 0.3642496853825518,
"learning_rate": 4.444097439100563e-05,
"loss": 0.4276,
"step": 356
},
{
"epoch": 0.6020236087689713,
"grad_norm": 0.3885225374210196,
"learning_rate": 4.440974391005622e-05,
"loss": 0.4401,
"step": 357
},
{
"epoch": 0.6037099494097807,
"grad_norm": 0.4330664385703508,
"learning_rate": 4.437851342910681e-05,
"loss": 0.4631,
"step": 358
},
{
"epoch": 0.6053962900505903,
"grad_norm": 0.41560861442665614,
"learning_rate": 4.4347282948157404e-05,
"loss": 0.4576,
"step": 359
},
{
"epoch": 0.6070826306913997,
"grad_norm": 0.36879647476749433,
"learning_rate": 4.4316052467207994e-05,
"loss": 0.4561,
"step": 360
},
{
"epoch": 0.6087689713322091,
"grad_norm": 0.43808306623569127,
"learning_rate": 4.428482198625859e-05,
"loss": 0.4575,
"step": 361
},
{
"epoch": 0.6104553119730185,
"grad_norm": 0.3525734460941292,
"learning_rate": 4.425359150530918e-05,
"loss": 0.4326,
"step": 362
},
{
"epoch": 0.612141652613828,
"grad_norm": 0.45284917798855845,
"learning_rate": 4.422236102435978e-05,
"loss": 0.4624,
"step": 363
},
{
"epoch": 0.6138279932546374,
"grad_norm": 0.42565642217457994,
"learning_rate": 4.419113054341037e-05,
"loss": 0.4602,
"step": 364
},
{
"epoch": 0.6155143338954469,
"grad_norm": 0.40695683541052846,
"learning_rate": 4.4159900062460966e-05,
"loss": 0.4602,
"step": 365
},
{
"epoch": 0.6172006745362564,
"grad_norm": 0.39479177428957435,
"learning_rate": 4.4128669581511556e-05,
"loss": 0.4303,
"step": 366
},
{
"epoch": 0.6188870151770658,
"grad_norm": 0.4103584712960603,
"learning_rate": 4.4097439100562146e-05,
"loss": 0.4641,
"step": 367
},
{
"epoch": 0.6205733558178752,
"grad_norm": 0.40114012742268623,
"learning_rate": 4.406620861961274e-05,
"loss": 0.4543,
"step": 368
},
{
"epoch": 0.6222596964586846,
"grad_norm": 0.4671847767961667,
"learning_rate": 4.403497813866333e-05,
"loss": 0.4576,
"step": 369
},
{
"epoch": 0.6239460370994941,
"grad_norm": 0.4214940061838059,
"learning_rate": 4.400374765771393e-05,
"loss": 0.4655,
"step": 370
},
{
"epoch": 0.6256323777403036,
"grad_norm": 0.4036379096633672,
"learning_rate": 4.397251717676452e-05,
"loss": 0.4621,
"step": 371
},
{
"epoch": 0.627318718381113,
"grad_norm": 0.43480686992150475,
"learning_rate": 4.394128669581512e-05,
"loss": 0.4472,
"step": 372
},
{
"epoch": 0.6290050590219224,
"grad_norm": 0.34172767802821746,
"learning_rate": 4.3910056214865714e-05,
"loss": 0.44,
"step": 373
},
{
"epoch": 0.6306913996627319,
"grad_norm": 0.4519665422947456,
"learning_rate": 4.3878825733916304e-05,
"loss": 0.4242,
"step": 374
},
{
"epoch": 0.6323777403035413,
"grad_norm": 0.33989642815100785,
"learning_rate": 4.38475952529669e-05,
"loss": 0.4621,
"step": 375
},
{
"epoch": 0.6340640809443507,
"grad_norm": 0.5051501147469363,
"learning_rate": 4.381636477201749e-05,
"loss": 0.447,
"step": 376
},
{
"epoch": 0.6357504215851602,
"grad_norm": 0.3114342404571123,
"learning_rate": 4.378513429106809e-05,
"loss": 0.4605,
"step": 377
},
{
"epoch": 0.6374367622259697,
"grad_norm": 0.46355812533549084,
"learning_rate": 4.375390381011868e-05,
"loss": 0.4493,
"step": 378
},
{
"epoch": 0.6391231028667791,
"grad_norm": 0.34338903987416625,
"learning_rate": 4.3722673329169275e-05,
"loss": 0.4284,
"step": 379
},
{
"epoch": 0.6408094435075885,
"grad_norm": 0.3546403564873265,
"learning_rate": 4.3691442848219865e-05,
"loss": 0.4304,
"step": 380
},
{
"epoch": 0.642495784148398,
"grad_norm": 0.3342737589074633,
"learning_rate": 4.3660212367270456e-05,
"loss": 0.4511,
"step": 381
},
{
"epoch": 0.6441821247892074,
"grad_norm": 0.38340072496867783,
"learning_rate": 4.362898188632105e-05,
"loss": 0.4739,
"step": 382
},
{
"epoch": 0.6458684654300169,
"grad_norm": 0.3188919441604891,
"learning_rate": 4.359775140537164e-05,
"loss": 0.4413,
"step": 383
},
{
"epoch": 0.6475548060708263,
"grad_norm": 0.40349853629853805,
"learning_rate": 4.356652092442224e-05,
"loss": 0.4294,
"step": 384
},
{
"epoch": 0.6492411467116358,
"grad_norm": 0.3418001692662577,
"learning_rate": 4.353529044347283e-05,
"loss": 0.4579,
"step": 385
},
{
"epoch": 0.6509274873524452,
"grad_norm": 0.4179039862512628,
"learning_rate": 4.350405996252343e-05,
"loss": 0.4718,
"step": 386
},
{
"epoch": 0.6526138279932546,
"grad_norm": 0.3656172176030232,
"learning_rate": 4.347282948157402e-05,
"loss": 0.4473,
"step": 387
},
{
"epoch": 0.654300168634064,
"grad_norm": 0.3569964633246004,
"learning_rate": 4.3441599000624614e-05,
"loss": 0.4447,
"step": 388
},
{
"epoch": 0.6559865092748736,
"grad_norm": 0.43024795155564693,
"learning_rate": 4.3410368519675204e-05,
"loss": 0.4364,
"step": 389
},
{
"epoch": 0.657672849915683,
"grad_norm": 0.3957910007344399,
"learning_rate": 4.3379138038725794e-05,
"loss": 0.4549,
"step": 390
},
{
"epoch": 0.6593591905564924,
"grad_norm": 0.4205619540503897,
"learning_rate": 4.334790755777639e-05,
"loss": 0.4265,
"step": 391
},
{
"epoch": 0.6610455311973018,
"grad_norm": 0.3131668807477272,
"learning_rate": 4.331667707682698e-05,
"loss": 0.439,
"step": 392
},
{
"epoch": 0.6627318718381113,
"grad_norm": 0.4493871641528794,
"learning_rate": 4.328544659587758e-05,
"loss": 0.4548,
"step": 393
},
{
"epoch": 0.6644182124789207,
"grad_norm": 0.33758428285308034,
"learning_rate": 4.325421611492817e-05,
"loss": 0.4636,
"step": 394
},
{
"epoch": 0.6661045531197302,
"grad_norm": 0.42168546778015187,
"learning_rate": 4.3222985633978765e-05,
"loss": 0.4327,
"step": 395
},
{
"epoch": 0.6677908937605397,
"grad_norm": 0.3204127281970423,
"learning_rate": 4.3191755153029355e-05,
"loss": 0.4508,
"step": 396
},
{
"epoch": 0.6694772344013491,
"grad_norm": 0.35319093847245864,
"learning_rate": 4.316052467207995e-05,
"loss": 0.4475,
"step": 397
},
{
"epoch": 0.6711635750421585,
"grad_norm": 0.3376608430123609,
"learning_rate": 4.312929419113055e-05,
"loss": 0.459,
"step": 398
},
{
"epoch": 0.6728499156829679,
"grad_norm": 0.41689185294932124,
"learning_rate": 4.309806371018114e-05,
"loss": 0.4469,
"step": 399
},
{
"epoch": 0.6745362563237775,
"grad_norm": 0.30877204324968244,
"learning_rate": 4.3066833229231736e-05,
"loss": 0.4435,
"step": 400
},
{
"epoch": 0.6762225969645869,
"grad_norm": 0.37792173543475427,
"learning_rate": 4.3035602748282327e-05,
"loss": 0.4452,
"step": 401
},
{
"epoch": 0.6779089376053963,
"grad_norm": 0.3780637092799853,
"learning_rate": 4.3004372267332924e-05,
"loss": 0.4491,
"step": 402
},
{
"epoch": 0.6795952782462057,
"grad_norm": 0.3753855323716615,
"learning_rate": 4.2973141786383514e-05,
"loss": 0.4597,
"step": 403
},
{
"epoch": 0.6812816188870152,
"grad_norm": 0.3547964801934976,
"learning_rate": 4.2941911305434104e-05,
"loss": 0.4471,
"step": 404
},
{
"epoch": 0.6829679595278246,
"grad_norm": 0.37366732354500176,
"learning_rate": 4.29106808244847e-05,
"loss": 0.4285,
"step": 405
},
{
"epoch": 0.684654300168634,
"grad_norm": 0.3527414732450224,
"learning_rate": 4.287945034353529e-05,
"loss": 0.4419,
"step": 406
},
{
"epoch": 0.6863406408094435,
"grad_norm": 0.36437242290240707,
"learning_rate": 4.284821986258589e-05,
"loss": 0.4323,
"step": 407
},
{
"epoch": 0.688026981450253,
"grad_norm": 0.3674294362180389,
"learning_rate": 4.281698938163648e-05,
"loss": 0.4364,
"step": 408
},
{
"epoch": 0.6897133220910624,
"grad_norm": 0.31810632269177597,
"learning_rate": 4.2785758900687075e-05,
"loss": 0.4389,
"step": 409
},
{
"epoch": 0.6913996627318718,
"grad_norm": 0.38692488885344895,
"learning_rate": 4.2754528419737665e-05,
"loss": 0.4563,
"step": 410
},
{
"epoch": 0.6930860033726813,
"grad_norm": 0.3583081987194819,
"learning_rate": 4.272329793878826e-05,
"loss": 0.4497,
"step": 411
},
{
"epoch": 0.6947723440134908,
"grad_norm": 0.39986326119698185,
"learning_rate": 4.269206745783885e-05,
"loss": 0.4478,
"step": 412
},
{
"epoch": 0.6964586846543002,
"grad_norm": 0.36531474130869634,
"learning_rate": 4.266083697688944e-05,
"loss": 0.4441,
"step": 413
},
{
"epoch": 0.6981450252951096,
"grad_norm": 0.36687118909418154,
"learning_rate": 4.262960649594004e-05,
"loss": 0.4318,
"step": 414
},
{
"epoch": 0.6998313659359191,
"grad_norm": 0.4182108090774202,
"learning_rate": 4.259837601499063e-05,
"loss": 0.4635,
"step": 415
},
{
"epoch": 0.7015177065767285,
"grad_norm": 0.3153963333187026,
"learning_rate": 4.2567145534041226e-05,
"loss": 0.438,
"step": 416
},
{
"epoch": 0.7032040472175379,
"grad_norm": 0.44907938165023165,
"learning_rate": 4.2535915053091817e-05,
"loss": 0.4354,
"step": 417
},
{
"epoch": 0.7048903878583473,
"grad_norm": 0.32544898293977376,
"learning_rate": 4.2504684572142414e-05,
"loss": 0.4595,
"step": 418
},
{
"epoch": 0.7065767284991569,
"grad_norm": 0.3887718081828085,
"learning_rate": 4.2473454091193004e-05,
"loss": 0.4324,
"step": 419
},
{
"epoch": 0.7082630691399663,
"grad_norm": 0.3801064013683576,
"learning_rate": 4.24422236102436e-05,
"loss": 0.4547,
"step": 420
},
{
"epoch": 0.7099494097807757,
"grad_norm": 0.36710077046278583,
"learning_rate": 4.241099312929419e-05,
"loss": 0.43,
"step": 421
},
{
"epoch": 0.7116357504215851,
"grad_norm": 0.3030182320062759,
"learning_rate": 4.237976264834478e-05,
"loss": 0.4428,
"step": 422
},
{
"epoch": 0.7133220910623946,
"grad_norm": 0.3905227942500751,
"learning_rate": 4.234853216739538e-05,
"loss": 0.4433,
"step": 423
},
{
"epoch": 0.715008431703204,
"grad_norm": 0.3360133794872185,
"learning_rate": 4.2317301686445975e-05,
"loss": 0.4406,
"step": 424
},
{
"epoch": 0.7166947723440135,
"grad_norm": 0.4107242343136471,
"learning_rate": 4.228607120549657e-05,
"loss": 0.4355,
"step": 425
},
{
"epoch": 0.718381112984823,
"grad_norm": 0.35341266657551246,
"learning_rate": 4.225484072454716e-05,
"loss": 0.4407,
"step": 426
},
{
"epoch": 0.7200674536256324,
"grad_norm": 0.32748306193146526,
"learning_rate": 4.222361024359775e-05,
"loss": 0.4526,
"step": 427
},
{
"epoch": 0.7217537942664418,
"grad_norm": 0.3476633561960724,
"learning_rate": 4.219237976264835e-05,
"loss": 0.4326,
"step": 428
},
{
"epoch": 0.7234401349072512,
"grad_norm": 0.39968844352495325,
"learning_rate": 4.216114928169894e-05,
"loss": 0.4386,
"step": 429
},
{
"epoch": 0.7251264755480608,
"grad_norm": 0.3422492650637317,
"learning_rate": 4.2129918800749536e-05,
"loss": 0.4408,
"step": 430
},
{
"epoch": 0.7268128161888702,
"grad_norm": 0.38617427295803425,
"learning_rate": 4.2098688319800126e-05,
"loss": 0.4405,
"step": 431
},
{
"epoch": 0.7284991568296796,
"grad_norm": 0.382691434468715,
"learning_rate": 4.206745783885072e-05,
"loss": 0.4352,
"step": 432
},
{
"epoch": 0.730185497470489,
"grad_norm": 0.34931977303922557,
"learning_rate": 4.203622735790131e-05,
"loss": 0.4515,
"step": 433
},
{
"epoch": 0.7318718381112985,
"grad_norm": 0.36781584253563737,
"learning_rate": 4.200499687695191e-05,
"loss": 0.4678,
"step": 434
},
{
"epoch": 0.7335581787521079,
"grad_norm": 0.3439821047053887,
"learning_rate": 4.19737663960025e-05,
"loss": 0.4346,
"step": 435
},
{
"epoch": 0.7352445193929174,
"grad_norm": 0.33784258943847506,
"learning_rate": 4.194253591505309e-05,
"loss": 0.4571,
"step": 436
},
{
"epoch": 0.7369308600337268,
"grad_norm": 0.3458788419301591,
"learning_rate": 4.191130543410369e-05,
"loss": 0.4317,
"step": 437
},
{
"epoch": 0.7386172006745363,
"grad_norm": 0.3511715747554807,
"learning_rate": 4.188007495315428e-05,
"loss": 0.4355,
"step": 438
},
{
"epoch": 0.7403035413153457,
"grad_norm": 0.4087128357184536,
"learning_rate": 4.1848844472204875e-05,
"loss": 0.446,
"step": 439
},
{
"epoch": 0.7419898819561551,
"grad_norm": 0.29749880486910146,
"learning_rate": 4.1817613991255465e-05,
"loss": 0.4369,
"step": 440
},
{
"epoch": 0.7436762225969646,
"grad_norm": 0.4058139525114164,
"learning_rate": 4.178638351030606e-05,
"loss": 0.4312,
"step": 441
},
{
"epoch": 0.7453625632377741,
"grad_norm": 0.4113540180328209,
"learning_rate": 4.175515302935665e-05,
"loss": 0.4565,
"step": 442
},
{
"epoch": 0.7470489038785835,
"grad_norm": 0.2859777308910414,
"learning_rate": 4.172392254840725e-05,
"loss": 0.4476,
"step": 443
},
{
"epoch": 0.7487352445193929,
"grad_norm": 0.4494426475589675,
"learning_rate": 4.169269206745784e-05,
"loss": 0.4293,
"step": 444
},
{
"epoch": 0.7504215851602024,
"grad_norm": 0.3191112415766653,
"learning_rate": 4.166146158650843e-05,
"loss": 0.4543,
"step": 445
},
{
"epoch": 0.7521079258010118,
"grad_norm": 0.4899358892440741,
"learning_rate": 4.1630231105559026e-05,
"loss": 0.4316,
"step": 446
},
{
"epoch": 0.7537942664418212,
"grad_norm": 0.4045866323132377,
"learning_rate": 4.1599000624609616e-05,
"loss": 0.4572,
"step": 447
},
{
"epoch": 0.7554806070826307,
"grad_norm": 0.40606674939278087,
"learning_rate": 4.156777014366021e-05,
"loss": 0.4305,
"step": 448
},
{
"epoch": 0.7571669477234402,
"grad_norm": 0.41131034817801554,
"learning_rate": 4.15365396627108e-05,
"loss": 0.448,
"step": 449
},
{
"epoch": 0.7588532883642496,
"grad_norm": 0.36374019960621246,
"learning_rate": 4.150530918176141e-05,
"loss": 0.4471,
"step": 450
},
{
"epoch": 0.760539629005059,
"grad_norm": 0.3407701272183516,
"learning_rate": 4.1474078700812e-05,
"loss": 0.4416,
"step": 451
},
{
"epoch": 0.7622259696458684,
"grad_norm": 0.33033061668230757,
"learning_rate": 4.144284821986259e-05,
"loss": 0.4383,
"step": 452
},
{
"epoch": 0.7639123102866779,
"grad_norm": 0.32073839968069323,
"learning_rate": 4.1411617738913184e-05,
"loss": 0.4311,
"step": 453
},
{
"epoch": 0.7655986509274874,
"grad_norm": 0.40654395677206645,
"learning_rate": 4.1380387257963775e-05,
"loss": 0.4507,
"step": 454
},
{
"epoch": 0.7672849915682968,
"grad_norm": 0.3331641705190293,
"learning_rate": 4.134915677701437e-05,
"loss": 0.4588,
"step": 455
},
{
"epoch": 0.7689713322091062,
"grad_norm": 0.36035655188344745,
"learning_rate": 4.131792629606496e-05,
"loss": 0.4316,
"step": 456
},
{
"epoch": 0.7706576728499157,
"grad_norm": 0.3529072751955698,
"learning_rate": 4.128669581511556e-05,
"loss": 0.4505,
"step": 457
},
{
"epoch": 0.7723440134907251,
"grad_norm": 0.3717386077872241,
"learning_rate": 4.125546533416615e-05,
"loss": 0.4252,
"step": 458
},
{
"epoch": 0.7740303541315345,
"grad_norm": 0.35367881506426163,
"learning_rate": 4.122423485321674e-05,
"loss": 0.4401,
"step": 459
},
{
"epoch": 0.7757166947723441,
"grad_norm": 0.4049795855490162,
"learning_rate": 4.1193004372267336e-05,
"loss": 0.4431,
"step": 460
},
{
"epoch": 0.7774030354131535,
"grad_norm": 0.3916967673517559,
"learning_rate": 4.1161773891317926e-05,
"loss": 0.4367,
"step": 461
},
{
"epoch": 0.7790893760539629,
"grad_norm": 0.46051501058246047,
"learning_rate": 4.113054341036852e-05,
"loss": 0.4305,
"step": 462
},
{
"epoch": 0.7807757166947723,
"grad_norm": 0.3470589434545159,
"learning_rate": 4.109931292941911e-05,
"loss": 0.4402,
"step": 463
},
{
"epoch": 0.7824620573355818,
"grad_norm": 0.4352877775027757,
"learning_rate": 4.106808244846971e-05,
"loss": 0.4353,
"step": 464
},
{
"epoch": 0.7841483979763912,
"grad_norm": 0.3193187418301974,
"learning_rate": 4.10368519675203e-05,
"loss": 0.4417,
"step": 465
},
{
"epoch": 0.7858347386172007,
"grad_norm": 0.3143650168188273,
"learning_rate": 4.10056214865709e-05,
"loss": 0.4571,
"step": 466
},
{
"epoch": 0.7875210792580101,
"grad_norm": 0.3219340880551001,
"learning_rate": 4.097439100562149e-05,
"loss": 0.447,
"step": 467
},
{
"epoch": 0.7892074198988196,
"grad_norm": 0.3012606759811735,
"learning_rate": 4.094316052467208e-05,
"loss": 0.4066,
"step": 468
},
{
"epoch": 0.790893760539629,
"grad_norm": 0.3483395029680997,
"learning_rate": 4.0911930043722674e-05,
"loss": 0.4227,
"step": 469
},
{
"epoch": 0.7925801011804384,
"grad_norm": 0.35722805142424147,
"learning_rate": 4.0880699562773265e-05,
"loss": 0.4163,
"step": 470
},
{
"epoch": 0.7942664418212478,
"grad_norm": 0.38762688601542045,
"learning_rate": 4.084946908182386e-05,
"loss": 0.4428,
"step": 471
},
{
"epoch": 0.7959527824620574,
"grad_norm": 0.44221716942779493,
"learning_rate": 4.081823860087445e-05,
"loss": 0.4475,
"step": 472
},
{
"epoch": 0.7976391231028668,
"grad_norm": 0.42521687012311943,
"learning_rate": 4.078700811992505e-05,
"loss": 0.4435,
"step": 473
},
{
"epoch": 0.7993254637436762,
"grad_norm": 0.4614612479724292,
"learning_rate": 4.075577763897564e-05,
"loss": 0.4342,
"step": 474
},
{
"epoch": 0.8010118043844857,
"grad_norm": 0.3880206058427269,
"learning_rate": 4.0724547158026236e-05,
"loss": 0.4525,
"step": 475
},
{
"epoch": 0.8026981450252951,
"grad_norm": 0.38043035899138344,
"learning_rate": 4.069331667707683e-05,
"loss": 0.4402,
"step": 476
},
{
"epoch": 0.8043844856661045,
"grad_norm": 0.41207333481490105,
"learning_rate": 4.066208619612742e-05,
"loss": 0.4696,
"step": 477
},
{
"epoch": 0.806070826306914,
"grad_norm": 0.4060511255941783,
"learning_rate": 4.063085571517802e-05,
"loss": 0.4323,
"step": 478
},
{
"epoch": 0.8077571669477235,
"grad_norm": 0.40363614657308583,
"learning_rate": 4.059962523422861e-05,
"loss": 0.4557,
"step": 479
},
{
"epoch": 0.8094435075885329,
"grad_norm": 0.4188029298749793,
"learning_rate": 4.056839475327921e-05,
"loss": 0.4169,
"step": 480
},
{
"epoch": 0.8111298482293423,
"grad_norm": 0.3254549567851391,
"learning_rate": 4.05371642723298e-05,
"loss": 0.4193,
"step": 481
},
{
"epoch": 0.8128161888701517,
"grad_norm": 0.4962719043903534,
"learning_rate": 4.050593379138039e-05,
"loss": 0.4464,
"step": 482
},
{
"epoch": 0.8145025295109612,
"grad_norm": 0.3295931350843861,
"learning_rate": 4.0474703310430984e-05,
"loss": 0.43,
"step": 483
},
{
"epoch": 0.8161888701517707,
"grad_norm": 0.47883553020245095,
"learning_rate": 4.0443472829481574e-05,
"loss": 0.4522,
"step": 484
},
{
"epoch": 0.8178752107925801,
"grad_norm": 0.33906309279359814,
"learning_rate": 4.041224234853217e-05,
"loss": 0.4482,
"step": 485
},
{
"epoch": 0.8195615514333895,
"grad_norm": 0.40081929926357074,
"learning_rate": 4.038101186758276e-05,
"loss": 0.4477,
"step": 486
},
{
"epoch": 0.821247892074199,
"grad_norm": 0.33573980380519236,
"learning_rate": 4.034978138663336e-05,
"loss": 0.4308,
"step": 487
},
{
"epoch": 0.8229342327150084,
"grad_norm": 0.38799000304628345,
"learning_rate": 4.031855090568395e-05,
"loss": 0.4383,
"step": 488
},
{
"epoch": 0.8246205733558178,
"grad_norm": 0.3096129062326779,
"learning_rate": 4.0287320424734545e-05,
"loss": 0.4463,
"step": 489
},
{
"epoch": 0.8263069139966274,
"grad_norm": 0.38212217849587243,
"learning_rate": 4.0256089943785135e-05,
"loss": 0.4401,
"step": 490
},
{
"epoch": 0.8279932546374368,
"grad_norm": 0.3248274009219074,
"learning_rate": 4.0224859462835726e-05,
"loss": 0.4352,
"step": 491
},
{
"epoch": 0.8296795952782462,
"grad_norm": 0.3643266483390527,
"learning_rate": 4.019362898188632e-05,
"loss": 0.4729,
"step": 492
},
{
"epoch": 0.8313659359190556,
"grad_norm": 0.3487030333559748,
"learning_rate": 4.016239850093691e-05,
"loss": 0.4266,
"step": 493
},
{
"epoch": 0.8330522765598651,
"grad_norm": 0.35571596803070454,
"learning_rate": 4.013116801998751e-05,
"loss": 0.4438,
"step": 494
},
{
"epoch": 0.8347386172006745,
"grad_norm": 0.41833496260050856,
"learning_rate": 4.00999375390381e-05,
"loss": 0.4503,
"step": 495
},
{
"epoch": 0.836424957841484,
"grad_norm": 0.3986814710468665,
"learning_rate": 4.00687070580887e-05,
"loss": 0.4315,
"step": 496
},
{
"epoch": 0.8381112984822934,
"grad_norm": 0.3290599779478399,
"learning_rate": 4.003747657713929e-05,
"loss": 0.4333,
"step": 497
},
{
"epoch": 0.8397976391231029,
"grad_norm": 0.38239443500458137,
"learning_rate": 4.0006246096189884e-05,
"loss": 0.448,
"step": 498
},
{
"epoch": 0.8414839797639123,
"grad_norm": 0.321771585575904,
"learning_rate": 3.9975015615240474e-05,
"loss": 0.4208,
"step": 499
},
{
"epoch": 0.8431703204047217,
"grad_norm": 0.40619718423970064,
"learning_rate": 3.9943785134291064e-05,
"loss": 0.4494,
"step": 500
},
{
"epoch": 0.8448566610455311,
"grad_norm": 0.38935911453692046,
"learning_rate": 3.991255465334167e-05,
"loss": 0.4354,
"step": 501
},
{
"epoch": 0.8465430016863407,
"grad_norm": 0.34815254820703556,
"learning_rate": 3.988132417239226e-05,
"loss": 0.4414,
"step": 502
},
{
"epoch": 0.8482293423271501,
"grad_norm": 0.4065925055362203,
"learning_rate": 3.9850093691442855e-05,
"loss": 0.4396,
"step": 503
},
{
"epoch": 0.8499156829679595,
"grad_norm": 0.32855881491070554,
"learning_rate": 3.9818863210493445e-05,
"loss": 0.4371,
"step": 504
},
{
"epoch": 0.851602023608769,
"grad_norm": 0.41451234973837914,
"learning_rate": 3.978763272954404e-05,
"loss": 0.4501,
"step": 505
},
{
"epoch": 0.8532883642495784,
"grad_norm": 0.3464873489983337,
"learning_rate": 3.975640224859463e-05,
"loss": 0.4379,
"step": 506
},
{
"epoch": 0.8549747048903878,
"grad_norm": 0.34786674244235233,
"learning_rate": 3.972517176764522e-05,
"loss": 0.429,
"step": 507
},
{
"epoch": 0.8566610455311973,
"grad_norm": 0.3451218776597317,
"learning_rate": 3.969394128669582e-05,
"loss": 0.4462,
"step": 508
},
{
"epoch": 0.8583473861720068,
"grad_norm": 0.3575771672004591,
"learning_rate": 3.966271080574641e-05,
"loss": 0.4266,
"step": 509
},
{
"epoch": 0.8600337268128162,
"grad_norm": 0.2989153179892053,
"learning_rate": 3.9631480324797006e-05,
"loss": 0.4288,
"step": 510
},
{
"epoch": 0.8617200674536256,
"grad_norm": 0.3436388897224447,
"learning_rate": 3.96002498438476e-05,
"loss": 0.4095,
"step": 511
},
{
"epoch": 0.863406408094435,
"grad_norm": 0.35763903249570084,
"learning_rate": 3.9569019362898194e-05,
"loss": 0.4562,
"step": 512
},
{
"epoch": 0.8650927487352446,
"grad_norm": 0.41390070859437555,
"learning_rate": 3.9537788881948784e-05,
"loss": 0.4455,
"step": 513
},
{
"epoch": 0.866779089376054,
"grad_norm": 0.34010398726999513,
"learning_rate": 3.9506558400999374e-05,
"loss": 0.4577,
"step": 514
},
{
"epoch": 0.8684654300168634,
"grad_norm": 0.37866647645676466,
"learning_rate": 3.947532792004997e-05,
"loss": 0.4386,
"step": 515
},
{
"epoch": 0.8701517706576728,
"grad_norm": 0.30028155172407867,
"learning_rate": 3.944409743910056e-05,
"loss": 0.434,
"step": 516
},
{
"epoch": 0.8718381112984823,
"grad_norm": 0.3669560450150648,
"learning_rate": 3.941286695815116e-05,
"loss": 0.4438,
"step": 517
},
{
"epoch": 0.8735244519392917,
"grad_norm": 0.3457542078983038,
"learning_rate": 3.938163647720175e-05,
"loss": 0.453,
"step": 518
},
{
"epoch": 0.8752107925801011,
"grad_norm": 0.33965599759527376,
"learning_rate": 3.9350405996252345e-05,
"loss": 0.4178,
"step": 519
},
{
"epoch": 0.8768971332209107,
"grad_norm": 0.3721436758139496,
"learning_rate": 3.9319175515302935e-05,
"loss": 0.4468,
"step": 520
},
{
"epoch": 0.8785834738617201,
"grad_norm": 0.4061525669717563,
"learning_rate": 3.928794503435353e-05,
"loss": 0.4259,
"step": 521
},
{
"epoch": 0.8802698145025295,
"grad_norm": 0.30659043214103315,
"learning_rate": 3.925671455340412e-05,
"loss": 0.4272,
"step": 522
},
{
"epoch": 0.8819561551433389,
"grad_norm": 0.3409351347239614,
"learning_rate": 3.922548407245471e-05,
"loss": 0.4421,
"step": 523
},
{
"epoch": 0.8836424957841484,
"grad_norm": 0.35220982612512924,
"learning_rate": 3.919425359150531e-05,
"loss": 0.4412,
"step": 524
},
{
"epoch": 0.8853288364249579,
"grad_norm": 0.30552652380518674,
"learning_rate": 3.91630231105559e-05,
"loss": 0.4214,
"step": 525
},
{
"epoch": 0.8870151770657673,
"grad_norm": 0.4137227791099141,
"learning_rate": 3.9131792629606496e-05,
"loss": 0.4269,
"step": 526
},
{
"epoch": 0.8887015177065767,
"grad_norm": 0.3150228249974658,
"learning_rate": 3.9100562148657093e-05,
"loss": 0.455,
"step": 527
},
{
"epoch": 0.8903878583473862,
"grad_norm": 0.3686613874530323,
"learning_rate": 3.906933166770769e-05,
"loss": 0.4476,
"step": 528
},
{
"epoch": 0.8920741989881956,
"grad_norm": 0.3430472314617903,
"learning_rate": 3.903810118675828e-05,
"loss": 0.4482,
"step": 529
},
{
"epoch": 0.893760539629005,
"grad_norm": 0.35676005653730164,
"learning_rate": 3.900687070580887e-05,
"loss": 0.4377,
"step": 530
},
{
"epoch": 0.8954468802698144,
"grad_norm": 0.3208027933402526,
"learning_rate": 3.897564022485947e-05,
"loss": 0.4211,
"step": 531
},
{
"epoch": 0.897133220910624,
"grad_norm": 0.3672019529503482,
"learning_rate": 3.894440974391006e-05,
"loss": 0.4376,
"step": 532
},
{
"epoch": 0.8988195615514334,
"grad_norm": 0.3253386882071814,
"learning_rate": 3.8913179262960655e-05,
"loss": 0.4445,
"step": 533
},
{
"epoch": 0.9005059021922428,
"grad_norm": 0.39486961722325015,
"learning_rate": 3.8881948782011245e-05,
"loss": 0.4481,
"step": 534
},
{
"epoch": 0.9021922428330523,
"grad_norm": 0.40525246697045486,
"learning_rate": 3.885071830106184e-05,
"loss": 0.4437,
"step": 535
},
{
"epoch": 0.9038785834738617,
"grad_norm": 0.3448446619749969,
"learning_rate": 3.881948782011243e-05,
"loss": 0.44,
"step": 536
},
{
"epoch": 0.9055649241146712,
"grad_norm": 0.3823092376610685,
"learning_rate": 3.878825733916302e-05,
"loss": 0.4289,
"step": 537
},
{
"epoch": 0.9072512647554806,
"grad_norm": 0.35549627695079783,
"learning_rate": 3.875702685821362e-05,
"loss": 0.4365,
"step": 538
},
{
"epoch": 0.9089376053962901,
"grad_norm": 0.361600452020028,
"learning_rate": 3.872579637726421e-05,
"loss": 0.4333,
"step": 539
},
{
"epoch": 0.9106239460370995,
"grad_norm": 0.39238177458414175,
"learning_rate": 3.8694565896314806e-05,
"loss": 0.4361,
"step": 540
},
{
"epoch": 0.9123102866779089,
"grad_norm": 0.30858347531378094,
"learning_rate": 3.8663335415365396e-05,
"loss": 0.4439,
"step": 541
},
{
"epoch": 0.9139966273187183,
"grad_norm": 0.40433613406568636,
"learning_rate": 3.863210493441599e-05,
"loss": 0.4343,
"step": 542
},
{
"epoch": 0.9156829679595279,
"grad_norm": 0.32314244386563457,
"learning_rate": 3.8600874453466583e-05,
"loss": 0.4402,
"step": 543
},
{
"epoch": 0.9173693086003373,
"grad_norm": 0.35329137291725576,
"learning_rate": 3.856964397251718e-05,
"loss": 0.4291,
"step": 544
},
{
"epoch": 0.9190556492411467,
"grad_norm": 0.4094160268434705,
"learning_rate": 3.853841349156777e-05,
"loss": 0.4553,
"step": 545
},
{
"epoch": 0.9207419898819561,
"grad_norm": 0.35244877974951694,
"learning_rate": 3.850718301061836e-05,
"loss": 0.4378,
"step": 546
},
{
"epoch": 0.9224283305227656,
"grad_norm": 0.34568213081212973,
"learning_rate": 3.847595252966896e-05,
"loss": 0.4117,
"step": 547
},
{
"epoch": 0.924114671163575,
"grad_norm": 0.3709568818749086,
"learning_rate": 3.844472204871955e-05,
"loss": 0.4306,
"step": 548
},
{
"epoch": 0.9258010118043845,
"grad_norm": 0.3775761038435258,
"learning_rate": 3.8413491567770145e-05,
"loss": 0.441,
"step": 549
},
{
"epoch": 0.927487352445194,
"grad_norm": 0.3698983531500424,
"learning_rate": 3.8382261086820735e-05,
"loss": 0.4314,
"step": 550
},
{
"epoch": 0.9291736930860034,
"grad_norm": 0.4074391201937174,
"learning_rate": 3.835103060587133e-05,
"loss": 0.4317,
"step": 551
},
{
"epoch": 0.9308600337268128,
"grad_norm": 0.426029092659015,
"learning_rate": 3.831980012492192e-05,
"loss": 0.4368,
"step": 552
},
{
"epoch": 0.9325463743676222,
"grad_norm": 0.34454783098281766,
"learning_rate": 3.828856964397252e-05,
"loss": 0.4458,
"step": 553
},
{
"epoch": 0.9342327150084317,
"grad_norm": 0.3817185847692516,
"learning_rate": 3.8257339163023116e-05,
"loss": 0.4576,
"step": 554
},
{
"epoch": 0.9359190556492412,
"grad_norm": 0.3140371120993948,
"learning_rate": 3.8226108682073706e-05,
"loss": 0.4267,
"step": 555
},
{
"epoch": 0.9376053962900506,
"grad_norm": 0.37134960441734055,
"learning_rate": 3.81948782011243e-05,
"loss": 0.4371,
"step": 556
},
{
"epoch": 0.93929173693086,
"grad_norm": 0.3261734677460667,
"learning_rate": 3.816364772017489e-05,
"loss": 0.4312,
"step": 557
},
{
"epoch": 0.9409780775716695,
"grad_norm": 0.3517792771913715,
"learning_rate": 3.813241723922549e-05,
"loss": 0.4286,
"step": 558
},
{
"epoch": 0.9426644182124789,
"grad_norm": 0.32412252967719923,
"learning_rate": 3.810118675827608e-05,
"loss": 0.4381,
"step": 559
},
{
"epoch": 0.9443507588532883,
"grad_norm": 0.3200973688390884,
"learning_rate": 3.806995627732668e-05,
"loss": 0.4334,
"step": 560
},
{
"epoch": 0.9460370994940978,
"grad_norm": 0.32686280606429596,
"learning_rate": 3.803872579637727e-05,
"loss": 0.4329,
"step": 561
},
{
"epoch": 0.9477234401349073,
"grad_norm": 0.3243359164811927,
"learning_rate": 3.800749531542786e-05,
"loss": 0.4338,
"step": 562
},
{
"epoch": 0.9494097807757167,
"grad_norm": 0.3487166516003897,
"learning_rate": 3.7976264834478454e-05,
"loss": 0.4319,
"step": 563
},
{
"epoch": 0.9510961214165261,
"grad_norm": 0.3272475408871655,
"learning_rate": 3.7945034353529045e-05,
"loss": 0.435,
"step": 564
},
{
"epoch": 0.9527824620573356,
"grad_norm": 0.3669600629409637,
"learning_rate": 3.791380387257964e-05,
"loss": 0.4466,
"step": 565
},
{
"epoch": 0.954468802698145,
"grad_norm": 0.2963475755698808,
"learning_rate": 3.788257339163023e-05,
"loss": 0.4324,
"step": 566
},
{
"epoch": 0.9561551433389545,
"grad_norm": 0.2997001225929376,
"learning_rate": 3.785134291068083e-05,
"loss": 0.4227,
"step": 567
},
{
"epoch": 0.9578414839797639,
"grad_norm": 0.4024928697863006,
"learning_rate": 3.782011242973142e-05,
"loss": 0.4229,
"step": 568
},
{
"epoch": 0.9595278246205734,
"grad_norm": 0.32574334101219726,
"learning_rate": 3.778888194878201e-05,
"loss": 0.4333,
"step": 569
},
{
"epoch": 0.9612141652613828,
"grad_norm": 0.4233326029987618,
"learning_rate": 3.7757651467832606e-05,
"loss": 0.4258,
"step": 570
},
{
"epoch": 0.9629005059021922,
"grad_norm": 0.31441995646070814,
"learning_rate": 3.7726420986883196e-05,
"loss": 0.4366,
"step": 571
},
{
"epoch": 0.9645868465430016,
"grad_norm": 0.3978301275223556,
"learning_rate": 3.769519050593379e-05,
"loss": 0.4386,
"step": 572
},
{
"epoch": 0.9662731871838112,
"grad_norm": 0.38255265169765723,
"learning_rate": 3.766396002498438e-05,
"loss": 0.4465,
"step": 573
},
{
"epoch": 0.9679595278246206,
"grad_norm": 0.41128548742947124,
"learning_rate": 3.763272954403498e-05,
"loss": 0.4217,
"step": 574
},
{
"epoch": 0.96964586846543,
"grad_norm": 0.3718617149053186,
"learning_rate": 3.760149906308557e-05,
"loss": 0.4252,
"step": 575
},
{
"epoch": 0.9713322091062394,
"grad_norm": 0.3887832792374972,
"learning_rate": 3.757026858213617e-05,
"loss": 0.4529,
"step": 576
},
{
"epoch": 0.9730185497470489,
"grad_norm": 0.4360800602788443,
"learning_rate": 3.753903810118676e-05,
"loss": 0.4319,
"step": 577
},
{
"epoch": 0.9747048903878583,
"grad_norm": 0.33415211151494867,
"learning_rate": 3.750780762023735e-05,
"loss": 0.4246,
"step": 578
},
{
"epoch": 0.9763912310286678,
"grad_norm": 0.3189675808202832,
"learning_rate": 3.747657713928795e-05,
"loss": 0.4159,
"step": 579
},
{
"epoch": 0.9780775716694773,
"grad_norm": 0.44576016512674754,
"learning_rate": 3.744534665833854e-05,
"loss": 0.4537,
"step": 580
},
{
"epoch": 0.9797639123102867,
"grad_norm": 0.36235218251396417,
"learning_rate": 3.741411617738914e-05,
"loss": 0.4511,
"step": 581
},
{
"epoch": 0.9814502529510961,
"grad_norm": 0.40599216443718106,
"learning_rate": 3.738288569643973e-05,
"loss": 0.4388,
"step": 582
},
{
"epoch": 0.9831365935919055,
"grad_norm": 0.393707746186096,
"learning_rate": 3.7351655215490325e-05,
"loss": 0.437,
"step": 583
},
{
"epoch": 0.984822934232715,
"grad_norm": 0.35873213762005124,
"learning_rate": 3.7320424734540916e-05,
"loss": 0.4317,
"step": 584
},
{
"epoch": 0.9865092748735245,
"grad_norm": 0.3735478259156147,
"learning_rate": 3.7289194253591506e-05,
"loss": 0.435,
"step": 585
},
{
"epoch": 0.9881956155143339,
"grad_norm": 0.3145636810256735,
"learning_rate": 3.72579637726421e-05,
"loss": 0.4301,
"step": 586
},
{
"epoch": 0.9898819561551433,
"grad_norm": 0.29373293301899733,
"learning_rate": 3.722673329169269e-05,
"loss": 0.4088,
"step": 587
},
{
"epoch": 0.9915682967959528,
"grad_norm": 0.39253716389128174,
"learning_rate": 3.719550281074329e-05,
"loss": 0.4104,
"step": 588
},
{
"epoch": 0.9932546374367622,
"grad_norm": 0.28317911466413853,
"learning_rate": 3.716427232979388e-05,
"loss": 0.4204,
"step": 589
},
{
"epoch": 0.9949409780775716,
"grad_norm": 0.31230546154127087,
"learning_rate": 3.713304184884448e-05,
"loss": 0.4217,
"step": 590
},
{
"epoch": 0.9966273187183811,
"grad_norm": 0.3334436143958086,
"learning_rate": 3.710181136789507e-05,
"loss": 0.4302,
"step": 591
},
{
"epoch": 0.9983136593591906,
"grad_norm": 0.2968157232197624,
"learning_rate": 3.7070580886945664e-05,
"loss": 0.4253,
"step": 592
},
{
"epoch": 1.0,
"grad_norm": 0.3146042029883502,
"learning_rate": 3.7039350405996254e-05,
"loss": 0.397,
"step": 593
},
{
"epoch": 1.0016863406408094,
"grad_norm": 0.3370073663531201,
"learning_rate": 3.7008119925046844e-05,
"loss": 0.375,
"step": 594
},
{
"epoch": 1.0033726812816188,
"grad_norm": 0.32116215055132985,
"learning_rate": 3.697688944409744e-05,
"loss": 0.3708,
"step": 595
},
{
"epoch": 1.0050590219224282,
"grad_norm": 0.29860857891458864,
"learning_rate": 3.694565896314803e-05,
"loss": 0.3865,
"step": 596
},
{
"epoch": 1.0067453625632379,
"grad_norm": 0.3275096400221224,
"learning_rate": 3.691442848219863e-05,
"loss": 0.3718,
"step": 597
},
{
"epoch": 1.0084317032040473,
"grad_norm": 0.31486317653134394,
"learning_rate": 3.688319800124922e-05,
"loss": 0.3706,
"step": 598
},
{
"epoch": 1.0101180438448567,
"grad_norm": 0.39271594795296993,
"learning_rate": 3.6851967520299815e-05,
"loss": 0.3522,
"step": 599
},
{
"epoch": 1.0118043844856661,
"grad_norm": 0.26586173702738847,
"learning_rate": 3.6820737039350406e-05,
"loss": 0.3659,
"step": 600
},
{
"epoch": 1.0134907251264755,
"grad_norm": 0.3550935166184591,
"learning_rate": 3.6789506558400996e-05,
"loss": 0.372,
"step": 601
},
{
"epoch": 1.015177065767285,
"grad_norm": 0.3757468734375804,
"learning_rate": 3.675827607745159e-05,
"loss": 0.3755,
"step": 602
},
{
"epoch": 1.0168634064080944,
"grad_norm": 0.32345317238449983,
"learning_rate": 3.672704559650218e-05,
"loss": 0.3712,
"step": 603
},
{
"epoch": 1.0185497470489038,
"grad_norm": 0.389457960719761,
"learning_rate": 3.669581511555278e-05,
"loss": 0.38,
"step": 604
},
{
"epoch": 1.0202360876897134,
"grad_norm": 0.31136627461944766,
"learning_rate": 3.666458463460338e-05,
"loss": 0.3754,
"step": 605
},
{
"epoch": 1.0219224283305228,
"grad_norm": 0.3728830518915666,
"learning_rate": 3.6633354153653974e-05,
"loss": 0.3768,
"step": 606
},
{
"epoch": 1.0236087689713322,
"grad_norm": 0.325914751533291,
"learning_rate": 3.6602123672704564e-05,
"loss": 0.361,
"step": 607
},
{
"epoch": 1.0252951096121417,
"grad_norm": 0.32251147450644635,
"learning_rate": 3.6570893191755154e-05,
"loss": 0.3632,
"step": 608
},
{
"epoch": 1.026981450252951,
"grad_norm": 0.38601247039180847,
"learning_rate": 3.653966271080575e-05,
"loss": 0.3374,
"step": 609
},
{
"epoch": 1.0286677908937605,
"grad_norm": 0.3113652782897978,
"learning_rate": 3.650843222985634e-05,
"loss": 0.3492,
"step": 610
},
{
"epoch": 1.03035413153457,
"grad_norm": 0.40738258586752285,
"learning_rate": 3.647720174890694e-05,
"loss": 0.3826,
"step": 611
},
{
"epoch": 1.0320404721753795,
"grad_norm": 0.40482778144228854,
"learning_rate": 3.644597126795753e-05,
"loss": 0.3618,
"step": 612
},
{
"epoch": 1.033726812816189,
"grad_norm": 0.3100389171136031,
"learning_rate": 3.6414740787008125e-05,
"loss": 0.3617,
"step": 613
},
{
"epoch": 1.0354131534569984,
"grad_norm": 0.4345026237676389,
"learning_rate": 3.6383510306058715e-05,
"loss": 0.3818,
"step": 614
},
{
"epoch": 1.0370994940978078,
"grad_norm": 0.3302182083765599,
"learning_rate": 3.635227982510931e-05,
"loss": 0.3802,
"step": 615
},
{
"epoch": 1.0387858347386172,
"grad_norm": 0.3478874768962978,
"learning_rate": 3.63210493441599e-05,
"loss": 0.3722,
"step": 616
},
{
"epoch": 1.0404721753794266,
"grad_norm": 0.3818087599342268,
"learning_rate": 3.628981886321049e-05,
"loss": 0.3785,
"step": 617
},
{
"epoch": 1.042158516020236,
"grad_norm": 0.3103310310694465,
"learning_rate": 3.625858838226109e-05,
"loss": 0.3554,
"step": 618
},
{
"epoch": 1.0438448566610454,
"grad_norm": 0.3195864556944807,
"learning_rate": 3.622735790131168e-05,
"loss": 0.3624,
"step": 619
},
{
"epoch": 1.045531197301855,
"grad_norm": 0.3715118708841681,
"learning_rate": 3.6196127420362277e-05,
"loss": 0.3638,
"step": 620
},
{
"epoch": 1.0472175379426645,
"grad_norm": 0.2947796032379757,
"learning_rate": 3.616489693941287e-05,
"loss": 0.3772,
"step": 621
},
{
"epoch": 1.048903878583474,
"grad_norm": 0.43267966177276695,
"learning_rate": 3.6133666458463464e-05,
"loss": 0.3674,
"step": 622
},
{
"epoch": 1.0505902192242833,
"grad_norm": 0.27812298939709335,
"learning_rate": 3.6102435977514054e-05,
"loss": 0.3465,
"step": 623
},
{
"epoch": 1.0522765598650927,
"grad_norm": 0.4438329515641008,
"learning_rate": 3.6071205496564644e-05,
"loss": 0.3667,
"step": 624
},
{
"epoch": 1.0539629005059021,
"grad_norm": 0.29768610317679384,
"learning_rate": 3.603997501561524e-05,
"loss": 0.3739,
"step": 625
},
{
"epoch": 1.0556492411467115,
"grad_norm": 0.3100212861752048,
"learning_rate": 3.600874453466583e-05,
"loss": 0.358,
"step": 626
},
{
"epoch": 1.0573355817875212,
"grad_norm": 0.3445036926061418,
"learning_rate": 3.597751405371643e-05,
"loss": 0.3684,
"step": 627
},
{
"epoch": 1.0590219224283306,
"grad_norm": 0.2718866041449956,
"learning_rate": 3.594628357276702e-05,
"loss": 0.3493,
"step": 628
},
{
"epoch": 1.06070826306914,
"grad_norm": 0.3392352864903805,
"learning_rate": 3.5915053091817615e-05,
"loss": 0.3745,
"step": 629
},
{
"epoch": 1.0623946037099494,
"grad_norm": 0.3328919083232664,
"learning_rate": 3.588382261086821e-05,
"loss": 0.3759,
"step": 630
},
{
"epoch": 1.0640809443507588,
"grad_norm": 0.30982338431159007,
"learning_rate": 3.58525921299188e-05,
"loss": 0.3736,
"step": 631
},
{
"epoch": 1.0657672849915683,
"grad_norm": 0.351636253410426,
"learning_rate": 3.58213616489694e-05,
"loss": 0.3779,
"step": 632
},
{
"epoch": 1.0674536256323777,
"grad_norm": 0.3019957709061264,
"learning_rate": 3.579013116801999e-05,
"loss": 0.386,
"step": 633
},
{
"epoch": 1.069139966273187,
"grad_norm": 0.35166269106240106,
"learning_rate": 3.5758900687070586e-05,
"loss": 0.3698,
"step": 634
},
{
"epoch": 1.0708263069139967,
"grad_norm": 0.2893719435618796,
"learning_rate": 3.5727670206121176e-05,
"loss": 0.3605,
"step": 635
},
{
"epoch": 1.0725126475548061,
"grad_norm": 2.1683991651466523,
"learning_rate": 3.569643972517177e-05,
"loss": 0.4151,
"step": 636
},
{
"epoch": 1.0741989881956155,
"grad_norm": 0.34939031231545353,
"learning_rate": 3.5665209244222363e-05,
"loss": 0.357,
"step": 637
},
{
"epoch": 1.075885328836425,
"grad_norm": 0.33365230698899556,
"learning_rate": 3.563397876327296e-05,
"loss": 0.3569,
"step": 638
},
{
"epoch": 1.0775716694772344,
"grad_norm": 0.38333312282223403,
"learning_rate": 3.560274828232355e-05,
"loss": 0.3725,
"step": 639
},
{
"epoch": 1.0792580101180438,
"grad_norm": 0.2989943238359894,
"learning_rate": 3.557151780137414e-05,
"loss": 0.3659,
"step": 640
},
{
"epoch": 1.0809443507588532,
"grad_norm": 0.35190364679048197,
"learning_rate": 3.554028732042474e-05,
"loss": 0.3456,
"step": 641
},
{
"epoch": 1.0826306913996628,
"grad_norm": 0.3102309344220619,
"learning_rate": 3.550905683947533e-05,
"loss": 0.3513,
"step": 642
},
{
"epoch": 1.0843170320404723,
"grad_norm": 0.34766344126692783,
"learning_rate": 3.5477826358525925e-05,
"loss": 0.3508,
"step": 643
},
{
"epoch": 1.0860033726812817,
"grad_norm": 0.30303909121585015,
"learning_rate": 3.5446595877576515e-05,
"loss": 0.3789,
"step": 644
},
{
"epoch": 1.087689713322091,
"grad_norm": 0.3531070193198295,
"learning_rate": 3.541536539662711e-05,
"loss": 0.3697,
"step": 645
},
{
"epoch": 1.0893760539629005,
"grad_norm": 0.3442402875788552,
"learning_rate": 3.53841349156777e-05,
"loss": 0.3748,
"step": 646
},
{
"epoch": 1.09106239460371,
"grad_norm": 0.31913063134238107,
"learning_rate": 3.53529044347283e-05,
"loss": 0.3527,
"step": 647
},
{
"epoch": 1.0927487352445193,
"grad_norm": 0.3150943101074222,
"learning_rate": 3.532167395377889e-05,
"loss": 0.3736,
"step": 648
},
{
"epoch": 1.0944350758853287,
"grad_norm": 0.3292315737364209,
"learning_rate": 3.529044347282948e-05,
"loss": 0.3775,
"step": 649
},
{
"epoch": 1.0961214165261384,
"grad_norm": 0.29052932382842833,
"learning_rate": 3.5259212991880076e-05,
"loss": 0.3731,
"step": 650
},
{
"epoch": 1.0978077571669478,
"grad_norm": 0.375090842212462,
"learning_rate": 3.5227982510930666e-05,
"loss": 0.356,
"step": 651
},
{
"epoch": 1.0994940978077572,
"grad_norm": 0.32161697933374434,
"learning_rate": 3.519675202998126e-05,
"loss": 0.3548,
"step": 652
},
{
"epoch": 1.1011804384485666,
"grad_norm": 0.2879157921021396,
"learning_rate": 3.5165521549031853e-05,
"loss": 0.358,
"step": 653
},
{
"epoch": 1.102866779089376,
"grad_norm": 0.41727435113409483,
"learning_rate": 3.513429106808245e-05,
"loss": 0.381,
"step": 654
},
{
"epoch": 1.1045531197301854,
"grad_norm": 0.2837299613816854,
"learning_rate": 3.510306058713304e-05,
"loss": 0.3533,
"step": 655
},
{
"epoch": 1.1062394603709949,
"grad_norm": 0.33391080950397317,
"learning_rate": 3.507183010618364e-05,
"loss": 0.3561,
"step": 656
},
{
"epoch": 1.1079258010118043,
"grad_norm": 0.29547239404552594,
"learning_rate": 3.5040599625234234e-05,
"loss": 0.3644,
"step": 657
},
{
"epoch": 1.109612141652614,
"grad_norm": 0.32594239439972955,
"learning_rate": 3.5009369144284825e-05,
"loss": 0.3673,
"step": 658
},
{
"epoch": 1.1112984822934233,
"grad_norm": 0.32452664682226484,
"learning_rate": 3.497813866333542e-05,
"loss": 0.3678,
"step": 659
},
{
"epoch": 1.1129848229342327,
"grad_norm": 0.31349408760230113,
"learning_rate": 3.494690818238601e-05,
"loss": 0.3551,
"step": 660
},
{
"epoch": 1.1146711635750421,
"grad_norm": 0.48435598730621804,
"learning_rate": 3.491567770143661e-05,
"loss": 0.3797,
"step": 661
},
{
"epoch": 1.1163575042158516,
"grad_norm": 0.3427185023275982,
"learning_rate": 3.48844472204872e-05,
"loss": 0.3729,
"step": 662
},
{
"epoch": 1.118043844856661,
"grad_norm": 0.37943453765393537,
"learning_rate": 3.485321673953779e-05,
"loss": 0.3711,
"step": 663
},
{
"epoch": 1.1197301854974704,
"grad_norm": 0.32502964540546436,
"learning_rate": 3.4821986258588386e-05,
"loss": 0.389,
"step": 664
},
{
"epoch": 1.12141652613828,
"grad_norm": 0.3264086956433378,
"learning_rate": 3.4790755777638976e-05,
"loss": 0.3593,
"step": 665
},
{
"epoch": 1.1231028667790894,
"grad_norm": 0.3953500314813085,
"learning_rate": 3.475952529668957e-05,
"loss": 0.392,
"step": 666
},
{
"epoch": 1.1247892074198989,
"grad_norm": 0.29474500867892905,
"learning_rate": 3.472829481574016e-05,
"loss": 0.3597,
"step": 667
},
{
"epoch": 1.1264755480607083,
"grad_norm": 0.3488785070722052,
"learning_rate": 3.469706433479076e-05,
"loss": 0.3749,
"step": 668
},
{
"epoch": 1.1281618887015177,
"grad_norm": 0.3248792393815756,
"learning_rate": 3.466583385384135e-05,
"loss": 0.3598,
"step": 669
},
{
"epoch": 1.129848229342327,
"grad_norm": 0.3224420541767695,
"learning_rate": 3.463460337289195e-05,
"loss": 0.3719,
"step": 670
},
{
"epoch": 1.1315345699831365,
"grad_norm": 0.3417491355704424,
"learning_rate": 3.460337289194254e-05,
"loss": 0.362,
"step": 671
},
{
"epoch": 1.1332209106239461,
"grad_norm": 0.3427747866197647,
"learning_rate": 3.457214241099313e-05,
"loss": 0.367,
"step": 672
},
{
"epoch": 1.1349072512647556,
"grad_norm": 0.3106830438240518,
"learning_rate": 3.4540911930043724e-05,
"loss": 0.367,
"step": 673
},
{
"epoch": 1.136593591905565,
"grad_norm": 0.3540348746643401,
"learning_rate": 3.4509681449094315e-05,
"loss": 0.3555,
"step": 674
},
{
"epoch": 1.1382799325463744,
"grad_norm": 0.34466886965401156,
"learning_rate": 3.447845096814491e-05,
"loss": 0.3628,
"step": 675
},
{
"epoch": 1.1399662731871838,
"grad_norm": 0.3164738524402864,
"learning_rate": 3.44472204871955e-05,
"loss": 0.3602,
"step": 676
},
{
"epoch": 1.1416526138279932,
"grad_norm": 0.37994324932360596,
"learning_rate": 3.44159900062461e-05,
"loss": 0.3625,
"step": 677
},
{
"epoch": 1.1433389544688026,
"grad_norm": 0.356114149548685,
"learning_rate": 3.438475952529669e-05,
"loss": 0.3738,
"step": 678
},
{
"epoch": 1.1450252951096123,
"grad_norm": 0.3225747215773845,
"learning_rate": 3.435352904434728e-05,
"loss": 0.3604,
"step": 679
},
{
"epoch": 1.1467116357504217,
"grad_norm": 0.3773129300465698,
"learning_rate": 3.4322298563397876e-05,
"loss": 0.3788,
"step": 680
},
{
"epoch": 1.148397976391231,
"grad_norm": 0.304117819143786,
"learning_rate": 3.4291068082448466e-05,
"loss": 0.3732,
"step": 681
},
{
"epoch": 1.1500843170320405,
"grad_norm": 0.3001832144319778,
"learning_rate": 3.425983760149907e-05,
"loss": 0.3608,
"step": 682
},
{
"epoch": 1.15177065767285,
"grad_norm": 0.2976369748044028,
"learning_rate": 3.422860712054966e-05,
"loss": 0.38,
"step": 683
},
{
"epoch": 1.1534569983136593,
"grad_norm": 0.2993556584838581,
"learning_rate": 3.419737663960026e-05,
"loss": 0.3469,
"step": 684
},
{
"epoch": 1.1551433389544687,
"grad_norm": 0.2952476863488473,
"learning_rate": 3.416614615865085e-05,
"loss": 0.376,
"step": 685
},
{
"epoch": 1.1568296795952782,
"grad_norm": 0.30611597016381326,
"learning_rate": 3.413491567770144e-05,
"loss": 0.3447,
"step": 686
},
{
"epoch": 1.1585160202360876,
"grad_norm": 0.33324196316017346,
"learning_rate": 3.4103685196752034e-05,
"loss": 0.3685,
"step": 687
},
{
"epoch": 1.1602023608768972,
"grad_norm": 0.3616915720368816,
"learning_rate": 3.4072454715802624e-05,
"loss": 0.3688,
"step": 688
},
{
"epoch": 1.1618887015177066,
"grad_norm": 0.3742976223101849,
"learning_rate": 3.404122423485322e-05,
"loss": 0.3686,
"step": 689
},
{
"epoch": 1.163575042158516,
"grad_norm": 0.30457069197757114,
"learning_rate": 3.400999375390381e-05,
"loss": 0.3475,
"step": 690
},
{
"epoch": 1.1652613827993255,
"grad_norm": 0.40089797869146526,
"learning_rate": 3.397876327295441e-05,
"loss": 0.4003,
"step": 691
},
{
"epoch": 1.1669477234401349,
"grad_norm": 0.31965659321223927,
"learning_rate": 3.3947532792005e-05,
"loss": 0.3769,
"step": 692
},
{
"epoch": 1.1686340640809443,
"grad_norm": 0.3439706518766779,
"learning_rate": 3.3916302311055595e-05,
"loss": 0.3659,
"step": 693
},
{
"epoch": 1.1703204047217537,
"grad_norm": 0.3429902418282865,
"learning_rate": 3.3885071830106186e-05,
"loss": 0.3604,
"step": 694
},
{
"epoch": 1.1720067453625633,
"grad_norm": 0.35074735517188066,
"learning_rate": 3.3853841349156776e-05,
"loss": 0.367,
"step": 695
},
{
"epoch": 1.1736930860033727,
"grad_norm": 0.3223682518850536,
"learning_rate": 3.382261086820737e-05,
"loss": 0.359,
"step": 696
},
{
"epoch": 1.1753794266441822,
"grad_norm": 0.36290253264152483,
"learning_rate": 3.379138038725796e-05,
"loss": 0.3579,
"step": 697
},
{
"epoch": 1.1770657672849916,
"grad_norm": 0.40337855296136965,
"learning_rate": 3.376014990630856e-05,
"loss": 0.3625,
"step": 698
},
{
"epoch": 1.178752107925801,
"grad_norm": 0.28866061205127996,
"learning_rate": 3.372891942535915e-05,
"loss": 0.376,
"step": 699
},
{
"epoch": 1.1804384485666104,
"grad_norm": 0.4563999065836459,
"learning_rate": 3.369768894440975e-05,
"loss": 0.3477,
"step": 700
},
{
"epoch": 1.1821247892074198,
"grad_norm": 0.28381683936533614,
"learning_rate": 3.366645846346034e-05,
"loss": 0.358,
"step": 701
},
{
"epoch": 1.1838111298482294,
"grad_norm": 0.3681545709767134,
"learning_rate": 3.3635227982510934e-05,
"loss": 0.3488,
"step": 702
},
{
"epoch": 1.1854974704890389,
"grad_norm": 0.3499215938015231,
"learning_rate": 3.3603997501561524e-05,
"loss": 0.3699,
"step": 703
},
{
"epoch": 1.1871838111298483,
"grad_norm": 0.36319169174953636,
"learning_rate": 3.3572767020612114e-05,
"loss": 0.3696,
"step": 704
},
{
"epoch": 1.1888701517706577,
"grad_norm": 0.3141413251617719,
"learning_rate": 3.354153653966271e-05,
"loss": 0.3386,
"step": 705
},
{
"epoch": 1.190556492411467,
"grad_norm": 0.383269478456824,
"learning_rate": 3.35103060587133e-05,
"loss": 0.3668,
"step": 706
},
{
"epoch": 1.1922428330522765,
"grad_norm": 0.33350904257579106,
"learning_rate": 3.34790755777639e-05,
"loss": 0.3762,
"step": 707
},
{
"epoch": 1.193929173693086,
"grad_norm": 0.3789829657202058,
"learning_rate": 3.3447845096814495e-05,
"loss": 0.3689,
"step": 708
},
{
"epoch": 1.1956155143338956,
"grad_norm": 0.31056826795813275,
"learning_rate": 3.3416614615865085e-05,
"loss": 0.3377,
"step": 709
},
{
"epoch": 1.197301854974705,
"grad_norm": 0.35526213637912024,
"learning_rate": 3.338538413491568e-05,
"loss": 0.362,
"step": 710
},
{
"epoch": 1.1989881956155144,
"grad_norm": 0.2867614135968695,
"learning_rate": 3.335415365396627e-05,
"loss": 0.3549,
"step": 711
},
{
"epoch": 1.2006745362563238,
"grad_norm": 0.3149147995866731,
"learning_rate": 3.332292317301687e-05,
"loss": 0.3625,
"step": 712
},
{
"epoch": 1.2023608768971332,
"grad_norm": 0.32790661583501146,
"learning_rate": 3.329169269206746e-05,
"loss": 0.354,
"step": 713
},
{
"epoch": 1.2040472175379426,
"grad_norm": 0.29102264869880434,
"learning_rate": 3.3260462211118057e-05,
"loss": 0.3761,
"step": 714
},
{
"epoch": 1.205733558178752,
"grad_norm": 0.3187511092694005,
"learning_rate": 3.322923173016865e-05,
"loss": 0.3617,
"step": 715
},
{
"epoch": 1.2074198988195615,
"grad_norm": 0.327560945523737,
"learning_rate": 3.3198001249219244e-05,
"loss": 0.3585,
"step": 716
},
{
"epoch": 1.2091062394603709,
"grad_norm": 0.344406280985839,
"learning_rate": 3.3166770768269834e-05,
"loss": 0.363,
"step": 717
},
{
"epoch": 1.2107925801011805,
"grad_norm": 0.2960616516303917,
"learning_rate": 3.3135540287320424e-05,
"loss": 0.3697,
"step": 718
},
{
"epoch": 1.21247892074199,
"grad_norm": 0.34919128274833694,
"learning_rate": 3.310430980637102e-05,
"loss": 0.3417,
"step": 719
},
{
"epoch": 1.2141652613827993,
"grad_norm": 0.33289746835413475,
"learning_rate": 3.307307932542161e-05,
"loss": 0.3824,
"step": 720
},
{
"epoch": 1.2158516020236088,
"grad_norm": 0.3131245593982676,
"learning_rate": 3.304184884447221e-05,
"loss": 0.3627,
"step": 721
},
{
"epoch": 1.2175379426644182,
"grad_norm": 0.34440988286437363,
"learning_rate": 3.30106183635228e-05,
"loss": 0.384,
"step": 722
},
{
"epoch": 1.2192242833052276,
"grad_norm": 0.30769578929370067,
"learning_rate": 3.2979387882573395e-05,
"loss": 0.3662,
"step": 723
},
{
"epoch": 1.220910623946037,
"grad_norm": 0.36799706179743263,
"learning_rate": 3.2948157401623985e-05,
"loss": 0.3387,
"step": 724
},
{
"epoch": 1.2225969645868466,
"grad_norm": 0.29634378440964937,
"learning_rate": 3.291692692067458e-05,
"loss": 0.3463,
"step": 725
},
{
"epoch": 1.224283305227656,
"grad_norm": 0.3136700584153526,
"learning_rate": 3.288569643972517e-05,
"loss": 0.3784,
"step": 726
},
{
"epoch": 1.2259696458684655,
"grad_norm": 0.35752470110645485,
"learning_rate": 3.285446595877576e-05,
"loss": 0.3788,
"step": 727
},
{
"epoch": 1.2276559865092749,
"grad_norm": 0.4833836006190696,
"learning_rate": 3.282323547782636e-05,
"loss": 0.3852,
"step": 728
},
{
"epoch": 1.2293423271500843,
"grad_norm": 0.3228921026231067,
"learning_rate": 3.279200499687695e-05,
"loss": 0.35,
"step": 729
},
{
"epoch": 1.2310286677908937,
"grad_norm": 0.3601983472885956,
"learning_rate": 3.2760774515927547e-05,
"loss": 0.3714,
"step": 730
},
{
"epoch": 1.2327150084317031,
"grad_norm": 0.32216952225076373,
"learning_rate": 3.272954403497814e-05,
"loss": 0.3463,
"step": 731
},
{
"epoch": 1.2344013490725128,
"grad_norm": 0.34789998458821336,
"learning_rate": 3.2698313554028734e-05,
"loss": 0.3606,
"step": 732
},
{
"epoch": 1.2360876897133222,
"grad_norm": 0.3220946963248954,
"learning_rate": 3.2667083073079324e-05,
"loss": 0.3908,
"step": 733
},
{
"epoch": 1.2377740303541316,
"grad_norm": 0.34525067270794385,
"learning_rate": 3.263585259212992e-05,
"loss": 0.3737,
"step": 734
},
{
"epoch": 1.239460370994941,
"grad_norm": 0.363141368473777,
"learning_rate": 3.260462211118052e-05,
"loss": 0.3613,
"step": 735
},
{
"epoch": 1.2411467116357504,
"grad_norm": 0.30845400363059916,
"learning_rate": 3.257339163023111e-05,
"loss": 0.3545,
"step": 736
},
{
"epoch": 1.2428330522765598,
"grad_norm": 0.3292649941730577,
"learning_rate": 3.2542161149281705e-05,
"loss": 0.3637,
"step": 737
},
{
"epoch": 1.2445193929173692,
"grad_norm": 0.3444162426809801,
"learning_rate": 3.2510930668332295e-05,
"loss": 0.3591,
"step": 738
},
{
"epoch": 1.2462057335581789,
"grad_norm": 0.2780241415209115,
"learning_rate": 3.247970018738289e-05,
"loss": 0.3476,
"step": 739
},
{
"epoch": 1.2478920741989883,
"grad_norm": 0.3310679629273186,
"learning_rate": 3.244846970643348e-05,
"loss": 0.3753,
"step": 740
},
{
"epoch": 1.2495784148397977,
"grad_norm": 0.3228089052508038,
"learning_rate": 3.241723922548407e-05,
"loss": 0.3705,
"step": 741
},
{
"epoch": 1.2512647554806071,
"grad_norm": 0.30123416686246896,
"learning_rate": 3.238600874453467e-05,
"loss": 0.3506,
"step": 742
},
{
"epoch": 1.2529510961214165,
"grad_norm": 0.3047360398946294,
"learning_rate": 3.235477826358526e-05,
"loss": 0.3679,
"step": 743
},
{
"epoch": 1.254637436762226,
"grad_norm": 0.28405410976343115,
"learning_rate": 3.2323547782635856e-05,
"loss": 0.3593,
"step": 744
},
{
"epoch": 1.2563237774030354,
"grad_norm": 0.3220882220276252,
"learning_rate": 3.2292317301686446e-05,
"loss": 0.3575,
"step": 745
},
{
"epoch": 1.258010118043845,
"grad_norm": 0.35944627744808794,
"learning_rate": 3.2261086820737043e-05,
"loss": 0.373,
"step": 746
},
{
"epoch": 1.2596964586846542,
"grad_norm": 0.30281824632780585,
"learning_rate": 3.2229856339787634e-05,
"loss": 0.3681,
"step": 747
},
{
"epoch": 1.2613827993254638,
"grad_norm": 0.353431345480095,
"learning_rate": 3.219862585883823e-05,
"loss": 0.3619,
"step": 748
},
{
"epoch": 1.2630691399662732,
"grad_norm": 0.3055957478380962,
"learning_rate": 3.216739537788882e-05,
"loss": 0.3713,
"step": 749
},
{
"epoch": 1.2647554806070826,
"grad_norm": 0.3103571679944118,
"learning_rate": 3.213616489693941e-05,
"loss": 0.3805,
"step": 750
},
{
"epoch": 1.266441821247892,
"grad_norm": 0.3364232773283497,
"learning_rate": 3.210493441599001e-05,
"loss": 0.3804,
"step": 751
},
{
"epoch": 1.2681281618887015,
"grad_norm": 0.27554897357070157,
"learning_rate": 3.20737039350406e-05,
"loss": 0.3524,
"step": 752
},
{
"epoch": 1.269814502529511,
"grad_norm": 0.30387516090534045,
"learning_rate": 3.2042473454091195e-05,
"loss": 0.3543,
"step": 753
},
{
"epoch": 1.2715008431703203,
"grad_norm": 0.3059496855372739,
"learning_rate": 3.2011242973141785e-05,
"loss": 0.3623,
"step": 754
},
{
"epoch": 1.27318718381113,
"grad_norm": 0.30728724779696115,
"learning_rate": 3.198001249219238e-05,
"loss": 0.3668,
"step": 755
},
{
"epoch": 1.2748735244519394,
"grad_norm": 0.3202085526337014,
"learning_rate": 3.194878201124297e-05,
"loss": 0.3671,
"step": 756
},
{
"epoch": 1.2765598650927488,
"grad_norm": 0.32689712504751167,
"learning_rate": 3.191755153029357e-05,
"loss": 0.3695,
"step": 757
},
{
"epoch": 1.2782462057335582,
"grad_norm": 0.30233880248863065,
"learning_rate": 3.188632104934416e-05,
"loss": 0.3736,
"step": 758
},
{
"epoch": 1.2799325463743676,
"grad_norm": 0.32961492613846366,
"learning_rate": 3.185509056839475e-05,
"loss": 0.3684,
"step": 759
},
{
"epoch": 1.281618887015177,
"grad_norm": 0.32713985472717844,
"learning_rate": 3.182386008744535e-05,
"loss": 0.3639,
"step": 760
},
{
"epoch": 1.2833052276559864,
"grad_norm": 0.2686422618776032,
"learning_rate": 3.179262960649594e-05,
"loss": 0.3651,
"step": 761
},
{
"epoch": 1.284991568296796,
"grad_norm": 0.33665097327652294,
"learning_rate": 3.176139912554654e-05,
"loss": 0.3858,
"step": 762
},
{
"epoch": 1.2866779089376055,
"grad_norm": 0.29534168258311044,
"learning_rate": 3.173016864459713e-05,
"loss": 0.3507,
"step": 763
},
{
"epoch": 1.2883642495784149,
"grad_norm": 0.35374323259618806,
"learning_rate": 3.169893816364772e-05,
"loss": 0.3906,
"step": 764
},
{
"epoch": 1.2900505902192243,
"grad_norm": 0.3264610064516771,
"learning_rate": 3.166770768269832e-05,
"loss": 0.3904,
"step": 765
},
{
"epoch": 1.2917369308600337,
"grad_norm": 0.2606629036110616,
"learning_rate": 3.163647720174891e-05,
"loss": 0.3625,
"step": 766
},
{
"epoch": 1.2934232715008431,
"grad_norm": 0.31584649760135897,
"learning_rate": 3.1605246720799505e-05,
"loss": 0.3686,
"step": 767
},
{
"epoch": 1.2951096121416525,
"grad_norm": 0.3223638084892562,
"learning_rate": 3.1574016239850095e-05,
"loss": 0.3461,
"step": 768
},
{
"epoch": 1.2967959527824622,
"grad_norm": 0.28755849067927153,
"learning_rate": 3.154278575890069e-05,
"loss": 0.355,
"step": 769
},
{
"epoch": 1.2984822934232714,
"grad_norm": 0.3088311800226945,
"learning_rate": 3.151155527795128e-05,
"loss": 0.3561,
"step": 770
},
{
"epoch": 1.300168634064081,
"grad_norm": 0.3184212229471459,
"learning_rate": 3.148032479700188e-05,
"loss": 0.3685,
"step": 771
},
{
"epoch": 1.3018549747048904,
"grad_norm": 0.3463641101847728,
"learning_rate": 3.144909431605247e-05,
"loss": 0.3785,
"step": 772
},
{
"epoch": 1.3035413153456998,
"grad_norm": 0.3525193112827311,
"learning_rate": 3.141786383510306e-05,
"loss": 0.3558,
"step": 773
},
{
"epoch": 1.3052276559865092,
"grad_norm": 0.4102862145029259,
"learning_rate": 3.1386633354153656e-05,
"loss": 0.3581,
"step": 774
},
{
"epoch": 1.3069139966273187,
"grad_norm": 0.29347725599207186,
"learning_rate": 3.1355402873204246e-05,
"loss": 0.3696,
"step": 775
},
{
"epoch": 1.3086003372681283,
"grad_norm": 0.3201690267050578,
"learning_rate": 3.132417239225484e-05,
"loss": 0.3699,
"step": 776
},
{
"epoch": 1.3102866779089375,
"grad_norm": 0.42217077326951885,
"learning_rate": 3.129294191130543e-05,
"loss": 0.378,
"step": 777
},
{
"epoch": 1.3119730185497471,
"grad_norm": 0.3204268838545862,
"learning_rate": 3.126171143035603e-05,
"loss": 0.3864,
"step": 778
},
{
"epoch": 1.3136593591905565,
"grad_norm": 0.36927699806450415,
"learning_rate": 3.123048094940662e-05,
"loss": 0.3725,
"step": 779
},
{
"epoch": 1.315345699831366,
"grad_norm": 0.36777202281156074,
"learning_rate": 3.119925046845722e-05,
"loss": 0.3478,
"step": 780
},
{
"epoch": 1.3170320404721754,
"grad_norm": 0.332383376452764,
"learning_rate": 3.116801998750781e-05,
"loss": 0.3684,
"step": 781
},
{
"epoch": 1.3187183811129848,
"grad_norm": 0.26968150427190296,
"learning_rate": 3.11367895065584e-05,
"loss": 0.3661,
"step": 782
},
{
"epoch": 1.3204047217537942,
"grad_norm": 0.2757213475684918,
"learning_rate": 3.1105559025608995e-05,
"loss": 0.3619,
"step": 783
},
{
"epoch": 1.3220910623946036,
"grad_norm": 0.291925824986569,
"learning_rate": 3.1074328544659585e-05,
"loss": 0.365,
"step": 784
},
{
"epoch": 1.3237774030354132,
"grad_norm": 0.28827293963232314,
"learning_rate": 3.104309806371019e-05,
"loss": 0.3567,
"step": 785
},
{
"epoch": 1.3254637436762227,
"grad_norm": 0.3094407833363561,
"learning_rate": 3.101186758276078e-05,
"loss": 0.377,
"step": 786
},
{
"epoch": 1.327150084317032,
"grad_norm": 0.2805255608547809,
"learning_rate": 3.098063710181137e-05,
"loss": 0.345,
"step": 787
},
{
"epoch": 1.3288364249578415,
"grad_norm": 0.27048687320912446,
"learning_rate": 3.0949406620861966e-05,
"loss": 0.3667,
"step": 788
},
{
"epoch": 1.330522765598651,
"grad_norm": 0.27434964698292863,
"learning_rate": 3.0918176139912556e-05,
"loss": 0.3475,
"step": 789
},
{
"epoch": 1.3322091062394603,
"grad_norm": 0.30584385870013375,
"learning_rate": 3.088694565896315e-05,
"loss": 0.379,
"step": 790
},
{
"epoch": 1.3338954468802697,
"grad_norm": 0.2877913618660639,
"learning_rate": 3.085571517801374e-05,
"loss": 0.3764,
"step": 791
},
{
"epoch": 1.3355817875210794,
"grad_norm": 0.26134414301494957,
"learning_rate": 3.082448469706434e-05,
"loss": 0.3654,
"step": 792
},
{
"epoch": 1.3372681281618888,
"grad_norm": 0.3192558959887312,
"learning_rate": 3.079325421611493e-05,
"loss": 0.3627,
"step": 793
},
{
"epoch": 1.3389544688026982,
"grad_norm": 0.2682907090974044,
"learning_rate": 3.076202373516553e-05,
"loss": 0.3631,
"step": 794
},
{
"epoch": 1.3406408094435076,
"grad_norm": 0.312326995625596,
"learning_rate": 3.073079325421612e-05,
"loss": 0.3731,
"step": 795
},
{
"epoch": 1.342327150084317,
"grad_norm": 0.36559281139328087,
"learning_rate": 3.069956277326671e-05,
"loss": 0.3483,
"step": 796
},
{
"epoch": 1.3440134907251264,
"grad_norm": 0.2538475060279743,
"learning_rate": 3.0668332292317304e-05,
"loss": 0.3667,
"step": 797
},
{
"epoch": 1.3456998313659359,
"grad_norm": 0.28040358277074,
"learning_rate": 3.0637101811367894e-05,
"loss": 0.3507,
"step": 798
},
{
"epoch": 1.3473861720067455,
"grad_norm": 0.33938165368820494,
"learning_rate": 3.060587133041849e-05,
"loss": 0.3789,
"step": 799
},
{
"epoch": 1.3490725126475547,
"grad_norm": 0.27797829986374323,
"learning_rate": 3.057464084946908e-05,
"loss": 0.3697,
"step": 800
},
{
"epoch": 1.3507588532883643,
"grad_norm": 0.317311724784089,
"learning_rate": 3.054341036851968e-05,
"loss": 0.3712,
"step": 801
},
{
"epoch": 1.3524451939291737,
"grad_norm": 0.27067284761984195,
"learning_rate": 3.051217988757027e-05,
"loss": 0.3642,
"step": 802
},
{
"epoch": 1.3541315345699831,
"grad_norm": 0.27640495068779947,
"learning_rate": 3.0480949406620862e-05,
"loss": 0.3665,
"step": 803
},
{
"epoch": 1.3558178752107926,
"grad_norm": 0.29925492356322403,
"learning_rate": 3.0449718925671456e-05,
"loss": 0.3606,
"step": 804
},
{
"epoch": 1.357504215851602,
"grad_norm": 0.28679446055488445,
"learning_rate": 3.041848844472205e-05,
"loss": 0.3653,
"step": 805
},
{
"epoch": 1.3591905564924116,
"grad_norm": 0.31685499931502714,
"learning_rate": 3.0387257963772643e-05,
"loss": 0.3578,
"step": 806
},
{
"epoch": 1.3608768971332208,
"grad_norm": 0.31709583316510476,
"learning_rate": 3.0356027482823236e-05,
"loss": 0.3536,
"step": 807
},
{
"epoch": 1.3625632377740304,
"grad_norm": 0.2972307323584074,
"learning_rate": 3.0324797001873826e-05,
"loss": 0.3453,
"step": 808
},
{
"epoch": 1.3642495784148398,
"grad_norm": 0.2768215969932845,
"learning_rate": 3.029356652092442e-05,
"loss": 0.3731,
"step": 809
},
{
"epoch": 1.3659359190556493,
"grad_norm": 0.3366256274766151,
"learning_rate": 3.0262336039975014e-05,
"loss": 0.3598,
"step": 810
},
{
"epoch": 1.3676222596964587,
"grad_norm": 0.35895024828608924,
"learning_rate": 3.0231105559025614e-05,
"loss": 0.3728,
"step": 811
},
{
"epoch": 1.369308600337268,
"grad_norm": 0.27047287184924756,
"learning_rate": 3.0199875078076207e-05,
"loss": 0.3557,
"step": 812
},
{
"epoch": 1.3709949409780775,
"grad_norm": 0.3045253938750555,
"learning_rate": 3.01686445971268e-05,
"loss": 0.3763,
"step": 813
},
{
"epoch": 1.372681281618887,
"grad_norm": 0.3269021401249622,
"learning_rate": 3.013741411617739e-05,
"loss": 0.3786,
"step": 814
},
{
"epoch": 1.3743676222596966,
"grad_norm": 0.2989257893228668,
"learning_rate": 3.0106183635227985e-05,
"loss": 0.3549,
"step": 815
},
{
"epoch": 1.376053962900506,
"grad_norm": 0.2902243430545122,
"learning_rate": 3.0074953154278578e-05,
"loss": 0.3605,
"step": 816
},
{
"epoch": 1.3777403035413154,
"grad_norm": 0.3268375592923771,
"learning_rate": 3.0043722673329172e-05,
"loss": 0.3554,
"step": 817
},
{
"epoch": 1.3794266441821248,
"grad_norm": 0.28706953208266983,
"learning_rate": 3.0012492192379765e-05,
"loss": 0.3671,
"step": 818
},
{
"epoch": 1.3811129848229342,
"grad_norm": 0.3288664801504078,
"learning_rate": 2.998126171143036e-05,
"loss": 0.3681,
"step": 819
},
{
"epoch": 1.3827993254637436,
"grad_norm": 0.2928303438562581,
"learning_rate": 2.9950031230480952e-05,
"loss": 0.3555,
"step": 820
},
{
"epoch": 1.384485666104553,
"grad_norm": 0.2903552935720318,
"learning_rate": 2.9918800749531546e-05,
"loss": 0.3532,
"step": 821
},
{
"epoch": 1.3861720067453627,
"grad_norm": 0.3321445685269924,
"learning_rate": 2.988757026858214e-05,
"loss": 0.3676,
"step": 822
},
{
"epoch": 1.387858347386172,
"grad_norm": 0.2884554947709554,
"learning_rate": 2.985633978763273e-05,
"loss": 0.3747,
"step": 823
},
{
"epoch": 1.3895446880269815,
"grad_norm": 0.276899181299804,
"learning_rate": 2.9825109306683323e-05,
"loss": 0.3731,
"step": 824
},
{
"epoch": 1.391231028667791,
"grad_norm": 0.28516809217475486,
"learning_rate": 2.9793878825733917e-05,
"loss": 0.3644,
"step": 825
},
{
"epoch": 1.3929173693086003,
"grad_norm": 0.3034709425045361,
"learning_rate": 2.976264834478451e-05,
"loss": 0.3656,
"step": 826
},
{
"epoch": 1.3946037099494097,
"grad_norm": 0.274788776371271,
"learning_rate": 2.9731417863835104e-05,
"loss": 0.3477,
"step": 827
},
{
"epoch": 1.3962900505902192,
"grad_norm": 0.3180401263617833,
"learning_rate": 2.9700187382885697e-05,
"loss": 0.3595,
"step": 828
},
{
"epoch": 1.3979763912310288,
"grad_norm": 0.29836311693348744,
"learning_rate": 2.966895690193629e-05,
"loss": 0.3809,
"step": 829
},
{
"epoch": 1.399662731871838,
"grad_norm": 0.3525823167547023,
"learning_rate": 2.9637726420986885e-05,
"loss": 0.3619,
"step": 830
},
{
"epoch": 1.4013490725126476,
"grad_norm": 0.33175669157893434,
"learning_rate": 2.9606495940037475e-05,
"loss": 0.3588,
"step": 831
},
{
"epoch": 1.403035413153457,
"grad_norm": 0.3245193475126911,
"learning_rate": 2.9575265459088068e-05,
"loss": 0.359,
"step": 832
},
{
"epoch": 1.4047217537942664,
"grad_norm": 0.29657466500656987,
"learning_rate": 2.9544034978138662e-05,
"loss": 0.3465,
"step": 833
},
{
"epoch": 1.4064080944350759,
"grad_norm": 0.3412473670858005,
"learning_rate": 2.9512804497189255e-05,
"loss": 0.3769,
"step": 834
},
{
"epoch": 1.4080944350758853,
"grad_norm": 0.3216702109422503,
"learning_rate": 2.948157401623985e-05,
"loss": 0.3908,
"step": 835
},
{
"epoch": 1.4097807757166947,
"grad_norm": 0.3250423656449163,
"learning_rate": 2.9450343535290442e-05,
"loss": 0.3485,
"step": 836
},
{
"epoch": 1.411467116357504,
"grad_norm": 0.34791591826158985,
"learning_rate": 2.941911305434104e-05,
"loss": 0.3749,
"step": 837
},
{
"epoch": 1.4131534569983137,
"grad_norm": 0.29886846786700416,
"learning_rate": 2.9387882573391633e-05,
"loss": 0.3707,
"step": 838
},
{
"epoch": 1.4148397976391232,
"grad_norm": 0.43800507347744627,
"learning_rate": 2.9356652092442227e-05,
"loss": 0.3585,
"step": 839
},
{
"epoch": 1.4165261382799326,
"grad_norm": 0.3370202282793368,
"learning_rate": 2.932542161149282e-05,
"loss": 0.3977,
"step": 840
},
{
"epoch": 1.418212478920742,
"grad_norm": 0.3464248680319261,
"learning_rate": 2.9294191130543414e-05,
"loss": 0.3672,
"step": 841
},
{
"epoch": 1.4198988195615514,
"grad_norm": 0.3329391344410722,
"learning_rate": 2.9262960649594007e-05,
"loss": 0.3797,
"step": 842
},
{
"epoch": 1.4215851602023608,
"grad_norm": 0.3304159135003874,
"learning_rate": 2.92317301686446e-05,
"loss": 0.3667,
"step": 843
},
{
"epoch": 1.4232715008431702,
"grad_norm": 0.33726375951261683,
"learning_rate": 2.9200499687695194e-05,
"loss": 0.3686,
"step": 844
},
{
"epoch": 1.4249578414839799,
"grad_norm": 0.31457486052096356,
"learning_rate": 2.9169269206745788e-05,
"loss": 0.3549,
"step": 845
},
{
"epoch": 1.4266441821247893,
"grad_norm": 0.313929594764878,
"learning_rate": 2.9138038725796378e-05,
"loss": 0.3455,
"step": 846
},
{
"epoch": 1.4283305227655987,
"grad_norm": 0.35655405274540297,
"learning_rate": 2.910680824484697e-05,
"loss": 0.361,
"step": 847
},
{
"epoch": 1.430016863406408,
"grad_norm": 0.3203018948600648,
"learning_rate": 2.9075577763897565e-05,
"loss": 0.356,
"step": 848
},
{
"epoch": 1.4317032040472175,
"grad_norm": 0.31890031263924845,
"learning_rate": 2.904434728294816e-05,
"loss": 0.3861,
"step": 849
},
{
"epoch": 1.433389544688027,
"grad_norm": 0.3439661347106031,
"learning_rate": 2.9013116801998752e-05,
"loss": 0.3691,
"step": 850
},
{
"epoch": 1.4350758853288363,
"grad_norm": 0.3287855811474556,
"learning_rate": 2.8981886321049346e-05,
"loss": 0.3658,
"step": 851
},
{
"epoch": 1.436762225969646,
"grad_norm": 0.3352337738607975,
"learning_rate": 2.895065584009994e-05,
"loss": 0.3526,
"step": 852
},
{
"epoch": 1.4384485666104554,
"grad_norm": 0.29066523249557324,
"learning_rate": 2.8919425359150533e-05,
"loss": 0.3665,
"step": 853
},
{
"epoch": 1.4401349072512648,
"grad_norm": 0.6443887765113684,
"learning_rate": 2.8888194878201126e-05,
"loss": 0.3695,
"step": 854
},
{
"epoch": 1.4418212478920742,
"grad_norm": 0.32997585072997554,
"learning_rate": 2.8856964397251717e-05,
"loss": 0.354,
"step": 855
},
{
"epoch": 1.4435075885328836,
"grad_norm": 0.2783806614720242,
"learning_rate": 2.882573391630231e-05,
"loss": 0.3849,
"step": 856
},
{
"epoch": 1.445193929173693,
"grad_norm": 0.33474735936565325,
"learning_rate": 2.8794503435352904e-05,
"loss": 0.3781,
"step": 857
},
{
"epoch": 1.4468802698145025,
"grad_norm": 0.3456639852483754,
"learning_rate": 2.8763272954403497e-05,
"loss": 0.3666,
"step": 858
},
{
"epoch": 1.448566610455312,
"grad_norm": 0.30135371080737516,
"learning_rate": 2.873204247345409e-05,
"loss": 0.3629,
"step": 859
},
{
"epoch": 1.4502529510961213,
"grad_norm": 0.3635410618935195,
"learning_rate": 2.8700811992504684e-05,
"loss": 0.3704,
"step": 860
},
{
"epoch": 1.451939291736931,
"grad_norm": 0.30991001067763485,
"learning_rate": 2.8669581511555278e-05,
"loss": 0.3727,
"step": 861
},
{
"epoch": 1.4536256323777403,
"grad_norm": 0.4643174111387868,
"learning_rate": 2.863835103060587e-05,
"loss": 0.3787,
"step": 862
},
{
"epoch": 1.4553119730185498,
"grad_norm": 0.3180200407283694,
"learning_rate": 2.8607120549656468e-05,
"loss": 0.3602,
"step": 863
},
{
"epoch": 1.4569983136593592,
"grad_norm": 0.35198422859049877,
"learning_rate": 2.8575890068707062e-05,
"loss": 0.3659,
"step": 864
},
{
"epoch": 1.4586846543001686,
"grad_norm": 0.3959760640216589,
"learning_rate": 2.8544659587757655e-05,
"loss": 0.3642,
"step": 865
},
{
"epoch": 1.460370994940978,
"grad_norm": 0.27646042574830654,
"learning_rate": 2.851342910680825e-05,
"loss": 0.3504,
"step": 866
},
{
"epoch": 1.4620573355817874,
"grad_norm": 0.3229912287222973,
"learning_rate": 2.8482198625858843e-05,
"loss": 0.3537,
"step": 867
},
{
"epoch": 1.463743676222597,
"grad_norm": 0.3308036371563188,
"learning_rate": 2.8450968144909436e-05,
"loss": 0.3672,
"step": 868
},
{
"epoch": 1.4654300168634065,
"grad_norm": 0.3049717462902101,
"learning_rate": 2.8419737663960026e-05,
"loss": 0.3501,
"step": 869
},
{
"epoch": 1.4671163575042159,
"grad_norm": 0.276903365881423,
"learning_rate": 2.838850718301062e-05,
"loss": 0.376,
"step": 870
},
{
"epoch": 1.4688026981450253,
"grad_norm": 0.282087202469682,
"learning_rate": 2.8357276702061213e-05,
"loss": 0.3458,
"step": 871
},
{
"epoch": 1.4704890387858347,
"grad_norm": 0.2889326442756206,
"learning_rate": 2.8326046221111807e-05,
"loss": 0.3635,
"step": 872
},
{
"epoch": 1.4721753794266441,
"grad_norm": 0.263485140543098,
"learning_rate": 2.82948157401624e-05,
"loss": 0.3525,
"step": 873
},
{
"epoch": 1.4738617200674535,
"grad_norm": 0.3017524454565209,
"learning_rate": 2.8263585259212994e-05,
"loss": 0.3569,
"step": 874
},
{
"epoch": 1.4755480607082632,
"grad_norm": 0.2765623305014126,
"learning_rate": 2.8232354778263588e-05,
"loss": 0.3593,
"step": 875
},
{
"epoch": 1.4772344013490726,
"grad_norm": 0.3025401132551647,
"learning_rate": 2.820112429731418e-05,
"loss": 0.3496,
"step": 876
},
{
"epoch": 1.478920741989882,
"grad_norm": 0.30350708989703407,
"learning_rate": 2.8169893816364775e-05,
"loss": 0.3566,
"step": 877
},
{
"epoch": 1.4806070826306914,
"grad_norm": 0.30107950706745196,
"learning_rate": 2.8138663335415365e-05,
"loss": 0.3884,
"step": 878
},
{
"epoch": 1.4822934232715008,
"grad_norm": 0.3965538879616542,
"learning_rate": 2.810743285446596e-05,
"loss": 0.3762,
"step": 879
},
{
"epoch": 1.4839797639123102,
"grad_norm": 0.26770539723536463,
"learning_rate": 2.8076202373516552e-05,
"loss": 0.3537,
"step": 880
},
{
"epoch": 1.4856661045531196,
"grad_norm": 0.33976305682196745,
"learning_rate": 2.8044971892567145e-05,
"loss": 0.3651,
"step": 881
},
{
"epoch": 1.4873524451939293,
"grad_norm": 0.2762542589057808,
"learning_rate": 2.801374141161774e-05,
"loss": 0.3553,
"step": 882
},
{
"epoch": 1.4890387858347387,
"grad_norm": 0.29491397560474647,
"learning_rate": 2.7982510930668333e-05,
"loss": 0.3741,
"step": 883
},
{
"epoch": 1.4907251264755481,
"grad_norm": 0.3313263751607482,
"learning_rate": 2.7951280449718926e-05,
"loss": 0.3624,
"step": 884
},
{
"epoch": 1.4924114671163575,
"grad_norm": 0.40279897069448306,
"learning_rate": 2.792004996876952e-05,
"loss": 0.3854,
"step": 885
},
{
"epoch": 1.494097807757167,
"grad_norm": 0.2995735918065522,
"learning_rate": 2.788881948782011e-05,
"loss": 0.3852,
"step": 886
},
{
"epoch": 1.4957841483979764,
"grad_norm": 0.4071606342151084,
"learning_rate": 2.7857589006870703e-05,
"loss": 0.3486,
"step": 887
},
{
"epoch": 1.4974704890387858,
"grad_norm": 0.3891142766885291,
"learning_rate": 2.7826358525921297e-05,
"loss": 0.3618,
"step": 888
},
{
"epoch": 1.4991568296795954,
"grad_norm": 0.41252059927905615,
"learning_rate": 2.7795128044971897e-05,
"loss": 0.3861,
"step": 889
},
{
"epoch": 1.5008431703204046,
"grad_norm": 0.37901924320659897,
"learning_rate": 2.776389756402249e-05,
"loss": 0.3659,
"step": 890
},
{
"epoch": 1.5025295109612142,
"grad_norm": 0.36103653161805105,
"learning_rate": 2.7732667083073084e-05,
"loss": 0.3828,
"step": 891
},
{
"epoch": 1.5042158516020236,
"grad_norm": 0.2973855077723278,
"learning_rate": 2.7701436602123674e-05,
"loss": 0.3786,
"step": 892
},
{
"epoch": 1.505902192242833,
"grad_norm": 0.308201431273671,
"learning_rate": 2.7670206121174268e-05,
"loss": 0.3757,
"step": 893
},
{
"epoch": 1.5075885328836425,
"grad_norm": 0.3157987216748116,
"learning_rate": 2.763897564022486e-05,
"loss": 0.3704,
"step": 894
},
{
"epoch": 1.5092748735244519,
"grad_norm": 0.30359379634490863,
"learning_rate": 2.7607745159275455e-05,
"loss": 0.3746,
"step": 895
},
{
"epoch": 1.5109612141652615,
"grad_norm": 0.31541367039281626,
"learning_rate": 2.757651467832605e-05,
"loss": 0.3537,
"step": 896
},
{
"epoch": 1.5126475548060707,
"grad_norm": 0.28664704951464065,
"learning_rate": 2.7545284197376642e-05,
"loss": 0.3643,
"step": 897
},
{
"epoch": 1.5143338954468804,
"grad_norm": 0.33654100602922427,
"learning_rate": 2.7514053716427236e-05,
"loss": 0.3874,
"step": 898
},
{
"epoch": 1.5160202360876898,
"grad_norm": 0.3460798665887633,
"learning_rate": 2.748282323547783e-05,
"loss": 0.3696,
"step": 899
},
{
"epoch": 1.5177065767284992,
"grad_norm": 0.29319829577254086,
"learning_rate": 2.7451592754528423e-05,
"loss": 0.3535,
"step": 900
},
{
"epoch": 1.5193929173693086,
"grad_norm": 0.30053976752466616,
"learning_rate": 2.7420362273579013e-05,
"loss": 0.3768,
"step": 901
},
{
"epoch": 1.521079258010118,
"grad_norm": 0.38472344036114325,
"learning_rate": 2.7389131792629607e-05,
"loss": 0.3668,
"step": 902
},
{
"epoch": 1.5227655986509276,
"grad_norm": 0.2711714617087213,
"learning_rate": 2.73579013116802e-05,
"loss": 0.3408,
"step": 903
},
{
"epoch": 1.5244519392917368,
"grad_norm": 0.2873701839245859,
"learning_rate": 2.7326670830730794e-05,
"loss": 0.3658,
"step": 904
},
{
"epoch": 1.5261382799325465,
"grad_norm": 0.29970672448116314,
"learning_rate": 2.7295440349781387e-05,
"loss": 0.3649,
"step": 905
},
{
"epoch": 1.5278246205733557,
"grad_norm": 0.30526313911571973,
"learning_rate": 2.726420986883198e-05,
"loss": 0.378,
"step": 906
},
{
"epoch": 1.5295109612141653,
"grad_norm": 0.2928928752135304,
"learning_rate": 2.7232979387882574e-05,
"loss": 0.3757,
"step": 907
},
{
"epoch": 1.5311973018549747,
"grad_norm": 0.2858263949745689,
"learning_rate": 2.7201748906933168e-05,
"loss": 0.3659,
"step": 908
},
{
"epoch": 1.5328836424957841,
"grad_norm": 0.3087797513117411,
"learning_rate": 2.717051842598376e-05,
"loss": 0.3743,
"step": 909
},
{
"epoch": 1.5345699831365935,
"grad_norm": 0.2856603026190315,
"learning_rate": 2.713928794503435e-05,
"loss": 0.357,
"step": 910
},
{
"epoch": 1.536256323777403,
"grad_norm": 0.3058290048021126,
"learning_rate": 2.7108057464084945e-05,
"loss": 0.3734,
"step": 911
},
{
"epoch": 1.5379426644182126,
"grad_norm": 0.3104946067619198,
"learning_rate": 2.707682698313554e-05,
"loss": 0.3568,
"step": 912
},
{
"epoch": 1.5396290050590218,
"grad_norm": 0.26979396016908347,
"learning_rate": 2.7045596502186132e-05,
"loss": 0.3715,
"step": 913
},
{
"epoch": 1.5413153456998314,
"grad_norm": 0.34743672943385967,
"learning_rate": 2.7014366021236733e-05,
"loss": 0.3527,
"step": 914
},
{
"epoch": 1.5430016863406408,
"grad_norm": 0.30343903155975266,
"learning_rate": 2.6983135540287323e-05,
"loss": 0.3559,
"step": 915
},
{
"epoch": 1.5446880269814502,
"grad_norm": 0.32417710032970565,
"learning_rate": 2.6951905059337916e-05,
"loss": 0.344,
"step": 916
},
{
"epoch": 1.5463743676222597,
"grad_norm": 0.32794367697794385,
"learning_rate": 2.692067457838851e-05,
"loss": 0.3659,
"step": 917
},
{
"epoch": 1.548060708263069,
"grad_norm": 0.3035455097790254,
"learning_rate": 2.6889444097439103e-05,
"loss": 0.3656,
"step": 918
},
{
"epoch": 1.5497470489038787,
"grad_norm": 0.29476341967594083,
"learning_rate": 2.6858213616489697e-05,
"loss": 0.3605,
"step": 919
},
{
"epoch": 1.551433389544688,
"grad_norm": 0.2507995893378595,
"learning_rate": 2.682698313554029e-05,
"loss": 0.3208,
"step": 920
},
{
"epoch": 1.5531197301854975,
"grad_norm": 0.333329654380094,
"learning_rate": 2.6795752654590884e-05,
"loss": 0.3865,
"step": 921
},
{
"epoch": 1.554806070826307,
"grad_norm": 0.28035363313722794,
"learning_rate": 2.6764522173641478e-05,
"loss": 0.3586,
"step": 922
},
{
"epoch": 1.5564924114671164,
"grad_norm": 0.31443387396206257,
"learning_rate": 2.673329169269207e-05,
"loss": 0.3737,
"step": 923
},
{
"epoch": 1.5581787521079258,
"grad_norm": 0.32875013176555307,
"learning_rate": 2.670206121174266e-05,
"loss": 0.3399,
"step": 924
},
{
"epoch": 1.5598650927487352,
"grad_norm": 0.31629044557639807,
"learning_rate": 2.6670830730793255e-05,
"loss": 0.353,
"step": 925
},
{
"epoch": 1.5615514333895448,
"grad_norm": 0.32486698411487475,
"learning_rate": 2.663960024984385e-05,
"loss": 0.3726,
"step": 926
},
{
"epoch": 1.563237774030354,
"grad_norm": 0.25686723863440886,
"learning_rate": 2.6608369768894442e-05,
"loss": 0.3714,
"step": 927
},
{
"epoch": 1.5649241146711637,
"grad_norm": 0.2994757555235764,
"learning_rate": 2.6577139287945035e-05,
"loss": 0.391,
"step": 928
},
{
"epoch": 1.566610455311973,
"grad_norm": 0.26274982624954607,
"learning_rate": 2.654590880699563e-05,
"loss": 0.368,
"step": 929
},
{
"epoch": 1.5682967959527825,
"grad_norm": 0.2766309841130026,
"learning_rate": 2.6514678326046223e-05,
"loss": 0.369,
"step": 930
},
{
"epoch": 1.569983136593592,
"grad_norm": 0.24174006935555933,
"learning_rate": 2.6483447845096816e-05,
"loss": 0.346,
"step": 931
},
{
"epoch": 1.5716694772344013,
"grad_norm": 0.2973914368046073,
"learning_rate": 2.645221736414741e-05,
"loss": 0.3509,
"step": 932
},
{
"epoch": 1.573355817875211,
"grad_norm": 0.27142951583047953,
"learning_rate": 2.6420986883198e-05,
"loss": 0.371,
"step": 933
},
{
"epoch": 1.5750421585160201,
"grad_norm": 0.28939375979112236,
"learning_rate": 2.6389756402248593e-05,
"loss": 0.3606,
"step": 934
},
{
"epoch": 1.5767284991568298,
"grad_norm": 0.3204660725413403,
"learning_rate": 2.6358525921299187e-05,
"loss": 0.3719,
"step": 935
},
{
"epoch": 1.578414839797639,
"grad_norm": 0.28518308791901786,
"learning_rate": 2.632729544034978e-05,
"loss": 0.3732,
"step": 936
},
{
"epoch": 1.5801011804384486,
"grad_norm": 0.2840336033242113,
"learning_rate": 2.6296064959400374e-05,
"loss": 0.3523,
"step": 937
},
{
"epoch": 1.581787521079258,
"grad_norm": 0.2848436033024478,
"learning_rate": 2.6264834478450968e-05,
"loss": 0.3613,
"step": 938
},
{
"epoch": 1.5834738617200674,
"grad_norm": 0.30573983740828986,
"learning_rate": 2.623360399750156e-05,
"loss": 0.3639,
"step": 939
},
{
"epoch": 1.5851602023608768,
"grad_norm": 0.3074017715880178,
"learning_rate": 2.6202373516552158e-05,
"loss": 0.3593,
"step": 940
},
{
"epoch": 1.5868465430016863,
"grad_norm": 0.3670939060165031,
"learning_rate": 2.617114303560275e-05,
"loss": 0.3732,
"step": 941
},
{
"epoch": 1.588532883642496,
"grad_norm": 0.2815186616157421,
"learning_rate": 2.6139912554653345e-05,
"loss": 0.376,
"step": 942
},
{
"epoch": 1.590219224283305,
"grad_norm": 0.3288959283244572,
"learning_rate": 2.610868207370394e-05,
"loss": 0.3859,
"step": 943
},
{
"epoch": 1.5919055649241147,
"grad_norm": 0.3333526511910872,
"learning_rate": 2.6077451592754532e-05,
"loss": 0.3476,
"step": 944
},
{
"epoch": 1.5935919055649241,
"grad_norm": 0.26153190652604347,
"learning_rate": 2.6046221111805126e-05,
"loss": 0.3527,
"step": 945
},
{
"epoch": 1.5952782462057336,
"grad_norm": 0.3352855943625819,
"learning_rate": 2.601499063085572e-05,
"loss": 0.3491,
"step": 946
},
{
"epoch": 1.596964586846543,
"grad_norm": 0.34129225800110075,
"learning_rate": 2.598376014990631e-05,
"loss": 0.3815,
"step": 947
},
{
"epoch": 1.5986509274873524,
"grad_norm": 0.27613299041658507,
"learning_rate": 2.5952529668956903e-05,
"loss": 0.371,
"step": 948
},
{
"epoch": 1.600337268128162,
"grad_norm": 0.3466164543787536,
"learning_rate": 2.5921299188007497e-05,
"loss": 0.3599,
"step": 949
},
{
"epoch": 1.6020236087689712,
"grad_norm": 0.2887317810997904,
"learning_rate": 2.589006870705809e-05,
"loss": 0.3693,
"step": 950
},
{
"epoch": 1.6037099494097808,
"grad_norm": 0.27325392270711873,
"learning_rate": 2.5858838226108684e-05,
"loss": 0.3566,
"step": 951
},
{
"epoch": 1.6053962900505903,
"grad_norm": 0.29154218812604826,
"learning_rate": 2.5827607745159277e-05,
"loss": 0.3656,
"step": 952
},
{
"epoch": 1.6070826306913997,
"grad_norm": 0.27216759043144917,
"learning_rate": 2.579637726420987e-05,
"loss": 0.3663,
"step": 953
},
{
"epoch": 1.608768971332209,
"grad_norm": 0.24444581005960203,
"learning_rate": 2.5765146783260464e-05,
"loss": 0.3463,
"step": 954
},
{
"epoch": 1.6104553119730185,
"grad_norm": 0.291335248047702,
"learning_rate": 2.5733916302311058e-05,
"loss": 0.3697,
"step": 955
},
{
"epoch": 1.6121416526138281,
"grad_norm": 0.2675512533134804,
"learning_rate": 2.5702685821361648e-05,
"loss": 0.3609,
"step": 956
},
{
"epoch": 1.6138279932546373,
"grad_norm": 0.34548920343457473,
"learning_rate": 2.567145534041224e-05,
"loss": 0.3699,
"step": 957
},
{
"epoch": 1.615514333895447,
"grad_norm": 0.2742740366615913,
"learning_rate": 2.5640224859462835e-05,
"loss": 0.3702,
"step": 958
},
{
"epoch": 1.6172006745362564,
"grad_norm": 0.37519505367106626,
"learning_rate": 2.560899437851343e-05,
"loss": 0.3585,
"step": 959
},
{
"epoch": 1.6188870151770658,
"grad_norm": 0.2752354411311124,
"learning_rate": 2.5577763897564022e-05,
"loss": 0.3678,
"step": 960
},
{
"epoch": 1.6205733558178752,
"grad_norm": 0.26159230336481026,
"learning_rate": 2.5546533416614616e-05,
"loss": 0.3481,
"step": 961
},
{
"epoch": 1.6222596964586846,
"grad_norm": 0.29194704056578963,
"learning_rate": 2.551530293566521e-05,
"loss": 0.3677,
"step": 962
},
{
"epoch": 1.6239460370994943,
"grad_norm": 0.2824926837708255,
"learning_rate": 2.5484072454715803e-05,
"loss": 0.3802,
"step": 963
},
{
"epoch": 1.6256323777403034,
"grad_norm": 0.26074933605007267,
"learning_rate": 2.5452841973766396e-05,
"loss": 0.3593,
"step": 964
},
{
"epoch": 1.627318718381113,
"grad_norm": 0.27358390071273747,
"learning_rate": 2.5421611492816987e-05,
"loss": 0.3634,
"step": 965
},
{
"epoch": 1.6290050590219223,
"grad_norm": 0.26051652985808404,
"learning_rate": 2.5390381011867587e-05,
"loss": 0.3786,
"step": 966
},
{
"epoch": 1.630691399662732,
"grad_norm": 0.27351362935955126,
"learning_rate": 2.535915053091818e-05,
"loss": 0.3709,
"step": 967
},
{
"epoch": 1.6323777403035413,
"grad_norm": 0.2730193534490263,
"learning_rate": 2.5327920049968774e-05,
"loss": 0.3523,
"step": 968
},
{
"epoch": 1.6340640809443507,
"grad_norm": 0.2713613178933639,
"learning_rate": 2.5296689569019368e-05,
"loss": 0.3754,
"step": 969
},
{
"epoch": 1.6357504215851602,
"grad_norm": 0.2913211147093276,
"learning_rate": 2.5265459088069958e-05,
"loss": 0.3641,
"step": 970
},
{
"epoch": 1.6374367622259696,
"grad_norm": 0.3176807011629316,
"learning_rate": 2.523422860712055e-05,
"loss": 0.3644,
"step": 971
},
{
"epoch": 1.6391231028667792,
"grad_norm": 0.2719022706726167,
"learning_rate": 2.5202998126171145e-05,
"loss": 0.3456,
"step": 972
},
{
"epoch": 1.6408094435075884,
"grad_norm": 0.28072788443759994,
"learning_rate": 2.517176764522174e-05,
"loss": 0.3569,
"step": 973
},
{
"epoch": 1.642495784148398,
"grad_norm": 0.3402154003990933,
"learning_rate": 2.5140537164272332e-05,
"loss": 0.3595,
"step": 974
},
{
"epoch": 1.6441821247892074,
"grad_norm": 0.29326577548976807,
"learning_rate": 2.5109306683322925e-05,
"loss": 0.3692,
"step": 975
},
{
"epoch": 1.6458684654300169,
"grad_norm": 0.2851064269963646,
"learning_rate": 2.507807620237352e-05,
"loss": 0.3476,
"step": 976
},
{
"epoch": 1.6475548060708263,
"grad_norm": 0.30308588717737617,
"learning_rate": 2.5046845721424113e-05,
"loss": 0.3528,
"step": 977
},
{
"epoch": 1.6492411467116357,
"grad_norm": 0.30446422773851584,
"learning_rate": 2.5015615240474706e-05,
"loss": 0.3791,
"step": 978
},
{
"epoch": 1.6509274873524453,
"grad_norm": 0.31209376681317047,
"learning_rate": 2.4984384759525296e-05,
"loss": 0.3726,
"step": 979
},
{
"epoch": 1.6526138279932545,
"grad_norm": 0.2957213372732873,
"learning_rate": 2.495315427857589e-05,
"loss": 0.3634,
"step": 980
},
{
"epoch": 1.6543001686340641,
"grad_norm": 0.2506278259092563,
"learning_rate": 2.4921923797626483e-05,
"loss": 0.3562,
"step": 981
},
{
"epoch": 1.6559865092748736,
"grad_norm": 0.26991031572885404,
"learning_rate": 2.4890693316677077e-05,
"loss": 0.3684,
"step": 982
},
{
"epoch": 1.657672849915683,
"grad_norm": 0.26827169125638395,
"learning_rate": 2.485946283572767e-05,
"loss": 0.382,
"step": 983
},
{
"epoch": 1.6593591905564924,
"grad_norm": 0.25236793433775045,
"learning_rate": 2.4828232354778264e-05,
"loss": 0.3781,
"step": 984
},
{
"epoch": 1.6610455311973018,
"grad_norm": 0.2597056655240759,
"learning_rate": 2.479700187382886e-05,
"loss": 0.358,
"step": 985
},
{
"epoch": 1.6627318718381114,
"grad_norm": 0.26812083288360683,
"learning_rate": 2.476577139287945e-05,
"loss": 0.3714,
"step": 986
},
{
"epoch": 1.6644182124789206,
"grad_norm": 0.25998888449376045,
"learning_rate": 2.4734540911930045e-05,
"loss": 0.3558,
"step": 987
},
{
"epoch": 1.6661045531197303,
"grad_norm": 0.33214867440974927,
"learning_rate": 2.4703310430980638e-05,
"loss": 0.3663,
"step": 988
},
{
"epoch": 1.6677908937605397,
"grad_norm": 0.2787721464162817,
"learning_rate": 2.4672079950031232e-05,
"loss": 0.3661,
"step": 989
},
{
"epoch": 1.669477234401349,
"grad_norm": 0.29617420303436354,
"learning_rate": 2.4640849469081825e-05,
"loss": 0.3665,
"step": 990
},
{
"epoch": 1.6711635750421585,
"grad_norm": 0.32312615723880084,
"learning_rate": 2.460961898813242e-05,
"loss": 0.3981,
"step": 991
},
{
"epoch": 1.672849915682968,
"grad_norm": 0.24323896852960228,
"learning_rate": 2.4578388507183012e-05,
"loss": 0.3579,
"step": 992
},
{
"epoch": 1.6745362563237776,
"grad_norm": 0.3218908678873722,
"learning_rate": 2.4547158026233606e-05,
"loss": 0.3884,
"step": 993
},
{
"epoch": 1.6762225969645868,
"grad_norm": 0.27355257653083903,
"learning_rate": 2.45159275452842e-05,
"loss": 0.3678,
"step": 994
},
{
"epoch": 1.6779089376053964,
"grad_norm": 0.2548595173625908,
"learning_rate": 2.448469706433479e-05,
"loss": 0.3621,
"step": 995
},
{
"epoch": 1.6795952782462056,
"grad_norm": 0.2671433617750058,
"learning_rate": 2.4453466583385383e-05,
"loss": 0.3793,
"step": 996
},
{
"epoch": 1.6812816188870152,
"grad_norm": 0.26834322594291415,
"learning_rate": 2.4422236102435977e-05,
"loss": 0.3736,
"step": 997
},
{
"epoch": 1.6829679595278246,
"grad_norm": 0.2603248087469291,
"learning_rate": 2.4391005621486574e-05,
"loss": 0.3705,
"step": 998
},
{
"epoch": 1.684654300168634,
"grad_norm": 0.2775103849331025,
"learning_rate": 2.4359775140537167e-05,
"loss": 0.3632,
"step": 999
},
{
"epoch": 1.6863406408094435,
"grad_norm": 0.2758959222329701,
"learning_rate": 2.432854465958776e-05,
"loss": 0.3818,
"step": 1000
},
{
"epoch": 1.6880269814502529,
"grad_norm": 0.273604462240355,
"learning_rate": 2.4297314178638354e-05,
"loss": 0.3631,
"step": 1001
},
{
"epoch": 1.6897133220910625,
"grad_norm": 0.33729218246704507,
"learning_rate": 2.4266083697688945e-05,
"loss": 0.3783,
"step": 1002
},
{
"epoch": 1.6913996627318717,
"grad_norm": 0.2663973993344779,
"learning_rate": 2.4234853216739538e-05,
"loss": 0.3487,
"step": 1003
},
{
"epoch": 1.6930860033726813,
"grad_norm": 0.33574629676201084,
"learning_rate": 2.420362273579013e-05,
"loss": 0.3686,
"step": 1004
},
{
"epoch": 1.6947723440134908,
"grad_norm": 0.2965945796608682,
"learning_rate": 2.4172392254840725e-05,
"loss": 0.3659,
"step": 1005
},
{
"epoch": 1.6964586846543002,
"grad_norm": 0.2832725819779564,
"learning_rate": 2.414116177389132e-05,
"loss": 0.3719,
"step": 1006
},
{
"epoch": 1.6981450252951096,
"grad_norm": 0.2660921300880089,
"learning_rate": 2.4109931292941912e-05,
"loss": 0.3627,
"step": 1007
},
{
"epoch": 1.699831365935919,
"grad_norm": 0.2328821109494059,
"learning_rate": 2.4078700811992506e-05,
"loss": 0.363,
"step": 1008
},
{
"epoch": 1.7015177065767286,
"grad_norm": 0.26673695583540513,
"learning_rate": 2.40474703310431e-05,
"loss": 0.3613,
"step": 1009
},
{
"epoch": 1.7032040472175378,
"grad_norm": 0.2593262439807181,
"learning_rate": 2.4016239850093693e-05,
"loss": 0.3667,
"step": 1010
},
{
"epoch": 1.7048903878583475,
"grad_norm": 0.27590667527210233,
"learning_rate": 2.3985009369144286e-05,
"loss": 0.3699,
"step": 1011
},
{
"epoch": 1.7065767284991569,
"grad_norm": 0.2763140503235593,
"learning_rate": 2.395377888819488e-05,
"loss": 0.371,
"step": 1012
},
{
"epoch": 1.7082630691399663,
"grad_norm": 0.2578085278220634,
"learning_rate": 2.3922548407245474e-05,
"loss": 0.3474,
"step": 1013
},
{
"epoch": 1.7099494097807757,
"grad_norm": 0.2761385598378717,
"learning_rate": 2.3891317926296067e-05,
"loss": 0.3625,
"step": 1014
},
{
"epoch": 1.7116357504215851,
"grad_norm": 0.27597852351892443,
"learning_rate": 2.386008744534666e-05,
"loss": 0.3747,
"step": 1015
},
{
"epoch": 1.7133220910623947,
"grad_norm": 0.2982888554929235,
"learning_rate": 2.3828856964397254e-05,
"loss": 0.3596,
"step": 1016
},
{
"epoch": 1.715008431703204,
"grad_norm": 0.2614376301845927,
"learning_rate": 2.3797626483447848e-05,
"loss": 0.3537,
"step": 1017
},
{
"epoch": 1.7166947723440136,
"grad_norm": 1.3930221257075153,
"learning_rate": 2.3766396002498438e-05,
"loss": 0.4185,
"step": 1018
},
{
"epoch": 1.718381112984823,
"grad_norm": 0.3795613329435081,
"learning_rate": 2.373516552154903e-05,
"loss": 0.3562,
"step": 1019
},
{
"epoch": 1.7200674536256324,
"grad_norm": 0.3056080974956922,
"learning_rate": 2.3703935040599625e-05,
"loss": 0.3649,
"step": 1020
},
{
"epoch": 1.7217537942664418,
"grad_norm": 0.4000237187674837,
"learning_rate": 2.367270455965022e-05,
"loss": 0.3667,
"step": 1021
},
{
"epoch": 1.7234401349072512,
"grad_norm": 0.33463335739266226,
"learning_rate": 2.3641474078700812e-05,
"loss": 0.3564,
"step": 1022
},
{
"epoch": 1.7251264755480609,
"grad_norm": 0.3485535996212191,
"learning_rate": 2.3610243597751406e-05,
"loss": 0.3786,
"step": 1023
},
{
"epoch": 1.72681281618887,
"grad_norm": 0.3987241043836055,
"learning_rate": 2.3579013116802003e-05,
"loss": 0.3687,
"step": 1024
},
{
"epoch": 1.7284991568296797,
"grad_norm": 0.3754341036793299,
"learning_rate": 2.3547782635852593e-05,
"loss": 0.3746,
"step": 1025
},
{
"epoch": 1.7301854974704889,
"grad_norm": 0.29602199276671504,
"learning_rate": 2.3516552154903186e-05,
"loss": 0.376,
"step": 1026
},
{
"epoch": 1.7318718381112985,
"grad_norm": 0.2863523824664361,
"learning_rate": 2.348532167395378e-05,
"loss": 0.366,
"step": 1027
},
{
"epoch": 1.733558178752108,
"grad_norm": 0.3706051157797791,
"learning_rate": 2.3454091193004373e-05,
"loss": 0.3653,
"step": 1028
},
{
"epoch": 1.7352445193929174,
"grad_norm": 0.29526649266114907,
"learning_rate": 2.3422860712054967e-05,
"loss": 0.3693,
"step": 1029
},
{
"epoch": 1.7369308600337268,
"grad_norm": 0.3305499627842152,
"learning_rate": 2.339163023110556e-05,
"loss": 0.3703,
"step": 1030
},
{
"epoch": 1.7386172006745362,
"grad_norm": 0.3222780067528204,
"learning_rate": 2.3360399750156154e-05,
"loss": 0.3448,
"step": 1031
},
{
"epoch": 1.7403035413153458,
"grad_norm": 0.311694614643851,
"learning_rate": 2.3329169269206748e-05,
"loss": 0.3601,
"step": 1032
},
{
"epoch": 1.741989881956155,
"grad_norm": 0.27323544168500447,
"learning_rate": 2.329793878825734e-05,
"loss": 0.3603,
"step": 1033
},
{
"epoch": 1.7436762225969646,
"grad_norm": 0.3555613719712465,
"learning_rate": 2.326670830730793e-05,
"loss": 0.3685,
"step": 1034
},
{
"epoch": 1.745362563237774,
"grad_norm": 0.33155498613569256,
"learning_rate": 2.3235477826358525e-05,
"loss": 0.3545,
"step": 1035
},
{
"epoch": 1.7470489038785835,
"grad_norm": 0.2791464988529287,
"learning_rate": 2.320424734540912e-05,
"loss": 0.3726,
"step": 1036
},
{
"epoch": 1.7487352445193929,
"grad_norm": 0.3422453820402333,
"learning_rate": 2.3173016864459715e-05,
"loss": 0.3531,
"step": 1037
},
{
"epoch": 1.7504215851602023,
"grad_norm": 0.29329487796063386,
"learning_rate": 2.314178638351031e-05,
"loss": 0.3534,
"step": 1038
},
{
"epoch": 1.752107925801012,
"grad_norm": 0.29710324626541224,
"learning_rate": 2.3110555902560902e-05,
"loss": 0.3697,
"step": 1039
},
{
"epoch": 1.7537942664418211,
"grad_norm": 0.2939041175358189,
"learning_rate": 2.3079325421611496e-05,
"loss": 0.3869,
"step": 1040
},
{
"epoch": 1.7554806070826308,
"grad_norm": 0.34204566971638944,
"learning_rate": 2.3048094940662086e-05,
"loss": 0.3587,
"step": 1041
},
{
"epoch": 1.7571669477234402,
"grad_norm": 0.31682768420537266,
"learning_rate": 2.301686445971268e-05,
"loss": 0.353,
"step": 1042
},
{
"epoch": 1.7588532883642496,
"grad_norm": 0.29485949755465213,
"learning_rate": 2.2985633978763273e-05,
"loss": 0.3617,
"step": 1043
},
{
"epoch": 1.760539629005059,
"grad_norm": 0.3475504060072616,
"learning_rate": 2.2954403497813867e-05,
"loss": 0.37,
"step": 1044
},
{
"epoch": 1.7622259696458684,
"grad_norm": 0.2922791776067849,
"learning_rate": 2.292317301686446e-05,
"loss": 0.3627,
"step": 1045
},
{
"epoch": 1.763912310286678,
"grad_norm": 1.1149661206969306,
"learning_rate": 2.2891942535915054e-05,
"loss": 0.39,
"step": 1046
},
{
"epoch": 1.7655986509274872,
"grad_norm": 0.29850221752863587,
"learning_rate": 2.2860712054965647e-05,
"loss": 0.3648,
"step": 1047
},
{
"epoch": 1.7672849915682969,
"grad_norm": 0.28451789499029356,
"learning_rate": 2.282948157401624e-05,
"loss": 0.3665,
"step": 1048
},
{
"epoch": 1.768971332209106,
"grad_norm": 0.279381167685199,
"learning_rate": 2.2798251093066835e-05,
"loss": 0.353,
"step": 1049
},
{
"epoch": 1.7706576728499157,
"grad_norm": 0.3111591163555094,
"learning_rate": 2.2767020612117428e-05,
"loss": 0.3564,
"step": 1050
},
{
"epoch": 1.7723440134907251,
"grad_norm": 0.3282013878697267,
"learning_rate": 2.273579013116802e-05,
"loss": 0.3592,
"step": 1051
},
{
"epoch": 1.7740303541315345,
"grad_norm": 0.2743211155501384,
"learning_rate": 2.2704559650218615e-05,
"loss": 0.3772,
"step": 1052
},
{
"epoch": 1.7757166947723442,
"grad_norm": 0.3068658384779893,
"learning_rate": 2.267332916926921e-05,
"loss": 0.3525,
"step": 1053
},
{
"epoch": 1.7774030354131534,
"grad_norm": 0.3821620228832803,
"learning_rate": 2.2642098688319802e-05,
"loss": 0.3868,
"step": 1054
},
{
"epoch": 1.779089376053963,
"grad_norm": 0.2984635934516873,
"learning_rate": 2.2610868207370396e-05,
"loss": 0.3601,
"step": 1055
},
{
"epoch": 1.7807757166947722,
"grad_norm": 0.30801597653766255,
"learning_rate": 2.257963772642099e-05,
"loss": 0.3714,
"step": 1056
},
{
"epoch": 1.7824620573355818,
"grad_norm": 0.3596227923255594,
"learning_rate": 2.254840724547158e-05,
"loss": 0.3727,
"step": 1057
},
{
"epoch": 1.7841483979763912,
"grad_norm": 0.32033504944868557,
"learning_rate": 2.2517176764522173e-05,
"loss": 0.3731,
"step": 1058
},
{
"epoch": 1.7858347386172007,
"grad_norm": 0.2953834413434869,
"learning_rate": 2.2485946283572767e-05,
"loss": 0.3565,
"step": 1059
},
{
"epoch": 1.78752107925801,
"grad_norm": 0.33100873485129334,
"learning_rate": 2.245471580262336e-05,
"loss": 0.3672,
"step": 1060
},
{
"epoch": 1.7892074198988195,
"grad_norm": 0.28300168443811613,
"learning_rate": 2.2423485321673954e-05,
"loss": 0.3453,
"step": 1061
},
{
"epoch": 1.7908937605396291,
"grad_norm": 0.26796245492445664,
"learning_rate": 2.2392254840724547e-05,
"loss": 0.3421,
"step": 1062
},
{
"epoch": 1.7925801011804383,
"grad_norm": 0.28436317611406275,
"learning_rate": 2.2361024359775144e-05,
"loss": 0.3685,
"step": 1063
},
{
"epoch": 1.794266441821248,
"grad_norm": 0.3044813972327036,
"learning_rate": 2.2329793878825734e-05,
"loss": 0.3761,
"step": 1064
},
{
"epoch": 1.7959527824620574,
"grad_norm": 0.2921388581676714,
"learning_rate": 2.2298563397876328e-05,
"loss": 0.3645,
"step": 1065
},
{
"epoch": 1.7976391231028668,
"grad_norm": 0.29626567597084796,
"learning_rate": 2.226733291692692e-05,
"loss": 0.3482,
"step": 1066
},
{
"epoch": 1.7993254637436762,
"grad_norm": 0.27624975007783753,
"learning_rate": 2.2236102435977515e-05,
"loss": 0.3495,
"step": 1067
},
{
"epoch": 1.8010118043844856,
"grad_norm": 0.28624573915638696,
"learning_rate": 2.220487195502811e-05,
"loss": 0.3599,
"step": 1068
},
{
"epoch": 1.8026981450252952,
"grad_norm": 0.2841114493990201,
"learning_rate": 2.2173641474078702e-05,
"loss": 0.3539,
"step": 1069
},
{
"epoch": 1.8043844856661044,
"grad_norm": 0.31601341314333714,
"learning_rate": 2.2142410993129296e-05,
"loss": 0.3411,
"step": 1070
},
{
"epoch": 1.806070826306914,
"grad_norm": 0.25856361379287507,
"learning_rate": 2.211118051217989e-05,
"loss": 0.3566,
"step": 1071
},
{
"epoch": 1.8077571669477235,
"grad_norm": 0.3116983391184263,
"learning_rate": 2.2079950031230483e-05,
"loss": 0.3679,
"step": 1072
},
{
"epoch": 1.809443507588533,
"grad_norm": 0.32306943398684707,
"learning_rate": 2.2048719550281073e-05,
"loss": 0.3678,
"step": 1073
},
{
"epoch": 1.8111298482293423,
"grad_norm": 0.2925451620866593,
"learning_rate": 2.2017489069331666e-05,
"loss": 0.3672,
"step": 1074
},
{
"epoch": 1.8128161888701517,
"grad_norm": 0.3335180580412719,
"learning_rate": 2.198625858838226e-05,
"loss": 0.3593,
"step": 1075
},
{
"epoch": 1.8145025295109614,
"grad_norm": 0.2895810667226106,
"learning_rate": 2.1955028107432857e-05,
"loss": 0.3521,
"step": 1076
},
{
"epoch": 1.8161888701517706,
"grad_norm": 0.2815963912814418,
"learning_rate": 2.192379762648345e-05,
"loss": 0.3632,
"step": 1077
},
{
"epoch": 1.8178752107925802,
"grad_norm": 0.2868864309832322,
"learning_rate": 2.1892567145534044e-05,
"loss": 0.3489,
"step": 1078
},
{
"epoch": 1.8195615514333894,
"grad_norm": 0.29441658382664976,
"learning_rate": 2.1861336664584638e-05,
"loss": 0.3657,
"step": 1079
},
{
"epoch": 1.821247892074199,
"grad_norm": 0.31297489563548325,
"learning_rate": 2.1830106183635228e-05,
"loss": 0.3504,
"step": 1080
},
{
"epoch": 1.8229342327150084,
"grad_norm": 0.27799837753213885,
"learning_rate": 2.179887570268582e-05,
"loss": 0.3689,
"step": 1081
},
{
"epoch": 1.8246205733558178,
"grad_norm": 0.28878031053711806,
"learning_rate": 2.1767645221736415e-05,
"loss": 0.3571,
"step": 1082
},
{
"epoch": 1.8263069139966275,
"grad_norm": 0.2932352697387466,
"learning_rate": 2.173641474078701e-05,
"loss": 0.3587,
"step": 1083
},
{
"epoch": 1.8279932546374367,
"grad_norm": 0.2685486954287236,
"learning_rate": 2.1705184259837602e-05,
"loss": 0.3835,
"step": 1084
},
{
"epoch": 1.8296795952782463,
"grad_norm": 0.2909854441326301,
"learning_rate": 2.1673953778888196e-05,
"loss": 0.3485,
"step": 1085
},
{
"epoch": 1.8313659359190555,
"grad_norm": 0.29465094304956874,
"learning_rate": 2.164272329793879e-05,
"loss": 0.3682,
"step": 1086
},
{
"epoch": 1.8330522765598651,
"grad_norm": 0.2707187489413735,
"learning_rate": 2.1611492816989383e-05,
"loss": 0.3736,
"step": 1087
},
{
"epoch": 1.8347386172006745,
"grad_norm": 0.30564282864344905,
"learning_rate": 2.1580262336039976e-05,
"loss": 0.3636,
"step": 1088
},
{
"epoch": 1.836424957841484,
"grad_norm": 0.8763246852809086,
"learning_rate": 2.154903185509057e-05,
"loss": 0.4001,
"step": 1089
},
{
"epoch": 1.8381112984822934,
"grad_norm": 0.2671171226002439,
"learning_rate": 2.1517801374141163e-05,
"loss": 0.3456,
"step": 1090
},
{
"epoch": 1.8397976391231028,
"grad_norm": 0.2820484363666512,
"learning_rate": 2.1486570893191757e-05,
"loss": 0.3737,
"step": 1091
},
{
"epoch": 1.8414839797639124,
"grad_norm": 0.2829569572495032,
"learning_rate": 2.145534041224235e-05,
"loss": 0.373,
"step": 1092
},
{
"epoch": 1.8431703204047216,
"grad_norm": 0.2655606395124247,
"learning_rate": 2.1424109931292944e-05,
"loss": 0.3513,
"step": 1093
},
{
"epoch": 1.8448566610455313,
"grad_norm": 0.30954298285785775,
"learning_rate": 2.1392879450343537e-05,
"loss": 0.3697,
"step": 1094
},
{
"epoch": 1.8465430016863407,
"grad_norm": 0.2735055996514486,
"learning_rate": 2.136164896939413e-05,
"loss": 0.343,
"step": 1095
},
{
"epoch": 1.84822934232715,
"grad_norm": 0.2743003208816105,
"learning_rate": 2.133041848844472e-05,
"loss": 0.3511,
"step": 1096
},
{
"epoch": 1.8499156829679595,
"grad_norm": 0.29353957623248444,
"learning_rate": 2.1299188007495315e-05,
"loss": 0.3721,
"step": 1097
},
{
"epoch": 1.851602023608769,
"grad_norm": 0.2908904768338592,
"learning_rate": 2.1267957526545908e-05,
"loss": 0.3653,
"step": 1098
},
{
"epoch": 1.8532883642495785,
"grad_norm": 0.26714340843376727,
"learning_rate": 2.1236727045596502e-05,
"loss": 0.3584,
"step": 1099
},
{
"epoch": 1.8549747048903877,
"grad_norm": 0.33059642792892746,
"learning_rate": 2.1205496564647095e-05,
"loss": 0.382,
"step": 1100
},
{
"epoch": 1.8566610455311974,
"grad_norm": 0.2894330220537914,
"learning_rate": 2.117426608369769e-05,
"loss": 0.3672,
"step": 1101
},
{
"epoch": 1.8583473861720068,
"grad_norm": 0.3197462143084355,
"learning_rate": 2.1143035602748286e-05,
"loss": 0.3898,
"step": 1102
},
{
"epoch": 1.8600337268128162,
"grad_norm": 0.27355447071343325,
"learning_rate": 2.1111805121798876e-05,
"loss": 0.3719,
"step": 1103
},
{
"epoch": 1.8617200674536256,
"grad_norm": 0.30736784785079163,
"learning_rate": 2.108057464084947e-05,
"loss": 0.3761,
"step": 1104
},
{
"epoch": 1.863406408094435,
"grad_norm": 0.260778455212016,
"learning_rate": 2.1049344159900063e-05,
"loss": 0.3547,
"step": 1105
},
{
"epoch": 1.8650927487352447,
"grad_norm": 0.27879425279471226,
"learning_rate": 2.1018113678950657e-05,
"loss": 0.3586,
"step": 1106
},
{
"epoch": 1.8667790893760539,
"grad_norm": 0.25377663369915016,
"learning_rate": 2.098688319800125e-05,
"loss": 0.3592,
"step": 1107
},
{
"epoch": 1.8684654300168635,
"grad_norm": 0.3065184311266967,
"learning_rate": 2.0955652717051844e-05,
"loss": 0.3531,
"step": 1108
},
{
"epoch": 1.8701517706576727,
"grad_norm": 0.27223437514527493,
"learning_rate": 2.0924422236102437e-05,
"loss": 0.3662,
"step": 1109
},
{
"epoch": 1.8718381112984823,
"grad_norm": 0.29624318973193403,
"learning_rate": 2.089319175515303e-05,
"loss": 0.3584,
"step": 1110
},
{
"epoch": 1.8735244519392917,
"grad_norm": 0.25284920706159414,
"learning_rate": 2.0861961274203624e-05,
"loss": 0.3542,
"step": 1111
},
{
"epoch": 1.8752107925801011,
"grad_norm": 0.2740076078757696,
"learning_rate": 2.0830730793254215e-05,
"loss": 0.3511,
"step": 1112
},
{
"epoch": 1.8768971332209108,
"grad_norm": 0.296603469029254,
"learning_rate": 2.0799500312304808e-05,
"loss": 0.3593,
"step": 1113
},
{
"epoch": 1.87858347386172,
"grad_norm": 0.3101924174752339,
"learning_rate": 2.07682698313554e-05,
"loss": 0.3629,
"step": 1114
},
{
"epoch": 1.8802698145025296,
"grad_norm": 0.2698704731333751,
"learning_rate": 2.0737039350406e-05,
"loss": 0.3511,
"step": 1115
},
{
"epoch": 1.8819561551433388,
"grad_norm": 0.2495696479572268,
"learning_rate": 2.0705808869456592e-05,
"loss": 0.3623,
"step": 1116
},
{
"epoch": 1.8836424957841484,
"grad_norm": 0.3025959336361255,
"learning_rate": 2.0674578388507186e-05,
"loss": 0.3632,
"step": 1117
},
{
"epoch": 1.8853288364249579,
"grad_norm": 0.3420981543369168,
"learning_rate": 2.064334790755778e-05,
"loss": 0.3561,
"step": 1118
},
{
"epoch": 1.8870151770657673,
"grad_norm": 0.273069642295159,
"learning_rate": 2.061211742660837e-05,
"loss": 0.3574,
"step": 1119
},
{
"epoch": 1.8887015177065767,
"grad_norm": 0.30195405718159296,
"learning_rate": 2.0580886945658963e-05,
"loss": 0.3565,
"step": 1120
},
{
"epoch": 1.890387858347386,
"grad_norm": 0.3207160807438318,
"learning_rate": 2.0549656464709557e-05,
"loss": 0.3706,
"step": 1121
},
{
"epoch": 1.8920741989881957,
"grad_norm": 0.27432257864150217,
"learning_rate": 2.051842598376015e-05,
"loss": 0.3692,
"step": 1122
},
{
"epoch": 1.893760539629005,
"grad_norm": 0.2803399274639525,
"learning_rate": 2.0487195502810744e-05,
"loss": 0.3774,
"step": 1123
},
{
"epoch": 1.8954468802698146,
"grad_norm": 0.274785479790295,
"learning_rate": 2.0455965021861337e-05,
"loss": 0.3572,
"step": 1124
},
{
"epoch": 1.897133220910624,
"grad_norm": 0.2919292690535732,
"learning_rate": 2.042473454091193e-05,
"loss": 0.3589,
"step": 1125
},
{
"epoch": 1.8988195615514334,
"grad_norm": 0.2493810185630252,
"learning_rate": 2.0393504059962524e-05,
"loss": 0.3666,
"step": 1126
},
{
"epoch": 1.9005059021922428,
"grad_norm": 0.3122422734266926,
"learning_rate": 2.0362273579013118e-05,
"loss": 0.3687,
"step": 1127
},
{
"epoch": 1.9021922428330522,
"grad_norm": 0.2833644627593553,
"learning_rate": 2.033104309806371e-05,
"loss": 0.3721,
"step": 1128
},
{
"epoch": 1.9038785834738619,
"grad_norm": 0.24682430613475206,
"learning_rate": 2.0299812617114305e-05,
"loss": 0.3481,
"step": 1129
},
{
"epoch": 1.905564924114671,
"grad_norm": 0.29373193111963974,
"learning_rate": 2.02685821361649e-05,
"loss": 0.3652,
"step": 1130
},
{
"epoch": 1.9072512647554807,
"grad_norm": 0.2488116465798772,
"learning_rate": 2.0237351655215492e-05,
"loss": 0.348,
"step": 1131
},
{
"epoch": 1.90893760539629,
"grad_norm": 0.29044014614137137,
"learning_rate": 2.0206121174266086e-05,
"loss": 0.3393,
"step": 1132
},
{
"epoch": 1.9106239460370995,
"grad_norm": 0.3110171656934698,
"learning_rate": 2.017489069331668e-05,
"loss": 0.3862,
"step": 1133
},
{
"epoch": 1.912310286677909,
"grad_norm": 0.2580764691582828,
"learning_rate": 2.0143660212367273e-05,
"loss": 0.3638,
"step": 1134
},
{
"epoch": 1.9139966273187183,
"grad_norm": 0.2792732755590291,
"learning_rate": 2.0112429731417863e-05,
"loss": 0.337,
"step": 1135
},
{
"epoch": 1.915682967959528,
"grad_norm": 0.29912970237260444,
"learning_rate": 2.0081199250468456e-05,
"loss": 0.3644,
"step": 1136
},
{
"epoch": 1.9173693086003372,
"grad_norm": 0.25447150765707793,
"learning_rate": 2.004996876951905e-05,
"loss": 0.3513,
"step": 1137
},
{
"epoch": 1.9190556492411468,
"grad_norm": 0.2747588787195848,
"learning_rate": 2.0018738288569643e-05,
"loss": 0.3719,
"step": 1138
},
{
"epoch": 1.920741989881956,
"grad_norm": 0.2540705730030083,
"learning_rate": 1.9987507807620237e-05,
"loss": 0.3551,
"step": 1139
},
{
"epoch": 1.9224283305227656,
"grad_norm": 0.2417593824707377,
"learning_rate": 1.9956277326670834e-05,
"loss": 0.3738,
"step": 1140
},
{
"epoch": 1.924114671163575,
"grad_norm": 0.23181361516056662,
"learning_rate": 1.9925046845721428e-05,
"loss": 0.3472,
"step": 1141
},
{
"epoch": 1.9258010118043845,
"grad_norm": 0.30125286349549063,
"learning_rate": 1.989381636477202e-05,
"loss": 0.3662,
"step": 1142
},
{
"epoch": 1.927487352445194,
"grad_norm": 0.2504368430085182,
"learning_rate": 1.986258588382261e-05,
"loss": 0.3375,
"step": 1143
},
{
"epoch": 1.9291736930860033,
"grad_norm": 0.24925738519535062,
"learning_rate": 1.9831355402873205e-05,
"loss": 0.3752,
"step": 1144
},
{
"epoch": 1.930860033726813,
"grad_norm": 0.2573957501588105,
"learning_rate": 1.98001249219238e-05,
"loss": 0.3623,
"step": 1145
},
{
"epoch": 1.932546374367622,
"grad_norm": 0.25691872404772453,
"learning_rate": 1.9768894440974392e-05,
"loss": 0.3694,
"step": 1146
},
{
"epoch": 1.9342327150084317,
"grad_norm": 0.28696277976052026,
"learning_rate": 1.9737663960024985e-05,
"loss": 0.3537,
"step": 1147
},
{
"epoch": 1.9359190556492412,
"grad_norm": 0.25620992572730106,
"learning_rate": 1.970643347907558e-05,
"loss": 0.3666,
"step": 1148
},
{
"epoch": 1.9376053962900506,
"grad_norm": 0.25427258057854996,
"learning_rate": 1.9675202998126173e-05,
"loss": 0.3456,
"step": 1149
},
{
"epoch": 1.93929173693086,
"grad_norm": 0.2359437378592525,
"learning_rate": 1.9643972517176766e-05,
"loss": 0.3527,
"step": 1150
},
{
"epoch": 1.9409780775716694,
"grad_norm": 0.24853620134111262,
"learning_rate": 1.9612742036227356e-05,
"loss": 0.3539,
"step": 1151
},
{
"epoch": 1.942664418212479,
"grad_norm": 0.24474109975464906,
"learning_rate": 1.958151155527795e-05,
"loss": 0.357,
"step": 1152
},
{
"epoch": 1.9443507588532882,
"grad_norm": 0.25479373707248376,
"learning_rate": 1.9550281074328547e-05,
"loss": 0.3741,
"step": 1153
},
{
"epoch": 1.9460370994940979,
"grad_norm": 0.22999459116070575,
"learning_rate": 1.951905059337914e-05,
"loss": 0.3458,
"step": 1154
},
{
"epoch": 1.9477234401349073,
"grad_norm": 0.2818946765325796,
"learning_rate": 1.9487820112429734e-05,
"loss": 0.3597,
"step": 1155
},
{
"epoch": 1.9494097807757167,
"grad_norm": 0.2520481240185905,
"learning_rate": 1.9456589631480327e-05,
"loss": 0.3608,
"step": 1156
},
{
"epoch": 1.951096121416526,
"grad_norm": 0.2732606635337268,
"learning_rate": 1.942535915053092e-05,
"loss": 0.3732,
"step": 1157
},
{
"epoch": 1.9527824620573355,
"grad_norm": 0.3059595096865597,
"learning_rate": 1.939412866958151e-05,
"loss": 0.3598,
"step": 1158
},
{
"epoch": 1.9544688026981452,
"grad_norm": 0.26107842993336405,
"learning_rate": 1.9362898188632105e-05,
"loss": 0.3595,
"step": 1159
},
{
"epoch": 1.9561551433389543,
"grad_norm": 0.26997037532654433,
"learning_rate": 1.9331667707682698e-05,
"loss": 0.3556,
"step": 1160
},
{
"epoch": 1.957841483979764,
"grad_norm": 0.29676884514372204,
"learning_rate": 1.9300437226733292e-05,
"loss": 0.3637,
"step": 1161
},
{
"epoch": 1.9595278246205734,
"grad_norm": 0.2891548777760464,
"learning_rate": 1.9269206745783885e-05,
"loss": 0.3643,
"step": 1162
},
{
"epoch": 1.9612141652613828,
"grad_norm": 0.3111146178069649,
"learning_rate": 1.923797626483448e-05,
"loss": 0.3469,
"step": 1163
},
{
"epoch": 1.9629005059021922,
"grad_norm": 0.290593840910139,
"learning_rate": 1.9206745783885072e-05,
"loss": 0.3646,
"step": 1164
},
{
"epoch": 1.9645868465430016,
"grad_norm": 0.30031395432300506,
"learning_rate": 1.9175515302935666e-05,
"loss": 0.3709,
"step": 1165
},
{
"epoch": 1.9662731871838113,
"grad_norm": 0.28471179249901485,
"learning_rate": 1.914428482198626e-05,
"loss": 0.382,
"step": 1166
},
{
"epoch": 1.9679595278246205,
"grad_norm": 0.2725996893631143,
"learning_rate": 1.9113054341036853e-05,
"loss": 0.3798,
"step": 1167
},
{
"epoch": 1.96964586846543,
"grad_norm": 0.33070917660443727,
"learning_rate": 1.9081823860087447e-05,
"loss": 0.3612,
"step": 1168
},
{
"epoch": 1.9713322091062393,
"grad_norm": 0.27273598100922003,
"learning_rate": 1.905059337913804e-05,
"loss": 0.3516,
"step": 1169
},
{
"epoch": 1.973018549747049,
"grad_norm": 0.250304283272736,
"learning_rate": 1.9019362898188634e-05,
"loss": 0.3628,
"step": 1170
},
{
"epoch": 1.9747048903878583,
"grad_norm": 0.25447789264283865,
"learning_rate": 1.8988132417239227e-05,
"loss": 0.3427,
"step": 1171
},
{
"epoch": 1.9763912310286678,
"grad_norm": 0.26147916698816104,
"learning_rate": 1.895690193628982e-05,
"loss": 0.3609,
"step": 1172
},
{
"epoch": 1.9780775716694774,
"grad_norm": 0.84319269153612,
"learning_rate": 1.8925671455340414e-05,
"loss": 0.3658,
"step": 1173
},
{
"epoch": 1.9797639123102866,
"grad_norm": 0.24524151384565965,
"learning_rate": 1.8894440974391004e-05,
"loss": 0.3469,
"step": 1174
},
{
"epoch": 1.9814502529510962,
"grad_norm": 0.27630607979162247,
"learning_rate": 1.8863210493441598e-05,
"loss": 0.3799,
"step": 1175
},
{
"epoch": 1.9831365935919054,
"grad_norm": 0.3062796051194247,
"learning_rate": 1.883198001249219e-05,
"loss": 0.3804,
"step": 1176
},
{
"epoch": 1.984822934232715,
"grad_norm": 0.23663568250522882,
"learning_rate": 1.8800749531542785e-05,
"loss": 0.3579,
"step": 1177
},
{
"epoch": 1.9865092748735245,
"grad_norm": 0.2839785085773881,
"learning_rate": 1.876951905059338e-05,
"loss": 0.3728,
"step": 1178
},
{
"epoch": 1.9881956155143339,
"grad_norm": 0.24299912642691168,
"learning_rate": 1.8738288569643976e-05,
"loss": 0.3531,
"step": 1179
},
{
"epoch": 1.9898819561551433,
"grad_norm": 0.2848817170382811,
"learning_rate": 1.870705808869457e-05,
"loss": 0.3639,
"step": 1180
},
{
"epoch": 1.9915682967959527,
"grad_norm": 0.2735374219071693,
"learning_rate": 1.8675827607745163e-05,
"loss": 0.3725,
"step": 1181
},
{
"epoch": 1.9932546374367623,
"grad_norm": 0.26362036054219906,
"learning_rate": 1.8644597126795753e-05,
"loss": 0.3523,
"step": 1182
},
{
"epoch": 1.9949409780775715,
"grad_norm": 0.2781904905926228,
"learning_rate": 1.8613366645846346e-05,
"loss": 0.3632,
"step": 1183
},
{
"epoch": 1.9966273187183812,
"grad_norm": 0.28555430611153043,
"learning_rate": 1.858213616489694e-05,
"loss": 0.3423,
"step": 1184
},
{
"epoch": 1.9983136593591906,
"grad_norm": 0.2925206020361668,
"learning_rate": 1.8550905683947534e-05,
"loss": 0.3602,
"step": 1185
},
{
"epoch": 2.0,
"grad_norm": 0.2906446874370902,
"learning_rate": 1.8519675202998127e-05,
"loss": 0.3512,
"step": 1186
},
{
"epoch": 2.0016863406408096,
"grad_norm": 0.3307751271667645,
"learning_rate": 1.848844472204872e-05,
"loss": 0.2956,
"step": 1187
},
{
"epoch": 2.003372681281619,
"grad_norm": 0.29575692997643666,
"learning_rate": 1.8457214241099314e-05,
"loss": 0.2939,
"step": 1188
},
{
"epoch": 2.0050590219224285,
"grad_norm": 0.369446717074265,
"learning_rate": 1.8425983760149908e-05,
"loss": 0.2965,
"step": 1189
},
{
"epoch": 2.0067453625632377,
"grad_norm": 0.30301462667295276,
"learning_rate": 1.8394753279200498e-05,
"loss": 0.291,
"step": 1190
},
{
"epoch": 2.0084317032040473,
"grad_norm": 0.3316839260298742,
"learning_rate": 1.836352279825109e-05,
"loss": 0.2778,
"step": 1191
},
{
"epoch": 2.0101180438448565,
"grad_norm": 0.3335466364498099,
"learning_rate": 1.833229231730169e-05,
"loss": 0.2955,
"step": 1192
},
{
"epoch": 2.011804384485666,
"grad_norm": 0.28377162554836055,
"learning_rate": 1.8301061836352282e-05,
"loss": 0.281,
"step": 1193
},
{
"epoch": 2.0134907251264758,
"grad_norm": 0.3022140584533395,
"learning_rate": 1.8269831355402875e-05,
"loss": 0.2903,
"step": 1194
},
{
"epoch": 2.015177065767285,
"grad_norm": 0.29880763730980237,
"learning_rate": 1.823860087445347e-05,
"loss": 0.2836,
"step": 1195
},
{
"epoch": 2.0168634064080946,
"grad_norm": 0.2793211888777449,
"learning_rate": 1.8207370393504063e-05,
"loss": 0.2804,
"step": 1196
},
{
"epoch": 2.0185497470489038,
"grad_norm": 0.2791483101159237,
"learning_rate": 1.8176139912554656e-05,
"loss": 0.2851,
"step": 1197
},
{
"epoch": 2.0202360876897134,
"grad_norm": 0.28203865877543277,
"learning_rate": 1.8144909431605246e-05,
"loss": 0.2897,
"step": 1198
},
{
"epoch": 2.0219224283305226,
"grad_norm": 0.30036531151809387,
"learning_rate": 1.811367895065584e-05,
"loss": 0.2816,
"step": 1199
},
{
"epoch": 2.0236087689713322,
"grad_norm": 0.26783802520337824,
"learning_rate": 1.8082448469706433e-05,
"loss": 0.2896,
"step": 1200
},
{
"epoch": 2.0252951096121414,
"grad_norm": 0.2727196129087922,
"learning_rate": 1.8051217988757027e-05,
"loss": 0.2914,
"step": 1201
},
{
"epoch": 2.026981450252951,
"grad_norm": 0.27693857289448,
"learning_rate": 1.801998750780762e-05,
"loss": 0.2905,
"step": 1202
},
{
"epoch": 2.0286677908937607,
"grad_norm": 0.2439937354147468,
"learning_rate": 1.7988757026858214e-05,
"loss": 0.2808,
"step": 1203
},
{
"epoch": 2.03035413153457,
"grad_norm": 0.25874712164329317,
"learning_rate": 1.7957526545908808e-05,
"loss": 0.2942,
"step": 1204
},
{
"epoch": 2.0320404721753795,
"grad_norm": 0.2853597585585357,
"learning_rate": 1.79262960649594e-05,
"loss": 0.2937,
"step": 1205
},
{
"epoch": 2.0337268128161887,
"grad_norm": 0.27253686049864834,
"learning_rate": 1.7895065584009995e-05,
"loss": 0.2873,
"step": 1206
},
{
"epoch": 2.0354131534569984,
"grad_norm": 0.2590500678500842,
"learning_rate": 1.7863835103060588e-05,
"loss": 0.2908,
"step": 1207
},
{
"epoch": 2.0370994940978076,
"grad_norm": 0.23581659556462323,
"learning_rate": 1.7832604622111182e-05,
"loss": 0.2871,
"step": 1208
},
{
"epoch": 2.038785834738617,
"grad_norm": 0.26532251878749463,
"learning_rate": 1.7801374141161775e-05,
"loss": 0.288,
"step": 1209
},
{
"epoch": 2.040472175379427,
"grad_norm": 0.2284343621608884,
"learning_rate": 1.777014366021237e-05,
"loss": 0.284,
"step": 1210
},
{
"epoch": 2.042158516020236,
"grad_norm": 0.27859804430246,
"learning_rate": 1.7738913179262962e-05,
"loss": 0.2824,
"step": 1211
},
{
"epoch": 2.0438448566610457,
"grad_norm": 0.25913110830168806,
"learning_rate": 1.7707682698313556e-05,
"loss": 0.3001,
"step": 1212
},
{
"epoch": 2.045531197301855,
"grad_norm": 0.26958477440693623,
"learning_rate": 1.767645221736415e-05,
"loss": 0.2966,
"step": 1213
},
{
"epoch": 2.0472175379426645,
"grad_norm": 0.2658355243895097,
"learning_rate": 1.764522173641474e-05,
"loss": 0.2803,
"step": 1214
},
{
"epoch": 2.0489038785834737,
"grad_norm": 0.23455281497306474,
"learning_rate": 1.7613991255465333e-05,
"loss": 0.2955,
"step": 1215
},
{
"epoch": 2.0505902192242833,
"grad_norm": 0.23598863217854707,
"learning_rate": 1.7582760774515927e-05,
"loss": 0.2798,
"step": 1216
},
{
"epoch": 2.052276559865093,
"grad_norm": 0.23974716737829616,
"learning_rate": 1.755153029356652e-05,
"loss": 0.291,
"step": 1217
},
{
"epoch": 2.053962900505902,
"grad_norm": 0.25591362950142704,
"learning_rate": 1.7520299812617117e-05,
"loss": 0.2941,
"step": 1218
},
{
"epoch": 2.0556492411467118,
"grad_norm": 0.24126270893781737,
"learning_rate": 1.748906933166771e-05,
"loss": 0.296,
"step": 1219
},
{
"epoch": 2.057335581787521,
"grad_norm": 0.24592681706462488,
"learning_rate": 1.7457838850718304e-05,
"loss": 0.2922,
"step": 1220
},
{
"epoch": 2.0590219224283306,
"grad_norm": 0.2493972413820514,
"learning_rate": 1.7426608369768894e-05,
"loss": 0.2934,
"step": 1221
},
{
"epoch": 2.06070826306914,
"grad_norm": 0.2240178832747126,
"learning_rate": 1.7395377888819488e-05,
"loss": 0.2772,
"step": 1222
},
{
"epoch": 2.0623946037099494,
"grad_norm": 0.24194577851657278,
"learning_rate": 1.736414740787008e-05,
"loss": 0.2917,
"step": 1223
},
{
"epoch": 2.064080944350759,
"grad_norm": 0.250309298786314,
"learning_rate": 1.7332916926920675e-05,
"loss": 0.2945,
"step": 1224
},
{
"epoch": 2.0657672849915683,
"grad_norm": 0.21954443072464697,
"learning_rate": 1.730168644597127e-05,
"loss": 0.2683,
"step": 1225
},
{
"epoch": 2.067453625632378,
"grad_norm": 0.24070761070270455,
"learning_rate": 1.7270455965021862e-05,
"loss": 0.2898,
"step": 1226
},
{
"epoch": 2.069139966273187,
"grad_norm": 0.23195736444889098,
"learning_rate": 1.7239225484072456e-05,
"loss": 0.2743,
"step": 1227
},
{
"epoch": 2.0708263069139967,
"grad_norm": 0.2401183451917892,
"learning_rate": 1.720799500312305e-05,
"loss": 0.2742,
"step": 1228
},
{
"epoch": 2.072512647554806,
"grad_norm": 0.22886720556268084,
"learning_rate": 1.717676452217364e-05,
"loss": 0.2714,
"step": 1229
},
{
"epoch": 2.0741989881956155,
"grad_norm": 0.22005987910735308,
"learning_rate": 1.7145534041224233e-05,
"loss": 0.2799,
"step": 1230
},
{
"epoch": 2.075885328836425,
"grad_norm": 0.23405270800317268,
"learning_rate": 1.711430356027483e-05,
"loss": 0.2763,
"step": 1231
},
{
"epoch": 2.0775716694772344,
"grad_norm": 0.22390802443877075,
"learning_rate": 1.7083073079325424e-05,
"loss": 0.2782,
"step": 1232
},
{
"epoch": 2.079258010118044,
"grad_norm": 0.2233723580929941,
"learning_rate": 1.7051842598376017e-05,
"loss": 0.294,
"step": 1233
},
{
"epoch": 2.080944350758853,
"grad_norm": 0.24116001791573663,
"learning_rate": 1.702061211742661e-05,
"loss": 0.277,
"step": 1234
},
{
"epoch": 2.082630691399663,
"grad_norm": 0.23973427379224568,
"learning_rate": 1.6989381636477204e-05,
"loss": 0.2799,
"step": 1235
},
{
"epoch": 2.084317032040472,
"grad_norm": 0.25867200752208974,
"learning_rate": 1.6958151155527798e-05,
"loss": 0.2887,
"step": 1236
},
{
"epoch": 2.0860033726812817,
"grad_norm": 0.21859934229938208,
"learning_rate": 1.6926920674578388e-05,
"loss": 0.2839,
"step": 1237
},
{
"epoch": 2.087689713322091,
"grad_norm": 0.23071892456929038,
"learning_rate": 1.689569019362898e-05,
"loss": 0.281,
"step": 1238
},
{
"epoch": 2.0893760539629005,
"grad_norm": 0.24681673388720834,
"learning_rate": 1.6864459712679575e-05,
"loss": 0.2891,
"step": 1239
},
{
"epoch": 2.09106239460371,
"grad_norm": 0.22203763739533036,
"learning_rate": 1.683322923173017e-05,
"loss": 0.2835,
"step": 1240
},
{
"epoch": 2.0927487352445193,
"grad_norm": 0.23625461690323366,
"learning_rate": 1.6801998750780762e-05,
"loss": 0.2882,
"step": 1241
},
{
"epoch": 2.094435075885329,
"grad_norm": 0.2495043900664838,
"learning_rate": 1.6770768269831356e-05,
"loss": 0.2866,
"step": 1242
},
{
"epoch": 2.096121416526138,
"grad_norm": 0.24755254920805522,
"learning_rate": 1.673953778888195e-05,
"loss": 0.3068,
"step": 1243
},
{
"epoch": 2.097807757166948,
"grad_norm": 0.2321369024402958,
"learning_rate": 1.6708307307932543e-05,
"loss": 0.2851,
"step": 1244
},
{
"epoch": 2.099494097807757,
"grad_norm": 0.23341761835203848,
"learning_rate": 1.6677076826983136e-05,
"loss": 0.2774,
"step": 1245
},
{
"epoch": 2.1011804384485666,
"grad_norm": 0.228643496609932,
"learning_rate": 1.664584634603373e-05,
"loss": 0.2871,
"step": 1246
},
{
"epoch": 2.1028667790893762,
"grad_norm": 0.24199355513800766,
"learning_rate": 1.6614615865084323e-05,
"loss": 0.2923,
"step": 1247
},
{
"epoch": 2.1045531197301854,
"grad_norm": 0.23947463740759362,
"learning_rate": 1.6583385384134917e-05,
"loss": 0.2841,
"step": 1248
},
{
"epoch": 2.106239460370995,
"grad_norm": 0.2443776609037523,
"learning_rate": 1.655215490318551e-05,
"loss": 0.2863,
"step": 1249
},
{
"epoch": 2.1079258010118043,
"grad_norm": 0.23349457936994056,
"learning_rate": 1.6520924422236104e-05,
"loss": 0.2767,
"step": 1250
},
{
"epoch": 2.109612141652614,
"grad_norm": 0.2204903592956088,
"learning_rate": 1.6489693941286698e-05,
"loss": 0.2741,
"step": 1251
},
{
"epoch": 2.111298482293423,
"grad_norm": 0.24727353017816828,
"learning_rate": 1.645846346033729e-05,
"loss": 0.2812,
"step": 1252
},
{
"epoch": 2.1129848229342327,
"grad_norm": 0.2311319506145901,
"learning_rate": 1.642723297938788e-05,
"loss": 0.2799,
"step": 1253
},
{
"epoch": 2.1146711635750424,
"grad_norm": 0.2368796716322885,
"learning_rate": 1.6396002498438475e-05,
"loss": 0.2872,
"step": 1254
},
{
"epoch": 2.1163575042158516,
"grad_norm": 0.22458704529716217,
"learning_rate": 1.636477201748907e-05,
"loss": 0.2833,
"step": 1255
},
{
"epoch": 2.118043844856661,
"grad_norm": 0.2257545808978752,
"learning_rate": 1.6333541536539662e-05,
"loss": 0.2781,
"step": 1256
},
{
"epoch": 2.1197301854974704,
"grad_norm": 0.217874801326409,
"learning_rate": 1.630231105559026e-05,
"loss": 0.2805,
"step": 1257
},
{
"epoch": 2.12141652613828,
"grad_norm": 0.23071381152166529,
"learning_rate": 1.6271080574640852e-05,
"loss": 0.2798,
"step": 1258
},
{
"epoch": 2.123102866779089,
"grad_norm": 0.21589917060864472,
"learning_rate": 1.6239850093691446e-05,
"loss": 0.287,
"step": 1259
},
{
"epoch": 2.124789207419899,
"grad_norm": 0.22233422998925415,
"learning_rate": 1.6208619612742036e-05,
"loss": 0.2918,
"step": 1260
},
{
"epoch": 2.126475548060708,
"grad_norm": 0.2253005585037101,
"learning_rate": 1.617738913179263e-05,
"loss": 0.2855,
"step": 1261
},
{
"epoch": 2.1281618887015177,
"grad_norm": 0.24525887817635011,
"learning_rate": 1.6146158650843223e-05,
"loss": 0.3026,
"step": 1262
},
{
"epoch": 2.1298482293423273,
"grad_norm": 0.2311288754571125,
"learning_rate": 1.6114928169893817e-05,
"loss": 0.2919,
"step": 1263
},
{
"epoch": 2.1315345699831365,
"grad_norm": 0.25357016877593264,
"learning_rate": 1.608369768894441e-05,
"loss": 0.2793,
"step": 1264
},
{
"epoch": 2.133220910623946,
"grad_norm": 0.24402417920686248,
"learning_rate": 1.6052467207995004e-05,
"loss": 0.2729,
"step": 1265
},
{
"epoch": 2.1349072512647553,
"grad_norm": 0.24043061619632167,
"learning_rate": 1.6021236727045597e-05,
"loss": 0.2812,
"step": 1266
},
{
"epoch": 2.136593591905565,
"grad_norm": 0.2256502846211635,
"learning_rate": 1.599000624609619e-05,
"loss": 0.2785,
"step": 1267
},
{
"epoch": 2.138279932546374,
"grad_norm": 0.20616813865116917,
"learning_rate": 1.5958775765146785e-05,
"loss": 0.27,
"step": 1268
},
{
"epoch": 2.139966273187184,
"grad_norm": 0.2578343792815402,
"learning_rate": 1.5927545284197375e-05,
"loss": 0.2919,
"step": 1269
},
{
"epoch": 2.1416526138279934,
"grad_norm": 0.24824972542043958,
"learning_rate": 1.589631480324797e-05,
"loss": 0.2861,
"step": 1270
},
{
"epoch": 2.1433389544688026,
"grad_norm": 0.24683021472584324,
"learning_rate": 1.5865084322298565e-05,
"loss": 0.2855,
"step": 1271
},
{
"epoch": 2.1450252951096123,
"grad_norm": 0.23061775902190973,
"learning_rate": 1.583385384134916e-05,
"loss": 0.2821,
"step": 1272
},
{
"epoch": 2.1467116357504215,
"grad_norm": 0.25513089040919573,
"learning_rate": 1.5802623360399752e-05,
"loss": 0.277,
"step": 1273
},
{
"epoch": 2.148397976391231,
"grad_norm": 0.24772256510306015,
"learning_rate": 1.5771392879450346e-05,
"loss": 0.2743,
"step": 1274
},
{
"epoch": 2.1500843170320403,
"grad_norm": 0.2377207233659539,
"learning_rate": 1.574016239850094e-05,
"loss": 0.2768,
"step": 1275
},
{
"epoch": 2.15177065767285,
"grad_norm": 0.2130979567502923,
"learning_rate": 1.570893191755153e-05,
"loss": 0.2754,
"step": 1276
},
{
"epoch": 2.1534569983136596,
"grad_norm": 0.262105795530253,
"learning_rate": 1.5677701436602123e-05,
"loss": 0.2814,
"step": 1277
},
{
"epoch": 2.1551433389544687,
"grad_norm": 0.25109311323666383,
"learning_rate": 1.5646470955652717e-05,
"loss": 0.2765,
"step": 1278
},
{
"epoch": 2.1568296795952784,
"grad_norm": 0.22712313240141768,
"learning_rate": 1.561524047470331e-05,
"loss": 0.2848,
"step": 1279
},
{
"epoch": 2.1585160202360876,
"grad_norm": 0.28309648201217064,
"learning_rate": 1.5584009993753904e-05,
"loss": 0.296,
"step": 1280
},
{
"epoch": 2.160202360876897,
"grad_norm": 0.2259925274682846,
"learning_rate": 1.5552779512804497e-05,
"loss": 0.274,
"step": 1281
},
{
"epoch": 2.1618887015177064,
"grad_norm": 0.2576654720712196,
"learning_rate": 1.5521549031855094e-05,
"loss": 0.2802,
"step": 1282
},
{
"epoch": 2.163575042158516,
"grad_norm": 0.24876972465327207,
"learning_rate": 1.5490318550905684e-05,
"loss": 0.2752,
"step": 1283
},
{
"epoch": 2.1652613827993257,
"grad_norm": 0.27479181196325325,
"learning_rate": 1.5459088069956278e-05,
"loss": 0.2995,
"step": 1284
},
{
"epoch": 2.166947723440135,
"grad_norm": 0.2777973590363955,
"learning_rate": 1.542785758900687e-05,
"loss": 0.2905,
"step": 1285
},
{
"epoch": 2.1686340640809445,
"grad_norm": 0.21470741412226269,
"learning_rate": 1.5396627108057465e-05,
"loss": 0.2806,
"step": 1286
},
{
"epoch": 2.1703204047217537,
"grad_norm": 0.25995545738255027,
"learning_rate": 1.536539662710806e-05,
"loss": 0.2783,
"step": 1287
},
{
"epoch": 2.1720067453625633,
"grad_norm": 0.26866950347821744,
"learning_rate": 1.5334166146158652e-05,
"loss": 0.2958,
"step": 1288
},
{
"epoch": 2.1736930860033725,
"grad_norm": 0.22667179523843922,
"learning_rate": 1.5302935665209246e-05,
"loss": 0.2925,
"step": 1289
},
{
"epoch": 2.175379426644182,
"grad_norm": 0.2524094258477164,
"learning_rate": 1.527170518425984e-05,
"loss": 0.2899,
"step": 1290
},
{
"epoch": 2.177065767284992,
"grad_norm": 0.25430523574145514,
"learning_rate": 1.5240474703310431e-05,
"loss": 0.2978,
"step": 1291
},
{
"epoch": 2.178752107925801,
"grad_norm": 0.23658111810584043,
"learning_rate": 1.5209244222361025e-05,
"loss": 0.2861,
"step": 1292
},
{
"epoch": 2.1804384485666106,
"grad_norm": 0.23263458485765506,
"learning_rate": 1.5178013741411618e-05,
"loss": 0.2943,
"step": 1293
},
{
"epoch": 2.18212478920742,
"grad_norm": 0.23038244450450082,
"learning_rate": 1.514678326046221e-05,
"loss": 0.2753,
"step": 1294
},
{
"epoch": 2.1838111298482294,
"grad_norm": 0.26875956119847794,
"learning_rate": 1.5115552779512807e-05,
"loss": 0.2835,
"step": 1295
},
{
"epoch": 2.1854974704890386,
"grad_norm": 0.23121532536402334,
"learning_rate": 1.50843222985634e-05,
"loss": 0.2736,
"step": 1296
},
{
"epoch": 2.1871838111298483,
"grad_norm": 0.21410903606747092,
"learning_rate": 1.5053091817613992e-05,
"loss": 0.2865,
"step": 1297
},
{
"epoch": 2.1888701517706575,
"grad_norm": 0.2332520113659498,
"learning_rate": 1.5021861336664586e-05,
"loss": 0.2737,
"step": 1298
},
{
"epoch": 2.190556492411467,
"grad_norm": 0.24890045008623024,
"learning_rate": 1.499063085571518e-05,
"loss": 0.2747,
"step": 1299
},
{
"epoch": 2.1922428330522767,
"grad_norm": 0.24074106694356046,
"learning_rate": 1.4959400374765773e-05,
"loss": 0.3033,
"step": 1300
},
{
"epoch": 2.193929173693086,
"grad_norm": 0.22496522452819792,
"learning_rate": 1.4928169893816365e-05,
"loss": 0.2698,
"step": 1301
},
{
"epoch": 2.1956155143338956,
"grad_norm": 0.24173115918945443,
"learning_rate": 1.4896939412866958e-05,
"loss": 0.2647,
"step": 1302
},
{
"epoch": 2.1973018549747048,
"grad_norm": 0.24597057276067655,
"learning_rate": 1.4865708931917552e-05,
"loss": 0.2863,
"step": 1303
},
{
"epoch": 2.1989881956155144,
"grad_norm": 0.2170673774515546,
"learning_rate": 1.4834478450968146e-05,
"loss": 0.2671,
"step": 1304
},
{
"epoch": 2.2006745362563236,
"grad_norm": 0.22410705790302243,
"learning_rate": 1.4803247970018737e-05,
"loss": 0.2897,
"step": 1305
},
{
"epoch": 2.2023608768971332,
"grad_norm": 0.2723128596611122,
"learning_rate": 1.4772017489069331e-05,
"loss": 0.2856,
"step": 1306
},
{
"epoch": 2.204047217537943,
"grad_norm": 0.23229362442508056,
"learning_rate": 1.4740787008119924e-05,
"loss": 0.2793,
"step": 1307
},
{
"epoch": 2.205733558178752,
"grad_norm": 0.21690579052268813,
"learning_rate": 1.470955652717052e-05,
"loss": 0.2755,
"step": 1308
},
{
"epoch": 2.2074198988195617,
"grad_norm": 0.24232075794132246,
"learning_rate": 1.4678326046221113e-05,
"loss": 0.2866,
"step": 1309
},
{
"epoch": 2.209106239460371,
"grad_norm": 0.24128736334554524,
"learning_rate": 1.4647095565271707e-05,
"loss": 0.2909,
"step": 1310
},
{
"epoch": 2.2107925801011805,
"grad_norm": 0.22279945820892214,
"learning_rate": 1.46158650843223e-05,
"loss": 0.274,
"step": 1311
},
{
"epoch": 2.2124789207419897,
"grad_norm": 0.22506715307821687,
"learning_rate": 1.4584634603372894e-05,
"loss": 0.2733,
"step": 1312
},
{
"epoch": 2.2141652613827993,
"grad_norm": 0.2671414216519164,
"learning_rate": 1.4553404122423486e-05,
"loss": 0.2773,
"step": 1313
},
{
"epoch": 2.2158516020236085,
"grad_norm": 0.24159986202299671,
"learning_rate": 1.452217364147408e-05,
"loss": 0.2854,
"step": 1314
},
{
"epoch": 2.217537942664418,
"grad_norm": 0.21206993766665885,
"learning_rate": 1.4490943160524673e-05,
"loss": 0.2802,
"step": 1315
},
{
"epoch": 2.219224283305228,
"grad_norm": 0.2609251300600537,
"learning_rate": 1.4459712679575266e-05,
"loss": 0.302,
"step": 1316
},
{
"epoch": 2.220910623946037,
"grad_norm": 0.2613884146443943,
"learning_rate": 1.4428482198625858e-05,
"loss": 0.2858,
"step": 1317
},
{
"epoch": 2.2225969645868466,
"grad_norm": 0.23026770972951804,
"learning_rate": 1.4397251717676452e-05,
"loss": 0.2901,
"step": 1318
},
{
"epoch": 2.224283305227656,
"grad_norm": 0.2644082966922968,
"learning_rate": 1.4366021236727045e-05,
"loss": 0.2912,
"step": 1319
},
{
"epoch": 2.2259696458684655,
"grad_norm": 0.28137455460025673,
"learning_rate": 1.4334790755777639e-05,
"loss": 0.2789,
"step": 1320
},
{
"epoch": 2.2276559865092747,
"grad_norm": 0.2265306145043108,
"learning_rate": 1.4303560274828234e-05,
"loss": 0.29,
"step": 1321
},
{
"epoch": 2.2293423271500843,
"grad_norm": 0.22638647931653802,
"learning_rate": 1.4272329793878828e-05,
"loss": 0.2836,
"step": 1322
},
{
"epoch": 2.231028667790894,
"grad_norm": 0.2584632170534727,
"learning_rate": 1.4241099312929421e-05,
"loss": 0.2882,
"step": 1323
},
{
"epoch": 2.232715008431703,
"grad_norm": 0.24957538711555913,
"learning_rate": 1.4209868831980013e-05,
"loss": 0.2726,
"step": 1324
},
{
"epoch": 2.2344013490725128,
"grad_norm": 0.23246788729694776,
"learning_rate": 1.4178638351030607e-05,
"loss": 0.292,
"step": 1325
},
{
"epoch": 2.236087689713322,
"grad_norm": 0.23914882917315505,
"learning_rate": 1.41474078700812e-05,
"loss": 0.2918,
"step": 1326
},
{
"epoch": 2.2377740303541316,
"grad_norm": 0.2314608760403086,
"learning_rate": 1.4116177389131794e-05,
"loss": 0.2774,
"step": 1327
},
{
"epoch": 2.2394603709949408,
"grad_norm": 0.2329062705306257,
"learning_rate": 1.4084946908182387e-05,
"loss": 0.2709,
"step": 1328
},
{
"epoch": 2.2411467116357504,
"grad_norm": 0.22203184430747988,
"learning_rate": 1.405371642723298e-05,
"loss": 0.278,
"step": 1329
},
{
"epoch": 2.24283305227656,
"grad_norm": 0.2447086816439649,
"learning_rate": 1.4022485946283573e-05,
"loss": 0.2764,
"step": 1330
},
{
"epoch": 2.2445193929173692,
"grad_norm": 0.2591556094088965,
"learning_rate": 1.3991255465334166e-05,
"loss": 0.2801,
"step": 1331
},
{
"epoch": 2.246205733558179,
"grad_norm": 0.23957435468644062,
"learning_rate": 1.396002498438476e-05,
"loss": 0.2714,
"step": 1332
},
{
"epoch": 2.247892074198988,
"grad_norm": 0.2606681194052524,
"learning_rate": 1.3928794503435352e-05,
"loss": 0.2915,
"step": 1333
},
{
"epoch": 2.2495784148397977,
"grad_norm": 0.23947024763981367,
"learning_rate": 1.3897564022485949e-05,
"loss": 0.278,
"step": 1334
},
{
"epoch": 2.251264755480607,
"grad_norm": 0.25887410797765165,
"learning_rate": 1.3866333541536542e-05,
"loss": 0.2883,
"step": 1335
},
{
"epoch": 2.2529510961214165,
"grad_norm": 0.24241816096972651,
"learning_rate": 1.3835103060587134e-05,
"loss": 0.2702,
"step": 1336
},
{
"epoch": 2.254637436762226,
"grad_norm": 0.23752929575738543,
"learning_rate": 1.3803872579637728e-05,
"loss": 0.3026,
"step": 1337
},
{
"epoch": 2.2563237774030354,
"grad_norm": 0.22243933683755185,
"learning_rate": 1.3772642098688321e-05,
"loss": 0.287,
"step": 1338
},
{
"epoch": 2.258010118043845,
"grad_norm": 0.22715493640118753,
"learning_rate": 1.3741411617738915e-05,
"loss": 0.2833,
"step": 1339
},
{
"epoch": 2.259696458684654,
"grad_norm": 0.2301390684777108,
"learning_rate": 1.3710181136789507e-05,
"loss": 0.2735,
"step": 1340
},
{
"epoch": 2.261382799325464,
"grad_norm": 0.20966361714580345,
"learning_rate": 1.36789506558401e-05,
"loss": 0.2751,
"step": 1341
},
{
"epoch": 2.263069139966273,
"grad_norm": 0.24196298157506335,
"learning_rate": 1.3647720174890694e-05,
"loss": 0.2802,
"step": 1342
},
{
"epoch": 2.2647554806070826,
"grad_norm": 0.22273345922078996,
"learning_rate": 1.3616489693941287e-05,
"loss": 0.2718,
"step": 1343
},
{
"epoch": 2.2664418212478923,
"grad_norm": 0.23365684879187829,
"learning_rate": 1.358525921299188e-05,
"loss": 0.2858,
"step": 1344
},
{
"epoch": 2.2681281618887015,
"grad_norm": 0.23424780996142444,
"learning_rate": 1.3554028732042473e-05,
"loss": 0.2658,
"step": 1345
},
{
"epoch": 2.269814502529511,
"grad_norm": 0.25740201871247437,
"learning_rate": 1.3522798251093066e-05,
"loss": 0.2872,
"step": 1346
},
{
"epoch": 2.2715008431703203,
"grad_norm": 0.2397190411229239,
"learning_rate": 1.3491567770143661e-05,
"loss": 0.2882,
"step": 1347
},
{
"epoch": 2.27318718381113,
"grad_norm": 0.2683555480791064,
"learning_rate": 1.3460337289194255e-05,
"loss": 0.2776,
"step": 1348
},
{
"epoch": 2.274873524451939,
"grad_norm": 0.2860216494530701,
"learning_rate": 1.3429106808244848e-05,
"loss": 0.2994,
"step": 1349
},
{
"epoch": 2.2765598650927488,
"grad_norm": 0.23203762775591186,
"learning_rate": 1.3397876327295442e-05,
"loss": 0.2922,
"step": 1350
},
{
"epoch": 2.2782462057335584,
"grad_norm": 0.25415952020742033,
"learning_rate": 1.3366645846346036e-05,
"loss": 0.2916,
"step": 1351
},
{
"epoch": 2.2799325463743676,
"grad_norm": 0.2155270459840728,
"learning_rate": 1.3335415365396627e-05,
"loss": 0.2738,
"step": 1352
},
{
"epoch": 2.2816188870151772,
"grad_norm": 0.21663505940255026,
"learning_rate": 1.3304184884447221e-05,
"loss": 0.2794,
"step": 1353
},
{
"epoch": 2.2833052276559864,
"grad_norm": 0.23570468928328556,
"learning_rate": 1.3272954403497814e-05,
"loss": 0.3047,
"step": 1354
},
{
"epoch": 2.284991568296796,
"grad_norm": 0.22355120229741432,
"learning_rate": 1.3241723922548408e-05,
"loss": 0.2942,
"step": 1355
},
{
"epoch": 2.2866779089376053,
"grad_norm": 0.22909057483377146,
"learning_rate": 1.3210493441599e-05,
"loss": 0.291,
"step": 1356
},
{
"epoch": 2.288364249578415,
"grad_norm": 0.2397792113373173,
"learning_rate": 1.3179262960649593e-05,
"loss": 0.2953,
"step": 1357
},
{
"epoch": 2.2900505902192245,
"grad_norm": 0.22734437317882458,
"learning_rate": 1.3148032479700187e-05,
"loss": 0.2887,
"step": 1358
},
{
"epoch": 2.2917369308600337,
"grad_norm": 0.22738598098791915,
"learning_rate": 1.311680199875078e-05,
"loss": 0.296,
"step": 1359
},
{
"epoch": 2.2934232715008434,
"grad_norm": 0.21592690968927647,
"learning_rate": 1.3085571517801376e-05,
"loss": 0.2842,
"step": 1360
},
{
"epoch": 2.2951096121416525,
"grad_norm": 0.23330931583642653,
"learning_rate": 1.305434103685197e-05,
"loss": 0.2861,
"step": 1361
},
{
"epoch": 2.296795952782462,
"grad_norm": 0.24244296287404996,
"learning_rate": 1.3023110555902563e-05,
"loss": 0.2807,
"step": 1362
},
{
"epoch": 2.2984822934232714,
"grad_norm": 0.2241173805399546,
"learning_rate": 1.2991880074953155e-05,
"loss": 0.2833,
"step": 1363
},
{
"epoch": 2.300168634064081,
"grad_norm": 0.23286405254784961,
"learning_rate": 1.2960649594003748e-05,
"loss": 0.2693,
"step": 1364
},
{
"epoch": 2.30185497470489,
"grad_norm": 0.24770154492130161,
"learning_rate": 1.2929419113054342e-05,
"loss": 0.2718,
"step": 1365
},
{
"epoch": 2.3035413153457,
"grad_norm": 0.2086407055272546,
"learning_rate": 1.2898188632104935e-05,
"loss": 0.2707,
"step": 1366
},
{
"epoch": 2.305227655986509,
"grad_norm": 0.24433979227348487,
"learning_rate": 1.2866958151155529e-05,
"loss": 0.2855,
"step": 1367
},
{
"epoch": 2.3069139966273187,
"grad_norm": 0.25872917443082816,
"learning_rate": 1.283572767020612e-05,
"loss": 0.2847,
"step": 1368
},
{
"epoch": 2.3086003372681283,
"grad_norm": 0.22534225398839433,
"learning_rate": 1.2804497189256714e-05,
"loss": 0.28,
"step": 1369
},
{
"epoch": 2.3102866779089375,
"grad_norm": 0.24181635268820018,
"learning_rate": 1.2773266708307308e-05,
"loss": 0.3136,
"step": 1370
},
{
"epoch": 2.311973018549747,
"grad_norm": 0.2349690814589048,
"learning_rate": 1.2742036227357901e-05,
"loss": 0.2788,
"step": 1371
},
{
"epoch": 2.3136593591905563,
"grad_norm": 0.23437618515026656,
"learning_rate": 1.2710805746408493e-05,
"loss": 0.289,
"step": 1372
},
{
"epoch": 2.315345699831366,
"grad_norm": 0.22857198837900544,
"learning_rate": 1.267957526545909e-05,
"loss": 0.2933,
"step": 1373
},
{
"epoch": 2.317032040472175,
"grad_norm": 0.23374151297523482,
"learning_rate": 1.2648344784509684e-05,
"loss": 0.2921,
"step": 1374
},
{
"epoch": 2.318718381112985,
"grad_norm": 0.2272494901135597,
"learning_rate": 1.2617114303560276e-05,
"loss": 0.2843,
"step": 1375
},
{
"epoch": 2.3204047217537944,
"grad_norm": 0.22957991022868246,
"learning_rate": 1.258588382261087e-05,
"loss": 0.2816,
"step": 1376
},
{
"epoch": 2.3220910623946036,
"grad_norm": 0.22712652918413673,
"learning_rate": 1.2554653341661463e-05,
"loss": 0.2946,
"step": 1377
},
{
"epoch": 2.3237774030354132,
"grad_norm": 0.20391612529685307,
"learning_rate": 1.2523422860712056e-05,
"loss": 0.277,
"step": 1378
},
{
"epoch": 2.3254637436762224,
"grad_norm": 0.2507935893570099,
"learning_rate": 1.2492192379762648e-05,
"loss": 0.2916,
"step": 1379
},
{
"epoch": 2.327150084317032,
"grad_norm": 0.2170341726977326,
"learning_rate": 1.2460961898813242e-05,
"loss": 0.2791,
"step": 1380
},
{
"epoch": 2.3288364249578413,
"grad_norm": 0.21290499300263288,
"learning_rate": 1.2429731417863835e-05,
"loss": 0.3004,
"step": 1381
},
{
"epoch": 2.330522765598651,
"grad_norm": 0.2386289179226274,
"learning_rate": 1.239850093691443e-05,
"loss": 0.2883,
"step": 1382
},
{
"epoch": 2.3322091062394605,
"grad_norm": 0.24106374267540046,
"learning_rate": 1.2367270455965022e-05,
"loss": 0.2976,
"step": 1383
},
{
"epoch": 2.3338954468802697,
"grad_norm": 0.24279979431421234,
"learning_rate": 1.2336039975015616e-05,
"loss": 0.2956,
"step": 1384
},
{
"epoch": 2.3355817875210794,
"grad_norm": 0.2415298045824607,
"learning_rate": 1.230480949406621e-05,
"loss": 0.2971,
"step": 1385
},
{
"epoch": 2.3372681281618886,
"grad_norm": 0.22791130385805178,
"learning_rate": 1.2273579013116803e-05,
"loss": 0.283,
"step": 1386
},
{
"epoch": 2.338954468802698,
"grad_norm": 0.22723222459345518,
"learning_rate": 1.2242348532167395e-05,
"loss": 0.2795,
"step": 1387
},
{
"epoch": 2.3406408094435074,
"grad_norm": 0.24264089151335377,
"learning_rate": 1.2211118051217988e-05,
"loss": 0.2937,
"step": 1388
},
{
"epoch": 2.342327150084317,
"grad_norm": 0.24686234338788215,
"learning_rate": 1.2179887570268584e-05,
"loss": 0.289,
"step": 1389
},
{
"epoch": 2.3440134907251267,
"grad_norm": 0.21992049757462978,
"learning_rate": 1.2148657089319177e-05,
"loss": 0.2986,
"step": 1390
},
{
"epoch": 2.345699831365936,
"grad_norm": 0.19803695791646844,
"learning_rate": 1.2117426608369769e-05,
"loss": 0.2857,
"step": 1391
},
{
"epoch": 2.3473861720067455,
"grad_norm": 0.23900469154795304,
"learning_rate": 1.2086196127420363e-05,
"loss": 0.2819,
"step": 1392
},
{
"epoch": 2.3490725126475547,
"grad_norm": 0.21287079879552834,
"learning_rate": 1.2054965646470956e-05,
"loss": 0.2797,
"step": 1393
},
{
"epoch": 2.3507588532883643,
"grad_norm": 0.2210244546656982,
"learning_rate": 1.202373516552155e-05,
"loss": 0.2932,
"step": 1394
},
{
"epoch": 2.3524451939291735,
"grad_norm": 0.2048304713681441,
"learning_rate": 1.1992504684572143e-05,
"loss": 0.2724,
"step": 1395
},
{
"epoch": 2.354131534569983,
"grad_norm": 0.25839090758710703,
"learning_rate": 1.1961274203622737e-05,
"loss": 0.2905,
"step": 1396
},
{
"epoch": 2.3558178752107928,
"grad_norm": 0.2281410884233869,
"learning_rate": 1.193004372267333e-05,
"loss": 0.2838,
"step": 1397
},
{
"epoch": 2.357504215851602,
"grad_norm": 0.21845257383012132,
"learning_rate": 1.1898813241723924e-05,
"loss": 0.2792,
"step": 1398
},
{
"epoch": 2.3591905564924116,
"grad_norm": 0.2618765844226696,
"learning_rate": 1.1867582760774516e-05,
"loss": 0.2931,
"step": 1399
},
{
"epoch": 2.360876897133221,
"grad_norm": 0.275866954983335,
"learning_rate": 1.183635227982511e-05,
"loss": 0.299,
"step": 1400
},
{
"epoch": 2.3625632377740304,
"grad_norm": 0.2001624858205199,
"learning_rate": 1.1805121798875703e-05,
"loss": 0.2737,
"step": 1401
},
{
"epoch": 2.3642495784148396,
"grad_norm": 0.23084935393173628,
"learning_rate": 1.1773891317926296e-05,
"loss": 0.3037,
"step": 1402
},
{
"epoch": 2.3659359190556493,
"grad_norm": 0.2396914920117963,
"learning_rate": 1.174266083697689e-05,
"loss": 0.304,
"step": 1403
},
{
"epoch": 2.367622259696459,
"grad_norm": 0.22867487444328807,
"learning_rate": 1.1711430356027483e-05,
"loss": 0.2652,
"step": 1404
},
{
"epoch": 2.369308600337268,
"grad_norm": 0.2594652883181782,
"learning_rate": 1.1680199875078077e-05,
"loss": 0.2914,
"step": 1405
},
{
"epoch": 2.3709949409780777,
"grad_norm": 0.22604681948110886,
"learning_rate": 1.164896939412867e-05,
"loss": 0.2762,
"step": 1406
},
{
"epoch": 2.372681281618887,
"grad_norm": 0.24016465263641243,
"learning_rate": 1.1617738913179262e-05,
"loss": 0.2726,
"step": 1407
},
{
"epoch": 2.3743676222596966,
"grad_norm": 0.2815087199536877,
"learning_rate": 1.1586508432229858e-05,
"loss": 0.304,
"step": 1408
},
{
"epoch": 2.3760539629005057,
"grad_norm": 0.22537579065518193,
"learning_rate": 1.1555277951280451e-05,
"loss": 0.2986,
"step": 1409
},
{
"epoch": 2.3777403035413154,
"grad_norm": 0.22030692295872925,
"learning_rate": 1.1524047470331043e-05,
"loss": 0.2846,
"step": 1410
},
{
"epoch": 2.379426644182125,
"grad_norm": 0.2614924581245704,
"learning_rate": 1.1492816989381637e-05,
"loss": 0.2804,
"step": 1411
},
{
"epoch": 2.381112984822934,
"grad_norm": 0.2525035198240204,
"learning_rate": 1.146158650843223e-05,
"loss": 0.2931,
"step": 1412
},
{
"epoch": 2.382799325463744,
"grad_norm": 0.25978609967948285,
"learning_rate": 1.1430356027482824e-05,
"loss": 0.2794,
"step": 1413
},
{
"epoch": 2.384485666104553,
"grad_norm": 0.24317234044800626,
"learning_rate": 1.1399125546533417e-05,
"loss": 0.2933,
"step": 1414
},
{
"epoch": 2.3861720067453627,
"grad_norm": 0.2724691738795029,
"learning_rate": 1.136789506558401e-05,
"loss": 0.2927,
"step": 1415
},
{
"epoch": 2.387858347386172,
"grad_norm": 0.23730128931622077,
"learning_rate": 1.1336664584634604e-05,
"loss": 0.2829,
"step": 1416
},
{
"epoch": 2.3895446880269815,
"grad_norm": 0.25629553877972355,
"learning_rate": 1.1305434103685198e-05,
"loss": 0.3061,
"step": 1417
},
{
"epoch": 2.391231028667791,
"grad_norm": 0.22474868934736805,
"learning_rate": 1.127420362273579e-05,
"loss": 0.2797,
"step": 1418
},
{
"epoch": 2.3929173693086003,
"grad_norm": 0.23721846546754655,
"learning_rate": 1.1242973141786383e-05,
"loss": 0.2796,
"step": 1419
},
{
"epoch": 2.39460370994941,
"grad_norm": 0.2699939342316147,
"learning_rate": 1.1211742660836977e-05,
"loss": 0.2802,
"step": 1420
},
{
"epoch": 2.396290050590219,
"grad_norm": 0.2509867604822763,
"learning_rate": 1.1180512179887572e-05,
"loss": 0.2798,
"step": 1421
},
{
"epoch": 2.397976391231029,
"grad_norm": 0.22384337619313563,
"learning_rate": 1.1149281698938164e-05,
"loss": 0.2812,
"step": 1422
},
{
"epoch": 2.399662731871838,
"grad_norm": 0.25255278624393845,
"learning_rate": 1.1118051217988758e-05,
"loss": 0.289,
"step": 1423
},
{
"epoch": 2.4013490725126476,
"grad_norm": 0.23643509840246096,
"learning_rate": 1.1086820737039351e-05,
"loss": 0.2838,
"step": 1424
},
{
"epoch": 2.403035413153457,
"grad_norm": 0.23374305341737317,
"learning_rate": 1.1055590256089945e-05,
"loss": 0.2862,
"step": 1425
},
{
"epoch": 2.4047217537942664,
"grad_norm": 0.2319544136431847,
"learning_rate": 1.1024359775140536e-05,
"loss": 0.2896,
"step": 1426
},
{
"epoch": 2.4064080944350756,
"grad_norm": 0.24022760550379593,
"learning_rate": 1.099312929419113e-05,
"loss": 0.2902,
"step": 1427
},
{
"epoch": 2.4080944350758853,
"grad_norm": 0.23074596259534727,
"learning_rate": 1.0961898813241725e-05,
"loss": 0.2773,
"step": 1428
},
{
"epoch": 2.409780775716695,
"grad_norm": 0.2676398026066761,
"learning_rate": 1.0930668332292319e-05,
"loss": 0.2986,
"step": 1429
},
{
"epoch": 2.411467116357504,
"grad_norm": 0.2812523688455711,
"learning_rate": 1.089943785134291e-05,
"loss": 0.2879,
"step": 1430
},
{
"epoch": 2.4131534569983137,
"grad_norm": 0.23002037986702084,
"learning_rate": 1.0868207370393504e-05,
"loss": 0.2998,
"step": 1431
},
{
"epoch": 2.414839797639123,
"grad_norm": 0.21862223377159512,
"learning_rate": 1.0836976889444098e-05,
"loss": 0.2771,
"step": 1432
},
{
"epoch": 2.4165261382799326,
"grad_norm": 0.29976235610573854,
"learning_rate": 1.0805746408494691e-05,
"loss": 0.2829,
"step": 1433
},
{
"epoch": 2.4182124789207418,
"grad_norm": 0.21791369459789725,
"learning_rate": 1.0774515927545285e-05,
"loss": 0.2783,
"step": 1434
},
{
"epoch": 2.4198988195615514,
"grad_norm": 0.23823404787623872,
"learning_rate": 1.0743285446595878e-05,
"loss": 0.2882,
"step": 1435
},
{
"epoch": 2.421585160202361,
"grad_norm": 0.2217777512693731,
"learning_rate": 1.0712054965646472e-05,
"loss": 0.2745,
"step": 1436
},
{
"epoch": 2.4232715008431702,
"grad_norm": 0.23152128478395884,
"learning_rate": 1.0680824484697066e-05,
"loss": 0.2792,
"step": 1437
},
{
"epoch": 2.42495784148398,
"grad_norm": 0.2551072037505355,
"learning_rate": 1.0649594003747657e-05,
"loss": 0.2795,
"step": 1438
},
{
"epoch": 2.426644182124789,
"grad_norm": 0.25243262292499147,
"learning_rate": 1.0618363522798251e-05,
"loss": 0.2864,
"step": 1439
},
{
"epoch": 2.4283305227655987,
"grad_norm": 0.2401318261662841,
"learning_rate": 1.0587133041848844e-05,
"loss": 0.2749,
"step": 1440
},
{
"epoch": 2.430016863406408,
"grad_norm": 0.2482249657282476,
"learning_rate": 1.0555902560899438e-05,
"loss": 0.2844,
"step": 1441
},
{
"epoch": 2.4317032040472175,
"grad_norm": 0.23941825406366568,
"learning_rate": 1.0524672079950032e-05,
"loss": 0.2783,
"step": 1442
},
{
"epoch": 2.433389544688027,
"grad_norm": 0.24014673070086756,
"learning_rate": 1.0493441599000625e-05,
"loss": 0.2876,
"step": 1443
},
{
"epoch": 2.4350758853288363,
"grad_norm": 0.21924519731054826,
"learning_rate": 1.0462211118051219e-05,
"loss": 0.2821,
"step": 1444
},
{
"epoch": 2.436762225969646,
"grad_norm": 0.27891282450208976,
"learning_rate": 1.0430980637101812e-05,
"loss": 0.2887,
"step": 1445
},
{
"epoch": 2.438448566610455,
"grad_norm": 0.2344618379707951,
"learning_rate": 1.0399750156152404e-05,
"loss": 0.2841,
"step": 1446
},
{
"epoch": 2.440134907251265,
"grad_norm": 0.21563837277257256,
"learning_rate": 1.0368519675203e-05,
"loss": 0.2766,
"step": 1447
},
{
"epoch": 2.441821247892074,
"grad_norm": 0.22943894425867237,
"learning_rate": 1.0337289194253593e-05,
"loss": 0.2845,
"step": 1448
},
{
"epoch": 2.4435075885328836,
"grad_norm": 0.20797197636256856,
"learning_rate": 1.0306058713304185e-05,
"loss": 0.2725,
"step": 1449
},
{
"epoch": 2.4451939291736933,
"grad_norm": 0.24047436221214155,
"learning_rate": 1.0274828232354778e-05,
"loss": 0.2794,
"step": 1450
},
{
"epoch": 2.4468802698145025,
"grad_norm": 0.23419204597117435,
"learning_rate": 1.0243597751405372e-05,
"loss": 0.2906,
"step": 1451
},
{
"epoch": 2.448566610455312,
"grad_norm": 0.23568491888175902,
"learning_rate": 1.0212367270455965e-05,
"loss": 0.2642,
"step": 1452
},
{
"epoch": 2.4502529510961213,
"grad_norm": 0.220781160618401,
"learning_rate": 1.0181136789506559e-05,
"loss": 0.2955,
"step": 1453
},
{
"epoch": 2.451939291736931,
"grad_norm": 0.23979617613977305,
"learning_rate": 1.0149906308557152e-05,
"loss": 0.2782,
"step": 1454
},
{
"epoch": 2.45362563237774,
"grad_norm": 0.23607145873020913,
"learning_rate": 1.0118675827607746e-05,
"loss": 0.2664,
"step": 1455
},
{
"epoch": 2.4553119730185498,
"grad_norm": 0.22391814814744884,
"learning_rate": 1.008744534665834e-05,
"loss": 0.2691,
"step": 1456
},
{
"epoch": 2.4569983136593594,
"grad_norm": 0.25227619396729256,
"learning_rate": 1.0056214865708931e-05,
"loss": 0.283,
"step": 1457
},
{
"epoch": 2.4586846543001686,
"grad_norm": 0.23572480074334606,
"learning_rate": 1.0024984384759525e-05,
"loss": 0.2836,
"step": 1458
},
{
"epoch": 2.460370994940978,
"grad_norm": 0.2338220636711118,
"learning_rate": 9.993753903810119e-06,
"loss": 0.2899,
"step": 1459
},
{
"epoch": 2.4620573355817874,
"grad_norm": 0.23295208691141953,
"learning_rate": 9.962523422860714e-06,
"loss": 0.2945,
"step": 1460
},
{
"epoch": 2.463743676222597,
"grad_norm": 0.21976085069131715,
"learning_rate": 9.931292941911306e-06,
"loss": 0.2848,
"step": 1461
},
{
"epoch": 2.4654300168634062,
"grad_norm": 0.2222298647571462,
"learning_rate": 9.9000624609619e-06,
"loss": 0.2769,
"step": 1462
},
{
"epoch": 2.467116357504216,
"grad_norm": 0.2559508939465625,
"learning_rate": 9.868831980012493e-06,
"loss": 0.2937,
"step": 1463
},
{
"epoch": 2.4688026981450255,
"grad_norm": 0.24584859890488658,
"learning_rate": 9.837601499063086e-06,
"loss": 0.279,
"step": 1464
},
{
"epoch": 2.4704890387858347,
"grad_norm": 0.21859759501061693,
"learning_rate": 9.806371018113678e-06,
"loss": 0.2747,
"step": 1465
},
{
"epoch": 2.4721753794266443,
"grad_norm": 0.26054047978030376,
"learning_rate": 9.775140537164273e-06,
"loss": 0.2696,
"step": 1466
},
{
"epoch": 2.4738617200674535,
"grad_norm": 0.22573529466661918,
"learning_rate": 9.743910056214867e-06,
"loss": 0.2815,
"step": 1467
},
{
"epoch": 2.475548060708263,
"grad_norm": 0.22375838955077348,
"learning_rate": 9.71267957526546e-06,
"loss": 0.2776,
"step": 1468
},
{
"epoch": 2.4772344013490724,
"grad_norm": 0.22425707199526068,
"learning_rate": 9.681449094316052e-06,
"loss": 0.3007,
"step": 1469
},
{
"epoch": 2.478920741989882,
"grad_norm": 0.2372375809699635,
"learning_rate": 9.650218613366646e-06,
"loss": 0.2896,
"step": 1470
},
{
"epoch": 2.4806070826306916,
"grad_norm": 0.2209793719324253,
"learning_rate": 9.61898813241724e-06,
"loss": 0.271,
"step": 1471
},
{
"epoch": 2.482293423271501,
"grad_norm": 0.272178984484663,
"learning_rate": 9.587757651467833e-06,
"loss": 0.3081,
"step": 1472
},
{
"epoch": 2.4839797639123105,
"grad_norm": 0.21530565491503215,
"learning_rate": 9.556527170518427e-06,
"loss": 0.2682,
"step": 1473
},
{
"epoch": 2.4856661045531196,
"grad_norm": 0.24950371032482432,
"learning_rate": 9.52529668956902e-06,
"loss": 0.2631,
"step": 1474
},
{
"epoch": 2.4873524451939293,
"grad_norm": 0.2379116998099416,
"learning_rate": 9.494066208619614e-06,
"loss": 0.2853,
"step": 1475
},
{
"epoch": 2.4890387858347385,
"grad_norm": 0.21081103689169245,
"learning_rate": 9.462835727670207e-06,
"loss": 0.2698,
"step": 1476
},
{
"epoch": 2.490725126475548,
"grad_norm": 0.22323696045894834,
"learning_rate": 9.431605246720799e-06,
"loss": 0.2726,
"step": 1477
},
{
"epoch": 2.4924114671163577,
"grad_norm": 0.21955495173287082,
"learning_rate": 9.400374765771393e-06,
"loss": 0.2889,
"step": 1478
},
{
"epoch": 2.494097807757167,
"grad_norm": 0.23023530655803884,
"learning_rate": 9.369144284821988e-06,
"loss": 0.2911,
"step": 1479
},
{
"epoch": 2.4957841483979766,
"grad_norm": 0.25141930925593015,
"learning_rate": 9.337913803872581e-06,
"loss": 0.2859,
"step": 1480
},
{
"epoch": 2.4974704890387858,
"grad_norm": 0.20504591024273977,
"learning_rate": 9.306683322923173e-06,
"loss": 0.2837,
"step": 1481
},
{
"epoch": 2.4991568296795954,
"grad_norm": 0.21175138808057636,
"learning_rate": 9.275452841973767e-06,
"loss": 0.2842,
"step": 1482
},
{
"epoch": 2.5008431703204046,
"grad_norm": 0.25946223776656135,
"learning_rate": 9.24422236102436e-06,
"loss": 0.2818,
"step": 1483
},
{
"epoch": 2.5025295109612142,
"grad_norm": 0.24555783079918103,
"learning_rate": 9.212991880074954e-06,
"loss": 0.2775,
"step": 1484
},
{
"epoch": 2.504215851602024,
"grad_norm": 0.2509789172967366,
"learning_rate": 9.181761399125546e-06,
"loss": 0.2847,
"step": 1485
},
{
"epoch": 2.505902192242833,
"grad_norm": 0.2219211199136465,
"learning_rate": 9.150530918176141e-06,
"loss": 0.275,
"step": 1486
},
{
"epoch": 2.5075885328836423,
"grad_norm": 0.2363255237521708,
"learning_rate": 9.119300437226734e-06,
"loss": 0.2836,
"step": 1487
},
{
"epoch": 2.509274873524452,
"grad_norm": 0.22035522503306615,
"learning_rate": 9.088069956277328e-06,
"loss": 0.2711,
"step": 1488
},
{
"epoch": 2.5109612141652615,
"grad_norm": 0.2363162187055854,
"learning_rate": 9.05683947532792e-06,
"loss": 0.2883,
"step": 1489
},
{
"epoch": 2.5126475548060707,
"grad_norm": 0.22924926722663613,
"learning_rate": 9.025608994378513e-06,
"loss": 0.2746,
"step": 1490
},
{
"epoch": 2.5143338954468804,
"grad_norm": 0.21468666781090634,
"learning_rate": 8.994378513429107e-06,
"loss": 0.2732,
"step": 1491
},
{
"epoch": 2.51602023608769,
"grad_norm": 0.22910744845776243,
"learning_rate": 8.9631480324797e-06,
"loss": 0.2888,
"step": 1492
},
{
"epoch": 2.517706576728499,
"grad_norm": 0.22362122867425577,
"learning_rate": 8.931917551530294e-06,
"loss": 0.2822,
"step": 1493
},
{
"epoch": 2.5193929173693084,
"grad_norm": 0.21554412005349557,
"learning_rate": 8.900687070580888e-06,
"loss": 0.2767,
"step": 1494
},
{
"epoch": 2.521079258010118,
"grad_norm": 0.2393134291816071,
"learning_rate": 8.869456589631481e-06,
"loss": 0.2859,
"step": 1495
},
{
"epoch": 2.5227655986509276,
"grad_norm": 0.2089083058882561,
"learning_rate": 8.838226108682075e-06,
"loss": 0.2737,
"step": 1496
},
{
"epoch": 2.524451939291737,
"grad_norm": 0.2241838198671693,
"learning_rate": 8.806995627732667e-06,
"loss": 0.2812,
"step": 1497
},
{
"epoch": 2.5261382799325465,
"grad_norm": 0.2161714545573455,
"learning_rate": 8.77576514678326e-06,
"loss": 0.299,
"step": 1498
},
{
"epoch": 2.5278246205733557,
"grad_norm": 0.22817866005045112,
"learning_rate": 8.744534665833855e-06,
"loss": 0.2816,
"step": 1499
},
{
"epoch": 2.5295109612141653,
"grad_norm": 0.2286712930717227,
"learning_rate": 8.713304184884447e-06,
"loss": 0.2929,
"step": 1500
},
{
"epoch": 2.5311973018549745,
"grad_norm": 0.19790624556774256,
"learning_rate": 8.68207370393504e-06,
"loss": 0.2768,
"step": 1501
},
{
"epoch": 2.532883642495784,
"grad_norm": 0.20099031176210763,
"learning_rate": 8.650843222985634e-06,
"loss": 0.2865,
"step": 1502
},
{
"epoch": 2.5345699831365938,
"grad_norm": 0.2283654562744821,
"learning_rate": 8.619612742036228e-06,
"loss": 0.2828,
"step": 1503
},
{
"epoch": 2.536256323777403,
"grad_norm": 0.20017848416510967,
"learning_rate": 8.58838226108682e-06,
"loss": 0.2858,
"step": 1504
},
{
"epoch": 2.5379426644182126,
"grad_norm": 0.20782530100569868,
"learning_rate": 8.557151780137415e-06,
"loss": 0.2762,
"step": 1505
},
{
"epoch": 2.539629005059022,
"grad_norm": 0.21768734755599212,
"learning_rate": 8.525921299188009e-06,
"loss": 0.2769,
"step": 1506
},
{
"epoch": 2.5413153456998314,
"grad_norm": 0.20221135790652545,
"learning_rate": 8.494690818238602e-06,
"loss": 0.2764,
"step": 1507
},
{
"epoch": 2.5430016863406406,
"grad_norm": 0.223661558640449,
"learning_rate": 8.463460337289194e-06,
"loss": 0.292,
"step": 1508
},
{
"epoch": 2.5446880269814502,
"grad_norm": 0.21107262757905626,
"learning_rate": 8.432229856339787e-06,
"loss": 0.2904,
"step": 1509
},
{
"epoch": 2.54637436762226,
"grad_norm": 0.21086031494965404,
"learning_rate": 8.400999375390381e-06,
"loss": 0.2843,
"step": 1510
},
{
"epoch": 2.548060708263069,
"grad_norm": 0.20215401518486265,
"learning_rate": 8.369768894440975e-06,
"loss": 0.2814,
"step": 1511
},
{
"epoch": 2.5497470489038787,
"grad_norm": 0.2322780280605853,
"learning_rate": 8.338538413491568e-06,
"loss": 0.3128,
"step": 1512
},
{
"epoch": 2.551433389544688,
"grad_norm": 0.21501337510584598,
"learning_rate": 8.307307932542162e-06,
"loss": 0.2956,
"step": 1513
},
{
"epoch": 2.5531197301854975,
"grad_norm": 0.23922833409027972,
"learning_rate": 8.276077451592755e-06,
"loss": 0.299,
"step": 1514
},
{
"epoch": 2.5548060708263067,
"grad_norm": 0.21327700733730867,
"learning_rate": 8.244846970643349e-06,
"loss": 0.2805,
"step": 1515
},
{
"epoch": 2.5564924114671164,
"grad_norm": 0.2177529625977393,
"learning_rate": 8.21361648969394e-06,
"loss": 0.2954,
"step": 1516
},
{
"epoch": 2.558178752107926,
"grad_norm": 0.2174531141885469,
"learning_rate": 8.182386008744534e-06,
"loss": 0.2812,
"step": 1517
},
{
"epoch": 2.559865092748735,
"grad_norm": 0.24011443306706076,
"learning_rate": 8.15115552779513e-06,
"loss": 0.2847,
"step": 1518
},
{
"epoch": 2.561551433389545,
"grad_norm": 0.2211842789290804,
"learning_rate": 8.119925046845723e-06,
"loss": 0.2968,
"step": 1519
},
{
"epoch": 2.563237774030354,
"grad_norm": 0.2027166936230016,
"learning_rate": 8.088694565896315e-06,
"loss": 0.2901,
"step": 1520
},
{
"epoch": 2.5649241146711637,
"grad_norm": 0.23143090522204798,
"learning_rate": 8.057464084946908e-06,
"loss": 0.3004,
"step": 1521
},
{
"epoch": 2.566610455311973,
"grad_norm": 0.21405158236348512,
"learning_rate": 8.026233603997502e-06,
"loss": 0.2771,
"step": 1522
},
{
"epoch": 2.5682967959527825,
"grad_norm": 0.25973051681141873,
"learning_rate": 7.995003123048095e-06,
"loss": 0.2773,
"step": 1523
},
{
"epoch": 2.569983136593592,
"grad_norm": 0.23015760509575306,
"learning_rate": 7.963772642098687e-06,
"loss": 0.2954,
"step": 1524
},
{
"epoch": 2.5716694772344013,
"grad_norm": 0.20994696566124324,
"learning_rate": 7.932542161149283e-06,
"loss": 0.2817,
"step": 1525
},
{
"epoch": 2.573355817875211,
"grad_norm": 0.254010145129785,
"learning_rate": 7.901311680199876e-06,
"loss": 0.2955,
"step": 1526
},
{
"epoch": 2.57504215851602,
"grad_norm": 0.24545591443052275,
"learning_rate": 7.87008119925047e-06,
"loss": 0.2803,
"step": 1527
},
{
"epoch": 2.5767284991568298,
"grad_norm": 0.21517156771568222,
"learning_rate": 7.838850718301062e-06,
"loss": 0.2876,
"step": 1528
},
{
"epoch": 2.578414839797639,
"grad_norm": 0.22365576198174414,
"learning_rate": 7.807620237351655e-06,
"loss": 0.2877,
"step": 1529
},
{
"epoch": 2.5801011804384486,
"grad_norm": 0.21885168775468924,
"learning_rate": 7.776389756402249e-06,
"loss": 0.2779,
"step": 1530
},
{
"epoch": 2.5817875210792582,
"grad_norm": 0.21740253104686508,
"learning_rate": 7.745159275452842e-06,
"loss": 0.2813,
"step": 1531
},
{
"epoch": 2.5834738617200674,
"grad_norm": 0.20214237553711495,
"learning_rate": 7.713928794503436e-06,
"loss": 0.2882,
"step": 1532
},
{
"epoch": 2.5851602023608766,
"grad_norm": 0.23227209112665623,
"learning_rate": 7.68269831355403e-06,
"loss": 0.309,
"step": 1533
},
{
"epoch": 2.5868465430016863,
"grad_norm": 0.21669129119869118,
"learning_rate": 7.651467832604623e-06,
"loss": 0.2893,
"step": 1534
},
{
"epoch": 2.588532883642496,
"grad_norm": 0.20972553341344566,
"learning_rate": 7.6202373516552155e-06,
"loss": 0.2829,
"step": 1535
},
{
"epoch": 2.590219224283305,
"grad_norm": 0.21727304295862815,
"learning_rate": 7.589006870705809e-06,
"loss": 0.27,
"step": 1536
},
{
"epoch": 2.5919055649241147,
"grad_norm": 0.20374504478680835,
"learning_rate": 7.5577763897564035e-06,
"loss": 0.2908,
"step": 1537
},
{
"epoch": 2.5935919055649244,
"grad_norm": 0.2066348649118724,
"learning_rate": 7.526545908806996e-06,
"loss": 0.2761,
"step": 1538
},
{
"epoch": 2.5952782462057336,
"grad_norm": 0.20726056661289594,
"learning_rate": 7.49531542785759e-06,
"loss": 0.2796,
"step": 1539
},
{
"epoch": 2.5969645868465427,
"grad_norm": 0.24056296732359322,
"learning_rate": 7.4640849469081824e-06,
"loss": 0.2934,
"step": 1540
},
{
"epoch": 2.5986509274873524,
"grad_norm": 0.20786597164807977,
"learning_rate": 7.432854465958776e-06,
"loss": 0.2743,
"step": 1541
},
{
"epoch": 2.600337268128162,
"grad_norm": 0.2264554556337058,
"learning_rate": 7.401623985009369e-06,
"loss": 0.2799,
"step": 1542
},
{
"epoch": 2.602023608768971,
"grad_norm": 0.20964648985122444,
"learning_rate": 7.370393504059962e-06,
"loss": 0.2738,
"step": 1543
},
{
"epoch": 2.603709949409781,
"grad_norm": 0.2288453161556054,
"learning_rate": 7.339163023110557e-06,
"loss": 0.2847,
"step": 1544
},
{
"epoch": 2.6053962900505905,
"grad_norm": 0.21159917871195397,
"learning_rate": 7.30793254216115e-06,
"loss": 0.2812,
"step": 1545
},
{
"epoch": 2.6070826306913997,
"grad_norm": 0.2069094857828839,
"learning_rate": 7.276702061211743e-06,
"loss": 0.2931,
"step": 1546
},
{
"epoch": 2.608768971332209,
"grad_norm": 0.21060482294692173,
"learning_rate": 7.2454715802623364e-06,
"loss": 0.2731,
"step": 1547
},
{
"epoch": 2.6104553119730185,
"grad_norm": 0.25613020925535795,
"learning_rate": 7.214241099312929e-06,
"loss": 0.2943,
"step": 1548
},
{
"epoch": 2.612141652613828,
"grad_norm": 0.24576894427795132,
"learning_rate": 7.183010618363523e-06,
"loss": 0.2913,
"step": 1549
},
{
"epoch": 2.6138279932546373,
"grad_norm": 0.21700719337456373,
"learning_rate": 7.151780137414117e-06,
"loss": 0.2944,
"step": 1550
},
{
"epoch": 2.615514333895447,
"grad_norm": 0.18957741799967104,
"learning_rate": 7.120549656464711e-06,
"loss": 0.2724,
"step": 1551
},
{
"epoch": 2.6172006745362566,
"grad_norm": 0.2110565071188753,
"learning_rate": 7.089319175515303e-06,
"loss": 0.285,
"step": 1552
},
{
"epoch": 2.618887015177066,
"grad_norm": 0.23081971460825607,
"learning_rate": 7.058088694565897e-06,
"loss": 0.2837,
"step": 1553
},
{
"epoch": 2.620573355817875,
"grad_norm": 0.2570191985673241,
"learning_rate": 7.02685821361649e-06,
"loss": 0.2861,
"step": 1554
},
{
"epoch": 2.6222596964586846,
"grad_norm": 0.22950739326313954,
"learning_rate": 6.995627732667083e-06,
"loss": 0.2868,
"step": 1555
},
{
"epoch": 2.6239460370994943,
"grad_norm": 0.21988490556961662,
"learning_rate": 6.964397251717676e-06,
"loss": 0.3013,
"step": 1556
},
{
"epoch": 2.6256323777403034,
"grad_norm": 0.21632940269907694,
"learning_rate": 6.933166770768271e-06,
"loss": 0.2655,
"step": 1557
},
{
"epoch": 2.627318718381113,
"grad_norm": 0.23015336720772667,
"learning_rate": 6.901936289818864e-06,
"loss": 0.2783,
"step": 1558
},
{
"epoch": 2.6290050590219223,
"grad_norm": 0.23068020817280607,
"learning_rate": 6.870705808869457e-06,
"loss": 0.2933,
"step": 1559
},
{
"epoch": 2.630691399662732,
"grad_norm": 0.22117581616378715,
"learning_rate": 6.83947532792005e-06,
"loss": 0.2891,
"step": 1560
},
{
"epoch": 2.632377740303541,
"grad_norm": 0.19747851535133804,
"learning_rate": 6.8082448469706436e-06,
"loss": 0.2675,
"step": 1561
},
{
"epoch": 2.6340640809443507,
"grad_norm": 0.23789835021633737,
"learning_rate": 6.777014366021236e-06,
"loss": 0.2628,
"step": 1562
},
{
"epoch": 2.6357504215851604,
"grad_norm": 0.24583892966645363,
"learning_rate": 6.745783885071831e-06,
"loss": 0.2888,
"step": 1563
},
{
"epoch": 2.6374367622259696,
"grad_norm": 0.23938417280190757,
"learning_rate": 6.714553404122424e-06,
"loss": 0.2979,
"step": 1564
},
{
"epoch": 2.639123102866779,
"grad_norm": 0.21557522746413682,
"learning_rate": 6.683322923173018e-06,
"loss": 0.2879,
"step": 1565
},
{
"epoch": 2.6408094435075884,
"grad_norm": 0.2237697418435905,
"learning_rate": 6.6520924422236105e-06,
"loss": 0.282,
"step": 1566
},
{
"epoch": 2.642495784148398,
"grad_norm": 0.2502379166017792,
"learning_rate": 6.620861961274204e-06,
"loss": 0.2756,
"step": 1567
},
{
"epoch": 2.6441821247892072,
"grad_norm": 0.22915242171625275,
"learning_rate": 6.589631480324797e-06,
"loss": 0.2903,
"step": 1568
},
{
"epoch": 2.645868465430017,
"grad_norm": 0.21731400403245665,
"learning_rate": 6.55840099937539e-06,
"loss": 0.2805,
"step": 1569
},
{
"epoch": 2.6475548060708265,
"grad_norm": 0.21670425873688098,
"learning_rate": 6.527170518425985e-06,
"loss": 0.2753,
"step": 1570
},
{
"epoch": 2.6492411467116357,
"grad_norm": 0.2451135117586469,
"learning_rate": 6.495940037476577e-06,
"loss": 0.2703,
"step": 1571
},
{
"epoch": 2.6509274873524453,
"grad_norm": 0.24833441155424285,
"learning_rate": 6.464709556527171e-06,
"loss": 0.2837,
"step": 1572
},
{
"epoch": 2.6526138279932545,
"grad_norm": 0.21463588266906852,
"learning_rate": 6.4334790755777645e-06,
"loss": 0.2874,
"step": 1573
},
{
"epoch": 2.654300168634064,
"grad_norm": 0.214244480177014,
"learning_rate": 6.402248594628357e-06,
"loss": 0.2833,
"step": 1574
},
{
"epoch": 2.6559865092748733,
"grad_norm": 0.20655998209421197,
"learning_rate": 6.371018113678951e-06,
"loss": 0.2763,
"step": 1575
},
{
"epoch": 2.657672849915683,
"grad_norm": 0.2115837356367195,
"learning_rate": 6.339787632729545e-06,
"loss": 0.2984,
"step": 1576
},
{
"epoch": 2.6593591905564926,
"grad_norm": 0.21949993697455955,
"learning_rate": 6.308557151780138e-06,
"loss": 0.31,
"step": 1577
},
{
"epoch": 2.661045531197302,
"grad_norm": 0.21648433072948026,
"learning_rate": 6.277326670830731e-06,
"loss": 0.2699,
"step": 1578
},
{
"epoch": 2.6627318718381114,
"grad_norm": 0.2086925321556406,
"learning_rate": 6.246096189881324e-06,
"loss": 0.2871,
"step": 1579
},
{
"epoch": 2.6644182124789206,
"grad_norm": 0.21713294641125344,
"learning_rate": 6.214865708931918e-06,
"loss": 0.2755,
"step": 1580
},
{
"epoch": 2.6661045531197303,
"grad_norm": 0.2122874249893408,
"learning_rate": 6.183635227982511e-06,
"loss": 0.2784,
"step": 1581
},
{
"epoch": 2.6677908937605395,
"grad_norm": 0.21600483492813047,
"learning_rate": 6.152404747033105e-06,
"loss": 0.2928,
"step": 1582
},
{
"epoch": 2.669477234401349,
"grad_norm": 0.23268700923125246,
"learning_rate": 6.121174266083697e-06,
"loss": 0.3,
"step": 1583
},
{
"epoch": 2.6711635750421587,
"grad_norm": 0.217511442703724,
"learning_rate": 6.089943785134292e-06,
"loss": 0.2661,
"step": 1584
},
{
"epoch": 2.672849915682968,
"grad_norm": 0.22021101235547547,
"learning_rate": 6.0587133041848845e-06,
"loss": 0.3002,
"step": 1585
},
{
"epoch": 2.6745362563237776,
"grad_norm": 0.2227726635451435,
"learning_rate": 6.027482823235478e-06,
"loss": 0.2667,
"step": 1586
},
{
"epoch": 2.6762225969645868,
"grad_norm": 0.19704649629300044,
"learning_rate": 5.996252342286072e-06,
"loss": 0.2726,
"step": 1587
},
{
"epoch": 2.6779089376053964,
"grad_norm": 0.22161664573034123,
"learning_rate": 5.965021861336665e-06,
"loss": 0.2889,
"step": 1588
},
{
"epoch": 2.6795952782462056,
"grad_norm": 0.2123347602164092,
"learning_rate": 5.933791380387258e-06,
"loss": 0.2887,
"step": 1589
},
{
"epoch": 2.681281618887015,
"grad_norm": 0.21222045114230656,
"learning_rate": 5.902560899437851e-06,
"loss": 0.2812,
"step": 1590
},
{
"epoch": 2.682967959527825,
"grad_norm": 0.205428838759869,
"learning_rate": 5.871330418488445e-06,
"loss": 0.2819,
"step": 1591
},
{
"epoch": 2.684654300168634,
"grad_norm": 0.22794111641253004,
"learning_rate": 5.8400999375390385e-06,
"loss": 0.294,
"step": 1592
},
{
"epoch": 2.6863406408094432,
"grad_norm": 0.2188799104236607,
"learning_rate": 5.808869456589631e-06,
"loss": 0.2908,
"step": 1593
},
{
"epoch": 2.688026981450253,
"grad_norm": 0.22322959770358233,
"learning_rate": 5.777638975640226e-06,
"loss": 0.2761,
"step": 1594
},
{
"epoch": 2.6897133220910625,
"grad_norm": 0.21789366885242073,
"learning_rate": 5.746408494690818e-06,
"loss": 0.2927,
"step": 1595
},
{
"epoch": 2.6913996627318717,
"grad_norm": 0.2083701451315594,
"learning_rate": 5.715178013741412e-06,
"loss": 0.278,
"step": 1596
},
{
"epoch": 2.6930860033726813,
"grad_norm": 0.22030127660422125,
"learning_rate": 5.683947532792005e-06,
"loss": 0.2869,
"step": 1597
},
{
"epoch": 2.694772344013491,
"grad_norm": 0.21709081444096415,
"learning_rate": 5.652717051842599e-06,
"loss": 0.2861,
"step": 1598
},
{
"epoch": 2.6964586846543,
"grad_norm": 0.3235206989069253,
"learning_rate": 5.621486570893192e-06,
"loss": 0.3022,
"step": 1599
},
{
"epoch": 2.6981450252951094,
"grad_norm": 0.2187203609037221,
"learning_rate": 5.590256089943786e-06,
"loss": 0.2777,
"step": 1600
},
{
"epoch": 2.699831365935919,
"grad_norm": 0.21029848278998103,
"learning_rate": 5.559025608994379e-06,
"loss": 0.2814,
"step": 1601
},
{
"epoch": 2.7015177065767286,
"grad_norm": 0.23673420516168278,
"learning_rate": 5.527795128044972e-06,
"loss": 0.2869,
"step": 1602
},
{
"epoch": 2.703204047217538,
"grad_norm": 0.20215563439209264,
"learning_rate": 5.496564647095565e-06,
"loss": 0.2758,
"step": 1603
},
{
"epoch": 2.7048903878583475,
"grad_norm": 0.2026247743653848,
"learning_rate": 5.465334166146159e-06,
"loss": 0.2708,
"step": 1604
},
{
"epoch": 2.706576728499157,
"grad_norm": 0.19435807754322892,
"learning_rate": 5.434103685196752e-06,
"loss": 0.2853,
"step": 1605
},
{
"epoch": 2.7082630691399663,
"grad_norm": 0.2053478996815802,
"learning_rate": 5.402873204247346e-06,
"loss": 0.3004,
"step": 1606
},
{
"epoch": 2.7099494097807755,
"grad_norm": 0.2202873897430332,
"learning_rate": 5.371642723297939e-06,
"loss": 0.2855,
"step": 1607
},
{
"epoch": 2.711635750421585,
"grad_norm": 0.20676180805282937,
"learning_rate": 5.340412242348533e-06,
"loss": 0.2808,
"step": 1608
},
{
"epoch": 2.7133220910623947,
"grad_norm": 0.1941446750107508,
"learning_rate": 5.3091817613991255e-06,
"loss": 0.2749,
"step": 1609
},
{
"epoch": 2.715008431703204,
"grad_norm": 0.19889620088958468,
"learning_rate": 5.277951280449719e-06,
"loss": 0.2782,
"step": 1610
},
{
"epoch": 2.7166947723440136,
"grad_norm": 0.2205070199105159,
"learning_rate": 5.2467207995003126e-06,
"loss": 0.2835,
"step": 1611
},
{
"epoch": 2.718381112984823,
"grad_norm": 0.20007123638846067,
"learning_rate": 5.215490318550906e-06,
"loss": 0.2969,
"step": 1612
},
{
"epoch": 2.7200674536256324,
"grad_norm": 0.2089199986979393,
"learning_rate": 5.1842598376015e-06,
"loss": 0.297,
"step": 1613
},
{
"epoch": 2.7217537942664416,
"grad_norm": 0.20191998141386186,
"learning_rate": 5.153029356652092e-06,
"loss": 0.2935,
"step": 1614
},
{
"epoch": 2.7234401349072512,
"grad_norm": 0.19688379446144275,
"learning_rate": 5.121798875702686e-06,
"loss": 0.2839,
"step": 1615
},
{
"epoch": 2.725126475548061,
"grad_norm": 0.21330583947097048,
"learning_rate": 5.0905683947532795e-06,
"loss": 0.2713,
"step": 1616
},
{
"epoch": 2.72681281618887,
"grad_norm": 0.21573383699863047,
"learning_rate": 5.059337913803873e-06,
"loss": 0.2945,
"step": 1617
},
{
"epoch": 2.7284991568296797,
"grad_norm": 0.20264899837278788,
"learning_rate": 5.028107432854466e-06,
"loss": 0.2818,
"step": 1618
},
{
"epoch": 2.730185497470489,
"grad_norm": 0.21608446927756567,
"learning_rate": 4.996876951905059e-06,
"loss": 0.2864,
"step": 1619
},
{
"epoch": 2.7318718381112985,
"grad_norm": 0.20370501968918966,
"learning_rate": 4.965646470955653e-06,
"loss": 0.2703,
"step": 1620
},
{
"epoch": 2.7335581787521077,
"grad_norm": 0.2080714326636247,
"learning_rate": 4.934415990006246e-06,
"loss": 0.292,
"step": 1621
},
{
"epoch": 2.7352445193929174,
"grad_norm": 0.2032840750089152,
"learning_rate": 4.903185509056839e-06,
"loss": 0.2804,
"step": 1622
},
{
"epoch": 2.736930860033727,
"grad_norm": 0.19331776690610536,
"learning_rate": 4.8719550281074335e-06,
"loss": 0.2755,
"step": 1623
},
{
"epoch": 2.738617200674536,
"grad_norm": 0.21693868552887213,
"learning_rate": 4.840724547158026e-06,
"loss": 0.2834,
"step": 1624
},
{
"epoch": 2.740303541315346,
"grad_norm": 0.20201398659734215,
"learning_rate": 4.80949406620862e-06,
"loss": 0.281,
"step": 1625
},
{
"epoch": 2.741989881956155,
"grad_norm": 0.20817935225884962,
"learning_rate": 4.778263585259213e-06,
"loss": 0.276,
"step": 1626
},
{
"epoch": 2.7436762225969646,
"grad_norm": 0.22331742883655747,
"learning_rate": 4.747033104309807e-06,
"loss": 0.2868,
"step": 1627
},
{
"epoch": 2.745362563237774,
"grad_norm": 0.20820921113077337,
"learning_rate": 4.7158026233603995e-06,
"loss": 0.2806,
"step": 1628
},
{
"epoch": 2.7470489038785835,
"grad_norm": 0.20234625534827635,
"learning_rate": 4.684572142410994e-06,
"loss": 0.2799,
"step": 1629
},
{
"epoch": 2.748735244519393,
"grad_norm": 0.2117057816531713,
"learning_rate": 4.653341661461587e-06,
"loss": 0.2959,
"step": 1630
},
{
"epoch": 2.7504215851602023,
"grad_norm": 0.21119392054935449,
"learning_rate": 4.62211118051218e-06,
"loss": 0.2968,
"step": 1631
},
{
"epoch": 2.752107925801012,
"grad_norm": 0.20784201095611002,
"learning_rate": 4.590880699562773e-06,
"loss": 0.2849,
"step": 1632
},
{
"epoch": 2.753794266441821,
"grad_norm": 0.2069161064249693,
"learning_rate": 4.559650218613367e-06,
"loss": 0.2809,
"step": 1633
},
{
"epoch": 2.7554806070826308,
"grad_norm": 0.2216730996786303,
"learning_rate": 4.52841973766396e-06,
"loss": 0.2831,
"step": 1634
},
{
"epoch": 2.75716694772344,
"grad_norm": 0.21586354572013416,
"learning_rate": 4.4971892567145535e-06,
"loss": 0.2856,
"step": 1635
},
{
"epoch": 2.7588532883642496,
"grad_norm": 0.20826728778824785,
"learning_rate": 4.465958775765147e-06,
"loss": 0.2884,
"step": 1636
},
{
"epoch": 2.7605396290050592,
"grad_norm": 0.1934541402293032,
"learning_rate": 4.434728294815741e-06,
"loss": 0.2877,
"step": 1637
},
{
"epoch": 2.7622259696458684,
"grad_norm": 0.19440484337210331,
"learning_rate": 4.403497813866333e-06,
"loss": 0.2865,
"step": 1638
},
{
"epoch": 2.763912310286678,
"grad_norm": 0.20329351934904272,
"learning_rate": 4.372267332916928e-06,
"loss": 0.2805,
"step": 1639
},
{
"epoch": 2.7655986509274872,
"grad_norm": 0.20682642131681114,
"learning_rate": 4.34103685196752e-06,
"loss": 0.2803,
"step": 1640
},
{
"epoch": 2.767284991568297,
"grad_norm": 0.1965628164276994,
"learning_rate": 4.309806371018114e-06,
"loss": 0.277,
"step": 1641
},
{
"epoch": 2.768971332209106,
"grad_norm": 0.20506125185258076,
"learning_rate": 4.2785758900687075e-06,
"loss": 0.2916,
"step": 1642
},
{
"epoch": 2.7706576728499157,
"grad_norm": 0.21162620440334876,
"learning_rate": 4.247345409119301e-06,
"loss": 0.2822,
"step": 1643
},
{
"epoch": 2.7723440134907253,
"grad_norm": 0.20567288650070736,
"learning_rate": 4.216114928169894e-06,
"loss": 0.2851,
"step": 1644
},
{
"epoch": 2.7740303541315345,
"grad_norm": 0.20508283384581297,
"learning_rate": 4.184884447220487e-06,
"loss": 0.2836,
"step": 1645
},
{
"epoch": 2.775716694772344,
"grad_norm": 0.20928732222500257,
"learning_rate": 4.153653966271081e-06,
"loss": 0.2838,
"step": 1646
},
{
"epoch": 2.7774030354131534,
"grad_norm": 0.2109337211436152,
"learning_rate": 4.122423485321674e-06,
"loss": 0.2804,
"step": 1647
},
{
"epoch": 2.779089376053963,
"grad_norm": 0.2077393204067855,
"learning_rate": 4.091193004372267e-06,
"loss": 0.2808,
"step": 1648
},
{
"epoch": 2.780775716694772,
"grad_norm": 0.21017828523019585,
"learning_rate": 4.0599625234228615e-06,
"loss": 0.2808,
"step": 1649
},
{
"epoch": 2.782462057335582,
"grad_norm": 0.2031729639742959,
"learning_rate": 4.028732042473454e-06,
"loss": 0.2732,
"step": 1650
},
{
"epoch": 2.7841483979763915,
"grad_norm": 0.21512407032855863,
"learning_rate": 3.997501561524048e-06,
"loss": 0.2984,
"step": 1651
},
{
"epoch": 2.7858347386172007,
"grad_norm": 0.196729154608354,
"learning_rate": 3.966271080574641e-06,
"loss": 0.2627,
"step": 1652
},
{
"epoch": 2.78752107925801,
"grad_norm": 0.2172289085545206,
"learning_rate": 3.935040599625235e-06,
"loss": 0.2868,
"step": 1653
},
{
"epoch": 2.7892074198988195,
"grad_norm": 0.1941943020185286,
"learning_rate": 3.9038101186758275e-06,
"loss": 0.2834,
"step": 1654
},
{
"epoch": 2.790893760539629,
"grad_norm": 0.2057212021649519,
"learning_rate": 3.872579637726421e-06,
"loss": 0.3031,
"step": 1655
},
{
"epoch": 2.7925801011804383,
"grad_norm": 0.19922608550472234,
"learning_rate": 3.841349156777015e-06,
"loss": 0.2669,
"step": 1656
},
{
"epoch": 2.794266441821248,
"grad_norm": 0.2022375243308089,
"learning_rate": 3.8101186758276078e-06,
"loss": 0.2904,
"step": 1657
},
{
"epoch": 2.7959527824620576,
"grad_norm": 0.22406966976385426,
"learning_rate": 3.7788881948782017e-06,
"loss": 0.3053,
"step": 1658
},
{
"epoch": 2.7976391231028668,
"grad_norm": 0.19858480327231595,
"learning_rate": 3.747657713928795e-06,
"loss": 0.2818,
"step": 1659
},
{
"epoch": 2.799325463743676,
"grad_norm": 0.20163790196870526,
"learning_rate": 3.716427232979388e-06,
"loss": 0.2957,
"step": 1660
},
{
"epoch": 2.8010118043844856,
"grad_norm": 0.2228194086576066,
"learning_rate": 3.685196752029981e-06,
"loss": 0.2902,
"step": 1661
},
{
"epoch": 2.8026981450252952,
"grad_norm": 0.20667251938872913,
"learning_rate": 3.653966271080575e-06,
"loss": 0.2669,
"step": 1662
},
{
"epoch": 2.8043844856661044,
"grad_norm": 0.20317918546557187,
"learning_rate": 3.6227357901311682e-06,
"loss": 0.2842,
"step": 1663
},
{
"epoch": 2.806070826306914,
"grad_norm": 0.2013232009325853,
"learning_rate": 3.5915053091817613e-06,
"loss": 0.2836,
"step": 1664
},
{
"epoch": 2.8077571669477237,
"grad_norm": 0.2124212170531947,
"learning_rate": 3.5602748282323553e-06,
"loss": 0.283,
"step": 1665
},
{
"epoch": 2.809443507588533,
"grad_norm": 0.20253081915800264,
"learning_rate": 3.5290443472829484e-06,
"loss": 0.2853,
"step": 1666
},
{
"epoch": 2.811129848229342,
"grad_norm": 0.20809367640615845,
"learning_rate": 3.4978138663335416e-06,
"loss": 0.2754,
"step": 1667
},
{
"epoch": 2.8128161888701517,
"grad_norm": 0.2889121452301856,
"learning_rate": 3.4665833853841355e-06,
"loss": 0.2884,
"step": 1668
},
{
"epoch": 2.8145025295109614,
"grad_norm": 0.20087128908636068,
"learning_rate": 3.4353529044347287e-06,
"loss": 0.281,
"step": 1669
},
{
"epoch": 2.8161888701517706,
"grad_norm": 0.2074070673281996,
"learning_rate": 3.4041224234853218e-06,
"loss": 0.2847,
"step": 1670
},
{
"epoch": 2.81787521079258,
"grad_norm": 0.19690785352695042,
"learning_rate": 3.3728919425359153e-06,
"loss": 0.2793,
"step": 1671
},
{
"epoch": 2.8195615514333894,
"grad_norm": 0.19009121705370408,
"learning_rate": 3.341661461586509e-06,
"loss": 0.2669,
"step": 1672
},
{
"epoch": 2.821247892074199,
"grad_norm": 0.19098650836641,
"learning_rate": 3.310430980637102e-06,
"loss": 0.2743,
"step": 1673
},
{
"epoch": 2.822934232715008,
"grad_norm": 0.2028744829991129,
"learning_rate": 3.279200499687695e-06,
"loss": 0.2761,
"step": 1674
},
{
"epoch": 2.824620573355818,
"grad_norm": 0.2044257995581774,
"learning_rate": 3.2479700187382887e-06,
"loss": 0.2946,
"step": 1675
},
{
"epoch": 2.8263069139966275,
"grad_norm": 0.20051532112544362,
"learning_rate": 3.2167395377888822e-06,
"loss": 0.2884,
"step": 1676
},
{
"epoch": 2.8279932546374367,
"grad_norm": 0.20553754280420816,
"learning_rate": 3.1855090568394754e-06,
"loss": 0.2886,
"step": 1677
},
{
"epoch": 2.8296795952782463,
"grad_norm": 0.200052112821026,
"learning_rate": 3.154278575890069e-06,
"loss": 0.283,
"step": 1678
},
{
"epoch": 2.8313659359190555,
"grad_norm": 0.21047714158137912,
"learning_rate": 3.123048094940662e-06,
"loss": 0.2981,
"step": 1679
},
{
"epoch": 2.833052276559865,
"grad_norm": 0.21403069862712779,
"learning_rate": 3.0918176139912556e-06,
"loss": 0.2998,
"step": 1680
},
{
"epoch": 2.8347386172006743,
"grad_norm": 0.21533734091940915,
"learning_rate": 3.0605871330418487e-06,
"loss": 0.2925,
"step": 1681
},
{
"epoch": 2.836424957841484,
"grad_norm": 0.19825568331336288,
"learning_rate": 3.0293566520924423e-06,
"loss": 0.2739,
"step": 1682
},
{
"epoch": 2.8381112984822936,
"grad_norm": 0.19901702391809034,
"learning_rate": 2.998126171143036e-06,
"loss": 0.2813,
"step": 1683
},
{
"epoch": 2.839797639123103,
"grad_norm": 0.20302512033337275,
"learning_rate": 2.966895690193629e-06,
"loss": 0.2903,
"step": 1684
},
{
"epoch": 2.8414839797639124,
"grad_norm": 0.20344695311378921,
"learning_rate": 2.9356652092442225e-06,
"loss": 0.2932,
"step": 1685
},
{
"epoch": 2.8431703204047216,
"grad_norm": 0.19426605288957086,
"learning_rate": 2.9044347282948156e-06,
"loss": 0.2922,
"step": 1686
},
{
"epoch": 2.8448566610455313,
"grad_norm": 0.19876694012610552,
"learning_rate": 2.873204247345409e-06,
"loss": 0.2938,
"step": 1687
},
{
"epoch": 2.8465430016863404,
"grad_norm": 0.2138763936205647,
"learning_rate": 2.8419737663960027e-06,
"loss": 0.2792,
"step": 1688
},
{
"epoch": 2.84822934232715,
"grad_norm": 0.21020119549110686,
"learning_rate": 2.810743285446596e-06,
"loss": 0.2854,
"step": 1689
},
{
"epoch": 2.8499156829679597,
"grad_norm": 0.20917019637007714,
"learning_rate": 2.7795128044971894e-06,
"loss": 0.2742,
"step": 1690
},
{
"epoch": 2.851602023608769,
"grad_norm": 0.20234378390911384,
"learning_rate": 2.7482823235477825e-06,
"loss": 0.286,
"step": 1691
},
{
"epoch": 2.8532883642495785,
"grad_norm": 0.20046123812374947,
"learning_rate": 2.717051842598376e-06,
"loss": 0.2772,
"step": 1692
},
{
"epoch": 2.8549747048903877,
"grad_norm": 0.2172720515949868,
"learning_rate": 2.6858213616489696e-06,
"loss": 0.2839,
"step": 1693
},
{
"epoch": 2.8566610455311974,
"grad_norm": 0.19873193377249784,
"learning_rate": 2.6545908806995627e-06,
"loss": 0.2884,
"step": 1694
},
{
"epoch": 2.8583473861720066,
"grad_norm": 0.20156031983168393,
"learning_rate": 2.6233603997501563e-06,
"loss": 0.2889,
"step": 1695
},
{
"epoch": 2.860033726812816,
"grad_norm": 0.2046721172478191,
"learning_rate": 2.59212991880075e-06,
"loss": 0.2886,
"step": 1696
},
{
"epoch": 2.861720067453626,
"grad_norm": 0.196703772022903,
"learning_rate": 2.560899437851343e-06,
"loss": 0.2701,
"step": 1697
},
{
"epoch": 2.863406408094435,
"grad_norm": 0.19845494425958038,
"learning_rate": 2.5296689569019365e-06,
"loss": 0.2846,
"step": 1698
},
{
"epoch": 2.8650927487352447,
"grad_norm": 0.19204603536346423,
"learning_rate": 2.4984384759525296e-06,
"loss": 0.2989,
"step": 1699
},
{
"epoch": 2.866779089376054,
"grad_norm": 0.19912137719570142,
"learning_rate": 2.467207995003123e-06,
"loss": 0.2735,
"step": 1700
},
{
"epoch": 2.8684654300168635,
"grad_norm": 0.1975812173384509,
"learning_rate": 2.4359775140537167e-06,
"loss": 0.2865,
"step": 1701
},
{
"epoch": 2.8701517706576727,
"grad_norm": 0.21832969792375734,
"learning_rate": 2.40474703310431e-06,
"loss": 0.2912,
"step": 1702
},
{
"epoch": 2.8718381112984823,
"grad_norm": 0.19774318843932107,
"learning_rate": 2.3735165521549034e-06,
"loss": 0.288,
"step": 1703
},
{
"epoch": 2.873524451939292,
"grad_norm": 0.20387387288215883,
"learning_rate": 2.342286071205497e-06,
"loss": 0.2705,
"step": 1704
},
{
"epoch": 2.875210792580101,
"grad_norm": 0.19751525186138755,
"learning_rate": 2.31105559025609e-06,
"loss": 0.2948,
"step": 1705
},
{
"epoch": 2.876897133220911,
"grad_norm": 0.18478364522654045,
"learning_rate": 2.2798251093066836e-06,
"loss": 0.2923,
"step": 1706
},
{
"epoch": 2.87858347386172,
"grad_norm": 0.21012748920962054,
"learning_rate": 2.2485946283572767e-06,
"loss": 0.2694,
"step": 1707
},
{
"epoch": 2.8802698145025296,
"grad_norm": 0.19216064779198222,
"learning_rate": 2.2173641474078703e-06,
"loss": 0.2814,
"step": 1708
},
{
"epoch": 2.881956155143339,
"grad_norm": 0.2192228501690963,
"learning_rate": 2.186133666458464e-06,
"loss": 0.2883,
"step": 1709
},
{
"epoch": 2.8836424957841484,
"grad_norm": 0.19038955794138343,
"learning_rate": 2.154903185509057e-06,
"loss": 0.2774,
"step": 1710
},
{
"epoch": 2.885328836424958,
"grad_norm": 0.21003488814268553,
"learning_rate": 2.1236727045596505e-06,
"loss": 0.2795,
"step": 1711
},
{
"epoch": 2.8870151770657673,
"grad_norm": 0.19468953854662951,
"learning_rate": 2.0924422236102436e-06,
"loss": 0.2716,
"step": 1712
},
{
"epoch": 2.8887015177065765,
"grad_norm": 0.38264106870328357,
"learning_rate": 2.061211742660837e-06,
"loss": 0.2843,
"step": 1713
},
{
"epoch": 2.890387858347386,
"grad_norm": 0.2055405553127052,
"learning_rate": 2.0299812617114307e-06,
"loss": 0.2868,
"step": 1714
},
{
"epoch": 2.8920741989881957,
"grad_norm": 0.18670256654302092,
"learning_rate": 1.998750780762024e-06,
"loss": 0.2824,
"step": 1715
},
{
"epoch": 2.893760539629005,
"grad_norm": 0.1935547163870898,
"learning_rate": 1.9675202998126174e-06,
"loss": 0.2808,
"step": 1716
},
{
"epoch": 2.8954468802698146,
"grad_norm": 0.1988377026687388,
"learning_rate": 1.9362898188632105e-06,
"loss": 0.2891,
"step": 1717
},
{
"epoch": 2.897133220910624,
"grad_norm": 0.18682277679930157,
"learning_rate": 1.9050593379138039e-06,
"loss": 0.2839,
"step": 1718
},
{
"epoch": 2.8988195615514334,
"grad_norm": 0.1883528830947179,
"learning_rate": 1.8738288569643974e-06,
"loss": 0.2812,
"step": 1719
},
{
"epoch": 2.9005059021922426,
"grad_norm": 0.18763676290215717,
"learning_rate": 1.8425983760149906e-06,
"loss": 0.2931,
"step": 1720
},
{
"epoch": 2.902192242833052,
"grad_norm": 0.20121785426301733,
"learning_rate": 1.8113678950655841e-06,
"loss": 0.2856,
"step": 1721
},
{
"epoch": 2.903878583473862,
"grad_norm": 0.1869843292530543,
"learning_rate": 1.7801374141161777e-06,
"loss": 0.2791,
"step": 1722
},
{
"epoch": 2.905564924114671,
"grad_norm": 0.1900538398572289,
"learning_rate": 1.7489069331667708e-06,
"loss": 0.2781,
"step": 1723
},
{
"epoch": 2.9072512647554807,
"grad_norm": 0.20594265363597425,
"learning_rate": 1.7176764522173643e-06,
"loss": 0.2796,
"step": 1724
},
{
"epoch": 2.9089376053962903,
"grad_norm": 0.20817463477274487,
"learning_rate": 1.6864459712679577e-06,
"loss": 0.3059,
"step": 1725
},
{
"epoch": 2.9106239460370995,
"grad_norm": 0.18804462500132332,
"learning_rate": 1.655215490318551e-06,
"loss": 0.2764,
"step": 1726
},
{
"epoch": 2.9123102866779087,
"grad_norm": 0.2045067376777732,
"learning_rate": 1.6239850093691443e-06,
"loss": 0.292,
"step": 1727
},
{
"epoch": 2.9139966273187183,
"grad_norm": 0.19175710659912157,
"learning_rate": 1.5927545284197377e-06,
"loss": 0.2855,
"step": 1728
},
{
"epoch": 2.915682967959528,
"grad_norm": 0.1988671701778532,
"learning_rate": 1.561524047470331e-06,
"loss": 0.2624,
"step": 1729
},
{
"epoch": 2.917369308600337,
"grad_norm": 0.20083443279579477,
"learning_rate": 1.5302935665209244e-06,
"loss": 0.2729,
"step": 1730
},
{
"epoch": 2.919055649241147,
"grad_norm": 0.18471159242802662,
"learning_rate": 1.499063085571518e-06,
"loss": 0.2724,
"step": 1731
},
{
"epoch": 2.920741989881956,
"grad_norm": 0.21068581000467754,
"learning_rate": 1.4678326046221112e-06,
"loss": 0.2801,
"step": 1732
},
{
"epoch": 2.9224283305227656,
"grad_norm": 0.1891544535796431,
"learning_rate": 1.4366021236727046e-06,
"loss": 0.2805,
"step": 1733
},
{
"epoch": 2.924114671163575,
"grad_norm": 0.20313222325906466,
"learning_rate": 1.405371642723298e-06,
"loss": 0.2871,
"step": 1734
},
{
"epoch": 2.9258010118043845,
"grad_norm": 0.20104211480825318,
"learning_rate": 1.3741411617738913e-06,
"loss": 0.3053,
"step": 1735
},
{
"epoch": 2.927487352445194,
"grad_norm": 0.19176846135912543,
"learning_rate": 1.3429106808244848e-06,
"loss": 0.2769,
"step": 1736
},
{
"epoch": 2.9291736930860033,
"grad_norm": 0.20721625332896973,
"learning_rate": 1.3116801998750781e-06,
"loss": 0.2887,
"step": 1737
},
{
"epoch": 2.930860033726813,
"grad_norm": 0.1948545750930062,
"learning_rate": 1.2804497189256715e-06,
"loss": 0.2911,
"step": 1738
},
{
"epoch": 2.932546374367622,
"grad_norm": 0.193561948006962,
"learning_rate": 1.2492192379762648e-06,
"loss": 0.2843,
"step": 1739
},
{
"epoch": 2.9342327150084317,
"grad_norm": 0.20976429959094334,
"learning_rate": 1.2179887570268584e-06,
"loss": 0.312,
"step": 1740
},
{
"epoch": 2.935919055649241,
"grad_norm": 0.20371403034140675,
"learning_rate": 1.1867582760774517e-06,
"loss": 0.2723,
"step": 1741
},
{
"epoch": 2.9376053962900506,
"grad_norm": 0.19139763737062507,
"learning_rate": 1.155527795128045e-06,
"loss": 0.3093,
"step": 1742
},
{
"epoch": 2.93929173693086,
"grad_norm": 0.20807080717830986,
"learning_rate": 1.1242973141786384e-06,
"loss": 0.3053,
"step": 1743
},
{
"epoch": 2.9409780775716694,
"grad_norm": 0.1851309824145173,
"learning_rate": 1.093066833229232e-06,
"loss": 0.2622,
"step": 1744
},
{
"epoch": 2.942664418212479,
"grad_norm": 0.18687903030152567,
"learning_rate": 1.0618363522798253e-06,
"loss": 0.2799,
"step": 1745
},
{
"epoch": 2.9443507588532882,
"grad_norm": 0.20483755437028875,
"learning_rate": 1.0306058713304186e-06,
"loss": 0.2949,
"step": 1746
},
{
"epoch": 2.946037099494098,
"grad_norm": 0.1977892003377331,
"learning_rate": 9.99375390381012e-07,
"loss": 0.2655,
"step": 1747
},
{
"epoch": 2.947723440134907,
"grad_norm": 0.18822466128381724,
"learning_rate": 9.681449094316053e-07,
"loss": 0.2745,
"step": 1748
},
{
"epoch": 2.9494097807757167,
"grad_norm": 0.19647124454247075,
"learning_rate": 9.369144284821987e-07,
"loss": 0.2848,
"step": 1749
},
{
"epoch": 2.9510961214165263,
"grad_norm": 0.199156961501018,
"learning_rate": 9.056839475327921e-07,
"loss": 0.3077,
"step": 1750
},
{
"epoch": 2.9527824620573355,
"grad_norm": 0.19076095952356142,
"learning_rate": 8.744534665833854e-07,
"loss": 0.2842,
"step": 1751
},
{
"epoch": 2.954468802698145,
"grad_norm": 0.19620460710416912,
"learning_rate": 8.432229856339788e-07,
"loss": 0.2827,
"step": 1752
},
{
"epoch": 2.9561551433389543,
"grad_norm": 0.18526171113725295,
"learning_rate": 8.119925046845722e-07,
"loss": 0.2806,
"step": 1753
},
{
"epoch": 2.957841483979764,
"grad_norm": 0.19345290483620012,
"learning_rate": 7.807620237351655e-07,
"loss": 0.2806,
"step": 1754
},
{
"epoch": 2.959527824620573,
"grad_norm": 0.20273068156401156,
"learning_rate": 7.49531542785759e-07,
"loss": 0.2929,
"step": 1755
},
{
"epoch": 2.961214165261383,
"grad_norm": 0.18398648217481314,
"learning_rate": 7.183010618363523e-07,
"loss": 0.289,
"step": 1756
},
{
"epoch": 2.9629005059021924,
"grad_norm": 0.18672180586987563,
"learning_rate": 6.870705808869456e-07,
"loss": 0.2694,
"step": 1757
},
{
"epoch": 2.9645868465430016,
"grad_norm": 0.1854297802284763,
"learning_rate": 6.558400999375391e-07,
"loss": 0.2744,
"step": 1758
},
{
"epoch": 2.9662731871838113,
"grad_norm": 0.18542133331185148,
"learning_rate": 6.246096189881324e-07,
"loss": 0.2921,
"step": 1759
},
{
"epoch": 2.9679595278246205,
"grad_norm": 0.19005635454359576,
"learning_rate": 5.933791380387259e-07,
"loss": 0.2886,
"step": 1760
},
{
"epoch": 2.96964586846543,
"grad_norm": 0.20487939341293834,
"learning_rate": 5.621486570893192e-07,
"loss": 0.2869,
"step": 1761
},
{
"epoch": 2.9713322091062393,
"grad_norm": 0.19380856113326692,
"learning_rate": 5.309181761399126e-07,
"loss": 0.2775,
"step": 1762
},
{
"epoch": 2.973018549747049,
"grad_norm": 0.1876885533427002,
"learning_rate": 4.99687695190506e-07,
"loss": 0.2705,
"step": 1763
},
{
"epoch": 2.9747048903878586,
"grad_norm": 0.19523022001958698,
"learning_rate": 4.6845721424109936e-07,
"loss": 0.2952,
"step": 1764
},
{
"epoch": 2.9763912310286678,
"grad_norm": 0.20164553295789006,
"learning_rate": 4.372267332916927e-07,
"loss": 0.2756,
"step": 1765
},
{
"epoch": 2.9780775716694774,
"grad_norm": 0.19436202075185965,
"learning_rate": 4.059962523422861e-07,
"loss": 0.2936,
"step": 1766
},
{
"epoch": 2.9797639123102866,
"grad_norm": 0.19528563314857372,
"learning_rate": 3.747657713928795e-07,
"loss": 0.2853,
"step": 1767
},
{
"epoch": 2.9814502529510962,
"grad_norm": 0.19637726757360177,
"learning_rate": 3.435352904434728e-07,
"loss": 0.2851,
"step": 1768
},
{
"epoch": 2.9831365935919054,
"grad_norm": 0.1926761803451411,
"learning_rate": 3.123048094940662e-07,
"loss": 0.2788,
"step": 1769
},
{
"epoch": 2.984822934232715,
"grad_norm": 0.19138794357950703,
"learning_rate": 2.810743285446596e-07,
"loss": 0.2755,
"step": 1770
},
{
"epoch": 2.9865092748735247,
"grad_norm": 0.19225770047830543,
"learning_rate": 2.49843847595253e-07,
"loss": 0.2868,
"step": 1771
},
{
"epoch": 2.988195615514334,
"grad_norm": 0.595894216120004,
"learning_rate": 2.1861336664584635e-07,
"loss": 0.2958,
"step": 1772
},
{
"epoch": 2.989881956155143,
"grad_norm": 0.18608324192924283,
"learning_rate": 1.8738288569643974e-07,
"loss": 0.277,
"step": 1773
},
{
"epoch": 2.9915682967959527,
"grad_norm": 0.1873496121673982,
"learning_rate": 1.561524047470331e-07,
"loss": 0.2755,
"step": 1774
},
{
"epoch": 2.9932546374367623,
"grad_norm": 0.19767801151433204,
"learning_rate": 1.249219237976265e-07,
"loss": 0.2727,
"step": 1775
},
{
"epoch": 2.9949409780775715,
"grad_norm": 0.19574725490820036,
"learning_rate": 9.369144284821987e-08,
"loss": 0.2865,
"step": 1776
},
{
"epoch": 2.996627318718381,
"grad_norm": 0.21185109566749932,
"learning_rate": 6.246096189881325e-08,
"loss": 0.2858,
"step": 1777
},
{
"epoch": 2.998313659359191,
"grad_norm": 0.1893865756399657,
"learning_rate": 3.123048094940662e-08,
"loss": 0.2848,
"step": 1778
},
{
"epoch": 3.0,
"grad_norm": 0.1750454809313914,
"learning_rate": 0.0,
"loss": 0.2613,
"step": 1779
},
{
"epoch": 3.0,
"step": 1779,
"total_flos": 1.5197438984385987e+18,
"train_loss": 0.44912863300381384,
"train_runtime": 103340.3025,
"train_samples_per_second": 0.275,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1779,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5197438984385987e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}