davidanugraha's picture
Upload folder using huggingface_hub
204b647 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1152,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017362995116657625,
"grad_norm": 1.9245674193476585,
"learning_rate": 0.0,
"loss": 0.5116,
"step": 1
},
{
"epoch": 0.003472599023331525,
"grad_norm": 1.9643021408198185,
"learning_rate": 8.620689655172414e-08,
"loss": 0.4842,
"step": 2
},
{
"epoch": 0.005208898534997287,
"grad_norm": 1.9648005065921663,
"learning_rate": 1.7241379310344828e-07,
"loss": 0.5066,
"step": 3
},
{
"epoch": 0.00694519804666305,
"grad_norm": 1.9491551990062834,
"learning_rate": 2.5862068965517245e-07,
"loss": 0.5129,
"step": 4
},
{
"epoch": 0.008681497558328812,
"grad_norm": 2.1220950808108245,
"learning_rate": 3.4482758620689656e-07,
"loss": 0.5027,
"step": 5
},
{
"epoch": 0.010417797069994574,
"grad_norm": 2.07218102117925,
"learning_rate": 4.3103448275862073e-07,
"loss": 0.5025,
"step": 6
},
{
"epoch": 0.012154096581660336,
"grad_norm": 2.0995935631323217,
"learning_rate": 5.172413793103449e-07,
"loss": 0.489,
"step": 7
},
{
"epoch": 0.0138903960933261,
"grad_norm": 1.8768353178221435,
"learning_rate": 6.034482758620691e-07,
"loss": 0.4919,
"step": 8
},
{
"epoch": 0.01562669560499186,
"grad_norm": 1.9966804449499742,
"learning_rate": 6.896551724137931e-07,
"loss": 0.4811,
"step": 9
},
{
"epoch": 0.017362995116657624,
"grad_norm": 1.8763897444655444,
"learning_rate": 7.758620689655173e-07,
"loss": 0.5173,
"step": 10
},
{
"epoch": 0.019099294628323386,
"grad_norm": 1.9849388307797002,
"learning_rate": 8.620689655172415e-07,
"loss": 0.5029,
"step": 11
},
{
"epoch": 0.020835594139989148,
"grad_norm": 1.8516809060978145,
"learning_rate": 9.482758620689655e-07,
"loss": 0.4963,
"step": 12
},
{
"epoch": 0.02257189365165491,
"grad_norm": 1.9602745515172928,
"learning_rate": 1.0344827586206898e-06,
"loss": 0.5193,
"step": 13
},
{
"epoch": 0.02430819316332067,
"grad_norm": 1.8697001604531807,
"learning_rate": 1.120689655172414e-06,
"loss": 0.5064,
"step": 14
},
{
"epoch": 0.026044492674986434,
"grad_norm": 1.6394017102026306,
"learning_rate": 1.2068965517241381e-06,
"loss": 0.4735,
"step": 15
},
{
"epoch": 0.0277807921866522,
"grad_norm": 1.6749588224557876,
"learning_rate": 1.2931034482758623e-06,
"loss": 0.4982,
"step": 16
},
{
"epoch": 0.02951709169831796,
"grad_norm": 1.6539213113264153,
"learning_rate": 1.3793103448275862e-06,
"loss": 0.4837,
"step": 17
},
{
"epoch": 0.03125339120998372,
"grad_norm": 1.6831347937004577,
"learning_rate": 1.4655172413793104e-06,
"loss": 0.4926,
"step": 18
},
{
"epoch": 0.032989690721649485,
"grad_norm": 1.6783335621868025,
"learning_rate": 1.5517241379310346e-06,
"loss": 0.4853,
"step": 19
},
{
"epoch": 0.03472599023331525,
"grad_norm": 1.5470450238435818,
"learning_rate": 1.6379310344827587e-06,
"loss": 0.4912,
"step": 20
},
{
"epoch": 0.03646228974498101,
"grad_norm": 1.2016269254575944,
"learning_rate": 1.724137931034483e-06,
"loss": 0.4956,
"step": 21
},
{
"epoch": 0.03819858925664677,
"grad_norm": 1.1241956354180678,
"learning_rate": 1.810344827586207e-06,
"loss": 0.4803,
"step": 22
},
{
"epoch": 0.03993488876831253,
"grad_norm": 1.005571825200285,
"learning_rate": 1.896551724137931e-06,
"loss": 0.4837,
"step": 23
},
{
"epoch": 0.041671188279978295,
"grad_norm": 0.8622973028025744,
"learning_rate": 1.982758620689655e-06,
"loss": 0.4927,
"step": 24
},
{
"epoch": 0.04340748779164406,
"grad_norm": 0.9266904408010402,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.4639,
"step": 25
},
{
"epoch": 0.04514378730330982,
"grad_norm": 0.8473052639588573,
"learning_rate": 2.1551724137931035e-06,
"loss": 0.482,
"step": 26
},
{
"epoch": 0.04688008681497558,
"grad_norm": 0.7307713142390112,
"learning_rate": 2.241379310344828e-06,
"loss": 0.4986,
"step": 27
},
{
"epoch": 0.04861638632664134,
"grad_norm": 0.634004932033382,
"learning_rate": 2.327586206896552e-06,
"loss": 0.465,
"step": 28
},
{
"epoch": 0.050352685838307105,
"grad_norm": 0.6554907050735821,
"learning_rate": 2.4137931034482762e-06,
"loss": 0.4693,
"step": 29
},
{
"epoch": 0.05208898534997287,
"grad_norm": 0.9610831996283411,
"learning_rate": 2.5e-06,
"loss": 0.4788,
"step": 30
},
{
"epoch": 0.05382528486163863,
"grad_norm": 0.9636064060253327,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.4563,
"step": 31
},
{
"epoch": 0.0555615843733044,
"grad_norm": 0.9591103406834719,
"learning_rate": 2.672413793103448e-06,
"loss": 0.4516,
"step": 32
},
{
"epoch": 0.05729788388497016,
"grad_norm": 1.0323468312914004,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.4584,
"step": 33
},
{
"epoch": 0.05903418339663592,
"grad_norm": 0.858020140191911,
"learning_rate": 2.844827586206897e-06,
"loss": 0.438,
"step": 34
},
{
"epoch": 0.060770482908301685,
"grad_norm": 0.8960956717385824,
"learning_rate": 2.931034482758621e-06,
"loss": 0.4744,
"step": 35
},
{
"epoch": 0.06250678241996745,
"grad_norm": 0.7458992924003728,
"learning_rate": 3.017241379310345e-06,
"loss": 0.4567,
"step": 36
},
{
"epoch": 0.06424308193163321,
"grad_norm": 0.8185303607292461,
"learning_rate": 3.103448275862069e-06,
"loss": 0.4872,
"step": 37
},
{
"epoch": 0.06597938144329897,
"grad_norm": 0.5822441053966084,
"learning_rate": 3.1896551724137935e-06,
"loss": 0.459,
"step": 38
},
{
"epoch": 0.06771568095496473,
"grad_norm": 0.5481383087123407,
"learning_rate": 3.2758620689655175e-06,
"loss": 0.4623,
"step": 39
},
{
"epoch": 0.0694519804666305,
"grad_norm": 0.41776886298603355,
"learning_rate": 3.362068965517242e-06,
"loss": 0.4543,
"step": 40
},
{
"epoch": 0.07118827997829626,
"grad_norm": 0.2938251335852544,
"learning_rate": 3.448275862068966e-06,
"loss": 0.4532,
"step": 41
},
{
"epoch": 0.07292457948996202,
"grad_norm": 0.2737664486009171,
"learning_rate": 3.5344827586206898e-06,
"loss": 0.4314,
"step": 42
},
{
"epoch": 0.07466087900162778,
"grad_norm": 0.3109574122181629,
"learning_rate": 3.620689655172414e-06,
"loss": 0.4327,
"step": 43
},
{
"epoch": 0.07639717851329354,
"grad_norm": 0.3666431324176173,
"learning_rate": 3.7068965517241385e-06,
"loss": 0.4549,
"step": 44
},
{
"epoch": 0.0781334780249593,
"grad_norm": 0.4053783490688968,
"learning_rate": 3.793103448275862e-06,
"loss": 0.4563,
"step": 45
},
{
"epoch": 0.07986977753662507,
"grad_norm": 0.4155281249495721,
"learning_rate": 3.8793103448275865e-06,
"loss": 0.4543,
"step": 46
},
{
"epoch": 0.08160607704829083,
"grad_norm": 0.4453663795611955,
"learning_rate": 3.96551724137931e-06,
"loss": 0.4464,
"step": 47
},
{
"epoch": 0.08334237655995659,
"grad_norm": 0.39170643684961,
"learning_rate": 4.051724137931034e-06,
"loss": 0.441,
"step": 48
},
{
"epoch": 0.08507867607162235,
"grad_norm": 0.29498910001689194,
"learning_rate": 4.137931034482759e-06,
"loss": 0.4247,
"step": 49
},
{
"epoch": 0.08681497558328811,
"grad_norm": 0.3353170488362428,
"learning_rate": 4.224137931034483e-06,
"loss": 0.4411,
"step": 50
},
{
"epoch": 0.08855127509495388,
"grad_norm": 0.26783380276886826,
"learning_rate": 4.310344827586207e-06,
"loss": 0.4546,
"step": 51
},
{
"epoch": 0.09028757460661964,
"grad_norm": 0.28388985863752725,
"learning_rate": 4.396551724137931e-06,
"loss": 0.4597,
"step": 52
},
{
"epoch": 0.0920238741182854,
"grad_norm": 0.20735134006190523,
"learning_rate": 4.482758620689656e-06,
"loss": 0.4444,
"step": 53
},
{
"epoch": 0.09376017362995116,
"grad_norm": 0.21901348164598217,
"learning_rate": 4.56896551724138e-06,
"loss": 0.4752,
"step": 54
},
{
"epoch": 0.09549647314161692,
"grad_norm": 0.25595144274452897,
"learning_rate": 4.655172413793104e-06,
"loss": 0.4339,
"step": 55
},
{
"epoch": 0.09723277265328269,
"grad_norm": 0.19446449161885673,
"learning_rate": 4.741379310344828e-06,
"loss": 0.4312,
"step": 56
},
{
"epoch": 0.09896907216494845,
"grad_norm": 0.22806613782923416,
"learning_rate": 4.8275862068965525e-06,
"loss": 0.4544,
"step": 57
},
{
"epoch": 0.10070537167661421,
"grad_norm": 0.18835606666142574,
"learning_rate": 4.9137931034482765e-06,
"loss": 0.4319,
"step": 58
},
{
"epoch": 0.10244167118827997,
"grad_norm": 0.24977581681440297,
"learning_rate": 5e-06,
"loss": 0.4416,
"step": 59
},
{
"epoch": 0.10417797069994574,
"grad_norm": 0.19750282506802236,
"learning_rate": 5.086206896551724e-06,
"loss": 0.4257,
"step": 60
},
{
"epoch": 0.1059142702116115,
"grad_norm": 0.19110900242177456,
"learning_rate": 5.172413793103449e-06,
"loss": 0.4177,
"step": 61
},
{
"epoch": 0.10765056972327726,
"grad_norm": 0.17213185107606405,
"learning_rate": 5.258620689655173e-06,
"loss": 0.4327,
"step": 62
},
{
"epoch": 0.10938686923494302,
"grad_norm": 0.15630107177842684,
"learning_rate": 5.344827586206896e-06,
"loss": 0.415,
"step": 63
},
{
"epoch": 0.1111231687466088,
"grad_norm": 0.17029523143593536,
"learning_rate": 5.431034482758621e-06,
"loss": 0.4453,
"step": 64
},
{
"epoch": 0.11285946825827456,
"grad_norm": 0.19659475939270915,
"learning_rate": 5.517241379310345e-06,
"loss": 0.4584,
"step": 65
},
{
"epoch": 0.11459576776994032,
"grad_norm": 0.16860964327696285,
"learning_rate": 5.603448275862069e-06,
"loss": 0.4493,
"step": 66
},
{
"epoch": 0.11633206728160608,
"grad_norm": 0.22957081775328345,
"learning_rate": 5.689655172413794e-06,
"loss": 0.456,
"step": 67
},
{
"epoch": 0.11806836679327185,
"grad_norm": 0.22307260970140155,
"learning_rate": 5.775862068965518e-06,
"loss": 0.4542,
"step": 68
},
{
"epoch": 0.11980466630493761,
"grad_norm": 0.1761287403232541,
"learning_rate": 5.862068965517242e-06,
"loss": 0.4357,
"step": 69
},
{
"epoch": 0.12154096581660337,
"grad_norm": 0.17806731622716251,
"learning_rate": 5.9482758620689665e-06,
"loss": 0.441,
"step": 70
},
{
"epoch": 0.12327726532826913,
"grad_norm": 0.16595870061584297,
"learning_rate": 6.03448275862069e-06,
"loss": 0.4478,
"step": 71
},
{
"epoch": 0.1250135648399349,
"grad_norm": 0.17683622113941452,
"learning_rate": 6.1206896551724135e-06,
"loss": 0.4439,
"step": 72
},
{
"epoch": 0.12674986435160066,
"grad_norm": 0.1728852455905528,
"learning_rate": 6.206896551724138e-06,
"loss": 0.468,
"step": 73
},
{
"epoch": 0.12848616386326642,
"grad_norm": 0.17067325519254972,
"learning_rate": 6.293103448275862e-06,
"loss": 0.44,
"step": 74
},
{
"epoch": 0.13022246337493218,
"grad_norm": 0.17049371114039943,
"learning_rate": 6.379310344827587e-06,
"loss": 0.4301,
"step": 75
},
{
"epoch": 0.13195876288659794,
"grad_norm": 0.16183277888470998,
"learning_rate": 6.465517241379311e-06,
"loss": 0.4438,
"step": 76
},
{
"epoch": 0.1336950623982637,
"grad_norm": 0.1849741022931392,
"learning_rate": 6.551724137931035e-06,
"loss": 0.436,
"step": 77
},
{
"epoch": 0.13543136190992947,
"grad_norm": 0.15302845702506,
"learning_rate": 6.63793103448276e-06,
"loss": 0.4522,
"step": 78
},
{
"epoch": 0.13716766142159523,
"grad_norm": 0.1779999440937052,
"learning_rate": 6.724137931034484e-06,
"loss": 0.4469,
"step": 79
},
{
"epoch": 0.138903960933261,
"grad_norm": 0.14785801477595045,
"learning_rate": 6.810344827586207e-06,
"loss": 0.4339,
"step": 80
},
{
"epoch": 0.14064026044492675,
"grad_norm": 0.14779342419552097,
"learning_rate": 6.896551724137932e-06,
"loss": 0.445,
"step": 81
},
{
"epoch": 0.1423765599565925,
"grad_norm": 0.17525194430534158,
"learning_rate": 6.982758620689656e-06,
"loss": 0.4426,
"step": 82
},
{
"epoch": 0.14411285946825828,
"grad_norm": 0.16728639548623878,
"learning_rate": 7.0689655172413796e-06,
"loss": 0.4301,
"step": 83
},
{
"epoch": 0.14584915897992404,
"grad_norm": 0.14448921047681842,
"learning_rate": 7.155172413793104e-06,
"loss": 0.4398,
"step": 84
},
{
"epoch": 0.1475854584915898,
"grad_norm": 0.17217160852240143,
"learning_rate": 7.241379310344828e-06,
"loss": 0.4452,
"step": 85
},
{
"epoch": 0.14932175800325556,
"grad_norm": 0.15054846492214635,
"learning_rate": 7.327586206896552e-06,
"loss": 0.4402,
"step": 86
},
{
"epoch": 0.15105805751492132,
"grad_norm": 0.18058844867837248,
"learning_rate": 7.413793103448277e-06,
"loss": 0.4249,
"step": 87
},
{
"epoch": 0.15279435702658709,
"grad_norm": 0.16522103718686243,
"learning_rate": 7.500000000000001e-06,
"loss": 0.439,
"step": 88
},
{
"epoch": 0.15453065653825285,
"grad_norm": 0.17085898069141625,
"learning_rate": 7.586206896551724e-06,
"loss": 0.4615,
"step": 89
},
{
"epoch": 0.1562669560499186,
"grad_norm": 0.1352941909359962,
"learning_rate": 7.672413793103449e-06,
"loss": 0.4366,
"step": 90
},
{
"epoch": 0.15800325556158437,
"grad_norm": 0.14834678829341033,
"learning_rate": 7.758620689655173e-06,
"loss": 0.4212,
"step": 91
},
{
"epoch": 0.15973955507325013,
"grad_norm": 0.1520079978715368,
"learning_rate": 7.844827586206897e-06,
"loss": 0.4365,
"step": 92
},
{
"epoch": 0.1614758545849159,
"grad_norm": 0.1371331686782148,
"learning_rate": 7.93103448275862e-06,
"loss": 0.4216,
"step": 93
},
{
"epoch": 0.16321215409658166,
"grad_norm": 0.13976693670870327,
"learning_rate": 8.017241379310345e-06,
"loss": 0.4492,
"step": 94
},
{
"epoch": 0.16494845360824742,
"grad_norm": 0.15160516329676715,
"learning_rate": 8.103448275862069e-06,
"loss": 0.4418,
"step": 95
},
{
"epoch": 0.16668475311991318,
"grad_norm": 0.17142907121339737,
"learning_rate": 8.189655172413794e-06,
"loss": 0.4383,
"step": 96
},
{
"epoch": 0.16842105263157894,
"grad_norm": 0.16648106630358045,
"learning_rate": 8.275862068965518e-06,
"loss": 0.4374,
"step": 97
},
{
"epoch": 0.1701573521432447,
"grad_norm": 0.16111838676750498,
"learning_rate": 8.362068965517242e-06,
"loss": 0.4568,
"step": 98
},
{
"epoch": 0.17189365165491047,
"grad_norm": 0.14581097572550494,
"learning_rate": 8.448275862068966e-06,
"loss": 0.4255,
"step": 99
},
{
"epoch": 0.17362995116657623,
"grad_norm": 0.1378644644499854,
"learning_rate": 8.53448275862069e-06,
"loss": 0.4235,
"step": 100
},
{
"epoch": 0.175366250678242,
"grad_norm": 0.15888598262640966,
"learning_rate": 8.620689655172414e-06,
"loss": 0.4347,
"step": 101
},
{
"epoch": 0.17710255018990775,
"grad_norm": 0.14832980896853906,
"learning_rate": 8.706896551724138e-06,
"loss": 0.4256,
"step": 102
},
{
"epoch": 0.17883884970157352,
"grad_norm": 0.16331329070890865,
"learning_rate": 8.793103448275862e-06,
"loss": 0.4146,
"step": 103
},
{
"epoch": 0.18057514921323928,
"grad_norm": 0.16254414896072908,
"learning_rate": 8.879310344827588e-06,
"loss": 0.4538,
"step": 104
},
{
"epoch": 0.18231144872490504,
"grad_norm": 0.20628714572870552,
"learning_rate": 8.965517241379312e-06,
"loss": 0.4545,
"step": 105
},
{
"epoch": 0.1840477482365708,
"grad_norm": 0.14672329302866716,
"learning_rate": 9.051724137931036e-06,
"loss": 0.4409,
"step": 106
},
{
"epoch": 0.18578404774823656,
"grad_norm": 0.17704727106288587,
"learning_rate": 9.13793103448276e-06,
"loss": 0.4569,
"step": 107
},
{
"epoch": 0.18752034725990233,
"grad_norm": 0.1671108515367416,
"learning_rate": 9.224137931034484e-06,
"loss": 0.4419,
"step": 108
},
{
"epoch": 0.1892566467715681,
"grad_norm": 0.1401024359628494,
"learning_rate": 9.310344827586207e-06,
"loss": 0.4399,
"step": 109
},
{
"epoch": 0.19099294628323385,
"grad_norm": 0.17514966479813296,
"learning_rate": 9.396551724137931e-06,
"loss": 0.4323,
"step": 110
},
{
"epoch": 0.1927292457948996,
"grad_norm": 0.1813009132122757,
"learning_rate": 9.482758620689655e-06,
"loss": 0.4465,
"step": 111
},
{
"epoch": 0.19446554530656537,
"grad_norm": 0.14595421920726084,
"learning_rate": 9.56896551724138e-06,
"loss": 0.4378,
"step": 112
},
{
"epoch": 0.19620184481823114,
"grad_norm": 0.15465278812299593,
"learning_rate": 9.655172413793105e-06,
"loss": 0.4381,
"step": 113
},
{
"epoch": 0.1979381443298969,
"grad_norm": 0.15186254208439187,
"learning_rate": 9.741379310344829e-06,
"loss": 0.4231,
"step": 114
},
{
"epoch": 0.19967444384156266,
"grad_norm": 0.15058276592497516,
"learning_rate": 9.827586206896553e-06,
"loss": 0.431,
"step": 115
},
{
"epoch": 0.20141074335322842,
"grad_norm": 0.16119079928649604,
"learning_rate": 9.913793103448277e-06,
"loss": 0.452,
"step": 116
},
{
"epoch": 0.20314704286489418,
"grad_norm": 0.14095989647979248,
"learning_rate": 1e-05,
"loss": 0.4182,
"step": 117
},
{
"epoch": 0.20488334237655995,
"grad_norm": 0.1746147114354537,
"learning_rate": 9.999977011008992e-06,
"loss": 0.4404,
"step": 118
},
{
"epoch": 0.2066196418882257,
"grad_norm": 0.17509978960441983,
"learning_rate": 9.999908044247359e-06,
"loss": 0.4376,
"step": 119
},
{
"epoch": 0.20835594139989147,
"grad_norm": 0.15056060709141128,
"learning_rate": 9.999793100349294e-06,
"loss": 0.4058,
"step": 120
},
{
"epoch": 0.21009224091155723,
"grad_norm": 0.16195757110570655,
"learning_rate": 9.999632180371776e-06,
"loss": 0.4461,
"step": 121
},
{
"epoch": 0.211828540423223,
"grad_norm": 0.13882873497505904,
"learning_rate": 9.999425285794557e-06,
"loss": 0.4405,
"step": 122
},
{
"epoch": 0.21356483993488876,
"grad_norm": 0.16965441460666414,
"learning_rate": 9.999172418520159e-06,
"loss": 0.4322,
"step": 123
},
{
"epoch": 0.21530113944655452,
"grad_norm": 0.16599733161228727,
"learning_rate": 9.998873580873848e-06,
"loss": 0.4375,
"step": 124
},
{
"epoch": 0.21703743895822028,
"grad_norm": 0.17776524728753473,
"learning_rate": 9.998528775603612e-06,
"loss": 0.4404,
"step": 125
},
{
"epoch": 0.21877373846988604,
"grad_norm": 0.14928214841596088,
"learning_rate": 9.99813800588014e-06,
"loss": 0.4028,
"step": 126
},
{
"epoch": 0.2205100379815518,
"grad_norm": 0.19711135086768242,
"learning_rate": 9.997701275296796e-06,
"loss": 0.4457,
"step": 127
},
{
"epoch": 0.2222463374932176,
"grad_norm": 0.15782967723076521,
"learning_rate": 9.997218587869577e-06,
"loss": 0.444,
"step": 128
},
{
"epoch": 0.22398263700488336,
"grad_norm": 0.16659222906938462,
"learning_rate": 9.996689948037081e-06,
"loss": 0.4371,
"step": 129
},
{
"epoch": 0.22571893651654912,
"grad_norm": 0.1499268597695575,
"learning_rate": 9.996115360660466e-06,
"loss": 0.4393,
"step": 130
},
{
"epoch": 0.22745523602821488,
"grad_norm": 0.1646819022624862,
"learning_rate": 9.99549483102341e-06,
"loss": 0.4539,
"step": 131
},
{
"epoch": 0.22919153553988064,
"grad_norm": 0.16685025404219458,
"learning_rate": 9.994828364832045e-06,
"loss": 0.4081,
"step": 132
},
{
"epoch": 0.2309278350515464,
"grad_norm": 0.15466930085043173,
"learning_rate": 9.994115968214933e-06,
"loss": 0.4378,
"step": 133
},
{
"epoch": 0.23266413456321217,
"grad_norm": 0.15508073913810053,
"learning_rate": 9.993357647722982e-06,
"loss": 0.439,
"step": 134
},
{
"epoch": 0.23440043407487793,
"grad_norm": 0.1539119346782389,
"learning_rate": 9.9925534103294e-06,
"loss": 0.4265,
"step": 135
},
{
"epoch": 0.2361367335865437,
"grad_norm": 0.14426900874434997,
"learning_rate": 9.991703263429633e-06,
"loss": 0.4244,
"step": 136
},
{
"epoch": 0.23787303309820945,
"grad_norm": 0.16870400577528086,
"learning_rate": 9.990807214841288e-06,
"loss": 0.419,
"step": 137
},
{
"epoch": 0.23960933260987521,
"grad_norm": 0.20462475693369803,
"learning_rate": 9.989865272804064e-06,
"loss": 0.4246,
"step": 138
},
{
"epoch": 0.24134563212154098,
"grad_norm": 0.1451712253771932,
"learning_rate": 9.988877445979681e-06,
"loss": 0.4381,
"step": 139
},
{
"epoch": 0.24308193163320674,
"grad_norm": 0.15891534718210473,
"learning_rate": 9.987843743451796e-06,
"loss": 0.4235,
"step": 140
},
{
"epoch": 0.2448182311448725,
"grad_norm": 0.17815822351007055,
"learning_rate": 9.98676417472592e-06,
"loss": 0.4425,
"step": 141
},
{
"epoch": 0.24655453065653826,
"grad_norm": 0.1654797111641331,
"learning_rate": 9.985638749729331e-06,
"loss": 0.4323,
"step": 142
},
{
"epoch": 0.24829083016820402,
"grad_norm": 0.15859047009407842,
"learning_rate": 9.984467478810985e-06,
"loss": 0.4441,
"step": 143
},
{
"epoch": 0.2500271296798698,
"grad_norm": 0.16268622271549055,
"learning_rate": 9.983250372741412e-06,
"loss": 0.4458,
"step": 144
},
{
"epoch": 0.25176342919153555,
"grad_norm": 0.14671156779493466,
"learning_rate": 9.981987442712634e-06,
"loss": 0.4299,
"step": 145
},
{
"epoch": 0.2534997287032013,
"grad_norm": 0.13279882931945028,
"learning_rate": 9.980678700338043e-06,
"loss": 0.4234,
"step": 146
},
{
"epoch": 0.2552360282148671,
"grad_norm": 0.13969849126911693,
"learning_rate": 9.979324157652303e-06,
"loss": 0.412,
"step": 147
},
{
"epoch": 0.25697232772653283,
"grad_norm": 0.1488892574422794,
"learning_rate": 9.977923827111247e-06,
"loss": 0.4238,
"step": 148
},
{
"epoch": 0.2587086272381986,
"grad_norm": 0.16686316447713223,
"learning_rate": 9.976477721591746e-06,
"loss": 0.4629,
"step": 149
},
{
"epoch": 0.26044492674986436,
"grad_norm": 0.16230014013728034,
"learning_rate": 9.974985854391606e-06,
"loss": 0.4411,
"step": 150
},
{
"epoch": 0.2621812262615301,
"grad_norm": 0.16069337866094202,
"learning_rate": 9.973448239229431e-06,
"loss": 0.4239,
"step": 151
},
{
"epoch": 0.2639175257731959,
"grad_norm": 0.1506322062881885,
"learning_rate": 9.971864890244514e-06,
"loss": 0.4517,
"step": 152
},
{
"epoch": 0.26565382528486164,
"grad_norm": 0.171740710512301,
"learning_rate": 9.97023582199669e-06,
"loss": 0.4374,
"step": 153
},
{
"epoch": 0.2673901247965274,
"grad_norm": 0.1419738588320865,
"learning_rate": 9.968561049466214e-06,
"loss": 0.4199,
"step": 154
},
{
"epoch": 0.26912642430819317,
"grad_norm": 0.1606660762569778,
"learning_rate": 9.966840588053618e-06,
"loss": 0.4372,
"step": 155
},
{
"epoch": 0.27086272381985893,
"grad_norm": 0.14846716499090143,
"learning_rate": 9.965074453579573e-06,
"loss": 0.4454,
"step": 156
},
{
"epoch": 0.2725990233315247,
"grad_norm": 0.1655209351236852,
"learning_rate": 9.963262662284735e-06,
"loss": 0.4467,
"step": 157
},
{
"epoch": 0.27433532284319045,
"grad_norm": 0.17149849857801977,
"learning_rate": 9.96140523082961e-06,
"loss": 0.4325,
"step": 158
},
{
"epoch": 0.2760716223548562,
"grad_norm": 0.1410596039887854,
"learning_rate": 9.959502176294384e-06,
"loss": 0.4337,
"step": 159
},
{
"epoch": 0.277807921866522,
"grad_norm": 0.1512531292193184,
"learning_rate": 9.957553516178782e-06,
"loss": 0.4278,
"step": 160
},
{
"epoch": 0.27954422137818774,
"grad_norm": 0.14446839709527778,
"learning_rate": 9.955559268401893e-06,
"loss": 0.4262,
"step": 161
},
{
"epoch": 0.2812805208898535,
"grad_norm": 0.14669453705746469,
"learning_rate": 9.953519451302016e-06,
"loss": 0.4271,
"step": 162
},
{
"epoch": 0.28301682040151926,
"grad_norm": 0.16500067161261647,
"learning_rate": 9.951434083636484e-06,
"loss": 0.4353,
"step": 163
},
{
"epoch": 0.284753119913185,
"grad_norm": 0.14619596732513318,
"learning_rate": 9.9493031845815e-06,
"loss": 0.4326,
"step": 164
},
{
"epoch": 0.2864894194248508,
"grad_norm": 0.16853726289967258,
"learning_rate": 9.947126773731949e-06,
"loss": 0.4407,
"step": 165
},
{
"epoch": 0.28822571893651655,
"grad_norm": 0.15112267673295302,
"learning_rate": 9.944904871101227e-06,
"loss": 0.4391,
"step": 166
},
{
"epoch": 0.2899620184481823,
"grad_norm": 0.1651599790331637,
"learning_rate": 9.942637497121055e-06,
"loss": 0.4448,
"step": 167
},
{
"epoch": 0.2916983179598481,
"grad_norm": 0.15953370847696652,
"learning_rate": 9.940324672641289e-06,
"loss": 0.4327,
"step": 168
},
{
"epoch": 0.29343461747151384,
"grad_norm": 0.19265474992006965,
"learning_rate": 9.937966418929725e-06,
"loss": 0.4491,
"step": 169
},
{
"epoch": 0.2951709169831796,
"grad_norm": 0.16933292810141926,
"learning_rate": 9.93556275767192e-06,
"loss": 0.4286,
"step": 170
},
{
"epoch": 0.29690721649484536,
"grad_norm": 0.15237594797568174,
"learning_rate": 9.933113710970967e-06,
"loss": 0.4283,
"step": 171
},
{
"epoch": 0.2986435160065111,
"grad_norm": 0.18265650880814457,
"learning_rate": 9.930619301347312e-06,
"loss": 0.4546,
"step": 172
},
{
"epoch": 0.3003798155181769,
"grad_norm": 0.18099070598137104,
"learning_rate": 9.928079551738542e-06,
"loss": 0.4092,
"step": 173
},
{
"epoch": 0.30211611502984265,
"grad_norm": 0.1815346079160872,
"learning_rate": 9.925494485499167e-06,
"loss": 0.4478,
"step": 174
},
{
"epoch": 0.3038524145415084,
"grad_norm": 0.1524482370964297,
"learning_rate": 9.922864126400414e-06,
"loss": 0.4545,
"step": 175
},
{
"epoch": 0.30558871405317417,
"grad_norm": 0.16293570594409254,
"learning_rate": 9.920188498630003e-06,
"loss": 0.4208,
"step": 176
},
{
"epoch": 0.30732501356483993,
"grad_norm": 0.1675570035616688,
"learning_rate": 9.917467626791925e-06,
"loss": 0.4626,
"step": 177
},
{
"epoch": 0.3090613130765057,
"grad_norm": 0.15825453091697014,
"learning_rate": 9.914701535906224e-06,
"loss": 0.4512,
"step": 178
},
{
"epoch": 0.31079761258817146,
"grad_norm": 0.15630202329564072,
"learning_rate": 9.91189025140875e-06,
"loss": 0.4343,
"step": 179
},
{
"epoch": 0.3125339120998372,
"grad_norm": 0.16078903742852474,
"learning_rate": 9.909033799150947e-06,
"loss": 0.4408,
"step": 180
},
{
"epoch": 0.314270211611503,
"grad_norm": 0.16012556685600268,
"learning_rate": 9.90613220539959e-06,
"loss": 0.4139,
"step": 181
},
{
"epoch": 0.31600651112316874,
"grad_norm": 0.14729320163626394,
"learning_rate": 9.90318549683657e-06,
"loss": 0.4365,
"step": 182
},
{
"epoch": 0.3177428106348345,
"grad_norm": 0.1397968636888418,
"learning_rate": 9.900193700558626e-06,
"loss": 0.4247,
"step": 183
},
{
"epoch": 0.31947911014650027,
"grad_norm": 0.16626985308479,
"learning_rate": 9.897156844077111e-06,
"loss": 0.4288,
"step": 184
},
{
"epoch": 0.32121540965816603,
"grad_norm": 0.15854235536164976,
"learning_rate": 9.89407495531773e-06,
"loss": 0.4148,
"step": 185
},
{
"epoch": 0.3229517091698318,
"grad_norm": 0.1622205483358838,
"learning_rate": 9.890948062620289e-06,
"loss": 0.438,
"step": 186
},
{
"epoch": 0.32468800868149755,
"grad_norm": 0.16252071337796425,
"learning_rate": 9.887776194738433e-06,
"loss": 0.4501,
"step": 187
},
{
"epoch": 0.3264243081931633,
"grad_norm": 0.1710876752747644,
"learning_rate": 9.884559380839374e-06,
"loss": 0.4283,
"step": 188
},
{
"epoch": 0.3281606077048291,
"grad_norm": 0.18377407394807166,
"learning_rate": 9.881297650503641e-06,
"loss": 0.4413,
"step": 189
},
{
"epoch": 0.32989690721649484,
"grad_norm": 0.140886519345969,
"learning_rate": 9.877991033724782e-06,
"loss": 0.4323,
"step": 190
},
{
"epoch": 0.3316332067281606,
"grad_norm": 0.14923852434950355,
"learning_rate": 9.874639560909118e-06,
"loss": 0.4422,
"step": 191
},
{
"epoch": 0.33336950623982636,
"grad_norm": 0.14200004213993497,
"learning_rate": 9.871243262875437e-06,
"loss": 0.4258,
"step": 192
},
{
"epoch": 0.3351058057514921,
"grad_norm": 0.1589219977289536,
"learning_rate": 9.867802170854724e-06,
"loss": 0.435,
"step": 193
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.1590427030427661,
"learning_rate": 9.864316316489873e-06,
"loss": 0.4327,
"step": 194
},
{
"epoch": 0.33857840477482365,
"grad_norm": 0.17338035825567785,
"learning_rate": 9.860785731835397e-06,
"loss": 0.4489,
"step": 195
},
{
"epoch": 0.3403147042864894,
"grad_norm": 0.1499845982176126,
"learning_rate": 9.857210449357121e-06,
"loss": 0.4333,
"step": 196
},
{
"epoch": 0.3420510037981552,
"grad_norm": 0.15231120130052084,
"learning_rate": 9.853590501931905e-06,
"loss": 0.4502,
"step": 197
},
{
"epoch": 0.34378730330982094,
"grad_norm": 0.18719106483657583,
"learning_rate": 9.849925922847323e-06,
"loss": 0.4525,
"step": 198
},
{
"epoch": 0.3455236028214867,
"grad_norm": 0.17712759736915118,
"learning_rate": 9.846216745801366e-06,
"loss": 0.4441,
"step": 199
},
{
"epoch": 0.34725990233315246,
"grad_norm": 0.14227690714934854,
"learning_rate": 9.842463004902127e-06,
"loss": 0.4278,
"step": 200
},
{
"epoch": 0.3489962018448182,
"grad_norm": 0.1420001146482775,
"learning_rate": 9.838664734667496e-06,
"loss": 0.4311,
"step": 201
},
{
"epoch": 0.350732501356484,
"grad_norm": 0.13874177478200764,
"learning_rate": 9.834821970024828e-06,
"loss": 0.4232,
"step": 202
},
{
"epoch": 0.35246880086814975,
"grad_norm": 0.16110035201959844,
"learning_rate": 9.83093474631064e-06,
"loss": 0.4178,
"step": 203
},
{
"epoch": 0.3542051003798155,
"grad_norm": 0.1477258063517038,
"learning_rate": 9.827003099270272e-06,
"loss": 0.4333,
"step": 204
},
{
"epoch": 0.35594139989148127,
"grad_norm": 0.14916525553791554,
"learning_rate": 9.82302706505756e-06,
"loss": 0.4336,
"step": 205
},
{
"epoch": 0.35767769940314703,
"grad_norm": 0.1573789515947177,
"learning_rate": 9.819006680234513e-06,
"loss": 0.44,
"step": 206
},
{
"epoch": 0.3594139989148128,
"grad_norm": 0.1814058728192214,
"learning_rate": 9.814941981770966e-06,
"loss": 0.4364,
"step": 207
},
{
"epoch": 0.36115029842647856,
"grad_norm": 0.14326603797247606,
"learning_rate": 9.810833007044247e-06,
"loss": 0.4206,
"step": 208
},
{
"epoch": 0.3628865979381443,
"grad_norm": 0.14817920662769005,
"learning_rate": 9.806679793838829e-06,
"loss": 0.4407,
"step": 209
},
{
"epoch": 0.3646228974498101,
"grad_norm": 0.1585265768266505,
"learning_rate": 9.802482380345983e-06,
"loss": 0.4752,
"step": 210
},
{
"epoch": 0.36635919696147584,
"grad_norm": 0.16492885340552949,
"learning_rate": 9.79824080516343e-06,
"loss": 0.4321,
"step": 211
},
{
"epoch": 0.3680954964731416,
"grad_norm": 0.15885428732313536,
"learning_rate": 9.793955107294983e-06,
"loss": 0.4182,
"step": 212
},
{
"epoch": 0.36983179598480737,
"grad_norm": 0.1851374561328918,
"learning_rate": 9.78962532615019e-06,
"loss": 0.4359,
"step": 213
},
{
"epoch": 0.3715680954964731,
"grad_norm": 0.16410510241800808,
"learning_rate": 9.785251501543973e-06,
"loss": 0.4525,
"step": 214
},
{
"epoch": 0.3733043950081389,
"grad_norm": 0.15343002176408385,
"learning_rate": 9.780833673696255e-06,
"loss": 0.4064,
"step": 215
},
{
"epoch": 0.37504069451980465,
"grad_norm": 0.1830058368195266,
"learning_rate": 9.7763718832316e-06,
"loss": 0.4068,
"step": 216
},
{
"epoch": 0.3767769940314704,
"grad_norm": 0.15700071691973735,
"learning_rate": 9.771866171178832e-06,
"loss": 0.4218,
"step": 217
},
{
"epoch": 0.3785132935431362,
"grad_norm": 0.12876055954547613,
"learning_rate": 9.767316578970658e-06,
"loss": 0.4211,
"step": 218
},
{
"epoch": 0.38024959305480194,
"grad_norm": 0.15442537490509584,
"learning_rate": 9.762723148443297e-06,
"loss": 0.435,
"step": 219
},
{
"epoch": 0.3819858925664677,
"grad_norm": 0.15580345792713624,
"learning_rate": 9.758085921836076e-06,
"loss": 0.4369,
"step": 220
},
{
"epoch": 0.38372219207813346,
"grad_norm": 0.1361428319295974,
"learning_rate": 9.753404941791063e-06,
"loss": 0.4313,
"step": 221
},
{
"epoch": 0.3854584915897992,
"grad_norm": 0.14653318408406776,
"learning_rate": 9.74868025135266e-06,
"loss": 0.4533,
"step": 222
},
{
"epoch": 0.387194791101465,
"grad_norm": 0.1565754671263572,
"learning_rate": 9.743911893967216e-06,
"loss": 0.4322,
"step": 223
},
{
"epoch": 0.38893109061313075,
"grad_norm": 0.16399056677484686,
"learning_rate": 9.739099913482616e-06,
"loss": 0.4466,
"step": 224
},
{
"epoch": 0.3906673901247965,
"grad_norm": 0.14157252484905397,
"learning_rate": 9.734244354147897e-06,
"loss": 0.4295,
"step": 225
},
{
"epoch": 0.39240368963646227,
"grad_norm": 0.1603171745561666,
"learning_rate": 9.729345260612817e-06,
"loss": 0.4372,
"step": 226
},
{
"epoch": 0.39413998914812803,
"grad_norm": 0.14321084701941422,
"learning_rate": 9.724402677927466e-06,
"loss": 0.4325,
"step": 227
},
{
"epoch": 0.3958762886597938,
"grad_norm": 0.1473692730338607,
"learning_rate": 9.719416651541839e-06,
"loss": 0.4309,
"step": 228
},
{
"epoch": 0.39761258817145956,
"grad_norm": 0.14956009373534798,
"learning_rate": 9.714387227305422e-06,
"loss": 0.4261,
"step": 229
},
{
"epoch": 0.3993488876831253,
"grad_norm": 0.14848206159225216,
"learning_rate": 9.70931445146677e-06,
"loss": 0.4397,
"step": 230
},
{
"epoch": 0.4010851871947911,
"grad_norm": 0.14241520833339297,
"learning_rate": 9.704198370673084e-06,
"loss": 0.4245,
"step": 231
},
{
"epoch": 0.40282148670645684,
"grad_norm": 0.14596741528366003,
"learning_rate": 9.699039031969776e-06,
"loss": 0.4396,
"step": 232
},
{
"epoch": 0.4045577862181226,
"grad_norm": 0.17062314124311329,
"learning_rate": 9.693836482800044e-06,
"loss": 0.4359,
"step": 233
},
{
"epoch": 0.40629408572978837,
"grad_norm": 0.1345681574448004,
"learning_rate": 9.68859077100443e-06,
"loss": 0.4019,
"step": 234
},
{
"epoch": 0.40803038524145413,
"grad_norm": 0.15564851444441055,
"learning_rate": 9.683301944820382e-06,
"loss": 0.432,
"step": 235
},
{
"epoch": 0.4097666847531199,
"grad_norm": 0.15674408774253595,
"learning_rate": 9.677970052881811e-06,
"loss": 0.4431,
"step": 236
},
{
"epoch": 0.41150298426478565,
"grad_norm": 0.147235124546805,
"learning_rate": 9.672595144218646e-06,
"loss": 0.4229,
"step": 237
},
{
"epoch": 0.4132392837764514,
"grad_norm": 0.15359262893739295,
"learning_rate": 9.667177268256373e-06,
"loss": 0.4489,
"step": 238
},
{
"epoch": 0.4149755832881172,
"grad_norm": 0.1936836685893336,
"learning_rate": 9.661716474815597e-06,
"loss": 0.4482,
"step": 239
},
{
"epoch": 0.41671188279978294,
"grad_norm": 0.15707058757983838,
"learning_rate": 9.656212814111567e-06,
"loss": 0.4324,
"step": 240
},
{
"epoch": 0.4184481823114487,
"grad_norm": 0.14905279461924087,
"learning_rate": 9.65066633675373e-06,
"loss": 0.4332,
"step": 241
},
{
"epoch": 0.42018448182311446,
"grad_norm": 0.15002700768694846,
"learning_rate": 9.645077093745248e-06,
"loss": 0.4317,
"step": 242
},
{
"epoch": 0.4219207813347802,
"grad_norm": 0.15705106694199372,
"learning_rate": 9.639445136482549e-06,
"loss": 0.4331,
"step": 243
},
{
"epoch": 0.423657080846446,
"grad_norm": 0.13539219399033092,
"learning_rate": 9.633770516754834e-06,
"loss": 0.4172,
"step": 244
},
{
"epoch": 0.42539338035811175,
"grad_norm": 0.15065870855174054,
"learning_rate": 9.628053286743619e-06,
"loss": 0.4344,
"step": 245
},
{
"epoch": 0.4271296798697775,
"grad_norm": 0.17437059505759828,
"learning_rate": 9.622293499022243e-06,
"loss": 0.4245,
"step": 246
},
{
"epoch": 0.4288659793814433,
"grad_norm": 0.17919029116585017,
"learning_rate": 9.61649120655539e-06,
"loss": 0.4221,
"step": 247
},
{
"epoch": 0.43060227889310904,
"grad_norm": 0.1334976875908364,
"learning_rate": 9.610646462698598e-06,
"loss": 0.403,
"step": 248
},
{
"epoch": 0.4323385784047748,
"grad_norm": 0.16277200115474594,
"learning_rate": 9.604759321197775e-06,
"loss": 0.4424,
"step": 249
},
{
"epoch": 0.43407487791644056,
"grad_norm": 0.13988502194177868,
"learning_rate": 9.598829836188694e-06,
"loss": 0.4476,
"step": 250
},
{
"epoch": 0.4358111774281063,
"grad_norm": 0.15784858998250612,
"learning_rate": 9.59285806219651e-06,
"loss": 0.4209,
"step": 251
},
{
"epoch": 0.4375474769397721,
"grad_norm": 0.1324213031630464,
"learning_rate": 9.586844054135248e-06,
"loss": 0.4133,
"step": 252
},
{
"epoch": 0.43928377645143785,
"grad_norm": 0.15300021033544062,
"learning_rate": 9.580787867307293e-06,
"loss": 0.4384,
"step": 253
},
{
"epoch": 0.4410200759631036,
"grad_norm": 0.143838210306649,
"learning_rate": 9.574689557402899e-06,
"loss": 0.4248,
"step": 254
},
{
"epoch": 0.44275637547476937,
"grad_norm": 0.13871482863771145,
"learning_rate": 9.56854918049966e-06,
"loss": 0.4405,
"step": 255
},
{
"epoch": 0.4444926749864352,
"grad_norm": 0.14460323057850463,
"learning_rate": 9.562366793062007e-06,
"loss": 0.4088,
"step": 256
},
{
"epoch": 0.44622897449810095,
"grad_norm": 0.1516286738769081,
"learning_rate": 9.55614245194068e-06,
"loss": 0.426,
"step": 257
},
{
"epoch": 0.4479652740097667,
"grad_norm": 0.15740523731737546,
"learning_rate": 9.549876214372203e-06,
"loss": 0.4042,
"step": 258
},
{
"epoch": 0.4497015735214325,
"grad_norm": 0.13541372433807597,
"learning_rate": 9.543568137978373e-06,
"loss": 0.4266,
"step": 259
},
{
"epoch": 0.45143787303309824,
"grad_norm": 0.13720170783679506,
"learning_rate": 9.53721828076571e-06,
"loss": 0.4192,
"step": 260
},
{
"epoch": 0.453174172544764,
"grad_norm": 0.15636037632592015,
"learning_rate": 9.53082670112494e-06,
"loss": 0.413,
"step": 261
},
{
"epoch": 0.45491047205642976,
"grad_norm": 0.1545294587942669,
"learning_rate": 9.524393457830452e-06,
"loss": 0.4494,
"step": 262
},
{
"epoch": 0.4566467715680955,
"grad_norm": 0.15627701909179118,
"learning_rate": 9.51791861003975e-06,
"loss": 0.4224,
"step": 263
},
{
"epoch": 0.4583830710797613,
"grad_norm": 0.1492359535781047,
"learning_rate": 9.511402217292927e-06,
"loss": 0.4262,
"step": 264
},
{
"epoch": 0.46011937059142705,
"grad_norm": 0.1353127405505104,
"learning_rate": 9.504844339512096e-06,
"loss": 0.4165,
"step": 265
},
{
"epoch": 0.4618556701030928,
"grad_norm": 0.13373371028131661,
"learning_rate": 9.498245037000857e-06,
"loss": 0.4243,
"step": 266
},
{
"epoch": 0.46359196961475857,
"grad_norm": 0.14814709309461124,
"learning_rate": 9.491604370443732e-06,
"loss": 0.4295,
"step": 267
},
{
"epoch": 0.46532826912642433,
"grad_norm": 0.15693342961674778,
"learning_rate": 9.484922400905608e-06,
"loss": 0.4296,
"step": 268
},
{
"epoch": 0.4670645686380901,
"grad_norm": 0.13500362769857513,
"learning_rate": 9.478199189831184e-06,
"loss": 0.4356,
"step": 269
},
{
"epoch": 0.46880086814975586,
"grad_norm": 0.1479397146934287,
"learning_rate": 9.471434799044392e-06,
"loss": 0.4342,
"step": 270
},
{
"epoch": 0.4705371676614216,
"grad_norm": 0.14658155644327392,
"learning_rate": 9.464629290747844e-06,
"loss": 0.417,
"step": 271
},
{
"epoch": 0.4722734671730874,
"grad_norm": 0.1547841209657364,
"learning_rate": 9.457782727522242e-06,
"loss": 0.4468,
"step": 272
},
{
"epoch": 0.47400976668475314,
"grad_norm": 0.140566594459949,
"learning_rate": 9.450895172325822e-06,
"loss": 0.4144,
"step": 273
},
{
"epoch": 0.4757460661964189,
"grad_norm": 0.14904014949184202,
"learning_rate": 9.443966688493762e-06,
"loss": 0.4267,
"step": 274
},
{
"epoch": 0.47748236570808467,
"grad_norm": 0.1618365518683853,
"learning_rate": 9.4369973397376e-06,
"loss": 0.4278,
"step": 275
},
{
"epoch": 0.47921866521975043,
"grad_norm": 0.15223749430970504,
"learning_rate": 9.429987190144659e-06,
"loss": 0.4321,
"step": 276
},
{
"epoch": 0.4809549647314162,
"grad_norm": 0.1539731787632902,
"learning_rate": 9.422936304177439e-06,
"loss": 0.428,
"step": 277
},
{
"epoch": 0.48269126424308195,
"grad_norm": 0.14879656683456302,
"learning_rate": 9.415844746673047e-06,
"loss": 0.4245,
"step": 278
},
{
"epoch": 0.4844275637547477,
"grad_norm": 0.14532316486603897,
"learning_rate": 9.408712582842583e-06,
"loss": 0.4301,
"step": 279
},
{
"epoch": 0.4861638632664135,
"grad_norm": 0.1707471215126368,
"learning_rate": 9.401539878270545e-06,
"loss": 0.4349,
"step": 280
},
{
"epoch": 0.48790016277807924,
"grad_norm": 0.1482363199845621,
"learning_rate": 9.394326698914229e-06,
"loss": 0.432,
"step": 281
},
{
"epoch": 0.489636462289745,
"grad_norm": 0.16631121800186766,
"learning_rate": 9.387073111103124e-06,
"loss": 0.4522,
"step": 282
},
{
"epoch": 0.49137276180141076,
"grad_norm": 0.15399106664401388,
"learning_rate": 9.379779181538294e-06,
"loss": 0.4177,
"step": 283
},
{
"epoch": 0.4931090613130765,
"grad_norm": 0.15933705725540492,
"learning_rate": 9.372444977291772e-06,
"loss": 0.4331,
"step": 284
},
{
"epoch": 0.4948453608247423,
"grad_norm": 0.1558071947319395,
"learning_rate": 9.365070565805941e-06,
"loss": 0.4413,
"step": 285
},
{
"epoch": 0.49658166033640805,
"grad_norm": 0.15869210462435573,
"learning_rate": 9.357656014892913e-06,
"loss": 0.4501,
"step": 286
},
{
"epoch": 0.4983179598480738,
"grad_norm": 0.14836343303572178,
"learning_rate": 9.350201392733902e-06,
"loss": 0.4398,
"step": 287
},
{
"epoch": 0.5000542593597396,
"grad_norm": 0.1446486130902907,
"learning_rate": 9.342706767878609e-06,
"loss": 0.4134,
"step": 288
},
{
"epoch": 0.5017905588714053,
"grad_norm": 0.16780837684964714,
"learning_rate": 9.335172209244577e-06,
"loss": 0.4354,
"step": 289
},
{
"epoch": 0.5035268583830711,
"grad_norm": 0.19016077034051496,
"learning_rate": 9.327597786116567e-06,
"loss": 0.4439,
"step": 290
},
{
"epoch": 0.5052631578947369,
"grad_norm": 0.12843881816321912,
"learning_rate": 9.319983568145919e-06,
"loss": 0.4044,
"step": 291
},
{
"epoch": 0.5069994574064026,
"grad_norm": 0.18172343997071713,
"learning_rate": 9.312329625349903e-06,
"loss": 0.439,
"step": 292
},
{
"epoch": 0.5087357569180684,
"grad_norm": 0.1533109565783212,
"learning_rate": 9.304636028111093e-06,
"loss": 0.4605,
"step": 293
},
{
"epoch": 0.5104720564297341,
"grad_norm": 0.1500997412539675,
"learning_rate": 9.296902847176703e-06,
"loss": 0.4346,
"step": 294
},
{
"epoch": 0.5122083559413999,
"grad_norm": 0.1434536596131837,
"learning_rate": 9.289130153657944e-06,
"loss": 0.442,
"step": 295
},
{
"epoch": 0.5139446554530657,
"grad_norm": 0.1631022083156658,
"learning_rate": 9.281318019029366e-06,
"loss": 0.4243,
"step": 296
},
{
"epoch": 0.5156809549647314,
"grad_norm": 0.15745451993950932,
"learning_rate": 9.273466515128209e-06,
"loss": 0.435,
"step": 297
},
{
"epoch": 0.5174172544763972,
"grad_norm": 0.15528300554164817,
"learning_rate": 9.265575714153732e-06,
"loss": 0.4335,
"step": 298
},
{
"epoch": 0.519153553988063,
"grad_norm": 0.18075251075323473,
"learning_rate": 9.257645688666557e-06,
"loss": 0.4293,
"step": 299
},
{
"epoch": 0.5208898534997287,
"grad_norm": 0.16706193153429877,
"learning_rate": 9.249676511588e-06,
"loss": 0.425,
"step": 300
},
{
"epoch": 0.5226261530113945,
"grad_norm": 0.15762590559281106,
"learning_rate": 9.241668256199392e-06,
"loss": 0.4572,
"step": 301
},
{
"epoch": 0.5243624525230602,
"grad_norm": 0.16766422353990898,
"learning_rate": 9.233620996141421e-06,
"loss": 0.4421,
"step": 302
},
{
"epoch": 0.526098752034726,
"grad_norm": 0.15326556435477035,
"learning_rate": 9.225534805413443e-06,
"loss": 0.4382,
"step": 303
},
{
"epoch": 0.5278350515463918,
"grad_norm": 0.15594005545981307,
"learning_rate": 9.217409758372805e-06,
"loss": 0.4306,
"step": 304
},
{
"epoch": 0.5295713510580575,
"grad_norm": 0.15658249902105414,
"learning_rate": 9.209245929734156e-06,
"loss": 0.4276,
"step": 305
},
{
"epoch": 0.5313076505697233,
"grad_norm": 0.1688752124377244,
"learning_rate": 9.201043394568773e-06,
"loss": 0.4431,
"step": 306
},
{
"epoch": 0.533043950081389,
"grad_norm": 0.16298343992138783,
"learning_rate": 9.192802228303858e-06,
"loss": 0.432,
"step": 307
},
{
"epoch": 0.5347802495930548,
"grad_norm": 0.14067039648645466,
"learning_rate": 9.184522506721848e-06,
"loss": 0.4268,
"step": 308
},
{
"epoch": 0.5365165491047206,
"grad_norm": 0.13618995591727004,
"learning_rate": 9.176204305959727e-06,
"loss": 0.4267,
"step": 309
},
{
"epoch": 0.5382528486163863,
"grad_norm": 0.13359346371022174,
"learning_rate": 9.167847702508304e-06,
"loss": 0.3988,
"step": 310
},
{
"epoch": 0.5399891481280521,
"grad_norm": 0.1696549417930708,
"learning_rate": 9.159452773211537e-06,
"loss": 0.423,
"step": 311
},
{
"epoch": 0.5417254476397179,
"grad_norm": 0.14323074167209557,
"learning_rate": 9.151019595265805e-06,
"loss": 0.4093,
"step": 312
},
{
"epoch": 0.5434617471513836,
"grad_norm": 0.16685931065308446,
"learning_rate": 9.142548246219212e-06,
"loss": 0.4416,
"step": 313
},
{
"epoch": 0.5451980466630494,
"grad_norm": 0.1487456457447269,
"learning_rate": 9.134038803970861e-06,
"loss": 0.4451,
"step": 314
},
{
"epoch": 0.5469343461747151,
"grad_norm": 0.1566773867653797,
"learning_rate": 9.12549134677015e-06,
"loss": 0.4205,
"step": 315
},
{
"epoch": 0.5486706456863809,
"grad_norm": 0.14167677739491838,
"learning_rate": 9.116905953216048e-06,
"loss": 0.4267,
"step": 316
},
{
"epoch": 0.5504069451980467,
"grad_norm": 0.15472423361672563,
"learning_rate": 9.108282702256366e-06,
"loss": 0.4271,
"step": 317
},
{
"epoch": 0.5521432447097124,
"grad_norm": 0.13110251374929036,
"learning_rate": 9.09962167318704e-06,
"loss": 0.4112,
"step": 318
},
{
"epoch": 0.5538795442213782,
"grad_norm": 0.15032138262922598,
"learning_rate": 9.090922945651399e-06,
"loss": 0.4448,
"step": 319
},
{
"epoch": 0.555615843733044,
"grad_norm": 0.14551624676182287,
"learning_rate": 9.082186599639429e-06,
"loss": 0.4201,
"step": 320
},
{
"epoch": 0.5573521432447097,
"grad_norm": 0.14465301780155224,
"learning_rate": 9.073412715487045e-06,
"loss": 0.4267,
"step": 321
},
{
"epoch": 0.5590884427563755,
"grad_norm": 0.14551487118151804,
"learning_rate": 9.064601373875341e-06,
"loss": 0.421,
"step": 322
},
{
"epoch": 0.5608247422680412,
"grad_norm": 0.1457948287955944,
"learning_rate": 9.05575265582986e-06,
"loss": 0.4445,
"step": 323
},
{
"epoch": 0.562561041779707,
"grad_norm": 0.16067858876663518,
"learning_rate": 9.04686664271984e-06,
"loss": 0.4434,
"step": 324
},
{
"epoch": 0.5642973412913728,
"grad_norm": 0.1541984433913847,
"learning_rate": 9.037943416257475e-06,
"loss": 0.4306,
"step": 325
},
{
"epoch": 0.5660336408030385,
"grad_norm": 0.14316636284034773,
"learning_rate": 9.028983058497152e-06,
"loss": 0.414,
"step": 326
},
{
"epoch": 0.5677699403147043,
"grad_norm": 0.17397044853565383,
"learning_rate": 9.019985651834703e-06,
"loss": 0.4432,
"step": 327
},
{
"epoch": 0.56950623982637,
"grad_norm": 0.1627074833616904,
"learning_rate": 9.010951279006652e-06,
"loss": 0.448,
"step": 328
},
{
"epoch": 0.5712425393380358,
"grad_norm": 0.1940870487142773,
"learning_rate": 9.001880023089442e-06,
"loss": 0.4425,
"step": 329
},
{
"epoch": 0.5729788388497016,
"grad_norm": 0.159598948177848,
"learning_rate": 8.992771967498682e-06,
"loss": 0.4406,
"step": 330
},
{
"epoch": 0.5747151383613673,
"grad_norm": 0.1498699860214419,
"learning_rate": 8.983627195988376e-06,
"loss": 0.4388,
"step": 331
},
{
"epoch": 0.5764514378730331,
"grad_norm": 0.16951652153250185,
"learning_rate": 8.974445792650152e-06,
"loss": 0.4423,
"step": 332
},
{
"epoch": 0.5781877373846989,
"grad_norm": 0.19455142439514098,
"learning_rate": 8.96522784191249e-06,
"loss": 0.4111,
"step": 333
},
{
"epoch": 0.5799240368963646,
"grad_norm": 0.13973563527713942,
"learning_rate": 8.955973428539943e-06,
"loss": 0.4096,
"step": 334
},
{
"epoch": 0.5816603364080304,
"grad_norm": 0.16050181521060788,
"learning_rate": 8.946682637632362e-06,
"loss": 0.4245,
"step": 335
},
{
"epoch": 0.5833966359196961,
"grad_norm": 0.15909571259070338,
"learning_rate": 8.937355554624111e-06,
"loss": 0.4072,
"step": 336
},
{
"epoch": 0.5851329354313619,
"grad_norm": 0.15430883571122384,
"learning_rate": 8.927992265283282e-06,
"loss": 0.4143,
"step": 337
},
{
"epoch": 0.5868692349430277,
"grad_norm": 0.14165670203508623,
"learning_rate": 8.9185928557109e-06,
"loss": 0.4317,
"step": 338
},
{
"epoch": 0.5886055344546934,
"grad_norm": 0.14467120823031193,
"learning_rate": 8.90915741234015e-06,
"loss": 0.4484,
"step": 339
},
{
"epoch": 0.5903418339663592,
"grad_norm": 0.15194753933755567,
"learning_rate": 8.899686021935554e-06,
"loss": 0.409,
"step": 340
},
{
"epoch": 0.592078133478025,
"grad_norm": 0.1909028572126585,
"learning_rate": 8.890178771592198e-06,
"loss": 0.4323,
"step": 341
},
{
"epoch": 0.5938144329896907,
"grad_norm": 0.16563174108323855,
"learning_rate": 8.88063574873492e-06,
"loss": 0.4382,
"step": 342
},
{
"epoch": 0.5955507325013565,
"grad_norm": 0.1535707289828906,
"learning_rate": 8.871057041117505e-06,
"loss": 0.4219,
"step": 343
},
{
"epoch": 0.5972870320130222,
"grad_norm": 0.17733027347423158,
"learning_rate": 8.861442736821883e-06,
"loss": 0.4229,
"step": 344
},
{
"epoch": 0.599023331524688,
"grad_norm": 0.15784389767547816,
"learning_rate": 8.851792924257316e-06,
"loss": 0.4155,
"step": 345
},
{
"epoch": 0.6007596310363538,
"grad_norm": 0.16406056888797899,
"learning_rate": 8.842107692159587e-06,
"loss": 0.4389,
"step": 346
},
{
"epoch": 0.6024959305480195,
"grad_norm": 0.16632382879168614,
"learning_rate": 8.83238712959018e-06,
"loss": 0.432,
"step": 347
},
{
"epoch": 0.6042322300596853,
"grad_norm": 0.16932428537112926,
"learning_rate": 8.822631325935463e-06,
"loss": 0.4179,
"step": 348
},
{
"epoch": 0.6059685295713511,
"grad_norm": 0.20272370572709011,
"learning_rate": 8.812840370905872e-06,
"loss": 0.4289,
"step": 349
},
{
"epoch": 0.6077048290830168,
"grad_norm": 0.16321136969174183,
"learning_rate": 8.80301435453508e-06,
"loss": 0.4346,
"step": 350
},
{
"epoch": 0.6094411285946826,
"grad_norm": 0.1695889443045464,
"learning_rate": 8.793153367179164e-06,
"loss": 0.4087,
"step": 351
},
{
"epoch": 0.6111774281063483,
"grad_norm": 0.1717524474479854,
"learning_rate": 8.783257499515785e-06,
"loss": 0.4175,
"step": 352
},
{
"epoch": 0.6129137276180141,
"grad_norm": 0.19238932735231165,
"learning_rate": 8.773326842543348e-06,
"loss": 0.427,
"step": 353
},
{
"epoch": 0.6146500271296799,
"grad_norm": 0.15756445720282503,
"learning_rate": 8.763361487580167e-06,
"loss": 0.4316,
"step": 354
},
{
"epoch": 0.6163863266413456,
"grad_norm": 0.1863059885455897,
"learning_rate": 8.753361526263622e-06,
"loss": 0.441,
"step": 355
},
{
"epoch": 0.6181226261530114,
"grad_norm": 0.14994819207944002,
"learning_rate": 8.743327050549326e-06,
"loss": 0.4265,
"step": 356
},
{
"epoch": 0.6198589256646772,
"grad_norm": 0.17062218033006715,
"learning_rate": 8.733258152710262e-06,
"loss": 0.454,
"step": 357
},
{
"epoch": 0.6215952251763429,
"grad_norm": 0.16850908560763084,
"learning_rate": 8.723154925335957e-06,
"loss": 0.4344,
"step": 358
},
{
"epoch": 0.6233315246880087,
"grad_norm": 0.16709271250050742,
"learning_rate": 8.713017461331608e-06,
"loss": 0.4362,
"step": 359
},
{
"epoch": 0.6250678241996744,
"grad_norm": 0.17296909553562448,
"learning_rate": 8.702845853917242e-06,
"loss": 0.4391,
"step": 360
},
{
"epoch": 0.6268041237113402,
"grad_norm": 0.1634801663010388,
"learning_rate": 8.692640196626859e-06,
"loss": 0.418,
"step": 361
},
{
"epoch": 0.628540423223006,
"grad_norm": 0.21089231489074947,
"learning_rate": 8.682400583307562e-06,
"loss": 0.4521,
"step": 362
},
{
"epoch": 0.6302767227346717,
"grad_norm": 0.16024795309892262,
"learning_rate": 8.672127108118702e-06,
"loss": 0.4298,
"step": 363
},
{
"epoch": 0.6320130222463375,
"grad_norm": 0.1387299903351098,
"learning_rate": 8.661819865531014e-06,
"loss": 0.4267,
"step": 364
},
{
"epoch": 0.6337493217580032,
"grad_norm": 0.16720213909194867,
"learning_rate": 8.651478950325739e-06,
"loss": 0.427,
"step": 365
},
{
"epoch": 0.635485621269669,
"grad_norm": 0.18886296533854524,
"learning_rate": 8.641104457593756e-06,
"loss": 0.4302,
"step": 366
},
{
"epoch": 0.6372219207813348,
"grad_norm": 0.1435746148298888,
"learning_rate": 8.630696482734718e-06,
"loss": 0.4216,
"step": 367
},
{
"epoch": 0.6389582202930005,
"grad_norm": 0.16965303549071564,
"learning_rate": 8.620255121456157e-06,
"loss": 0.4425,
"step": 368
},
{
"epoch": 0.6406945198046663,
"grad_norm": 0.15830472472009197,
"learning_rate": 8.609780469772623e-06,
"loss": 0.4198,
"step": 369
},
{
"epoch": 0.6424308193163321,
"grad_norm": 0.15576341914865044,
"learning_rate": 8.59927262400478e-06,
"loss": 0.4029,
"step": 370
},
{
"epoch": 0.6441671188279978,
"grad_norm": 0.1502744242798527,
"learning_rate": 8.588731680778541e-06,
"loss": 0.4266,
"step": 371
},
{
"epoch": 0.6459034183396636,
"grad_norm": 0.1613768907477256,
"learning_rate": 8.578157737024161e-06,
"loss": 0.4198,
"step": 372
},
{
"epoch": 0.6476397178513293,
"grad_norm": 0.14851747702098045,
"learning_rate": 8.567550889975362e-06,
"loss": 0.4293,
"step": 373
},
{
"epoch": 0.6493760173629951,
"grad_norm": 0.16651395276661102,
"learning_rate": 8.556911237168428e-06,
"loss": 0.4267,
"step": 374
},
{
"epoch": 0.6511123168746609,
"grad_norm": 0.1503438279696906,
"learning_rate": 8.546238876441313e-06,
"loss": 0.4315,
"step": 375
},
{
"epoch": 0.6528486163863266,
"grad_norm": 0.13894070033687952,
"learning_rate": 8.535533905932739e-06,
"loss": 0.4361,
"step": 376
},
{
"epoch": 0.6545849158979924,
"grad_norm": 0.15374759492249587,
"learning_rate": 8.524796424081291e-06,
"loss": 0.4295,
"step": 377
},
{
"epoch": 0.6563212154096582,
"grad_norm": 0.16379040041737855,
"learning_rate": 8.514026529624523e-06,
"loss": 0.4278,
"step": 378
},
{
"epoch": 0.6580575149213239,
"grad_norm": 0.13727637457553227,
"learning_rate": 8.503224321598035e-06,
"loss": 0.4233,
"step": 379
},
{
"epoch": 0.6597938144329897,
"grad_norm": 0.1681213503893215,
"learning_rate": 8.492389899334572e-06,
"loss": 0.4249,
"step": 380
},
{
"epoch": 0.6615301139446554,
"grad_norm": 0.155321814222258,
"learning_rate": 8.481523362463111e-06,
"loss": 0.4131,
"step": 381
},
{
"epoch": 0.6632664134563212,
"grad_norm": 0.15655768137446305,
"learning_rate": 8.470624810907936e-06,
"loss": 0.4339,
"step": 382
},
{
"epoch": 0.665002712967987,
"grad_norm": 0.13927963207883687,
"learning_rate": 8.459694344887732e-06,
"loss": 0.4342,
"step": 383
},
{
"epoch": 0.6667390124796527,
"grad_norm": 0.16226244072961146,
"learning_rate": 8.44873206491465e-06,
"loss": 0.4273,
"step": 384
},
{
"epoch": 0.6684753119913185,
"grad_norm": 0.1695557602850689,
"learning_rate": 8.437738071793394e-06,
"loss": 0.4202,
"step": 385
},
{
"epoch": 0.6702116115029843,
"grad_norm": 0.15576962380070136,
"learning_rate": 8.426712466620288e-06,
"loss": 0.4288,
"step": 386
},
{
"epoch": 0.67194791101465,
"grad_norm": 0.158261704332491,
"learning_rate": 8.415655350782346e-06,
"loss": 0.4433,
"step": 387
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.15193331842541377,
"learning_rate": 8.404566825956341e-06,
"loss": 0.4155,
"step": 388
},
{
"epoch": 0.6754205100379815,
"grad_norm": 0.174426066510097,
"learning_rate": 8.393446994107876e-06,
"loss": 0.4404,
"step": 389
},
{
"epoch": 0.6771568095496473,
"grad_norm": 0.1325367145926275,
"learning_rate": 8.382295957490435e-06,
"loss": 0.4224,
"step": 390
},
{
"epoch": 0.6788931090613131,
"grad_norm": 0.13452032528615257,
"learning_rate": 8.371113818644449e-06,
"loss": 0.4185,
"step": 391
},
{
"epoch": 0.6806294085729788,
"grad_norm": 0.16147477287089293,
"learning_rate": 8.359900680396356e-06,
"loss": 0.4424,
"step": 392
},
{
"epoch": 0.6823657080846446,
"grad_norm": 0.15553772122957388,
"learning_rate": 8.348656645857648e-06,
"loss": 0.4252,
"step": 393
},
{
"epoch": 0.6841020075963103,
"grad_norm": 0.14572880321282852,
"learning_rate": 8.33738181842393e-06,
"loss": 0.4155,
"step": 394
},
{
"epoch": 0.6858383071079761,
"grad_norm": 0.14969559128512425,
"learning_rate": 8.326076301773964e-06,
"loss": 0.4358,
"step": 395
},
{
"epoch": 0.6875746066196419,
"grad_norm": 0.1520543652203451,
"learning_rate": 8.314740199868716e-06,
"loss": 0.4179,
"step": 396
},
{
"epoch": 0.6893109061313076,
"grad_norm": 0.14932155425139088,
"learning_rate": 8.303373616950408e-06,
"loss": 0.4417,
"step": 397
},
{
"epoch": 0.6910472056429734,
"grad_norm": 0.1436390734221764,
"learning_rate": 8.291976657541545e-06,
"loss": 0.4357,
"step": 398
},
{
"epoch": 0.6927835051546392,
"grad_norm": 0.16562398124925534,
"learning_rate": 8.28054942644397e-06,
"loss": 0.4485,
"step": 399
},
{
"epoch": 0.6945198046663049,
"grad_norm": 0.16781942148844772,
"learning_rate": 8.269092028737885e-06,
"loss": 0.4341,
"step": 400
},
{
"epoch": 0.6962561041779707,
"grad_norm": 0.14039547328113816,
"learning_rate": 8.257604569780898e-06,
"loss": 0.414,
"step": 401
},
{
"epoch": 0.6979924036896364,
"grad_norm": 0.13511539909730524,
"learning_rate": 8.246087155207041e-06,
"loss": 0.4151,
"step": 402
},
{
"epoch": 0.6997287032013022,
"grad_norm": 0.1749036371486231,
"learning_rate": 8.234539890925812e-06,
"loss": 0.4149,
"step": 403
},
{
"epoch": 0.701465002712968,
"grad_norm": 0.1386923926618094,
"learning_rate": 8.222962883121196e-06,
"loss": 0.429,
"step": 404
},
{
"epoch": 0.7032013022246337,
"grad_norm": 0.18428690503297337,
"learning_rate": 8.21135623825068e-06,
"loss": 0.44,
"step": 405
},
{
"epoch": 0.7049376017362995,
"grad_norm": 0.1452278087108832,
"learning_rate": 8.19972006304429e-06,
"loss": 0.4179,
"step": 406
},
{
"epoch": 0.7066739012479653,
"grad_norm": 0.15917166403787675,
"learning_rate": 8.188054464503591e-06,
"loss": 0.4289,
"step": 407
},
{
"epoch": 0.708410200759631,
"grad_norm": 0.15011317963396884,
"learning_rate": 8.176359549900725e-06,
"loss": 0.4149,
"step": 408
},
{
"epoch": 0.7101465002712968,
"grad_norm": 0.1652053561456685,
"learning_rate": 8.164635426777404e-06,
"loss": 0.4412,
"step": 409
},
{
"epoch": 0.7118827997829625,
"grad_norm": 0.16677143872271724,
"learning_rate": 8.152882202943933e-06,
"loss": 0.4397,
"step": 410
},
{
"epoch": 0.7136190992946283,
"grad_norm": 0.1404122061810744,
"learning_rate": 8.141099986478212e-06,
"loss": 0.4193,
"step": 411
},
{
"epoch": 0.7153553988062941,
"grad_norm": 0.15212329324056417,
"learning_rate": 8.129288885724752e-06,
"loss": 0.4374,
"step": 412
},
{
"epoch": 0.7170916983179598,
"grad_norm": 0.173228829337398,
"learning_rate": 8.117449009293668e-06,
"loss": 0.4075,
"step": 413
},
{
"epoch": 0.7188279978296256,
"grad_norm": 0.14887921410638563,
"learning_rate": 8.105580466059685e-06,
"loss": 0.421,
"step": 414
},
{
"epoch": 0.7205642973412913,
"grad_norm": 0.1341819963564282,
"learning_rate": 8.093683365161135e-06,
"loss": 0.4312,
"step": 415
},
{
"epoch": 0.7223005968529571,
"grad_norm": 0.15095004881427365,
"learning_rate": 8.081757815998958e-06,
"loss": 0.4348,
"step": 416
},
{
"epoch": 0.7240368963646229,
"grad_norm": 0.14673177767107912,
"learning_rate": 8.069803928235689e-06,
"loss": 0.4273,
"step": 417
},
{
"epoch": 0.7257731958762886,
"grad_norm": 0.1822633455744125,
"learning_rate": 8.057821811794457e-06,
"loss": 0.4479,
"step": 418
},
{
"epoch": 0.7275094953879544,
"grad_norm": 0.14795548056912192,
"learning_rate": 8.045811576857969e-06,
"loss": 0.425,
"step": 419
},
{
"epoch": 0.7292457948996202,
"grad_norm": 0.14905122363244133,
"learning_rate": 8.033773333867498e-06,
"loss": 0.4213,
"step": 420
},
{
"epoch": 0.7309820944112859,
"grad_norm": 0.1576303606399092,
"learning_rate": 8.021707193521865e-06,
"loss": 0.4537,
"step": 421
},
{
"epoch": 0.7327183939229517,
"grad_norm": 0.15229537034665708,
"learning_rate": 8.009613266776433e-06,
"loss": 0.4328,
"step": 422
},
{
"epoch": 0.7344546934346174,
"grad_norm": 0.14146675627654046,
"learning_rate": 7.997491664842067e-06,
"loss": 0.4228,
"step": 423
},
{
"epoch": 0.7361909929462832,
"grad_norm": 0.1494345927721846,
"learning_rate": 7.985342499184125e-06,
"loss": 0.4472,
"step": 424
},
{
"epoch": 0.737927292457949,
"grad_norm": 0.15638464912362302,
"learning_rate": 7.973165881521435e-06,
"loss": 0.4654,
"step": 425
},
{
"epoch": 0.7396635919696147,
"grad_norm": 0.1581657562507735,
"learning_rate": 7.960961923825255e-06,
"loss": 0.4307,
"step": 426
},
{
"epoch": 0.7413998914812805,
"grad_norm": 0.1597652030369101,
"learning_rate": 7.948730738318255e-06,
"loss": 0.4467,
"step": 427
},
{
"epoch": 0.7431361909929463,
"grad_norm": 0.13680894316088868,
"learning_rate": 7.936472437473482e-06,
"loss": 0.4196,
"step": 428
},
{
"epoch": 0.744872490504612,
"grad_norm": 0.13931230444695578,
"learning_rate": 7.924187134013323e-06,
"loss": 0.4139,
"step": 429
},
{
"epoch": 0.7466087900162778,
"grad_norm": 0.13099637928584668,
"learning_rate": 7.91187494090847e-06,
"loss": 0.4088,
"step": 430
},
{
"epoch": 0.7483450895279435,
"grad_norm": 0.14334829413945557,
"learning_rate": 7.899535971376881e-06,
"loss": 0.4164,
"step": 431
},
{
"epoch": 0.7500813890396093,
"grad_norm": 0.16796267070198684,
"learning_rate": 7.887170338882742e-06,
"loss": 0.4359,
"step": 432
},
{
"epoch": 0.7518176885512751,
"grad_norm": 0.16242036776843566,
"learning_rate": 7.874778157135416e-06,
"loss": 0.4258,
"step": 433
},
{
"epoch": 0.7535539880629408,
"grad_norm": 0.1385070248558403,
"learning_rate": 7.862359540088404e-06,
"loss": 0.4277,
"step": 434
},
{
"epoch": 0.7552902875746066,
"grad_norm": 0.13314274636073767,
"learning_rate": 7.849914601938302e-06,
"loss": 0.4084,
"step": 435
},
{
"epoch": 0.7570265870862724,
"grad_norm": 0.18197820963349937,
"learning_rate": 7.837443457123732e-06,
"loss": 0.4496,
"step": 436
},
{
"epoch": 0.7587628865979381,
"grad_norm": 0.14847810455059401,
"learning_rate": 7.824946220324313e-06,
"loss": 0.4249,
"step": 437
},
{
"epoch": 0.7604991861096039,
"grad_norm": 0.14144146539360863,
"learning_rate": 7.812423006459588e-06,
"loss": 0.4073,
"step": 438
},
{
"epoch": 0.7622354856212696,
"grad_norm": 0.16954210592768115,
"learning_rate": 7.799873930687979e-06,
"loss": 0.4162,
"step": 439
},
{
"epoch": 0.7639717851329354,
"grad_norm": 0.1567807710460927,
"learning_rate": 7.78729910840572e-06,
"loss": 0.4266,
"step": 440
},
{
"epoch": 0.7657080846446012,
"grad_norm": 0.13338113485510952,
"learning_rate": 7.774698655245802e-06,
"loss": 0.4221,
"step": 441
},
{
"epoch": 0.7674443841562669,
"grad_norm": 0.13011809766956603,
"learning_rate": 7.762072687076911e-06,
"loss": 0.4333,
"step": 442
},
{
"epoch": 0.7691806836679327,
"grad_norm": 0.1565450908841742,
"learning_rate": 7.749421320002349e-06,
"loss": 0.4406,
"step": 443
},
{
"epoch": 0.7709169831795984,
"grad_norm": 0.15302860162984966,
"learning_rate": 7.736744670358985e-06,
"loss": 0.449,
"step": 444
},
{
"epoch": 0.7726532826912642,
"grad_norm": 0.15249501154388767,
"learning_rate": 7.724042854716169e-06,
"loss": 0.4298,
"step": 445
},
{
"epoch": 0.77438958220293,
"grad_norm": 0.14043726604769854,
"learning_rate": 7.711315989874677e-06,
"loss": 0.4162,
"step": 446
},
{
"epoch": 0.7761258817145957,
"grad_norm": 0.15320635053093767,
"learning_rate": 7.698564192865617e-06,
"loss": 0.4242,
"step": 447
},
{
"epoch": 0.7778621812262615,
"grad_norm": 0.17989485147230902,
"learning_rate": 7.68578758094937e-06,
"loss": 0.4418,
"step": 448
},
{
"epoch": 0.7795984807379273,
"grad_norm": 0.1684510261895026,
"learning_rate": 7.6729862716145e-06,
"loss": 0.4337,
"step": 449
},
{
"epoch": 0.781334780249593,
"grad_norm": 0.15069039960003242,
"learning_rate": 7.660160382576683e-06,
"loss": 0.426,
"step": 450
},
{
"epoch": 0.7830710797612588,
"grad_norm": 0.13273283070755218,
"learning_rate": 7.64731003177762e-06,
"loss": 0.4251,
"step": 451
},
{
"epoch": 0.7848073792729245,
"grad_norm": 0.15560617906201166,
"learning_rate": 7.634435337383948e-06,
"loss": 0.4519,
"step": 452
},
{
"epoch": 0.7865436787845903,
"grad_norm": 0.14351368157137256,
"learning_rate": 7.621536417786159e-06,
"loss": 0.4314,
"step": 453
},
{
"epoch": 0.7882799782962561,
"grad_norm": 0.1599564797673948,
"learning_rate": 7.608613391597514e-06,
"loss": 0.4314,
"step": 454
},
{
"epoch": 0.7900162778079218,
"grad_norm": 0.13912046111141438,
"learning_rate": 7.595666377652948e-06,
"loss": 0.4195,
"step": 455
},
{
"epoch": 0.7917525773195876,
"grad_norm": 0.1686980909185228,
"learning_rate": 7.582695495007974e-06,
"loss": 0.4328,
"step": 456
},
{
"epoch": 0.7934888768312534,
"grad_norm": 0.16273258235117594,
"learning_rate": 7.56970086293759e-06,
"loss": 0.4572,
"step": 457
},
{
"epoch": 0.7952251763429191,
"grad_norm": 0.1512893886108004,
"learning_rate": 7.556682600935194e-06,
"loss": 0.4288,
"step": 458
},
{
"epoch": 0.7969614758545849,
"grad_norm": 0.15124774797909318,
"learning_rate": 7.543640828711467e-06,
"loss": 0.4157,
"step": 459
},
{
"epoch": 0.7986977753662506,
"grad_norm": 0.15932778545097698,
"learning_rate": 7.530575666193283e-06,
"loss": 0.4152,
"step": 460
},
{
"epoch": 0.8004340748779164,
"grad_norm": 0.13827246944553215,
"learning_rate": 7.5174872335226e-06,
"loss": 0.4267,
"step": 461
},
{
"epoch": 0.8021703743895822,
"grad_norm": 0.1633340969790764,
"learning_rate": 7.504375651055369e-06,
"loss": 0.43,
"step": 462
},
{
"epoch": 0.8039066739012479,
"grad_norm": 0.148737493055638,
"learning_rate": 7.491241039360404e-06,
"loss": 0.4201,
"step": 463
},
{
"epoch": 0.8056429734129137,
"grad_norm": 0.17090144278144342,
"learning_rate": 7.478083519218297e-06,
"loss": 0.4646,
"step": 464
},
{
"epoch": 0.8073792729245794,
"grad_norm": 0.16543183078759685,
"learning_rate": 7.464903211620291e-06,
"loss": 0.431,
"step": 465
},
{
"epoch": 0.8091155724362452,
"grad_norm": 0.14450967489218555,
"learning_rate": 7.451700237767177e-06,
"loss": 0.4409,
"step": 466
},
{
"epoch": 0.810851871947911,
"grad_norm": 0.14314840673680362,
"learning_rate": 7.438474719068174e-06,
"loss": 0.4355,
"step": 467
},
{
"epoch": 0.8125881714595767,
"grad_norm": 0.14758334398446482,
"learning_rate": 7.425226777139811e-06,
"loss": 0.4459,
"step": 468
},
{
"epoch": 0.8143244709712425,
"grad_norm": 0.14270670380654457,
"learning_rate": 7.4119565338048195e-06,
"loss": 0.4062,
"step": 469
},
{
"epoch": 0.8160607704829083,
"grad_norm": 0.13060107623226094,
"learning_rate": 7.3986641110909975e-06,
"loss": 0.4248,
"step": 470
},
{
"epoch": 0.817797069994574,
"grad_norm": 0.1388192439723675,
"learning_rate": 7.385349631230102e-06,
"loss": 0.4229,
"step": 471
},
{
"epoch": 0.8195333695062398,
"grad_norm": 0.14822306068748878,
"learning_rate": 7.372013216656715e-06,
"loss": 0.4241,
"step": 472
},
{
"epoch": 0.8212696690179055,
"grad_norm": 0.1383157739435485,
"learning_rate": 7.358654990007123e-06,
"loss": 0.4315,
"step": 473
},
{
"epoch": 0.8230059685295713,
"grad_norm": 0.16449865250363682,
"learning_rate": 7.3452750741181855e-06,
"loss": 0.4391,
"step": 474
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.14207031927514444,
"learning_rate": 7.331873592026212e-06,
"loss": 0.4232,
"step": 475
},
{
"epoch": 0.8264785675529028,
"grad_norm": 0.14589681162317597,
"learning_rate": 7.31845066696582e-06,
"loss": 0.4167,
"step": 476
},
{
"epoch": 0.8282148670645686,
"grad_norm": 0.1422796225146136,
"learning_rate": 7.305006422368811e-06,
"loss": 0.4309,
"step": 477
},
{
"epoch": 0.8299511665762344,
"grad_norm": 0.13818720279721158,
"learning_rate": 7.291540981863034e-06,
"loss": 0.4283,
"step": 478
},
{
"epoch": 0.8316874660879001,
"grad_norm": 0.14285698185091358,
"learning_rate": 7.278054469271245e-06,
"loss": 0.4273,
"step": 479
},
{
"epoch": 0.8334237655995659,
"grad_norm": 0.13966266346160003,
"learning_rate": 7.26454700860997e-06,
"loss": 0.4367,
"step": 480
},
{
"epoch": 0.8351600651112316,
"grad_norm": 0.1331588540524562,
"learning_rate": 7.251018724088367e-06,
"loss": 0.431,
"step": 481
},
{
"epoch": 0.8368963646228974,
"grad_norm": 0.1425328854150848,
"learning_rate": 7.237469740107078e-06,
"loss": 0.4335,
"step": 482
},
{
"epoch": 0.8386326641345632,
"grad_norm": 0.13148396093005682,
"learning_rate": 7.223900181257094e-06,
"loss": 0.3987,
"step": 483
},
{
"epoch": 0.8403689636462289,
"grad_norm": 0.13046232046429795,
"learning_rate": 7.2103101723186e-06,
"loss": 0.4059,
"step": 484
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.1443365626310558,
"learning_rate": 7.196699838259834e-06,
"loss": 0.4386,
"step": 485
},
{
"epoch": 0.8438415626695605,
"grad_norm": 0.14089367813530887,
"learning_rate": 7.183069304235935e-06,
"loss": 0.4219,
"step": 486
},
{
"epoch": 0.8455778621812262,
"grad_norm": 0.13167205606546367,
"learning_rate": 7.169418695587791e-06,
"loss": 0.4174,
"step": 487
},
{
"epoch": 0.847314161692892,
"grad_norm": 0.13933768890571246,
"learning_rate": 7.155748137840892e-06,
"loss": 0.4278,
"step": 488
},
{
"epoch": 0.8490504612045577,
"grad_norm": 0.12724653360437108,
"learning_rate": 7.142057756704168e-06,
"loss": 0.4293,
"step": 489
},
{
"epoch": 0.8507867607162235,
"grad_norm": 0.13861402433503245,
"learning_rate": 7.128347678068841e-06,
"loss": 0.4304,
"step": 490
},
{
"epoch": 0.8525230602278893,
"grad_norm": 0.1420863969782023,
"learning_rate": 7.1146180280072584e-06,
"loss": 0.4284,
"step": 491
},
{
"epoch": 0.854259359739555,
"grad_norm": 0.14172250449900636,
"learning_rate": 7.100868932771741e-06,
"loss": 0.4461,
"step": 492
},
{
"epoch": 0.8559956592512208,
"grad_norm": 0.16114613280891626,
"learning_rate": 7.087100518793421e-06,
"loss": 0.4233,
"step": 493
},
{
"epoch": 0.8577319587628865,
"grad_norm": 0.1471442217924641,
"learning_rate": 7.073312912681074e-06,
"loss": 0.4236,
"step": 494
},
{
"epoch": 0.8594682582745523,
"grad_norm": 0.14412078891445385,
"learning_rate": 7.059506241219964e-06,
"loss": 0.4331,
"step": 495
},
{
"epoch": 0.8612045577862181,
"grad_norm": 0.14590273185782915,
"learning_rate": 7.045680631370668e-06,
"loss": 0.409,
"step": 496
},
{
"epoch": 0.8629408572978838,
"grad_norm": 0.15307053625347286,
"learning_rate": 7.031836210267915e-06,
"loss": 0.4255,
"step": 497
},
{
"epoch": 0.8646771568095496,
"grad_norm": 0.15633948129523692,
"learning_rate": 7.0179731052194134e-06,
"loss": 0.4205,
"step": 498
},
{
"epoch": 0.8664134563212154,
"grad_norm": 0.1569427893027917,
"learning_rate": 7.004091443704681e-06,
"loss": 0.438,
"step": 499
},
{
"epoch": 0.8681497558328811,
"grad_norm": 0.14493419488810694,
"learning_rate": 6.990191353373876e-06,
"loss": 0.432,
"step": 500
},
{
"epoch": 0.8698860553445469,
"grad_norm": 0.13166025057509068,
"learning_rate": 6.976272962046619e-06,
"loss": 0.4263,
"step": 501
},
{
"epoch": 0.8716223548562126,
"grad_norm": 0.1487366571198014,
"learning_rate": 6.962336397710819e-06,
"loss": 0.4326,
"step": 502
},
{
"epoch": 0.8733586543678784,
"grad_norm": 0.15151379193315279,
"learning_rate": 6.948381788521498e-06,
"loss": 0.421,
"step": 503
},
{
"epoch": 0.8750949538795442,
"grad_norm": 0.1446369210011889,
"learning_rate": 6.9344092627996075e-06,
"loss": 0.4357,
"step": 504
},
{
"epoch": 0.8768312533912099,
"grad_norm": 0.13526985128108113,
"learning_rate": 6.920418949030856e-06,
"loss": 0.4226,
"step": 505
},
{
"epoch": 0.8785675529028757,
"grad_norm": 0.14209678942328663,
"learning_rate": 6.906410975864522e-06,
"loss": 0.436,
"step": 506
},
{
"epoch": 0.8803038524145415,
"grad_norm": 0.15545453377773016,
"learning_rate": 6.892385472112275e-06,
"loss": 0.4223,
"step": 507
},
{
"epoch": 0.8820401519262072,
"grad_norm": 0.160449156940923,
"learning_rate": 6.878342566746985e-06,
"loss": 0.4256,
"step": 508
},
{
"epoch": 0.883776451437873,
"grad_norm": 0.15483085985532163,
"learning_rate": 6.864282388901544e-06,
"loss": 0.434,
"step": 509
},
{
"epoch": 0.8855127509495387,
"grad_norm": 0.14591180973133458,
"learning_rate": 6.85020506786767e-06,
"loss": 0.4234,
"step": 510
},
{
"epoch": 0.8872490504612045,
"grad_norm": 0.13585311221734875,
"learning_rate": 6.836110733094728e-06,
"loss": 0.4287,
"step": 511
},
{
"epoch": 0.8889853499728704,
"grad_norm": 0.15429791758689432,
"learning_rate": 6.821999514188532e-06,
"loss": 0.4338,
"step": 512
},
{
"epoch": 0.8907216494845361,
"grad_norm": 0.13417432207738678,
"learning_rate": 6.807871540910155e-06,
"loss": 0.4147,
"step": 513
},
{
"epoch": 0.8924579489962019,
"grad_norm": 0.12318186174299123,
"learning_rate": 6.793726943174737e-06,
"loss": 0.4155,
"step": 514
},
{
"epoch": 0.8941942485078677,
"grad_norm": 0.14032720007338734,
"learning_rate": 6.779565851050292e-06,
"loss": 0.406,
"step": 515
},
{
"epoch": 0.8959305480195334,
"grad_norm": 0.1604151317342483,
"learning_rate": 6.765388394756504e-06,
"loss": 0.448,
"step": 516
},
{
"epoch": 0.8976668475311992,
"grad_norm": 0.13753780470400695,
"learning_rate": 6.751194704663544e-06,
"loss": 0.4327,
"step": 517
},
{
"epoch": 0.899403147042865,
"grad_norm": 0.16004678522601554,
"learning_rate": 6.736984911290853e-06,
"loss": 0.3995,
"step": 518
},
{
"epoch": 0.9011394465545307,
"grad_norm": 0.13803186927904562,
"learning_rate": 6.722759145305959e-06,
"loss": 0.431,
"step": 519
},
{
"epoch": 0.9028757460661965,
"grad_norm": 0.15500469621691793,
"learning_rate": 6.708517537523264e-06,
"loss": 0.4391,
"step": 520
},
{
"epoch": 0.9046120455778622,
"grad_norm": 0.15834488288721668,
"learning_rate": 6.694260218902845e-06,
"loss": 0.4096,
"step": 521
},
{
"epoch": 0.906348345089528,
"grad_norm": 0.14874399244800696,
"learning_rate": 6.6799873205492485e-06,
"loss": 0.4286,
"step": 522
},
{
"epoch": 0.9080846446011938,
"grad_norm": 0.12429177101740557,
"learning_rate": 6.665698973710289e-06,
"loss": 0.4072,
"step": 523
},
{
"epoch": 0.9098209441128595,
"grad_norm": 0.13519014730031864,
"learning_rate": 6.651395309775837e-06,
"loss": 0.4153,
"step": 524
},
{
"epoch": 0.9115572436245253,
"grad_norm": 0.16632287295859693,
"learning_rate": 6.637076460276612e-06,
"loss": 0.4248,
"step": 525
},
{
"epoch": 0.913293543136191,
"grad_norm": 0.17329750345670855,
"learning_rate": 6.622742556882976e-06,
"loss": 0.4352,
"step": 526
},
{
"epoch": 0.9150298426478568,
"grad_norm": 0.12671659925618628,
"learning_rate": 6.608393731403721e-06,
"loss": 0.4062,
"step": 527
},
{
"epoch": 0.9167661421595226,
"grad_norm": 0.14740755648196183,
"learning_rate": 6.5940301157848505e-06,
"loss": 0.4168,
"step": 528
},
{
"epoch": 0.9185024416711883,
"grad_norm": 0.14660381330269254,
"learning_rate": 6.579651842108381e-06,
"loss": 0.4154,
"step": 529
},
{
"epoch": 0.9202387411828541,
"grad_norm": 0.13079068927881393,
"learning_rate": 6.565259042591112e-06,
"loss": 0.4068,
"step": 530
},
{
"epoch": 0.9219750406945199,
"grad_norm": 0.16047124875668828,
"learning_rate": 6.5508518495834214e-06,
"loss": 0.447,
"step": 531
},
{
"epoch": 0.9237113402061856,
"grad_norm": 0.16056398869904115,
"learning_rate": 6.536430395568037e-06,
"loss": 0.4402,
"step": 532
},
{
"epoch": 0.9254476397178514,
"grad_norm": 0.14379051689279518,
"learning_rate": 6.521994813158834e-06,
"loss": 0.4255,
"step": 533
},
{
"epoch": 0.9271839392295171,
"grad_norm": 0.14092695024536514,
"learning_rate": 6.507545235099601e-06,
"loss": 0.4318,
"step": 534
},
{
"epoch": 0.9289202387411829,
"grad_norm": 0.12525720188556722,
"learning_rate": 6.493081794262823e-06,
"loss": 0.4099,
"step": 535
},
{
"epoch": 0.9306565382528487,
"grad_norm": 0.13733388828680687,
"learning_rate": 6.478604623648468e-06,
"loss": 0.4562,
"step": 536
},
{
"epoch": 0.9323928377645144,
"grad_norm": 0.15169527782692627,
"learning_rate": 6.464113856382752e-06,
"loss": 0.4388,
"step": 537
},
{
"epoch": 0.9341291372761802,
"grad_norm": 0.15659833497207304,
"learning_rate": 6.449609625716924e-06,
"loss": 0.4385,
"step": 538
},
{
"epoch": 0.935865436787846,
"grad_norm": 0.14056083766486266,
"learning_rate": 6.435092065026035e-06,
"loss": 0.4365,
"step": 539
},
{
"epoch": 0.9376017362995117,
"grad_norm": 0.14777123442183437,
"learning_rate": 6.420561307807713e-06,
"loss": 0.4399,
"step": 540
},
{
"epoch": 0.9393380358111775,
"grad_norm": 0.1638663833379513,
"learning_rate": 6.406017487680938e-06,
"loss": 0.4607,
"step": 541
},
{
"epoch": 0.9410743353228432,
"grad_norm": 0.1729654923685193,
"learning_rate": 6.391460738384808e-06,
"loss": 0.436,
"step": 542
},
{
"epoch": 0.942810634834509,
"grad_norm": 0.14203433714494784,
"learning_rate": 6.376891193777317e-06,
"loss": 0.4491,
"step": 543
},
{
"epoch": 0.9445469343461748,
"grad_norm": 0.13595845029555445,
"learning_rate": 6.3623089878341146e-06,
"loss": 0.4246,
"step": 544
},
{
"epoch": 0.9462832338578405,
"grad_norm": 0.14926991946937673,
"learning_rate": 6.3477142546472836e-06,
"loss": 0.4307,
"step": 545
},
{
"epoch": 0.9480195333695063,
"grad_norm": 0.14848175859544074,
"learning_rate": 6.333107128424098e-06,
"loss": 0.4285,
"step": 546
},
{
"epoch": 0.949755832881172,
"grad_norm": 0.13823091677225982,
"learning_rate": 6.318487743485797e-06,
"loss": 0.4129,
"step": 547
},
{
"epoch": 0.9514921323928378,
"grad_norm": 0.14290107135288815,
"learning_rate": 6.303856234266344e-06,
"loss": 0.4269,
"step": 548
},
{
"epoch": 0.9532284319045036,
"grad_norm": 0.15407365570533135,
"learning_rate": 6.28921273531119e-06,
"loss": 0.4332,
"step": 549
},
{
"epoch": 0.9549647314161693,
"grad_norm": 0.1478352145654114,
"learning_rate": 6.274557381276045e-06,
"loss": 0.4278,
"step": 550
},
{
"epoch": 0.9567010309278351,
"grad_norm": 0.13620053200124488,
"learning_rate": 6.259890306925627e-06,
"loss": 0.425,
"step": 551
},
{
"epoch": 0.9584373304395009,
"grad_norm": 0.14784390477062037,
"learning_rate": 6.245211647132433e-06,
"loss": 0.4354,
"step": 552
},
{
"epoch": 0.9601736299511666,
"grad_norm": 0.13789859062305407,
"learning_rate": 6.230521536875494e-06,
"loss": 0.4119,
"step": 553
},
{
"epoch": 0.9619099294628324,
"grad_norm": 0.13714602492626404,
"learning_rate": 6.215820111239137e-06,
"loss": 0.4407,
"step": 554
},
{
"epoch": 0.9636462289744981,
"grad_norm": 0.1352836476917812,
"learning_rate": 6.201107505411736e-06,
"loss": 0.4262,
"step": 555
},
{
"epoch": 0.9653825284861639,
"grad_norm": 0.14817555775463528,
"learning_rate": 6.186383854684479e-06,
"loss": 0.4263,
"step": 556
},
{
"epoch": 0.9671188279978297,
"grad_norm": 0.13028353307404736,
"learning_rate": 6.171649294450113e-06,
"loss": 0.4369,
"step": 557
},
{
"epoch": 0.9688551275094954,
"grad_norm": 0.129005775473133,
"learning_rate": 6.156903960201709e-06,
"loss": 0.4037,
"step": 558
},
{
"epoch": 0.9705914270211612,
"grad_norm": 0.14131727755712598,
"learning_rate": 6.142147987531407e-06,
"loss": 0.4363,
"step": 559
},
{
"epoch": 0.972327726532827,
"grad_norm": 0.1492506986288016,
"learning_rate": 6.12738151212918e-06,
"loss": 0.4313,
"step": 560
},
{
"epoch": 0.9740640260444927,
"grad_norm": 0.14245819902062826,
"learning_rate": 6.112604669781572e-06,
"loss": 0.4424,
"step": 561
},
{
"epoch": 0.9758003255561585,
"grad_norm": 0.1443083695557364,
"learning_rate": 6.097817596370465e-06,
"loss": 0.4295,
"step": 562
},
{
"epoch": 0.9775366250678242,
"grad_norm": 0.13360371562359302,
"learning_rate": 6.083020427871818e-06,
"loss": 0.4347,
"step": 563
},
{
"epoch": 0.97927292457949,
"grad_norm": 0.1374915267856843,
"learning_rate": 6.0682133003544165e-06,
"loss": 0.4355,
"step": 564
},
{
"epoch": 0.9810092240911558,
"grad_norm": 0.1392477071475812,
"learning_rate": 6.053396349978632e-06,
"loss": 0.4294,
"step": 565
},
{
"epoch": 0.9827455236028215,
"grad_norm": 0.14836695769751443,
"learning_rate": 6.038569712995161e-06,
"loss": 0.4125,
"step": 566
},
{
"epoch": 0.9844818231144873,
"grad_norm": 0.14807766260570912,
"learning_rate": 6.02373352574377e-06,
"loss": 0.4117,
"step": 567
},
{
"epoch": 0.986218122626153,
"grad_norm": 0.1383315377874369,
"learning_rate": 6.008887924652053e-06,
"loss": 0.4412,
"step": 568
},
{
"epoch": 0.9879544221378188,
"grad_norm": 0.14253538031780663,
"learning_rate": 5.994033046234163e-06,
"loss": 0.4326,
"step": 569
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.14622116436494234,
"learning_rate": 5.979169027089568e-06,
"loss": 0.4067,
"step": 570
},
{
"epoch": 0.9914270211611503,
"grad_norm": 0.14488845841033232,
"learning_rate": 5.9642960039017875e-06,
"loss": 0.4374,
"step": 571
},
{
"epoch": 0.9931633206728161,
"grad_norm": 0.1362868479844263,
"learning_rate": 5.949414113437142e-06,
"loss": 0.4358,
"step": 572
},
{
"epoch": 0.9948996201844819,
"grad_norm": 0.1383976044726672,
"learning_rate": 5.934523492543489e-06,
"loss": 0.4185,
"step": 573
},
{
"epoch": 0.9966359196961476,
"grad_norm": 0.13345431171412198,
"learning_rate": 5.919624278148969e-06,
"loss": 0.4216,
"step": 574
},
{
"epoch": 0.9983722192078134,
"grad_norm": 0.13508419957425205,
"learning_rate": 5.904716607260743e-06,
"loss": 0.4113,
"step": 575
},
{
"epoch": 1.0,
"grad_norm": 0.14703530488547564,
"learning_rate": 5.889800616963738e-06,
"loss": 0.4309,
"step": 576
},
{
"epoch": 1.0017362995116659,
"grad_norm": 0.13820635061977254,
"learning_rate": 5.874876444419377e-06,
"loss": 0.4281,
"step": 577
},
{
"epoch": 1.0034725990233315,
"grad_norm": 0.1399834662944234,
"learning_rate": 5.8599442268643325e-06,
"loss": 0.4093,
"step": 578
},
{
"epoch": 1.0052088985349974,
"grad_norm": 0.14740466502395172,
"learning_rate": 5.8450041016092465e-06,
"loss": 0.4362,
"step": 579
},
{
"epoch": 1.006945198046663,
"grad_norm": 0.14722268227038388,
"learning_rate": 5.830056206037482e-06,
"loss": 0.4148,
"step": 580
},
{
"epoch": 1.008681497558329,
"grad_norm": 0.1489173191512877,
"learning_rate": 5.815100677603854e-06,
"loss": 0.4079,
"step": 581
},
{
"epoch": 1.0104177970699946,
"grad_norm": 0.13701264794437995,
"learning_rate": 5.800137653833368e-06,
"loss": 0.4104,
"step": 582
},
{
"epoch": 1.0121540965816604,
"grad_norm": 0.14149209374235472,
"learning_rate": 5.785167272319948e-06,
"loss": 0.4143,
"step": 583
},
{
"epoch": 1.013890396093326,
"grad_norm": 0.13608710059669457,
"learning_rate": 5.7701896707251824e-06,
"loss": 0.4133,
"step": 584
},
{
"epoch": 1.015626695604992,
"grad_norm": 0.127667185519214,
"learning_rate": 5.75520498677705e-06,
"loss": 0.3983,
"step": 585
},
{
"epoch": 1.0173629951166576,
"grad_norm": 0.14952422157976003,
"learning_rate": 5.740213358268658e-06,
"loss": 0.4063,
"step": 586
},
{
"epoch": 1.0190992946283235,
"grad_norm": 0.17330839118924732,
"learning_rate": 5.72521492305697e-06,
"loss": 0.4265,
"step": 587
},
{
"epoch": 1.0208355941399891,
"grad_norm": 0.1369542169535847,
"learning_rate": 5.710209819061544e-06,
"loss": 0.4085,
"step": 588
},
{
"epoch": 1.022571893651655,
"grad_norm": 0.14021766398680688,
"learning_rate": 5.695198184263259e-06,
"loss": 0.402,
"step": 589
},
{
"epoch": 1.0243081931633207,
"grad_norm": 0.1273275336014157,
"learning_rate": 5.680180156703052e-06,
"loss": 0.4158,
"step": 590
},
{
"epoch": 1.0260444926749865,
"grad_norm": 0.1447814022061172,
"learning_rate": 5.665155874480639e-06,
"loss": 0.402,
"step": 591
},
{
"epoch": 1.0277807921866522,
"grad_norm": 0.13493488857954397,
"learning_rate": 5.65012547575326e-06,
"loss": 0.4124,
"step": 592
},
{
"epoch": 1.029517091698318,
"grad_norm": 0.1419298182476474,
"learning_rate": 5.635089098734394e-06,
"loss": 0.4328,
"step": 593
},
{
"epoch": 1.0312533912099837,
"grad_norm": 0.1259898887932693,
"learning_rate": 5.620046881692496e-06,
"loss": 0.4082,
"step": 594
},
{
"epoch": 1.0329896907216496,
"grad_norm": 0.14566690869149043,
"learning_rate": 5.604998962949721e-06,
"loss": 0.4082,
"step": 595
},
{
"epoch": 1.0347259902333152,
"grad_norm": 0.1437692130612652,
"learning_rate": 5.5899454808806604e-06,
"loss": 0.4336,
"step": 596
},
{
"epoch": 1.0364622897449811,
"grad_norm": 0.13609460091136136,
"learning_rate": 5.574886573911056e-06,
"loss": 0.4329,
"step": 597
},
{
"epoch": 1.0381985892566468,
"grad_norm": 0.13077463591633914,
"learning_rate": 5.559822380516539e-06,
"loss": 0.397,
"step": 598
},
{
"epoch": 1.0399348887683126,
"grad_norm": 0.14243098008221436,
"learning_rate": 5.5447530392213545e-06,
"loss": 0.4047,
"step": 599
},
{
"epoch": 1.0416711882799783,
"grad_norm": 0.1344847481406696,
"learning_rate": 5.529678688597081e-06,
"loss": 0.4043,
"step": 600
},
{
"epoch": 1.0434074877916442,
"grad_norm": 0.14785465016750507,
"learning_rate": 5.514599467261363e-06,
"loss": 0.4023,
"step": 601
},
{
"epoch": 1.0451437873033098,
"grad_norm": 0.14094496078005248,
"learning_rate": 5.4995155138766345e-06,
"loss": 0.4009,
"step": 602
},
{
"epoch": 1.0468800868149757,
"grad_norm": 0.12656563162402626,
"learning_rate": 5.484426967148843e-06,
"loss": 0.4124,
"step": 603
},
{
"epoch": 1.0486163863266413,
"grad_norm": 0.14436518143142452,
"learning_rate": 5.469333965826174e-06,
"loss": 0.4091,
"step": 604
},
{
"epoch": 1.0503526858383072,
"grad_norm": 0.1266094695639352,
"learning_rate": 5.454236648697776e-06,
"loss": 0.4238,
"step": 605
},
{
"epoch": 1.0520889853499729,
"grad_norm": 0.1312935404022273,
"learning_rate": 5.439135154592486e-06,
"loss": 0.4017,
"step": 606
},
{
"epoch": 1.0538252848616387,
"grad_norm": 0.14228976847722097,
"learning_rate": 5.4240296223775465e-06,
"loss": 0.4155,
"step": 607
},
{
"epoch": 1.0555615843733044,
"grad_norm": 0.12839472792831685,
"learning_rate": 5.4089201909573376e-06,
"loss": 0.3921,
"step": 608
},
{
"epoch": 1.0572978838849703,
"grad_norm": 0.12265551035079586,
"learning_rate": 5.3938069992720894e-06,
"loss": 0.3843,
"step": 609
},
{
"epoch": 1.059034183396636,
"grad_norm": 0.12358555247985234,
"learning_rate": 5.378690186296617e-06,
"loss": 0.403,
"step": 610
},
{
"epoch": 1.0607704829083018,
"grad_norm": 0.13923611661731283,
"learning_rate": 5.363569891039027e-06,
"loss": 0.4303,
"step": 611
},
{
"epoch": 1.0625067824199674,
"grad_norm": 0.1466000832580553,
"learning_rate": 5.348446252539457e-06,
"loss": 0.4136,
"step": 612
},
{
"epoch": 1.0642430819316333,
"grad_norm": 0.13908426064129698,
"learning_rate": 5.333319409868777e-06,
"loss": 0.4287,
"step": 613
},
{
"epoch": 1.065979381443299,
"grad_norm": 0.1268183562981206,
"learning_rate": 5.318189502127332e-06,
"loss": 0.3916,
"step": 614
},
{
"epoch": 1.0677156809549648,
"grad_norm": 0.13537891316641507,
"learning_rate": 5.303056668443645e-06,
"loss": 0.4132,
"step": 615
},
{
"epoch": 1.0694519804666305,
"grad_norm": 0.12929905525099036,
"learning_rate": 5.287921047973149e-06,
"loss": 0.4098,
"step": 616
},
{
"epoch": 1.0711882799782964,
"grad_norm": 0.13243953145888157,
"learning_rate": 5.272782779896898e-06,
"loss": 0.4243,
"step": 617
},
{
"epoch": 1.072924579489962,
"grad_norm": 0.1399328685213442,
"learning_rate": 5.257642003420298e-06,
"loss": 0.4159,
"step": 618
},
{
"epoch": 1.0746608790016279,
"grad_norm": 0.13181582787899612,
"learning_rate": 5.242498857771816e-06,
"loss": 0.4061,
"step": 619
},
{
"epoch": 1.0763971785132935,
"grad_norm": 0.15243301626899763,
"learning_rate": 5.2273534822017105e-06,
"loss": 0.4104,
"step": 620
},
{
"epoch": 1.0781334780249594,
"grad_norm": 0.13165320866583394,
"learning_rate": 5.212206015980742e-06,
"loss": 0.3981,
"step": 621
},
{
"epoch": 1.079869777536625,
"grad_norm": 0.14715452781968524,
"learning_rate": 5.197056598398897e-06,
"loss": 0.4168,
"step": 622
},
{
"epoch": 1.081606077048291,
"grad_norm": 0.13599260807794022,
"learning_rate": 5.181905368764102e-06,
"loss": 0.4326,
"step": 623
},
{
"epoch": 1.0833423765599566,
"grad_norm": 0.14589346570188805,
"learning_rate": 5.166752466400954e-06,
"loss": 0.4112,
"step": 624
},
{
"epoch": 1.0850786760716224,
"grad_norm": 0.12995220413381026,
"learning_rate": 5.151598030649425e-06,
"loss": 0.3986,
"step": 625
},
{
"epoch": 1.086814975583288,
"grad_norm": 0.1329355849474862,
"learning_rate": 5.13644220086359e-06,
"loss": 0.4012,
"step": 626
},
{
"epoch": 1.088551275094954,
"grad_norm": 0.13589628456048708,
"learning_rate": 5.121285116410344e-06,
"loss": 0.4008,
"step": 627
},
{
"epoch": 1.0902875746066196,
"grad_norm": 0.14045644426905074,
"learning_rate": 5.106126916668118e-06,
"loss": 0.4111,
"step": 628
},
{
"epoch": 1.0920238741182855,
"grad_norm": 0.14551962000345375,
"learning_rate": 5.090967741025599e-06,
"loss": 0.4006,
"step": 629
},
{
"epoch": 1.0937601736299511,
"grad_norm": 0.13705310011406346,
"learning_rate": 5.075807728880447e-06,
"loss": 0.4281,
"step": 630
},
{
"epoch": 1.095496473141617,
"grad_norm": 0.12740292576758674,
"learning_rate": 5.060647019638016e-06,
"loss": 0.4007,
"step": 631
},
{
"epoch": 1.0972327726532827,
"grad_norm": 0.12099875797969191,
"learning_rate": 5.04548575271007e-06,
"loss": 0.415,
"step": 632
},
{
"epoch": 1.0989690721649485,
"grad_norm": 0.13398256820011353,
"learning_rate": 5.030324067513499e-06,
"loss": 0.3973,
"step": 633
},
{
"epoch": 1.1007053716766142,
"grad_norm": 0.1348779537529425,
"learning_rate": 5.015162103469042e-06,
"loss": 0.4065,
"step": 634
},
{
"epoch": 1.10244167118828,
"grad_norm": 0.12678399726307263,
"learning_rate": 5e-06,
"loss": 0.4141,
"step": 635
},
{
"epoch": 1.1041779706999457,
"grad_norm": 0.13577535863115883,
"learning_rate": 4.984837896530959e-06,
"loss": 0.4109,
"step": 636
},
{
"epoch": 1.1059142702116116,
"grad_norm": 0.13470321160616094,
"learning_rate": 4.969675932486503e-06,
"loss": 0.4086,
"step": 637
},
{
"epoch": 1.1076505697232772,
"grad_norm": 0.14286962783803234,
"learning_rate": 4.954514247289931e-06,
"loss": 0.403,
"step": 638
},
{
"epoch": 1.1093868692349431,
"grad_norm": 0.12644018098118473,
"learning_rate": 4.939352980361985e-06,
"loss": 0.4201,
"step": 639
},
{
"epoch": 1.1111231687466088,
"grad_norm": 0.1325504479364434,
"learning_rate": 4.924192271119554e-06,
"loss": 0.4013,
"step": 640
},
{
"epoch": 1.1128594682582746,
"grad_norm": 0.1291532073174894,
"learning_rate": 4.909032258974403e-06,
"loss": 0.4086,
"step": 641
},
{
"epoch": 1.1145957677699403,
"grad_norm": 0.13268107816932617,
"learning_rate": 4.8938730833318825e-06,
"loss": 0.4229,
"step": 642
},
{
"epoch": 1.1163320672816062,
"grad_norm": 0.14551167466783432,
"learning_rate": 4.878714883589657e-06,
"loss": 0.4049,
"step": 643
},
{
"epoch": 1.1180683667932718,
"grad_norm": 0.1458795187912891,
"learning_rate": 4.863557799136411e-06,
"loss": 0.4252,
"step": 644
},
{
"epoch": 1.1198046663049377,
"grad_norm": 0.16140943317028875,
"learning_rate": 4.848401969350577e-06,
"loss": 0.4115,
"step": 645
},
{
"epoch": 1.1215409658166033,
"grad_norm": 0.17325457371109407,
"learning_rate": 4.833247533599047e-06,
"loss": 0.4121,
"step": 646
},
{
"epoch": 1.1232772653282692,
"grad_norm": 0.13364198145727132,
"learning_rate": 4.8180946312359e-06,
"loss": 0.4039,
"step": 647
},
{
"epoch": 1.1250135648399349,
"grad_norm": 0.1281682816584959,
"learning_rate": 4.802943401601105e-06,
"loss": 0.4182,
"step": 648
},
{
"epoch": 1.1267498643516007,
"grad_norm": 0.14174013597508733,
"learning_rate": 4.78779398401926e-06,
"loss": 0.4263,
"step": 649
},
{
"epoch": 1.1284861638632664,
"grad_norm": 0.13774912972454795,
"learning_rate": 4.77264651779829e-06,
"loss": 0.4421,
"step": 650
},
{
"epoch": 1.1302224633749323,
"grad_norm": 0.14207068241290607,
"learning_rate": 4.757501142228186e-06,
"loss": 0.4089,
"step": 651
},
{
"epoch": 1.131958762886598,
"grad_norm": 0.12903740995178237,
"learning_rate": 4.742357996579704e-06,
"loss": 0.4163,
"step": 652
},
{
"epoch": 1.1336950623982638,
"grad_norm": 0.1350820578224657,
"learning_rate": 4.7272172201031055e-06,
"loss": 0.4041,
"step": 653
},
{
"epoch": 1.1354313619099294,
"grad_norm": 0.1329350427266318,
"learning_rate": 4.712078952026853e-06,
"loss": 0.4035,
"step": 654
},
{
"epoch": 1.1371676614215953,
"grad_norm": 0.14408458515913283,
"learning_rate": 4.696943331556357e-06,
"loss": 0.4153,
"step": 655
},
{
"epoch": 1.138903960933261,
"grad_norm": 0.1382968249352353,
"learning_rate": 4.6818104978726685e-06,
"loss": 0.4167,
"step": 656
},
{
"epoch": 1.1406402604449268,
"grad_norm": 0.1337490824001829,
"learning_rate": 4.666680590131225e-06,
"loss": 0.4021,
"step": 657
},
{
"epoch": 1.1423765599565925,
"grad_norm": 0.13624081308831007,
"learning_rate": 4.651553747460545e-06,
"loss": 0.4008,
"step": 658
},
{
"epoch": 1.1441128594682584,
"grad_norm": 0.1301703680610189,
"learning_rate": 4.6364301089609755e-06,
"loss": 0.4201,
"step": 659
},
{
"epoch": 1.145849158979924,
"grad_norm": 0.12869809645079283,
"learning_rate": 4.621309813703385e-06,
"loss": 0.4221,
"step": 660
},
{
"epoch": 1.1475854584915899,
"grad_norm": 0.15594955604265287,
"learning_rate": 4.606193000727913e-06,
"loss": 0.423,
"step": 661
},
{
"epoch": 1.1493217580032555,
"grad_norm": 0.14731027076680392,
"learning_rate": 4.591079809042664e-06,
"loss": 0.4244,
"step": 662
},
{
"epoch": 1.1510580575149214,
"grad_norm": 0.13957336559394706,
"learning_rate": 4.575970377622456e-06,
"loss": 0.4203,
"step": 663
},
{
"epoch": 1.152794357026587,
"grad_norm": 0.11562182093236165,
"learning_rate": 4.560864845407515e-06,
"loss": 0.4134,
"step": 664
},
{
"epoch": 1.154530656538253,
"grad_norm": 0.12916966436454652,
"learning_rate": 4.545763351302224e-06,
"loss": 0.4273,
"step": 665
},
{
"epoch": 1.1562669560499186,
"grad_norm": 0.1279961071644105,
"learning_rate": 4.530666034173827e-06,
"loss": 0.3993,
"step": 666
},
{
"epoch": 1.1580032555615845,
"grad_norm": 0.12327967071387225,
"learning_rate": 4.515573032851158e-06,
"loss": 0.4015,
"step": 667
},
{
"epoch": 1.15973955507325,
"grad_norm": 0.13127349833439605,
"learning_rate": 4.500484486123367e-06,
"loss": 0.4069,
"step": 668
},
{
"epoch": 1.161475854584916,
"grad_norm": 0.13059532586105568,
"learning_rate": 4.485400532738638e-06,
"loss": 0.4166,
"step": 669
},
{
"epoch": 1.1632121540965816,
"grad_norm": 0.12508095784691065,
"learning_rate": 4.47032131140292e-06,
"loss": 0.4104,
"step": 670
},
{
"epoch": 1.1649484536082475,
"grad_norm": 0.1515418268364036,
"learning_rate": 4.455246960778646e-06,
"loss": 0.4098,
"step": 671
},
{
"epoch": 1.1666847531199132,
"grad_norm": 0.12573956573460845,
"learning_rate": 4.4401776194834615e-06,
"loss": 0.3935,
"step": 672
},
{
"epoch": 1.168421052631579,
"grad_norm": 0.14929151845493635,
"learning_rate": 4.425113426088945e-06,
"loss": 0.4119,
"step": 673
},
{
"epoch": 1.1701573521432447,
"grad_norm": 0.14187289444160336,
"learning_rate": 4.410054519119341e-06,
"loss": 0.418,
"step": 674
},
{
"epoch": 1.1718936516549106,
"grad_norm": 0.14862945555637808,
"learning_rate": 4.395001037050278e-06,
"loss": 0.4257,
"step": 675
},
{
"epoch": 1.1736299511665762,
"grad_norm": 0.13996175733356186,
"learning_rate": 4.379953118307505e-06,
"loss": 0.4003,
"step": 676
},
{
"epoch": 1.175366250678242,
"grad_norm": 0.1255053214493474,
"learning_rate": 4.364910901265607e-06,
"loss": 0.4241,
"step": 677
},
{
"epoch": 1.1771025501899077,
"grad_norm": 0.13373405771489325,
"learning_rate": 4.3498745242467415e-06,
"loss": 0.4052,
"step": 678
},
{
"epoch": 1.1788388497015736,
"grad_norm": 0.1300518944364487,
"learning_rate": 4.334844125519363e-06,
"loss": 0.4237,
"step": 679
},
{
"epoch": 1.1805751492132392,
"grad_norm": 0.13269874204216286,
"learning_rate": 4.319819843296952e-06,
"loss": 0.4171,
"step": 680
},
{
"epoch": 1.1823114487249051,
"grad_norm": 0.1492214358278627,
"learning_rate": 4.3048018157367435e-06,
"loss": 0.4084,
"step": 681
},
{
"epoch": 1.1840477482365708,
"grad_norm": 0.1329430046793398,
"learning_rate": 4.289790180938459e-06,
"loss": 0.4217,
"step": 682
},
{
"epoch": 1.1857840477482366,
"grad_norm": 0.12941948682816526,
"learning_rate": 4.274785076943031e-06,
"loss": 0.4178,
"step": 683
},
{
"epoch": 1.1875203472599023,
"grad_norm": 0.13202812035471978,
"learning_rate": 4.259786641731344e-06,
"loss": 0.4095,
"step": 684
},
{
"epoch": 1.1892566467715682,
"grad_norm": 0.14494857456759286,
"learning_rate": 4.244795013222951e-06,
"loss": 0.4224,
"step": 685
},
{
"epoch": 1.1909929462832338,
"grad_norm": 0.13462792345729818,
"learning_rate": 4.229810329274819e-06,
"loss": 0.4239,
"step": 686
},
{
"epoch": 1.1927292457948997,
"grad_norm": 0.14566055483265097,
"learning_rate": 4.214832727680054e-06,
"loss": 0.4348,
"step": 687
},
{
"epoch": 1.1944655453065653,
"grad_norm": 0.13632018344344551,
"learning_rate": 4.199862346166635e-06,
"loss": 0.4142,
"step": 688
},
{
"epoch": 1.1962018448182312,
"grad_norm": 0.14011949501328627,
"learning_rate": 4.184899322396147e-06,
"loss": 0.4031,
"step": 689
},
{
"epoch": 1.1979381443298969,
"grad_norm": 0.14232733717051585,
"learning_rate": 4.16994379396252e-06,
"loss": 0.3988,
"step": 690
},
{
"epoch": 1.1996744438415627,
"grad_norm": 0.12566943014411622,
"learning_rate": 4.154995898390756e-06,
"loss": 0.4054,
"step": 691
},
{
"epoch": 1.2014107433532284,
"grad_norm": 0.14134431965453162,
"learning_rate": 4.140055773135671e-06,
"loss": 0.4208,
"step": 692
},
{
"epoch": 1.2031470428648943,
"grad_norm": 0.1331920255111448,
"learning_rate": 4.125123555580624e-06,
"loss": 0.4179,
"step": 693
},
{
"epoch": 1.20488334237656,
"grad_norm": 0.1302865467669287,
"learning_rate": 4.110199383036263e-06,
"loss": 0.4185,
"step": 694
},
{
"epoch": 1.2066196418882258,
"grad_norm": 0.119878175170804,
"learning_rate": 4.0952833927392585e-06,
"loss": 0.3943,
"step": 695
},
{
"epoch": 1.2083559413998914,
"grad_norm": 0.1289332139821464,
"learning_rate": 4.080375721851031e-06,
"loss": 0.4317,
"step": 696
},
{
"epoch": 1.2100922409115573,
"grad_norm": 0.1489851485752332,
"learning_rate": 4.0654765074565125e-06,
"loss": 0.4147,
"step": 697
},
{
"epoch": 1.211828540423223,
"grad_norm": 0.1359535916414997,
"learning_rate": 4.050585886562858e-06,
"loss": 0.4098,
"step": 698
},
{
"epoch": 1.2135648399348888,
"grad_norm": 0.13399659588917587,
"learning_rate": 4.035703996098214e-06,
"loss": 0.4073,
"step": 699
},
{
"epoch": 1.2153011394465545,
"grad_norm": 0.14010727123563935,
"learning_rate": 4.020830972910433e-06,
"loss": 0.4208,
"step": 700
},
{
"epoch": 1.2170374389582204,
"grad_norm": 0.13938234625949356,
"learning_rate": 4.00596695376584e-06,
"loss": 0.4115,
"step": 701
},
{
"epoch": 1.218773738469886,
"grad_norm": 0.1353808767288146,
"learning_rate": 3.991112075347948e-06,
"loss": 0.4068,
"step": 702
},
{
"epoch": 1.2205100379815519,
"grad_norm": 0.14275599742668682,
"learning_rate": 3.976266474256232e-06,
"loss": 0.4165,
"step": 703
},
{
"epoch": 1.2222463374932175,
"grad_norm": 0.1378621771922673,
"learning_rate": 3.96143028700484e-06,
"loss": 0.416,
"step": 704
},
{
"epoch": 1.2239826370048834,
"grad_norm": 0.13218109257570979,
"learning_rate": 3.94660365002137e-06,
"loss": 0.4118,
"step": 705
},
{
"epoch": 1.225718936516549,
"grad_norm": 0.13836744351326696,
"learning_rate": 3.931786699645584e-06,
"loss": 0.4158,
"step": 706
},
{
"epoch": 1.227455236028215,
"grad_norm": 0.12863460316637662,
"learning_rate": 3.916979572128185e-06,
"loss": 0.4145,
"step": 707
},
{
"epoch": 1.2291915355398806,
"grad_norm": 0.1487138212974801,
"learning_rate": 3.9021824036295355e-06,
"loss": 0.4462,
"step": 708
},
{
"epoch": 1.2309278350515465,
"grad_norm": 0.15238329064789952,
"learning_rate": 3.887395330218429e-06,
"loss": 0.4329,
"step": 709
},
{
"epoch": 1.232664134563212,
"grad_norm": 0.14000705269862404,
"learning_rate": 3.872618487870822e-06,
"loss": 0.4208,
"step": 710
},
{
"epoch": 1.234400434074878,
"grad_norm": 0.1337473888423507,
"learning_rate": 3.857852012468594e-06,
"loss": 0.3966,
"step": 711
},
{
"epoch": 1.2361367335865436,
"grad_norm": 0.13831080512228816,
"learning_rate": 3.843096039798293e-06,
"loss": 0.4136,
"step": 712
},
{
"epoch": 1.2378730330982095,
"grad_norm": 0.13958883535341216,
"learning_rate": 3.8283507055498886e-06,
"loss": 0.4088,
"step": 713
},
{
"epoch": 1.2396093326098752,
"grad_norm": 0.12998244574393739,
"learning_rate": 3.8136161453155225e-06,
"loss": 0.4082,
"step": 714
},
{
"epoch": 1.241345632121541,
"grad_norm": 0.12023935229932833,
"learning_rate": 3.798892494588265e-06,
"loss": 0.406,
"step": 715
},
{
"epoch": 1.2430819316332067,
"grad_norm": 0.13329454708178137,
"learning_rate": 3.784179888760864e-06,
"loss": 0.4108,
"step": 716
},
{
"epoch": 1.2448182311448726,
"grad_norm": 0.14362657202555368,
"learning_rate": 3.7694784631245066e-06,
"loss": 0.4153,
"step": 717
},
{
"epoch": 1.2465545306565382,
"grad_norm": 0.14234958856936142,
"learning_rate": 3.754788352867568e-06,
"loss": 0.4257,
"step": 718
},
{
"epoch": 1.248290830168204,
"grad_norm": 0.13113466271767915,
"learning_rate": 3.7401096930743753e-06,
"loss": 0.4068,
"step": 719
},
{
"epoch": 1.2500271296798697,
"grad_norm": 0.14575529914592233,
"learning_rate": 3.7254426187239567e-06,
"loss": 0.4272,
"step": 720
},
{
"epoch": 1.2517634291915356,
"grad_norm": 0.13600829322582764,
"learning_rate": 3.7107872646888115e-06,
"loss": 0.4162,
"step": 721
},
{
"epoch": 1.2534997287032013,
"grad_norm": 0.148400333733331,
"learning_rate": 3.696143765733658e-06,
"loss": 0.4298,
"step": 722
},
{
"epoch": 1.2552360282148671,
"grad_norm": 0.13942444090666167,
"learning_rate": 3.6815122565142034e-06,
"loss": 0.4056,
"step": 723
},
{
"epoch": 1.2569723277265328,
"grad_norm": 0.1445262305649223,
"learning_rate": 3.666892871575903e-06,
"loss": 0.4211,
"step": 724
},
{
"epoch": 1.2587086272381987,
"grad_norm": 0.12435903954866685,
"learning_rate": 3.6522857453527172e-06,
"loss": 0.4049,
"step": 725
},
{
"epoch": 1.2604449267498643,
"grad_norm": 0.14897501276229344,
"learning_rate": 3.6376910121658867e-06,
"loss": 0.4319,
"step": 726
},
{
"epoch": 1.2621812262615302,
"grad_norm": 0.1581301795554515,
"learning_rate": 3.623108806222684e-06,
"loss": 0.4227,
"step": 727
},
{
"epoch": 1.2639175257731958,
"grad_norm": 0.16087358209515815,
"learning_rate": 3.608539261615194e-06,
"loss": 0.4129,
"step": 728
},
{
"epoch": 1.2656538252848617,
"grad_norm": 0.14218267776525048,
"learning_rate": 3.5939825123190637e-06,
"loss": 0.4125,
"step": 729
},
{
"epoch": 1.2673901247965274,
"grad_norm": 0.1557900583782951,
"learning_rate": 3.5794386921922885e-06,
"loss": 0.427,
"step": 730
},
{
"epoch": 1.2691264243081932,
"grad_norm": 0.13287959721455264,
"learning_rate": 3.5649079349739656e-06,
"loss": 0.4337,
"step": 731
},
{
"epoch": 1.2708627238198589,
"grad_norm": 0.13684064235737806,
"learning_rate": 3.550390374283077e-06,
"loss": 0.4201,
"step": 732
},
{
"epoch": 1.2725990233315247,
"grad_norm": 0.13717376028853762,
"learning_rate": 3.5358861436172487e-06,
"loss": 0.4245,
"step": 733
},
{
"epoch": 1.2743353228431904,
"grad_norm": 0.12673772460863922,
"learning_rate": 3.521395376351534e-06,
"loss": 0.4302,
"step": 734
},
{
"epoch": 1.2760716223548563,
"grad_norm": 0.1452730674929177,
"learning_rate": 3.506918205737179e-06,
"loss": 0.4167,
"step": 735
},
{
"epoch": 1.277807921866522,
"grad_norm": 0.12966559464754768,
"learning_rate": 3.492454764900402e-06,
"loss": 0.4096,
"step": 736
},
{
"epoch": 1.2795442213781878,
"grad_norm": 0.126491624200071,
"learning_rate": 3.478005186841167e-06,
"loss": 0.4111,
"step": 737
},
{
"epoch": 1.2812805208898534,
"grad_norm": 0.1648290795025507,
"learning_rate": 3.4635696044319644e-06,
"loss": 0.4265,
"step": 738
},
{
"epoch": 1.2830168204015193,
"grad_norm": 0.13792714906019224,
"learning_rate": 3.4491481504165802e-06,
"loss": 0.4086,
"step": 739
},
{
"epoch": 1.284753119913185,
"grad_norm": 0.12851901581350092,
"learning_rate": 3.4347409574088896e-06,
"loss": 0.4153,
"step": 740
},
{
"epoch": 1.2864894194248508,
"grad_norm": 0.14055982724287153,
"learning_rate": 3.4203481578916197e-06,
"loss": 0.3958,
"step": 741
},
{
"epoch": 1.2882257189365165,
"grad_norm": 0.14211923802570475,
"learning_rate": 3.4059698842151516e-06,
"loss": 0.4253,
"step": 742
},
{
"epoch": 1.2899620184481824,
"grad_norm": 0.14505225134761637,
"learning_rate": 3.3916062685962813e-06,
"loss": 0.4086,
"step": 743
},
{
"epoch": 1.291698317959848,
"grad_norm": 0.1378622302172767,
"learning_rate": 3.377257443117027e-06,
"loss": 0.4145,
"step": 744
},
{
"epoch": 1.293434617471514,
"grad_norm": 0.12570469560505018,
"learning_rate": 3.3629235397233894e-06,
"loss": 0.4028,
"step": 745
},
{
"epoch": 1.2951709169831795,
"grad_norm": 0.1465008931089143,
"learning_rate": 3.3486046902241663e-06,
"loss": 0.4385,
"step": 746
},
{
"epoch": 1.2969072164948454,
"grad_norm": 0.13318963224172997,
"learning_rate": 3.3343010262897125e-06,
"loss": 0.4129,
"step": 747
},
{
"epoch": 1.298643516006511,
"grad_norm": 0.13025976099827086,
"learning_rate": 3.3200126794507544e-06,
"loss": 0.4254,
"step": 748
},
{
"epoch": 1.300379815518177,
"grad_norm": 0.13522951465238042,
"learning_rate": 3.305739781097157e-06,
"loss": 0.4358,
"step": 749
},
{
"epoch": 1.3021161150298426,
"grad_norm": 0.13796244449987072,
"learning_rate": 3.2914824624767384e-06,
"loss": 0.4173,
"step": 750
},
{
"epoch": 1.3038524145415085,
"grad_norm": 0.12998370880403234,
"learning_rate": 3.2772408546940413e-06,
"loss": 0.4258,
"step": 751
},
{
"epoch": 1.3055887140531741,
"grad_norm": 0.12608502139488215,
"learning_rate": 3.263015088709147e-06,
"loss": 0.3888,
"step": 752
},
{
"epoch": 1.30732501356484,
"grad_norm": 0.12057418198065353,
"learning_rate": 3.248805295336458e-06,
"loss": 0.4178,
"step": 753
},
{
"epoch": 1.3090613130765056,
"grad_norm": 0.13684112107904475,
"learning_rate": 3.234611605243496e-06,
"loss": 0.4212,
"step": 754
},
{
"epoch": 1.3107976125881715,
"grad_norm": 0.12993759903594548,
"learning_rate": 3.2204341489497098e-06,
"loss": 0.4131,
"step": 755
},
{
"epoch": 1.3125339120998372,
"grad_norm": 0.13405591819120782,
"learning_rate": 3.206273056825263e-06,
"loss": 0.4193,
"step": 756
},
{
"epoch": 1.314270211611503,
"grad_norm": 0.11848443460254894,
"learning_rate": 3.192128459089846e-06,
"loss": 0.407,
"step": 757
},
{
"epoch": 1.3160065111231687,
"grad_norm": 0.1283026303176609,
"learning_rate": 3.178000485811469e-06,
"loss": 0.4043,
"step": 758
},
{
"epoch": 1.3177428106348346,
"grad_norm": 0.134503429002085,
"learning_rate": 3.1638892669052725e-06,
"loss": 0.4233,
"step": 759
},
{
"epoch": 1.3194791101465002,
"grad_norm": 0.1304647334011823,
"learning_rate": 3.149794932132331e-06,
"loss": 0.3977,
"step": 760
},
{
"epoch": 1.321215409658166,
"grad_norm": 0.12921517217947945,
"learning_rate": 3.1357176110984578e-06,
"loss": 0.4148,
"step": 761
},
{
"epoch": 1.3229517091698317,
"grad_norm": 0.13650111546563745,
"learning_rate": 3.1216574332530153e-06,
"loss": 0.4322,
"step": 762
},
{
"epoch": 1.3246880086814976,
"grad_norm": 0.12200301128989274,
"learning_rate": 3.107614527887727e-06,
"loss": 0.4208,
"step": 763
},
{
"epoch": 1.3264243081931633,
"grad_norm": 0.1385711764976705,
"learning_rate": 3.093589024135478e-06,
"loss": 0.4124,
"step": 764
},
{
"epoch": 1.3281606077048291,
"grad_norm": 0.12826365451603902,
"learning_rate": 3.079581050969146e-06,
"loss": 0.42,
"step": 765
},
{
"epoch": 1.3298969072164948,
"grad_norm": 0.1353306325888454,
"learning_rate": 3.0655907372003945e-06,
"loss": 0.4164,
"step": 766
},
{
"epoch": 1.3316332067281607,
"grad_norm": 0.1378424644755559,
"learning_rate": 3.0516182114785044e-06,
"loss": 0.4159,
"step": 767
},
{
"epoch": 1.3333695062398263,
"grad_norm": 0.1285283517829389,
"learning_rate": 3.0376636022891813e-06,
"loss": 0.4126,
"step": 768
},
{
"epoch": 1.3351058057514922,
"grad_norm": 0.13426392864303094,
"learning_rate": 3.0237270379533823e-06,
"loss": 0.4115,
"step": 769
},
{
"epoch": 1.3368421052631578,
"grad_norm": 0.12430141285628485,
"learning_rate": 3.0098086466261244e-06,
"loss": 0.4074,
"step": 770
},
{
"epoch": 1.3385784047748237,
"grad_norm": 0.1296003321288165,
"learning_rate": 2.9959085562953207e-06,
"loss": 0.4074,
"step": 771
},
{
"epoch": 1.3403147042864894,
"grad_norm": 0.12858505693047656,
"learning_rate": 2.9820268947805886e-06,
"loss": 0.4152,
"step": 772
},
{
"epoch": 1.3420510037981552,
"grad_norm": 0.12197621327889531,
"learning_rate": 2.968163789732087e-06,
"loss": 0.3858,
"step": 773
},
{
"epoch": 1.3437873033098209,
"grad_norm": 0.12881358416491176,
"learning_rate": 2.954319368629333e-06,
"loss": 0.4001,
"step": 774
},
{
"epoch": 1.3455236028214868,
"grad_norm": 0.130976313094142,
"learning_rate": 2.9404937587800374e-06,
"loss": 0.3965,
"step": 775
},
{
"epoch": 1.3472599023331524,
"grad_norm": 0.14058834884031382,
"learning_rate": 2.9266870873189275e-06,
"loss": 0.4283,
"step": 776
},
{
"epoch": 1.3489962018448183,
"grad_norm": 0.14263920518768747,
"learning_rate": 2.912899481206582e-06,
"loss": 0.4043,
"step": 777
},
{
"epoch": 1.350732501356484,
"grad_norm": 0.12084831525686957,
"learning_rate": 2.89913106722826e-06,
"loss": 0.4154,
"step": 778
},
{
"epoch": 1.3524688008681498,
"grad_norm": 0.1354875286137426,
"learning_rate": 2.8853819719927432e-06,
"loss": 0.4206,
"step": 779
},
{
"epoch": 1.3542051003798155,
"grad_norm": 0.13251361102306228,
"learning_rate": 2.871652321931161e-06,
"loss": 0.4007,
"step": 780
},
{
"epoch": 1.3559413998914813,
"grad_norm": 0.13099944844294423,
"learning_rate": 2.8579422432958316e-06,
"loss": 0.4207,
"step": 781
},
{
"epoch": 1.357677699403147,
"grad_norm": 0.12769218412087335,
"learning_rate": 2.8442518621591085e-06,
"loss": 0.4183,
"step": 782
},
{
"epoch": 1.3594139989148128,
"grad_norm": 0.14309184331634017,
"learning_rate": 2.83058130441221e-06,
"loss": 0.4097,
"step": 783
},
{
"epoch": 1.3611502984264785,
"grad_norm": 0.16868092956785954,
"learning_rate": 2.8169306957640675e-06,
"loss": 0.4255,
"step": 784
},
{
"epoch": 1.3628865979381444,
"grad_norm": 0.14472137695357806,
"learning_rate": 2.803300161740166e-06,
"loss": 0.3991,
"step": 785
},
{
"epoch": 1.36462289744981,
"grad_norm": 0.1260246338457178,
"learning_rate": 2.7896898276814005e-06,
"loss": 0.4251,
"step": 786
},
{
"epoch": 1.366359196961476,
"grad_norm": 0.12995354011841623,
"learning_rate": 2.7760998187429067e-06,
"loss": 0.3945,
"step": 787
},
{
"epoch": 1.3680954964731415,
"grad_norm": 0.13800691020421174,
"learning_rate": 2.7625302598929226e-06,
"loss": 0.4121,
"step": 788
},
{
"epoch": 1.3698317959848074,
"grad_norm": 0.12589367394524512,
"learning_rate": 2.748981275911633e-06,
"loss": 0.4116,
"step": 789
},
{
"epoch": 1.371568095496473,
"grad_norm": 0.13086859376723847,
"learning_rate": 2.73545299139003e-06,
"loss": 0.417,
"step": 790
},
{
"epoch": 1.373304395008139,
"grad_norm": 0.12779840511323343,
"learning_rate": 2.7219455307287557e-06,
"loss": 0.4007,
"step": 791
},
{
"epoch": 1.3750406945198046,
"grad_norm": 0.13743539379081776,
"learning_rate": 2.7084590181369675e-06,
"loss": 0.4165,
"step": 792
},
{
"epoch": 1.3767769940314705,
"grad_norm": 0.14005665536498818,
"learning_rate": 2.69499357763119e-06,
"loss": 0.4112,
"step": 793
},
{
"epoch": 1.3785132935431361,
"grad_norm": 0.12255718957071185,
"learning_rate": 2.6815493330341822e-06,
"loss": 0.4191,
"step": 794
},
{
"epoch": 1.380249593054802,
"grad_norm": 0.13565587855667666,
"learning_rate": 2.6681264079737907e-06,
"loss": 0.4065,
"step": 795
},
{
"epoch": 1.3819858925664676,
"grad_norm": 0.13419326992967473,
"learning_rate": 2.6547249258818162e-06,
"loss": 0.4119,
"step": 796
},
{
"epoch": 1.3837221920781335,
"grad_norm": 0.13136822894753178,
"learning_rate": 2.641345009992878e-06,
"loss": 0.4153,
"step": 797
},
{
"epoch": 1.3854584915897992,
"grad_norm": 0.14176326400487876,
"learning_rate": 2.627986783343287e-06,
"loss": 0.4096,
"step": 798
},
{
"epoch": 1.387194791101465,
"grad_norm": 0.12839978347380843,
"learning_rate": 2.6146503687699005e-06,
"loss": 0.4044,
"step": 799
},
{
"epoch": 1.3889310906131307,
"grad_norm": 0.1355048536747864,
"learning_rate": 2.601335888909005e-06,
"loss": 0.4163,
"step": 800
},
{
"epoch": 1.3906673901247966,
"grad_norm": 0.12545466668714947,
"learning_rate": 2.5880434661951826e-06,
"loss": 0.4119,
"step": 801
},
{
"epoch": 1.3924036896364622,
"grad_norm": 0.13036763191690007,
"learning_rate": 2.5747732228601903e-06,
"loss": 0.4047,
"step": 802
},
{
"epoch": 1.394139989148128,
"grad_norm": 0.1379598785530406,
"learning_rate": 2.5615252809318287e-06,
"loss": 0.4326,
"step": 803
},
{
"epoch": 1.3958762886597937,
"grad_norm": 0.14077141190929993,
"learning_rate": 2.5482997622328252e-06,
"loss": 0.3938,
"step": 804
},
{
"epoch": 1.3976125881714596,
"grad_norm": 0.1468546409601843,
"learning_rate": 2.5350967883797095e-06,
"loss": 0.4301,
"step": 805
},
{
"epoch": 1.3993488876831253,
"grad_norm": 0.13961508755488275,
"learning_rate": 2.5219164807817055e-06,
"loss": 0.4132,
"step": 806
},
{
"epoch": 1.4010851871947911,
"grad_norm": 0.12741589875103143,
"learning_rate": 2.508758960639599e-06,
"loss": 0.4286,
"step": 807
},
{
"epoch": 1.4028214867064568,
"grad_norm": 0.1322065820346883,
"learning_rate": 2.495624348944633e-06,
"loss": 0.4,
"step": 808
},
{
"epoch": 1.4045577862181227,
"grad_norm": 0.1279223022976378,
"learning_rate": 2.4825127664774008e-06,
"loss": 0.4032,
"step": 809
},
{
"epoch": 1.4062940857297883,
"grad_norm": 0.1299005206195399,
"learning_rate": 2.469424333806718e-06,
"loss": 0.4145,
"step": 810
},
{
"epoch": 1.4080303852414542,
"grad_norm": 0.12318119514845602,
"learning_rate": 2.456359171288534e-06,
"loss": 0.4119,
"step": 811
},
{
"epoch": 1.4097666847531198,
"grad_norm": 0.11803503465881546,
"learning_rate": 2.443317399064806e-06,
"loss": 0.3971,
"step": 812
},
{
"epoch": 1.4115029842647857,
"grad_norm": 0.12120636381131922,
"learning_rate": 2.4302991370624106e-06,
"loss": 0.4154,
"step": 813
},
{
"epoch": 1.4132392837764514,
"grad_norm": 0.14357692984029835,
"learning_rate": 2.4173045049920276e-06,
"loss": 0.4153,
"step": 814
},
{
"epoch": 1.4149755832881172,
"grad_norm": 0.13853494299667582,
"learning_rate": 2.4043336223470525e-06,
"loss": 0.4122,
"step": 815
},
{
"epoch": 1.4167118827997829,
"grad_norm": 0.12762107651550092,
"learning_rate": 2.3913866084024857e-06,
"loss": 0.3934,
"step": 816
},
{
"epoch": 1.4184481823114488,
"grad_norm": 0.14007255270207977,
"learning_rate": 2.3784635822138424e-06,
"loss": 0.4069,
"step": 817
},
{
"epoch": 1.4201844818231144,
"grad_norm": 0.13584088672858124,
"learning_rate": 2.365564662616053e-06,
"loss": 0.4198,
"step": 818
},
{
"epoch": 1.4219207813347803,
"grad_norm": 0.12697303599490736,
"learning_rate": 2.3526899682223813e-06,
"loss": 0.3899,
"step": 819
},
{
"epoch": 1.423657080846446,
"grad_norm": 0.12679484994880091,
"learning_rate": 2.339839617423318e-06,
"loss": 0.4197,
"step": 820
},
{
"epoch": 1.4253933803581118,
"grad_norm": 0.12876866513716562,
"learning_rate": 2.3270137283855022e-06,
"loss": 0.3991,
"step": 821
},
{
"epoch": 1.4271296798697775,
"grad_norm": 0.12836114905719587,
"learning_rate": 2.3142124190506315e-06,
"loss": 0.4021,
"step": 822
},
{
"epoch": 1.4288659793814433,
"grad_norm": 0.12995546010077,
"learning_rate": 2.3014358071343844e-06,
"loss": 0.414,
"step": 823
},
{
"epoch": 1.430602278893109,
"grad_norm": 0.13519331292147682,
"learning_rate": 2.288684010125325e-06,
"loss": 0.4246,
"step": 824
},
{
"epoch": 1.4323385784047749,
"grad_norm": 0.12538832218788362,
"learning_rate": 2.2759571452838325e-06,
"loss": 0.3945,
"step": 825
},
{
"epoch": 1.4340748779164405,
"grad_norm": 0.12565420481547468,
"learning_rate": 2.2632553296410172e-06,
"loss": 0.3893,
"step": 826
},
{
"epoch": 1.4358111774281064,
"grad_norm": 0.14011228845933368,
"learning_rate": 2.2505786799976527e-06,
"loss": 0.4024,
"step": 827
},
{
"epoch": 1.437547476939772,
"grad_norm": 0.13362405741803038,
"learning_rate": 2.2379273129230916e-06,
"loss": 0.4104,
"step": 828
},
{
"epoch": 1.439283776451438,
"grad_norm": 0.12278040688438624,
"learning_rate": 2.2253013447541993e-06,
"loss": 0.3996,
"step": 829
},
{
"epoch": 1.4410200759631036,
"grad_norm": 0.12707888516785454,
"learning_rate": 2.2127008915942817e-06,
"loss": 0.4256,
"step": 830
},
{
"epoch": 1.4427563754747694,
"grad_norm": 0.12297647001266902,
"learning_rate": 2.2001260693120236e-06,
"loss": 0.4116,
"step": 831
},
{
"epoch": 1.444492674986435,
"grad_norm": 0.1316400193577434,
"learning_rate": 2.1875769935404145e-06,
"loss": 0.4257,
"step": 832
},
{
"epoch": 1.446228974498101,
"grad_norm": 0.12592051019559328,
"learning_rate": 2.17505377967569e-06,
"loss": 0.4002,
"step": 833
},
{
"epoch": 1.4479652740097668,
"grad_norm": 0.13307011942276165,
"learning_rate": 2.1625565428762687e-06,
"loss": 0.4143,
"step": 834
},
{
"epoch": 1.4497015735214325,
"grad_norm": 0.13261132323467073,
"learning_rate": 2.1500853980616997e-06,
"loss": 0.4103,
"step": 835
},
{
"epoch": 1.4514378730330981,
"grad_norm": 0.13736208640453593,
"learning_rate": 2.1376404599115963e-06,
"loss": 0.4076,
"step": 836
},
{
"epoch": 1.453174172544764,
"grad_norm": 0.14043076559730674,
"learning_rate": 2.125221842864585e-06,
"loss": 0.4208,
"step": 837
},
{
"epoch": 1.4549104720564299,
"grad_norm": 0.12533168615565918,
"learning_rate": 2.1128296611172593e-06,
"loss": 0.4137,
"step": 838
},
{
"epoch": 1.4566467715680955,
"grad_norm": 0.11887754452981136,
"learning_rate": 2.10046402862312e-06,
"loss": 0.4174,
"step": 839
},
{
"epoch": 1.4583830710797612,
"grad_norm": 0.1365498648247696,
"learning_rate": 2.0881250590915316e-06,
"loss": 0.4125,
"step": 840
},
{
"epoch": 1.460119370591427,
"grad_norm": 0.12267517652888206,
"learning_rate": 2.075812865986677e-06,
"loss": 0.4091,
"step": 841
},
{
"epoch": 1.461855670103093,
"grad_norm": 0.13012211466376844,
"learning_rate": 2.0635275625265187e-06,
"loss": 0.4311,
"step": 842
},
{
"epoch": 1.4635919696147586,
"grad_norm": 0.129498733682449,
"learning_rate": 2.051269261681745e-06,
"loss": 0.4224,
"step": 843
},
{
"epoch": 1.4653282691264242,
"grad_norm": 0.12190967445012241,
"learning_rate": 2.039038076174748e-06,
"loss": 0.4155,
"step": 844
},
{
"epoch": 1.46706456863809,
"grad_norm": 0.13087222637882714,
"learning_rate": 2.0268341184785674e-06,
"loss": 0.4118,
"step": 845
},
{
"epoch": 1.468800868149756,
"grad_norm": 0.134418294508292,
"learning_rate": 2.0146575008158765e-06,
"loss": 0.4269,
"step": 846
},
{
"epoch": 1.4705371676614216,
"grad_norm": 0.12743480518428843,
"learning_rate": 2.0025083351579337e-06,
"loss": 0.4006,
"step": 847
},
{
"epoch": 1.4722734671730873,
"grad_norm": 0.12625799127877016,
"learning_rate": 1.990386733223569e-06,
"loss": 0.4217,
"step": 848
},
{
"epoch": 1.4740097666847531,
"grad_norm": 0.13796255034449323,
"learning_rate": 1.978292806478134e-06,
"loss": 0.4329,
"step": 849
},
{
"epoch": 1.475746066196419,
"grad_norm": 0.13577869591390213,
"learning_rate": 1.9662266661325038e-06,
"loss": 0.4108,
"step": 850
},
{
"epoch": 1.4774823657080847,
"grad_norm": 0.12374125989207808,
"learning_rate": 1.9541884231420304e-06,
"loss": 0.4058,
"step": 851
},
{
"epoch": 1.4792186652197503,
"grad_norm": 0.13167015482453784,
"learning_rate": 1.9421781882055447e-06,
"loss": 0.4146,
"step": 852
},
{
"epoch": 1.4809549647314162,
"grad_norm": 0.1332394699372241,
"learning_rate": 1.930196071764312e-06,
"loss": 0.425,
"step": 853
},
{
"epoch": 1.482691264243082,
"grad_norm": 0.1268455605647913,
"learning_rate": 1.918242184001044e-06,
"loss": 0.3957,
"step": 854
},
{
"epoch": 1.4844275637547477,
"grad_norm": 0.13517298465365898,
"learning_rate": 1.906316634838865e-06,
"loss": 0.4179,
"step": 855
},
{
"epoch": 1.4861638632664134,
"grad_norm": 0.12480497326109305,
"learning_rate": 1.8944195339403176e-06,
"loss": 0.4163,
"step": 856
},
{
"epoch": 1.4879001627780792,
"grad_norm": 0.15655713241339458,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.418,
"step": 857
},
{
"epoch": 1.4896364622897451,
"grad_norm": 0.12127548342469688,
"learning_rate": 1.8707111142752493e-06,
"loss": 0.395,
"step": 858
},
{
"epoch": 1.4913727618014108,
"grad_norm": 0.12456923098315939,
"learning_rate": 1.8589000135217882e-06,
"loss": 0.4027,
"step": 859
},
{
"epoch": 1.4931090613130764,
"grad_norm": 0.11957509588071376,
"learning_rate": 1.8471177970560712e-06,
"loss": 0.409,
"step": 860
},
{
"epoch": 1.4948453608247423,
"grad_norm": 0.12209458380394814,
"learning_rate": 1.8353645732225977e-06,
"loss": 0.3863,
"step": 861
},
{
"epoch": 1.4965816603364082,
"grad_norm": 0.1273098707036207,
"learning_rate": 1.8236404500992767e-06,
"loss": 0.3968,
"step": 862
},
{
"epoch": 1.4983179598480738,
"grad_norm": 0.12369245264759446,
"learning_rate": 1.8119455354964088e-06,
"loss": 0.4233,
"step": 863
},
{
"epoch": 1.5000542593597395,
"grad_norm": 0.13398660997805997,
"learning_rate": 1.8002799369557121e-06,
"loss": 0.4153,
"step": 864
},
{
"epoch": 1.5017905588714053,
"grad_norm": 0.1384801086046812,
"learning_rate": 1.7886437617493206e-06,
"loss": 0.4333,
"step": 865
},
{
"epoch": 1.5035268583830712,
"grad_norm": 0.11746726161406841,
"learning_rate": 1.7770371168788042e-06,
"loss": 0.4087,
"step": 866
},
{
"epoch": 1.5052631578947369,
"grad_norm": 0.12377311929628279,
"learning_rate": 1.765460109074188e-06,
"loss": 0.4066,
"step": 867
},
{
"epoch": 1.5069994574064025,
"grad_norm": 0.12518207143985072,
"learning_rate": 1.7539128447929603e-06,
"loss": 0.4275,
"step": 868
},
{
"epoch": 1.5087357569180684,
"grad_norm": 0.12617569143392077,
"learning_rate": 1.7423954302191047e-06,
"loss": 0.4083,
"step": 869
},
{
"epoch": 1.5104720564297343,
"grad_norm": 0.1236554713478632,
"learning_rate": 1.7309079712621152e-06,
"loss": 0.3962,
"step": 870
},
{
"epoch": 1.5122083559414,
"grad_norm": 0.1246822344076014,
"learning_rate": 1.7194505735560307e-06,
"loss": 0.4099,
"step": 871
},
{
"epoch": 1.5139446554530656,
"grad_norm": 0.1299322275534694,
"learning_rate": 1.7080233424584553e-06,
"loss": 0.4215,
"step": 872
},
{
"epoch": 1.5156809549647314,
"grad_norm": 0.13649358878274787,
"learning_rate": 1.6966263830495939e-06,
"loss": 0.429,
"step": 873
},
{
"epoch": 1.5174172544763973,
"grad_norm": 0.13493202861295495,
"learning_rate": 1.6852598001312836e-06,
"loss": 0.4203,
"step": 874
},
{
"epoch": 1.519153553988063,
"grad_norm": 0.1300026060628514,
"learning_rate": 1.6739236982260377e-06,
"loss": 0.4176,
"step": 875
},
{
"epoch": 1.5208898534997286,
"grad_norm": 0.13491527997309305,
"learning_rate": 1.662618181576071e-06,
"loss": 0.4156,
"step": 876
},
{
"epoch": 1.5226261530113945,
"grad_norm": 0.11300836408342123,
"learning_rate": 1.6513433541423529e-06,
"loss": 0.3902,
"step": 877
},
{
"epoch": 1.5243624525230604,
"grad_norm": 0.12174204501469876,
"learning_rate": 1.6400993196036441e-06,
"loss": 0.4242,
"step": 878
},
{
"epoch": 1.526098752034726,
"grad_norm": 0.13546333027439583,
"learning_rate": 1.6288861813555511e-06,
"loss": 0.4227,
"step": 879
},
{
"epoch": 1.5278350515463917,
"grad_norm": 0.12127618299527383,
"learning_rate": 1.6177040425095664e-06,
"loss": 0.414,
"step": 880
},
{
"epoch": 1.5295713510580575,
"grad_norm": 0.13012169318108746,
"learning_rate": 1.6065530058921253e-06,
"loss": 0.4277,
"step": 881
},
{
"epoch": 1.5313076505697234,
"grad_norm": 0.1423957843203789,
"learning_rate": 1.5954331740436591e-06,
"loss": 0.4005,
"step": 882
},
{
"epoch": 1.533043950081389,
"grad_norm": 0.12506933416673377,
"learning_rate": 1.5843446492176562e-06,
"loss": 0.3961,
"step": 883
},
{
"epoch": 1.5347802495930547,
"grad_norm": 0.13366751188467238,
"learning_rate": 1.5732875333797143e-06,
"loss": 0.4149,
"step": 884
},
{
"epoch": 1.5365165491047206,
"grad_norm": 0.12554028072879378,
"learning_rate": 1.562261928206608e-06,
"loss": 0.4123,
"step": 885
},
{
"epoch": 1.5382528486163864,
"grad_norm": 0.12208173618819541,
"learning_rate": 1.551267935085351e-06,
"loss": 0.3962,
"step": 886
},
{
"epoch": 1.539989148128052,
"grad_norm": 0.12016508007857718,
"learning_rate": 1.5403056551122697e-06,
"loss": 0.4124,
"step": 887
},
{
"epoch": 1.5417254476397177,
"grad_norm": 0.13559374517942055,
"learning_rate": 1.5293751890920649e-06,
"loss": 0.4134,
"step": 888
},
{
"epoch": 1.5434617471513836,
"grad_norm": 0.11686815344911052,
"learning_rate": 1.5184766375368914e-06,
"loss": 0.4016,
"step": 889
},
{
"epoch": 1.5451980466630495,
"grad_norm": 0.13384389935838778,
"learning_rate": 1.5076101006654286e-06,
"loss": 0.405,
"step": 890
},
{
"epoch": 1.5469343461747151,
"grad_norm": 0.13607774222673785,
"learning_rate": 1.4967756784019666e-06,
"loss": 0.4131,
"step": 891
},
{
"epoch": 1.5486706456863808,
"grad_norm": 0.12143950640173919,
"learning_rate": 1.485973470375479e-06,
"loss": 0.4393,
"step": 892
},
{
"epoch": 1.5504069451980467,
"grad_norm": 0.13447482018669238,
"learning_rate": 1.4752035759187106e-06,
"loss": 0.3989,
"step": 893
},
{
"epoch": 1.5521432447097125,
"grad_norm": 0.12219257242260986,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.4136,
"step": 894
},
{
"epoch": 1.5538795442213782,
"grad_norm": 0.11918499493508868,
"learning_rate": 1.4537611235586863e-06,
"loss": 0.4105,
"step": 895
},
{
"epoch": 1.5556158437330438,
"grad_norm": 0.12248886593105629,
"learning_rate": 1.4430887628315715e-06,
"loss": 0.4122,
"step": 896
},
{
"epoch": 1.5573521432447097,
"grad_norm": 0.1220004487241072,
"learning_rate": 1.4324491100246386e-06,
"loss": 0.3947,
"step": 897
},
{
"epoch": 1.5590884427563756,
"grad_norm": 0.11598397128698863,
"learning_rate": 1.4218422629758405e-06,
"loss": 0.4004,
"step": 898
},
{
"epoch": 1.5608247422680412,
"grad_norm": 0.12691720000176912,
"learning_rate": 1.4112683192214598e-06,
"loss": 0.4338,
"step": 899
},
{
"epoch": 1.562561041779707,
"grad_norm": 0.13316611138281842,
"learning_rate": 1.40072737599522e-06,
"loss": 0.4132,
"step": 900
},
{
"epoch": 1.5642973412913728,
"grad_norm": 0.13213292659724496,
"learning_rate": 1.390219530227378e-06,
"loss": 0.4159,
"step": 901
},
{
"epoch": 1.5660336408030386,
"grad_norm": 0.12083065391743368,
"learning_rate": 1.3797448785438433e-06,
"loss": 0.4142,
"step": 902
},
{
"epoch": 1.5677699403147043,
"grad_norm": 0.12191223406786662,
"learning_rate": 1.369303517265283e-06,
"loss": 0.4132,
"step": 903
},
{
"epoch": 1.56950623982637,
"grad_norm": 0.13900771994376313,
"learning_rate": 1.358895542406245e-06,
"loss": 0.4261,
"step": 904
},
{
"epoch": 1.5712425393380358,
"grad_norm": 0.1278302066364733,
"learning_rate": 1.348521049674264e-06,
"loss": 0.4066,
"step": 905
},
{
"epoch": 1.5729788388497017,
"grad_norm": 0.12945116378479313,
"learning_rate": 1.3381801344689876e-06,
"loss": 0.4089,
"step": 906
},
{
"epoch": 1.5747151383613673,
"grad_norm": 0.12916747174933954,
"learning_rate": 1.3278728918812978e-06,
"loss": 0.4225,
"step": 907
},
{
"epoch": 1.576451437873033,
"grad_norm": 0.11927642012866999,
"learning_rate": 1.3175994166924394e-06,
"loss": 0.4294,
"step": 908
},
{
"epoch": 1.5781877373846989,
"grad_norm": 0.1206412178443493,
"learning_rate": 1.3073598033731427e-06,
"loss": 0.4026,
"step": 909
},
{
"epoch": 1.5799240368963647,
"grad_norm": 0.13593801989098223,
"learning_rate": 1.2971541460827597e-06,
"loss": 0.4259,
"step": 910
},
{
"epoch": 1.5816603364080304,
"grad_norm": 0.13696333152462678,
"learning_rate": 1.2869825386683938e-06,
"loss": 0.4084,
"step": 911
},
{
"epoch": 1.583396635919696,
"grad_norm": 0.13284306946184443,
"learning_rate": 1.2768450746640448e-06,
"loss": 0.4192,
"step": 912
},
{
"epoch": 1.585132935431362,
"grad_norm": 0.13243455602958631,
"learning_rate": 1.2667418472897386e-06,
"loss": 0.4193,
"step": 913
},
{
"epoch": 1.5868692349430278,
"grad_norm": 0.1225346669572518,
"learning_rate": 1.2566729494506768e-06,
"loss": 0.3819,
"step": 914
},
{
"epoch": 1.5886055344546934,
"grad_norm": 0.12964877271023698,
"learning_rate": 1.246638473736378e-06,
"loss": 0.4087,
"step": 915
},
{
"epoch": 1.590341833966359,
"grad_norm": 0.1341324486851053,
"learning_rate": 1.236638512419835e-06,
"loss": 0.3998,
"step": 916
},
{
"epoch": 1.592078133478025,
"grad_norm": 0.12992187668983388,
"learning_rate": 1.2266731574566536e-06,
"loss": 0.4248,
"step": 917
},
{
"epoch": 1.5938144329896908,
"grad_norm": 0.1282491585530033,
"learning_rate": 1.2167425004842171e-06,
"loss": 0.4162,
"step": 918
},
{
"epoch": 1.5955507325013565,
"grad_norm": 0.12186709446939265,
"learning_rate": 1.2068466328208368e-06,
"loss": 0.4291,
"step": 919
},
{
"epoch": 1.5972870320130221,
"grad_norm": 0.14142890451759685,
"learning_rate": 1.196985645464921e-06,
"loss": 0.419,
"step": 920
},
{
"epoch": 1.599023331524688,
"grad_norm": 0.14338876457298588,
"learning_rate": 1.1871596290941278e-06,
"loss": 0.4077,
"step": 921
},
{
"epoch": 1.6007596310363539,
"grad_norm": 0.1305962229366433,
"learning_rate": 1.1773686740645384e-06,
"loss": 0.4071,
"step": 922
},
{
"epoch": 1.6024959305480195,
"grad_norm": 0.11435580814232883,
"learning_rate": 1.1676128704098222e-06,
"loss": 0.4057,
"step": 923
},
{
"epoch": 1.6042322300596852,
"grad_norm": 0.11670856676886189,
"learning_rate": 1.1578923078404152e-06,
"loss": 0.4058,
"step": 924
},
{
"epoch": 1.605968529571351,
"grad_norm": 0.124581473605881,
"learning_rate": 1.1482070757426855e-06,
"loss": 0.418,
"step": 925
},
{
"epoch": 1.607704829083017,
"grad_norm": 0.11983010369768266,
"learning_rate": 1.1385572631781178e-06,
"loss": 0.4045,
"step": 926
},
{
"epoch": 1.6094411285946826,
"grad_norm": 0.124700646599233,
"learning_rate": 1.1289429588824962e-06,
"loss": 0.3989,
"step": 927
},
{
"epoch": 1.6111774281063482,
"grad_norm": 0.13817353344869943,
"learning_rate": 1.1193642512650805e-06,
"loss": 0.4086,
"step": 928
},
{
"epoch": 1.612913727618014,
"grad_norm": 0.12049426083605701,
"learning_rate": 1.1098212284078037e-06,
"loss": 0.3997,
"step": 929
},
{
"epoch": 1.61465002712968,
"grad_norm": 0.11944095499572413,
"learning_rate": 1.1003139780644467e-06,
"loss": 0.4146,
"step": 930
},
{
"epoch": 1.6163863266413456,
"grad_norm": 0.12874490049668783,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.4078,
"step": 931
},
{
"epoch": 1.6181226261530113,
"grad_norm": 0.1200731058980104,
"learning_rate": 1.0814071442890983e-06,
"loss": 0.4134,
"step": 932
},
{
"epoch": 1.6198589256646772,
"grad_norm": 0.12656380110216298,
"learning_rate": 1.07200773471672e-06,
"loss": 0.4168,
"step": 933
},
{
"epoch": 1.621595225176343,
"grad_norm": 0.12268627183607625,
"learning_rate": 1.0626444453758895e-06,
"loss": 0.4098,
"step": 934
},
{
"epoch": 1.6233315246880087,
"grad_norm": 0.12230549063662255,
"learning_rate": 1.053317362367639e-06,
"loss": 0.4099,
"step": 935
},
{
"epoch": 1.6250678241996743,
"grad_norm": 0.12897798951819442,
"learning_rate": 1.0440265714600573e-06,
"loss": 0.4109,
"step": 936
},
{
"epoch": 1.6268041237113402,
"grad_norm": 0.1296337132331582,
"learning_rate": 1.0347721580875125e-06,
"loss": 0.4344,
"step": 937
},
{
"epoch": 1.628540423223006,
"grad_norm": 0.12301012837827231,
"learning_rate": 1.0255542073498487e-06,
"loss": 0.4119,
"step": 938
},
{
"epoch": 1.6302767227346717,
"grad_norm": 0.13806847181548138,
"learning_rate": 1.016372804011625e-06,
"loss": 0.4224,
"step": 939
},
{
"epoch": 1.6320130222463374,
"grad_norm": 0.11912380035465843,
"learning_rate": 1.0072280325013185e-06,
"loss": 0.3886,
"step": 940
},
{
"epoch": 1.6337493217580032,
"grad_norm": 0.12481979295903664,
"learning_rate": 9.981199769105605e-07,
"loss": 0.411,
"step": 941
},
{
"epoch": 1.6354856212696691,
"grad_norm": 0.12010701170379462,
"learning_rate": 9.890487209933497e-07,
"loss": 0.4158,
"step": 942
},
{
"epoch": 1.6372219207813348,
"grad_norm": 0.1271131768562924,
"learning_rate": 9.80014348165298e-07,
"loss": 0.4135,
"step": 943
},
{
"epoch": 1.6389582202930004,
"grad_norm": 0.11686752698079612,
"learning_rate": 9.710169415028492e-07,
"loss": 0.3941,
"step": 944
},
{
"epoch": 1.6406945198046663,
"grad_norm": 0.13442063233496024,
"learning_rate": 9.62056583742527e-07,
"loss": 0.428,
"step": 945
},
{
"epoch": 1.6424308193163322,
"grad_norm": 0.1367443089530073,
"learning_rate": 9.531333572801604e-07,
"loss": 0.4306,
"step": 946
},
{
"epoch": 1.6441671188279978,
"grad_norm": 0.12926844185744127,
"learning_rate": 9.442473441701422e-07,
"loss": 0.4075,
"step": 947
},
{
"epoch": 1.6459034183396635,
"grad_norm": 0.138290795761905,
"learning_rate": 9.353986261246606e-07,
"loss": 0.4068,
"step": 948
},
{
"epoch": 1.6476397178513293,
"grad_norm": 0.13448081054207545,
"learning_rate": 9.26587284512957e-07,
"loss": 0.4116,
"step": 949
},
{
"epoch": 1.6493760173629952,
"grad_norm": 0.12689121517003302,
"learning_rate": 9.178134003605721e-07,
"loss": 0.4095,
"step": 950
},
{
"epoch": 1.6511123168746609,
"grad_norm": 0.11431606376406304,
"learning_rate": 9.090770543486033e-07,
"loss": 0.4055,
"step": 951
},
{
"epoch": 1.6528486163863265,
"grad_norm": 0.129398563963692,
"learning_rate": 9.003783268129612e-07,
"loss": 0.42,
"step": 952
},
{
"epoch": 1.6545849158979924,
"grad_norm": 0.12584472294549326,
"learning_rate": 8.917172977436356e-07,
"loss": 0.4099,
"step": 953
},
{
"epoch": 1.6563212154096583,
"grad_norm": 0.13257419772829573,
"learning_rate": 8.830940467839538e-07,
"loss": 0.4181,
"step": 954
},
{
"epoch": 1.658057514921324,
"grad_norm": 0.12623087863431037,
"learning_rate": 8.745086532298497e-07,
"loss": 0.4154,
"step": 955
},
{
"epoch": 1.6597938144329896,
"grad_norm": 0.1323049844837645,
"learning_rate": 8.659611960291397e-07,
"loss": 0.4227,
"step": 956
},
{
"epoch": 1.6615301139446554,
"grad_norm": 0.11871513898323233,
"learning_rate": 8.574517537807897e-07,
"loss": 0.3906,
"step": 957
},
{
"epoch": 1.6632664134563213,
"grad_norm": 0.13398389612030392,
"learning_rate": 8.48980404734196e-07,
"loss": 0.4231,
"step": 958
},
{
"epoch": 1.665002712967987,
"grad_norm": 0.12019923307656202,
"learning_rate": 8.40547226788464e-07,
"loss": 0.4232,
"step": 959
},
{
"epoch": 1.6667390124796526,
"grad_norm": 0.1275078036550687,
"learning_rate": 8.321522974916968e-07,
"loss": 0.4127,
"step": 960
},
{
"epoch": 1.6684753119913185,
"grad_norm": 0.12244677864657005,
"learning_rate": 8.237956940402758e-07,
"loss": 0.4091,
"step": 961
},
{
"epoch": 1.6702116115029844,
"grad_norm": 0.12771023412092483,
"learning_rate": 8.154774932781523e-07,
"loss": 0.4232,
"step": 962
},
{
"epoch": 1.67194791101465,
"grad_norm": 0.1279342671525142,
"learning_rate": 8.071977716961432e-07,
"loss": 0.4164,
"step": 963
},
{
"epoch": 1.6736842105263157,
"grad_norm": 0.11698740747617209,
"learning_rate": 7.989566054312286e-07,
"loss": 0.3993,
"step": 964
},
{
"epoch": 1.6754205100379815,
"grad_norm": 0.11905130215832907,
"learning_rate": 7.907540702658456e-07,
"loss": 0.3869,
"step": 965
},
{
"epoch": 1.6771568095496474,
"grad_norm": 0.1299614359355832,
"learning_rate": 7.82590241627198e-07,
"loss": 0.416,
"step": 966
},
{
"epoch": 1.678893109061313,
"grad_norm": 0.11874306176345516,
"learning_rate": 7.744651945865572e-07,
"loss": 0.4272,
"step": 967
},
{
"epoch": 1.6806294085729787,
"grad_norm": 0.12183634974576847,
"learning_rate": 7.663790038585794e-07,
"loss": 0.3941,
"step": 968
},
{
"epoch": 1.6823657080846446,
"grad_norm": 0.11711707630925917,
"learning_rate": 7.583317438006094e-07,
"loss": 0.4115,
"step": 969
},
{
"epoch": 1.6841020075963105,
"grad_norm": 0.11537841404891344,
"learning_rate": 7.503234884120031e-07,
"loss": 0.4166,
"step": 970
},
{
"epoch": 1.685838307107976,
"grad_norm": 0.12352881324092035,
"learning_rate": 7.423543113334436e-07,
"loss": 0.3987,
"step": 971
},
{
"epoch": 1.6875746066196418,
"grad_norm": 0.13446570147296136,
"learning_rate": 7.344242858462697e-07,
"loss": 0.4293,
"step": 972
},
{
"epoch": 1.6893109061313076,
"grad_norm": 0.12555237261456514,
"learning_rate": 7.265334848717931e-07,
"loss": 0.4024,
"step": 973
},
{
"epoch": 1.6910472056429735,
"grad_norm": 0.12022020939360528,
"learning_rate": 7.186819809706358e-07,
"loss": 0.4082,
"step": 974
},
{
"epoch": 1.6927835051546392,
"grad_norm": 0.12022778005709792,
"learning_rate": 7.108698463420577e-07,
"loss": 0.4206,
"step": 975
},
{
"epoch": 1.6945198046663048,
"grad_norm": 0.13192095245849325,
"learning_rate": 7.030971528232983e-07,
"loss": 0.4258,
"step": 976
},
{
"epoch": 1.6962561041779707,
"grad_norm": 0.1288027515410068,
"learning_rate": 6.953639718889077e-07,
"loss": 0.4277,
"step": 977
},
{
"epoch": 1.6979924036896366,
"grad_norm": 0.1257765886436919,
"learning_rate": 6.876703746500984e-07,
"loss": 0.4241,
"step": 978
},
{
"epoch": 1.6997287032013022,
"grad_norm": 0.11829898374820907,
"learning_rate": 6.800164318540836e-07,
"loss": 0.3969,
"step": 979
},
{
"epoch": 1.7014650027129679,
"grad_norm": 0.12343280352654452,
"learning_rate": 6.724022138834341e-07,
"loss": 0.4394,
"step": 980
},
{
"epoch": 1.7032013022246337,
"grad_norm": 0.12584810369777363,
"learning_rate": 6.648277907554235e-07,
"loss": 0.4086,
"step": 981
},
{
"epoch": 1.7049376017362996,
"grad_norm": 0.12220526205664348,
"learning_rate": 6.572932321213921e-07,
"loss": 0.4149,
"step": 982
},
{
"epoch": 1.7066739012479653,
"grad_norm": 0.11264723449198615,
"learning_rate": 6.497986072660989e-07,
"loss": 0.3946,
"step": 983
},
{
"epoch": 1.708410200759631,
"grad_norm": 0.11486638829519887,
"learning_rate": 6.423439851070884e-07,
"loss": 0.4029,
"step": 984
},
{
"epoch": 1.7101465002712968,
"grad_norm": 0.12297059784409017,
"learning_rate": 6.349294341940593e-07,
"loss": 0.4178,
"step": 985
},
{
"epoch": 1.7118827997829626,
"grad_norm": 0.12946935380664815,
"learning_rate": 6.275550227082278e-07,
"loss": 0.4286,
"step": 986
},
{
"epoch": 1.7136190992946283,
"grad_norm": 0.13174117409809644,
"learning_rate": 6.202208184617065e-07,
"loss": 0.4129,
"step": 987
},
{
"epoch": 1.715355398806294,
"grad_norm": 0.1260009778700492,
"learning_rate": 6.129268888968759e-07,
"loss": 0.4226,
"step": 988
},
{
"epoch": 1.7170916983179598,
"grad_norm": 0.13334490497918267,
"learning_rate": 6.056733010857713e-07,
"loss": 0.4169,
"step": 989
},
{
"epoch": 1.7188279978296257,
"grad_norm": 0.12492912736636083,
"learning_rate": 5.984601217294567e-07,
"loss": 0.4414,
"step": 990
},
{
"epoch": 1.7205642973412913,
"grad_norm": 0.12227556231870458,
"learning_rate": 5.91287417157419e-07,
"loss": 0.4155,
"step": 991
},
{
"epoch": 1.722300596852957,
"grad_norm": 0.12166850206869084,
"learning_rate": 5.841552533269534e-07,
"loss": 0.4155,
"step": 992
},
{
"epoch": 1.7240368963646229,
"grad_norm": 0.11635001857121037,
"learning_rate": 5.770636958225617e-07,
"loss": 0.4153,
"step": 993
},
{
"epoch": 1.7257731958762887,
"grad_norm": 0.12318020561314444,
"learning_rate": 5.700128098553436e-07,
"loss": 0.4072,
"step": 994
},
{
"epoch": 1.7275094953879544,
"grad_norm": 0.12603305151655275,
"learning_rate": 5.630026602624011e-07,
"loss": 0.4222,
"step": 995
},
{
"epoch": 1.72924579489962,
"grad_norm": 0.11621559170055032,
"learning_rate": 5.560333115062389e-07,
"loss": 0.3941,
"step": 996
},
{
"epoch": 1.730982094411286,
"grad_norm": 0.12012197885101744,
"learning_rate": 5.491048276741784e-07,
"loss": 0.4274,
"step": 997
},
{
"epoch": 1.7327183939229518,
"grad_norm": 0.13911959663978574,
"learning_rate": 5.422172724777586e-07,
"loss": 0.4252,
"step": 998
},
{
"epoch": 1.7344546934346174,
"grad_norm": 0.1248001547802355,
"learning_rate": 5.353707092521581e-07,
"loss": 0.4057,
"step": 999
},
{
"epoch": 1.736190992946283,
"grad_norm": 0.12501251772698127,
"learning_rate": 5.285652009556075e-07,
"loss": 0.4318,
"step": 1000
},
{
"epoch": 1.737927292457949,
"grad_norm": 0.13319821322804037,
"learning_rate": 5.218008101688172e-07,
"loss": 0.4211,
"step": 1001
},
{
"epoch": 1.7396635919696148,
"grad_norm": 0.13793608628093898,
"learning_rate": 5.150775990943924e-07,
"loss": 0.4161,
"step": 1002
},
{
"epoch": 1.7413998914812805,
"grad_norm": 0.11965611546509496,
"learning_rate": 5.083956295562704e-07,
"loss": 0.4147,
"step": 1003
},
{
"epoch": 1.7431361909929461,
"grad_norm": 0.11638856925820777,
"learning_rate": 5.017549629991437e-07,
"loss": 0.4156,
"step": 1004
},
{
"epoch": 1.744872490504612,
"grad_norm": 0.12049432353418484,
"learning_rate": 4.951556604879049e-07,
"loss": 0.4076,
"step": 1005
},
{
"epoch": 1.746608790016278,
"grad_norm": 0.12657454111882746,
"learning_rate": 4.885977827070748e-07,
"loss": 0.436,
"step": 1006
},
{
"epoch": 1.7483450895279435,
"grad_norm": 0.1271471881131416,
"learning_rate": 4.820813899602506e-07,
"loss": 0.4137,
"step": 1007
},
{
"epoch": 1.7500813890396092,
"grad_norm": 0.1194948081487038,
"learning_rate": 4.756065421695499e-07,
"loss": 0.3982,
"step": 1008
},
{
"epoch": 1.751817688551275,
"grad_norm": 0.12495098812494451,
"learning_rate": 4.6917329887506133e-07,
"loss": 0.4353,
"step": 1009
},
{
"epoch": 1.753553988062941,
"grad_norm": 0.1151883405922238,
"learning_rate": 4.6278171923429207e-07,
"loss": 0.391,
"step": 1010
},
{
"epoch": 1.7552902875746066,
"grad_norm": 0.1331087193647889,
"learning_rate": 4.5643186202162904e-07,
"loss": 0.412,
"step": 1011
},
{
"epoch": 1.7570265870862722,
"grad_norm": 0.11920565550399728,
"learning_rate": 4.501237856277979e-07,
"loss": 0.407,
"step": 1012
},
{
"epoch": 1.7587628865979381,
"grad_norm": 0.12336162708275682,
"learning_rate": 4.43857548059321e-07,
"loss": 0.4104,
"step": 1013
},
{
"epoch": 1.760499186109604,
"grad_norm": 0.1177681245029087,
"learning_rate": 4.376332069379929e-07,
"loss": 0.4033,
"step": 1014
},
{
"epoch": 1.7622354856212696,
"grad_norm": 0.11923987199445626,
"learning_rate": 4.3145081950033915e-07,
"loss": 0.406,
"step": 1015
},
{
"epoch": 1.7639717851329353,
"grad_norm": 0.12030800995449681,
"learning_rate": 4.2531044259710217e-07,
"loss": 0.4142,
"step": 1016
},
{
"epoch": 1.7657080846446012,
"grad_norm": 0.13356196556406277,
"learning_rate": 4.192121326927073e-07,
"loss": 0.4157,
"step": 1017
},
{
"epoch": 1.767444384156267,
"grad_norm": 0.11767092243929073,
"learning_rate": 4.131559458647544e-07,
"loss": 0.3966,
"step": 1018
},
{
"epoch": 1.7691806836679327,
"grad_norm": 0.11976922663082386,
"learning_rate": 4.0714193780348965e-07,
"loss": 0.3953,
"step": 1019
},
{
"epoch": 1.7709169831795983,
"grad_norm": 0.11569192386039726,
"learning_rate": 4.0117016381130636e-07,
"loss": 0.4009,
"step": 1020
},
{
"epoch": 1.7726532826912642,
"grad_norm": 0.12306395373992947,
"learning_rate": 3.952406788022267e-07,
"loss": 0.3994,
"step": 1021
},
{
"epoch": 1.77438958220293,
"grad_norm": 0.12551376699711242,
"learning_rate": 3.89353537301404e-07,
"loss": 0.4179,
"step": 1022
},
{
"epoch": 1.7761258817145957,
"grad_norm": 0.1242165395810953,
"learning_rate": 3.8350879344461134e-07,
"loss": 0.4278,
"step": 1023
},
{
"epoch": 1.7778621812262614,
"grad_norm": 0.11672137446032638,
"learning_rate": 3.7770650097775805e-07,
"loss": 0.4059,
"step": 1024
},
{
"epoch": 1.7795984807379273,
"grad_norm": 0.14417998618582262,
"learning_rate": 3.71946713256382e-07,
"loss": 0.4018,
"step": 1025
},
{
"epoch": 1.7813347802495931,
"grad_norm": 0.11215087093514516,
"learning_rate": 3.6622948324516796e-07,
"loss": 0.4339,
"step": 1026
},
{
"epoch": 1.7830710797612588,
"grad_norm": 0.11670782056107784,
"learning_rate": 3.6055486351745327e-07,
"loss": 0.4165,
"step": 1027
},
{
"epoch": 1.7848073792729244,
"grad_norm": 0.11642631630152035,
"learning_rate": 3.549229062547532e-07,
"loss": 0.3922,
"step": 1028
},
{
"epoch": 1.7865436787845903,
"grad_norm": 0.13084447236943444,
"learning_rate": 3.4933366324627183e-07,
"loss": 0.4372,
"step": 1029
},
{
"epoch": 1.7882799782962562,
"grad_norm": 0.12019833156490904,
"learning_rate": 3.4378718588843395e-07,
"loss": 0.3918,
"step": 1030
},
{
"epoch": 1.7900162778079218,
"grad_norm": 0.12401178225076169,
"learning_rate": 3.3828352518440464e-07,
"loss": 0.4288,
"step": 1031
},
{
"epoch": 1.7917525773195875,
"grad_norm": 0.11609111078950146,
"learning_rate": 3.328227317436278e-07,
"loss": 0.4066,
"step": 1032
},
{
"epoch": 1.7934888768312534,
"grad_norm": 0.12936793468009347,
"learning_rate": 3.274048557813553e-07,
"loss": 0.4284,
"step": 1033
},
{
"epoch": 1.7952251763429192,
"grad_norm": 0.13160087563042192,
"learning_rate": 3.220299471181898e-07,
"loss": 0.4141,
"step": 1034
},
{
"epoch": 1.7969614758545849,
"grad_norm": 0.12367898334775716,
"learning_rate": 3.1669805517961896e-07,
"loss": 0.4163,
"step": 1035
},
{
"epoch": 1.7986977753662505,
"grad_norm": 0.11825101838963192,
"learning_rate": 3.1140922899557115e-07,
"loss": 0.3847,
"step": 1036
},
{
"epoch": 1.8004340748779164,
"grad_norm": 0.11677455293491364,
"learning_rate": 3.061635171999566e-07,
"loss": 0.3883,
"step": 1037
},
{
"epoch": 1.8021703743895823,
"grad_norm": 0.12161061205938878,
"learning_rate": 3.0096096803022445e-07,
"loss": 0.4208,
"step": 1038
},
{
"epoch": 1.803906673901248,
"grad_norm": 0.11698975276704002,
"learning_rate": 2.9580162932691726e-07,
"loss": 0.4057,
"step": 1039
},
{
"epoch": 1.8056429734129136,
"grad_norm": 0.11411535406581336,
"learning_rate": 2.906855485332305e-07,
"loss": 0.4118,
"step": 1040
},
{
"epoch": 1.8073792729245794,
"grad_norm": 0.11562855972013814,
"learning_rate": 2.85612772694579e-07,
"loss": 0.3985,
"step": 1041
},
{
"epoch": 1.8091155724362453,
"grad_norm": 0.1180653895313076,
"learning_rate": 2.8058334845816214e-07,
"loss": 0.4206,
"step": 1042
},
{
"epoch": 1.810851871947911,
"grad_norm": 0.1131823910445962,
"learning_rate": 2.7559732207253554e-07,
"loss": 0.3917,
"step": 1043
},
{
"epoch": 1.8125881714595766,
"grad_norm": 0.11671701219857006,
"learning_rate": 2.706547393871839e-07,
"loss": 0.427,
"step": 1044
},
{
"epoch": 1.8143244709712425,
"grad_norm": 0.12812859020701226,
"learning_rate": 2.6575564585210487e-07,
"loss": 0.4095,
"step": 1045
},
{
"epoch": 1.8160607704829084,
"grad_norm": 0.12292614122195747,
"learning_rate": 2.609000865173844e-07,
"loss": 0.4074,
"step": 1046
},
{
"epoch": 1.817797069994574,
"grad_norm": 0.11918461838119136,
"learning_rate": 2.5608810603278634e-07,
"loss": 0.4144,
"step": 1047
},
{
"epoch": 1.8195333695062397,
"grad_norm": 0.12189620357247144,
"learning_rate": 2.5131974864734063e-07,
"loss": 0.4082,
"step": 1048
},
{
"epoch": 1.8212696690179055,
"grad_norm": 0.11828054439793066,
"learning_rate": 2.4659505820893827e-07,
"loss": 0.4083,
"step": 1049
},
{
"epoch": 1.8230059685295714,
"grad_norm": 0.1158006635449286,
"learning_rate": 2.4191407816392565e-07,
"loss": 0.3963,
"step": 1050
},
{
"epoch": 1.824742268041237,
"grad_norm": 0.12599491824541956,
"learning_rate": 2.3727685155670587e-07,
"loss": 0.4281,
"step": 1051
},
{
"epoch": 1.8264785675529027,
"grad_norm": 0.1298331569600777,
"learning_rate": 2.3268342102934216e-07,
"loss": 0.4222,
"step": 1052
},
{
"epoch": 1.8282148670645686,
"grad_norm": 0.126821957156595,
"learning_rate": 2.2813382882116986e-07,
"loss": 0.4139,
"step": 1053
},
{
"epoch": 1.8299511665762345,
"grad_norm": 0.13079207049274877,
"learning_rate": 2.2362811676840123e-07,
"loss": 0.4451,
"step": 1054
},
{
"epoch": 1.8316874660879001,
"grad_norm": 0.12114459628543015,
"learning_rate": 2.1916632630374579e-07,
"loss": 0.3974,
"step": 1055
},
{
"epoch": 1.8334237655995658,
"grad_norm": 0.1140732521419423,
"learning_rate": 2.1474849845602773e-07,
"loss": 0.4047,
"step": 1056
},
{
"epoch": 1.8351600651112316,
"grad_norm": 0.11526093835726821,
"learning_rate": 2.1037467384981024e-07,
"loss": 0.3992,
"step": 1057
},
{
"epoch": 1.8368963646228975,
"grad_norm": 0.12228612221156158,
"learning_rate": 2.0604489270501847e-07,
"loss": 0.4059,
"step": 1058
},
{
"epoch": 1.8386326641345632,
"grad_norm": 0.12477417041122149,
"learning_rate": 2.0175919483657213e-07,
"loss": 0.4116,
"step": 1059
},
{
"epoch": 1.8403689636462288,
"grad_norm": 0.12862208203106815,
"learning_rate": 1.975176196540185e-07,
"loss": 0.4177,
"step": 1060
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.12578544940193018,
"learning_rate": 1.933202061611722e-07,
"loss": 0.4095,
"step": 1061
},
{
"epoch": 1.8438415626695606,
"grad_norm": 0.12325651839838077,
"learning_rate": 1.8916699295575324e-07,
"loss": 0.4175,
"step": 1062
},
{
"epoch": 1.8455778621812262,
"grad_norm": 0.12464746665679002,
"learning_rate": 1.8505801822903459e-07,
"loss": 0.4196,
"step": 1063
},
{
"epoch": 1.8473141616928919,
"grad_norm": 0.13032362019407107,
"learning_rate": 1.8099331976548785e-07,
"loss": 0.3897,
"step": 1064
},
{
"epoch": 1.8490504612045577,
"grad_norm": 0.11289571970838852,
"learning_rate": 1.769729349424415e-07,
"loss": 0.3845,
"step": 1065
},
{
"epoch": 1.8507867607162236,
"grad_norm": 0.11655854527319932,
"learning_rate": 1.729969007297305e-07,
"loss": 0.4305,
"step": 1066
},
{
"epoch": 1.8525230602278893,
"grad_norm": 0.1261126469926069,
"learning_rate": 1.6906525368936055e-07,
"loss": 0.4262,
"step": 1067
},
{
"epoch": 1.854259359739555,
"grad_norm": 0.12541632651842094,
"learning_rate": 1.6517802997517262e-07,
"loss": 0.4013,
"step": 1068
},
{
"epoch": 1.8559956592512208,
"grad_norm": 0.12921619899241918,
"learning_rate": 1.6133526533250566e-07,
"loss": 0.4262,
"step": 1069
},
{
"epoch": 1.8577319587628867,
"grad_norm": 0.11908846405888013,
"learning_rate": 1.5753699509787336e-07,
"loss": 0.4355,
"step": 1070
},
{
"epoch": 1.8594682582745523,
"grad_norm": 0.12440968519612089,
"learning_rate": 1.5378325419863504e-07,
"loss": 0.4248,
"step": 1071
},
{
"epoch": 1.861204557786218,
"grad_norm": 0.11926491117578981,
"learning_rate": 1.5007407715267762e-07,
"loss": 0.4058,
"step": 1072
},
{
"epoch": 1.8629408572978838,
"grad_norm": 0.142881003325884,
"learning_rate": 1.4640949806809523e-07,
"loss": 0.4413,
"step": 1073
},
{
"epoch": 1.8646771568095497,
"grad_norm": 0.11865749007897276,
"learning_rate": 1.4278955064287948e-07,
"loss": 0.4345,
"step": 1074
},
{
"epoch": 1.8664134563212154,
"grad_norm": 0.11784186117435934,
"learning_rate": 1.3921426816460525e-07,
"loss": 0.43,
"step": 1075
},
{
"epoch": 1.868149755832881,
"grad_norm": 0.12206259247615907,
"learning_rate": 1.3568368351012718e-07,
"loss": 0.4194,
"step": 1076
},
{
"epoch": 1.8698860553445469,
"grad_norm": 0.1281570795041499,
"learning_rate": 1.3219782914527633e-07,
"loss": 0.4145,
"step": 1077
},
{
"epoch": 1.8716223548562128,
"grad_norm": 0.11970320146740036,
"learning_rate": 1.287567371245635e-07,
"loss": 0.4027,
"step": 1078
},
{
"epoch": 1.8733586543678784,
"grad_norm": 0.12848469392555553,
"learning_rate": 1.253604390908819e-07,
"loss": 0.4321,
"step": 1079
},
{
"epoch": 1.875094953879544,
"grad_norm": 0.12770177634383664,
"learning_rate": 1.2200896627521718e-07,
"loss": 0.4386,
"step": 1080
},
{
"epoch": 1.87683125339121,
"grad_norm": 0.12151906694858564,
"learning_rate": 1.1870234949636072e-07,
"loss": 0.4159,
"step": 1081
},
{
"epoch": 1.8785675529028758,
"grad_norm": 0.127017831403236,
"learning_rate": 1.154406191606261e-07,
"loss": 0.3994,
"step": 1082
},
{
"epoch": 1.8803038524145415,
"grad_norm": 0.1262661612491107,
"learning_rate": 1.1222380526156929e-07,
"loss": 0.4342,
"step": 1083
},
{
"epoch": 1.882040151926207,
"grad_norm": 0.1171509178238538,
"learning_rate": 1.090519373797122e-07,
"loss": 0.4226,
"step": 1084
},
{
"epoch": 1.883776451437873,
"grad_norm": 0.13097472029104612,
"learning_rate": 1.0592504468227127e-07,
"loss": 0.4126,
"step": 1085
},
{
"epoch": 1.8855127509495389,
"grad_norm": 0.1187689132650043,
"learning_rate": 1.0284315592289041e-07,
"loss": 0.4216,
"step": 1086
},
{
"epoch": 1.8872490504612045,
"grad_norm": 0.11632907820386926,
"learning_rate": 9.98062994413751e-08,
"loss": 0.4083,
"step": 1087
},
{
"epoch": 1.8889853499728704,
"grad_norm": 0.11874619451388094,
"learning_rate": 9.681450316343155e-08,
"loss": 0.4076,
"step": 1088
},
{
"epoch": 1.890721649484536,
"grad_norm": 0.11553420260775356,
"learning_rate": 9.386779460041018e-08,
"loss": 0.4022,
"step": 1089
},
{
"epoch": 1.892457948996202,
"grad_norm": 0.12777258119761628,
"learning_rate": 9.096620084905472e-08,
"loss": 0.4215,
"step": 1090
},
{
"epoch": 1.8941942485078678,
"grad_norm": 0.1299790103661572,
"learning_rate": 8.810974859124966e-08,
"loss": 0.4195,
"step": 1091
},
{
"epoch": 1.8959305480195334,
"grad_norm": 0.12870008807740843,
"learning_rate": 8.529846409377707e-08,
"loss": 0.4183,
"step": 1092
},
{
"epoch": 1.897666847531199,
"grad_norm": 0.11557600699360503,
"learning_rate": 8.253237320807461e-08,
"loss": 0.4012,
"step": 1093
},
{
"epoch": 1.899403147042865,
"grad_norm": 0.1295692835277435,
"learning_rate": 7.981150136999793e-08,
"loss": 0.4205,
"step": 1094
},
{
"epoch": 1.9011394465545308,
"grad_norm": 0.11763339137349027,
"learning_rate": 7.71358735995864e-08,
"loss": 0.4137,
"step": 1095
},
{
"epoch": 1.9028757460661965,
"grad_norm": 0.12209014747781179,
"learning_rate": 7.450551450083277e-08,
"loss": 0.415,
"step": 1096
},
{
"epoch": 1.9046120455778621,
"grad_norm": 0.11983015977982606,
"learning_rate": 7.192044826145772e-08,
"loss": 0.3991,
"step": 1097
},
{
"epoch": 1.906348345089528,
"grad_norm": 0.1207622356665182,
"learning_rate": 6.938069865268737e-08,
"loss": 0.4126,
"step": 1098
},
{
"epoch": 1.9080846446011939,
"grad_norm": 0.12025369875642083,
"learning_rate": 6.688628902903393e-08,
"loss": 0.4117,
"step": 1099
},
{
"epoch": 1.9098209441128595,
"grad_norm": 0.12948153340332494,
"learning_rate": 6.443724232808146e-08,
"loss": 0.4252,
"step": 1100
},
{
"epoch": 1.9115572436245252,
"grad_norm": 0.11631844929689528,
"learning_rate": 6.203358107027491e-08,
"loss": 0.4177,
"step": 1101
},
{
"epoch": 1.913293543136191,
"grad_norm": 0.12021451757580208,
"learning_rate": 5.967532735871306e-08,
"loss": 0.42,
"step": 1102
},
{
"epoch": 1.915029842647857,
"grad_norm": 0.12120333242584432,
"learning_rate": 5.736250287894651e-08,
"loss": 0.4301,
"step": 1103
},
{
"epoch": 1.9167661421595226,
"grad_norm": 0.11959409905136235,
"learning_rate": 5.509512889877333e-08,
"loss": 0.4068,
"step": 1104
},
{
"epoch": 1.9185024416711882,
"grad_norm": 0.12331314438716552,
"learning_rate": 5.2873226268052026e-08,
"loss": 0.4026,
"step": 1105
},
{
"epoch": 1.920238741182854,
"grad_norm": 0.12692833064254694,
"learning_rate": 5.069681541850058e-08,
"loss": 0.421,
"step": 1106
},
{
"epoch": 1.92197504069452,
"grad_norm": 0.11247262830318959,
"learning_rate": 4.856591636351604e-08,
"loss": 0.4053,
"step": 1107
},
{
"epoch": 1.9237113402061856,
"grad_norm": 0.12245938252177607,
"learning_rate": 4.648054869798524e-08,
"loss": 0.4167,
"step": 1108
},
{
"epoch": 1.9254476397178513,
"grad_norm": 0.12475135088194052,
"learning_rate": 4.444073159810769e-08,
"loss": 0.4021,
"step": 1109
},
{
"epoch": 1.9271839392295171,
"grad_norm": 0.11574725410724895,
"learning_rate": 4.244648382121852e-08,
"loss": 0.408,
"step": 1110
},
{
"epoch": 1.928920238741183,
"grad_norm": 0.12368144526647107,
"learning_rate": 4.0497823705615836e-08,
"loss": 0.4117,
"step": 1111
},
{
"epoch": 1.9306565382528487,
"grad_norm": 0.1206154036080435,
"learning_rate": 3.859476917039029e-08,
"loss": 0.4091,
"step": 1112
},
{
"epoch": 1.9323928377645143,
"grad_norm": 0.11695836630567202,
"learning_rate": 3.673733771526466e-08,
"loss": 0.4138,
"step": 1113
},
{
"epoch": 1.9341291372761802,
"grad_norm": 0.11933510110855337,
"learning_rate": 3.492554642042789e-08,
"loss": 0.3976,
"step": 1114
},
{
"epoch": 1.935865436787846,
"grad_norm": 0.11621387774141514,
"learning_rate": 3.315941194638239e-08,
"loss": 0.4192,
"step": 1115
},
{
"epoch": 1.9376017362995117,
"grad_norm": 0.11785093875152831,
"learning_rate": 3.143895053378698e-08,
"loss": 0.4088,
"step": 1116
},
{
"epoch": 1.9393380358111774,
"grad_norm": 0.11954445365824508,
"learning_rate": 2.976417800331144e-08,
"loss": 0.4071,
"step": 1117
},
{
"epoch": 1.9410743353228432,
"grad_norm": 0.11557274305022142,
"learning_rate": 2.8135109755487723e-08,
"loss": 0.3902,
"step": 1118
},
{
"epoch": 1.942810634834509,
"grad_norm": 0.1298782178121965,
"learning_rate": 2.6551760770569534e-08,
"loss": 0.4134,
"step": 1119
},
{
"epoch": 1.9445469343461748,
"grad_norm": 0.1179201998294052,
"learning_rate": 2.501414560839577e-08,
"loss": 0.415,
"step": 1120
},
{
"epoch": 1.9462832338578404,
"grad_norm": 0.12529243637432458,
"learning_rate": 2.352227840825394e-08,
"loss": 0.4207,
"step": 1121
},
{
"epoch": 1.9480195333695063,
"grad_norm": 0.11761944161526862,
"learning_rate": 2.2076172888753632e-08,
"loss": 0.4171,
"step": 1122
},
{
"epoch": 1.9497558328811722,
"grad_norm": 0.13326349889687525,
"learning_rate": 2.067584234769715e-08,
"loss": 0.4397,
"step": 1123
},
{
"epoch": 1.9514921323928378,
"grad_norm": 0.12285157394273083,
"learning_rate": 1.9321299661959614e-08,
"loss": 0.4151,
"step": 1124
},
{
"epoch": 1.9532284319045035,
"grad_norm": 0.11927335798670852,
"learning_rate": 1.8012557287367394e-08,
"loss": 0.399,
"step": 1125
},
{
"epoch": 1.9549647314161693,
"grad_norm": 0.11652679857972266,
"learning_rate": 1.674962725858875e-08,
"loss": 0.4194,
"step": 1126
},
{
"epoch": 1.9567010309278352,
"grad_norm": 0.11860055221544642,
"learning_rate": 1.553252118901727e-08,
"loss": 0.4179,
"step": 1127
},
{
"epoch": 1.9584373304395009,
"grad_norm": 0.11695805242109508,
"learning_rate": 1.4361250270670257e-08,
"loss": 0.4183,
"step": 1128
},
{
"epoch": 1.9601736299511665,
"grad_norm": 0.13215770831912252,
"learning_rate": 1.3235825274081626e-08,
"loss": 0.4293,
"step": 1129
},
{
"epoch": 1.9619099294628324,
"grad_norm": 0.12055150409768445,
"learning_rate": 1.2156256548205292e-08,
"loss": 0.4051,
"step": 1130
},
{
"epoch": 1.9636462289744983,
"grad_norm": 0.11537922403279285,
"learning_rate": 1.1122554020320252e-08,
"loss": 0.4317,
"step": 1131
},
{
"epoch": 1.965382528486164,
"grad_norm": 0.13083088141242724,
"learning_rate": 1.0134727195937332e-08,
"loss": 0.4079,
"step": 1132
},
{
"epoch": 1.9671188279978296,
"grad_norm": 0.11720738515436362,
"learning_rate": 9.192785158713691e-09,
"loss": 0.3998,
"step": 1133
},
{
"epoch": 1.9688551275094954,
"grad_norm": 0.11709302268501304,
"learning_rate": 8.296736570367337e-09,
"loss": 0.4006,
"step": 1134
},
{
"epoch": 1.9705914270211613,
"grad_norm": 0.11420680025829363,
"learning_rate": 7.446589670599968e-09,
"loss": 0.3982,
"step": 1135
},
{
"epoch": 1.972327726532827,
"grad_norm": 0.1260826669146985,
"learning_rate": 6.642352277019815e-09,
"loss": 0.4159,
"step": 1136
},
{
"epoch": 1.9740640260444926,
"grad_norm": 0.12070921112427768,
"learning_rate": 5.884031785068356e-09,
"loss": 0.4111,
"step": 1137
},
{
"epoch": 1.9758003255561585,
"grad_norm": 0.12073001784468888,
"learning_rate": 5.17163516795538e-09,
"loss": 0.4108,
"step": 1138
},
{
"epoch": 1.9775366250678243,
"grad_norm": 0.12518510846802877,
"learning_rate": 4.505168976592922e-09,
"loss": 0.4103,
"step": 1139
},
{
"epoch": 1.97927292457949,
"grad_norm": 0.12321739657536239,
"learning_rate": 3.884639339534202e-09,
"loss": 0.4066,
"step": 1140
},
{
"epoch": 1.9810092240911557,
"grad_norm": 0.11741501996048638,
"learning_rate": 3.3100519629203353e-09,
"loss": 0.3731,
"step": 1141
},
{
"epoch": 1.9827455236028215,
"grad_norm": 0.11667218226770629,
"learning_rate": 2.781412130424266e-09,
"loss": 0.3942,
"step": 1142
},
{
"epoch": 1.9844818231144874,
"grad_norm": 0.11589729089722366,
"learning_rate": 2.298724703204691e-09,
"loss": 0.3943,
"step": 1143
},
{
"epoch": 1.986218122626153,
"grad_norm": 0.12645976313375637,
"learning_rate": 1.861994119860544e-09,
"loss": 0.4108,
"step": 1144
},
{
"epoch": 1.9879544221378187,
"grad_norm": 0.12047556198437237,
"learning_rate": 1.471224396389359e-09,
"loss": 0.4021,
"step": 1145
},
{
"epoch": 1.9896907216494846,
"grad_norm": 0.11686013457027536,
"learning_rate": 1.1264191261528557e-09,
"loss": 0.4092,
"step": 1146
},
{
"epoch": 1.9914270211611504,
"grad_norm": 0.1321783887220322,
"learning_rate": 8.275814798408554e-10,
"loss": 0.418,
"step": 1147
},
{
"epoch": 1.993163320672816,
"grad_norm": 0.1178482673584452,
"learning_rate": 5.747142054429722e-10,
"loss": 0.4088,
"step": 1148
},
{
"epoch": 1.9948996201844817,
"grad_norm": 0.1205943485930681,
"learning_rate": 3.678196282252966e-10,
"loss": 0.4181,
"step": 1149
},
{
"epoch": 1.9966359196961476,
"grad_norm": 0.1216930863569034,
"learning_rate": 2.0689965070652686e-10,
"loss": 0.4379,
"step": 1150
},
{
"epoch": 1.9983722192078135,
"grad_norm": 0.12024784790971259,
"learning_rate": 9.195575264242529e-11,
"loss": 0.3954,
"step": 1151
},
{
"epoch": 2.0,
"grad_norm": 0.12663974138485112,
"learning_rate": 2.2988991009720295e-11,
"loss": 0.4082,
"step": 1152
},
{
"epoch": 2.0,
"step": 1152,
"total_flos": 2415371294343168.0,
"train_loss": 0.4241238099574629,
"train_runtime": 41892.0315,
"train_samples_per_second": 1.76,
"train_steps_per_second": 0.027
}
],
"logging_steps": 1,
"max_steps": 1152,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2415371294343168.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}