random_1wSwi8O2rX6jQpag / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
e20774e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1142,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017513134851138354,
"grad_norm": 1.6189321527508724,
"learning_rate": 9.99998108060379e-06,
"loss": 0.2516,
"step": 1
},
{
"epoch": 0.0035026269702276708,
"grad_norm": 3.2205596273078165,
"learning_rate": 9.999924322558328e-06,
"loss": 0.4272,
"step": 2
},
{
"epoch": 0.005253940455341506,
"grad_norm": 1.9512359773972658,
"learning_rate": 9.99982972629315e-06,
"loss": 0.3537,
"step": 3
},
{
"epoch": 0.0070052539404553416,
"grad_norm": 2.303082988314436,
"learning_rate": 9.99969729252414e-06,
"loss": 0.4041,
"step": 4
},
{
"epoch": 0.008756567425569177,
"grad_norm": 1.7602753076895619,
"learning_rate": 9.999527022253521e-06,
"loss": 0.282,
"step": 5
},
{
"epoch": 0.010507880910683012,
"grad_norm": 1.472660482986906,
"learning_rate": 9.999318916769858e-06,
"loss": 0.3178,
"step": 6
},
{
"epoch": 0.012259194395796848,
"grad_norm": 1.5120175005465801,
"learning_rate": 9.999072977648042e-06,
"loss": 0.2982,
"step": 7
},
{
"epoch": 0.014010507880910683,
"grad_norm": 1.552294241363048,
"learning_rate": 9.998789206749284e-06,
"loss": 0.2406,
"step": 8
},
{
"epoch": 0.01576182136602452,
"grad_norm": 1.4355383392258896,
"learning_rate": 9.998467606221091e-06,
"loss": 0.3063,
"step": 9
},
{
"epoch": 0.017513134851138354,
"grad_norm": 1.1505639079486283,
"learning_rate": 9.998108178497259e-06,
"loss": 0.2002,
"step": 10
},
{
"epoch": 0.01926444833625219,
"grad_norm": 1.478144193504059,
"learning_rate": 9.99771092629785e-06,
"loss": 0.249,
"step": 11
},
{
"epoch": 0.021015761821366025,
"grad_norm": 1.2422830596341097,
"learning_rate": 9.997275852629172e-06,
"loss": 0.2275,
"step": 12
},
{
"epoch": 0.02276707530647986,
"grad_norm": 1.176007388166311,
"learning_rate": 9.99680296078376e-06,
"loss": 0.1875,
"step": 13
},
{
"epoch": 0.024518388791593695,
"grad_norm": 1.5860889058713947,
"learning_rate": 9.996292254340342e-06,
"loss": 0.2827,
"step": 14
},
{
"epoch": 0.02626970227670753,
"grad_norm": 1.708745706115137,
"learning_rate": 9.995743737163823e-06,
"loss": 0.2909,
"step": 15
},
{
"epoch": 0.028021015761821366,
"grad_norm": 1.8765102069789172,
"learning_rate": 9.99515741340525e-06,
"loss": 0.1961,
"step": 16
},
{
"epoch": 0.0297723292469352,
"grad_norm": 1.4334195277716413,
"learning_rate": 9.994533287501775e-06,
"loss": 0.2652,
"step": 17
},
{
"epoch": 0.03152364273204904,
"grad_norm": 1.140465695934814,
"learning_rate": 9.993871364176637e-06,
"loss": 0.2149,
"step": 18
},
{
"epoch": 0.03327495621716287,
"grad_norm": 1.1401794918473909,
"learning_rate": 9.993171648439109e-06,
"loss": 0.1913,
"step": 19
},
{
"epoch": 0.03502626970227671,
"grad_norm": 1.3063665613307132,
"learning_rate": 9.992434145584471e-06,
"loss": 0.2375,
"step": 20
},
{
"epoch": 0.03677758318739054,
"grad_norm": 1.1007379852192687,
"learning_rate": 9.991658861193966e-06,
"loss": 0.2276,
"step": 21
},
{
"epoch": 0.03852889667250438,
"grad_norm": 1.3635848002372042,
"learning_rate": 9.99084580113476e-06,
"loss": 0.2432,
"step": 22
},
{
"epoch": 0.040280210157618214,
"grad_norm": 1.437328015329854,
"learning_rate": 9.989994971559897e-06,
"loss": 0.2815,
"step": 23
},
{
"epoch": 0.04203152364273205,
"grad_norm": 1.2255764233144246,
"learning_rate": 9.989106378908246e-06,
"loss": 0.2028,
"step": 24
},
{
"epoch": 0.043782837127845885,
"grad_norm": 0.934261492585796,
"learning_rate": 9.988180029904463e-06,
"loss": 0.1948,
"step": 25
},
{
"epoch": 0.04553415061295972,
"grad_norm": 1.3236962758603368,
"learning_rate": 9.987215931558935e-06,
"loss": 0.2235,
"step": 26
},
{
"epoch": 0.047285464098073555,
"grad_norm": 1.389095598869151,
"learning_rate": 9.986214091167726e-06,
"loss": 0.2542,
"step": 27
},
{
"epoch": 0.04903677758318739,
"grad_norm": 1.2087489144763743,
"learning_rate": 9.985174516312519e-06,
"loss": 0.2045,
"step": 28
},
{
"epoch": 0.050788091068301226,
"grad_norm": 1.0708229969119525,
"learning_rate": 9.984097214860566e-06,
"loss": 0.2343,
"step": 29
},
{
"epoch": 0.05253940455341506,
"grad_norm": 1.0781105753548819,
"learning_rate": 9.982982194964626e-06,
"loss": 0.1994,
"step": 30
},
{
"epoch": 0.0542907180385289,
"grad_norm": 1.2715702567474565,
"learning_rate": 9.981829465062898e-06,
"loss": 0.2553,
"step": 31
},
{
"epoch": 0.05604203152364273,
"grad_norm": 0.9520552503997382,
"learning_rate": 9.980639033878965e-06,
"loss": 0.1844,
"step": 32
},
{
"epoch": 0.05779334500875657,
"grad_norm": 1.2877099584096807,
"learning_rate": 9.979410910421724e-06,
"loss": 0.232,
"step": 33
},
{
"epoch": 0.0595446584938704,
"grad_norm": 1.364716904790534,
"learning_rate": 9.978145103985315e-06,
"loss": 0.1983,
"step": 34
},
{
"epoch": 0.06129597197898424,
"grad_norm": 1.0906698260991894,
"learning_rate": 9.976841624149054e-06,
"loss": 0.1903,
"step": 35
},
{
"epoch": 0.06304728546409807,
"grad_norm": 1.09412669488779,
"learning_rate": 9.975500480777364e-06,
"loss": 0.1919,
"step": 36
},
{
"epoch": 0.0647985989492119,
"grad_norm": 1.2010020335474978,
"learning_rate": 9.974121684019695e-06,
"loss": 0.1608,
"step": 37
},
{
"epoch": 0.06654991243432574,
"grad_norm": 0.8480134607436435,
"learning_rate": 9.972705244310445e-06,
"loss": 0.1524,
"step": 38
},
{
"epoch": 0.06830122591943957,
"grad_norm": 1.2269951958743044,
"learning_rate": 9.97125117236889e-06,
"loss": 0.2164,
"step": 39
},
{
"epoch": 0.07005253940455342,
"grad_norm": 1.5214916016344373,
"learning_rate": 9.969759479199093e-06,
"loss": 0.2573,
"step": 40
},
{
"epoch": 0.07180385288966724,
"grad_norm": 1.3616317930694422,
"learning_rate": 9.968230176089828e-06,
"loss": 0.2272,
"step": 41
},
{
"epoch": 0.07355516637478109,
"grad_norm": 1.1932422020281335,
"learning_rate": 9.966663274614495e-06,
"loss": 0.1567,
"step": 42
},
{
"epoch": 0.07530647985989491,
"grad_norm": 0.8716927387571308,
"learning_rate": 9.965058786631022e-06,
"loss": 0.2293,
"step": 43
},
{
"epoch": 0.07705779334500876,
"grad_norm": 1.3932388141794017,
"learning_rate": 9.963416724281787e-06,
"loss": 0.2286,
"step": 44
},
{
"epoch": 0.07880910683012259,
"grad_norm": 1.2309788186288007,
"learning_rate": 9.961737099993523e-06,
"loss": 0.2633,
"step": 45
},
{
"epoch": 0.08056042031523643,
"grad_norm": 1.0195652741277668,
"learning_rate": 9.960019926477218e-06,
"loss": 0.1664,
"step": 46
},
{
"epoch": 0.08231173380035026,
"grad_norm": 0.805855610816426,
"learning_rate": 9.958265216728032e-06,
"loss": 0.1538,
"step": 47
},
{
"epoch": 0.0840630472854641,
"grad_norm": 0.9295478292913552,
"learning_rate": 9.956472984025179e-06,
"loss": 0.154,
"step": 48
},
{
"epoch": 0.08581436077057793,
"grad_norm": 1.1182679627713559,
"learning_rate": 9.954643241931845e-06,
"loss": 0.1946,
"step": 49
},
{
"epoch": 0.08756567425569177,
"grad_norm": 0.9976680326317856,
"learning_rate": 9.952776004295077e-06,
"loss": 0.1705,
"step": 50
},
{
"epoch": 0.0893169877408056,
"grad_norm": 0.9178402665093803,
"learning_rate": 9.95087128524568e-06,
"loss": 0.1583,
"step": 51
},
{
"epoch": 0.09106830122591944,
"grad_norm": 0.9494960096058251,
"learning_rate": 9.948929099198104e-06,
"loss": 0.1779,
"step": 52
},
{
"epoch": 0.09281961471103327,
"grad_norm": 1.546996916824689,
"learning_rate": 9.946949460850346e-06,
"loss": 0.2514,
"step": 53
},
{
"epoch": 0.09457092819614711,
"grad_norm": 0.9923825116181026,
"learning_rate": 9.944932385183831e-06,
"loss": 0.1989,
"step": 54
},
{
"epoch": 0.09632224168126094,
"grad_norm": 1.0402809673797797,
"learning_rate": 9.9428778874633e-06,
"loss": 0.2107,
"step": 55
},
{
"epoch": 0.09807355516637478,
"grad_norm": 1.0825018590876776,
"learning_rate": 9.940785983236696e-06,
"loss": 0.1676,
"step": 56
},
{
"epoch": 0.09982486865148861,
"grad_norm": 1.379406625027056,
"learning_rate": 9.938656688335045e-06,
"loss": 0.1991,
"step": 57
},
{
"epoch": 0.10157618213660245,
"grad_norm": 1.0500448305369041,
"learning_rate": 9.936490018872336e-06,
"loss": 0.1695,
"step": 58
},
{
"epoch": 0.10332749562171628,
"grad_norm": 1.1035051286979576,
"learning_rate": 9.934285991245399e-06,
"loss": 0.1939,
"step": 59
},
{
"epoch": 0.10507880910683012,
"grad_norm": 0.9812842055285581,
"learning_rate": 9.932044622133785e-06,
"loss": 0.1937,
"step": 60
},
{
"epoch": 0.10683012259194395,
"grad_norm": 0.9527544704706223,
"learning_rate": 9.929765928499635e-06,
"loss": 0.171,
"step": 61
},
{
"epoch": 0.1085814360770578,
"grad_norm": 1.302925637235369,
"learning_rate": 9.927449927587549e-06,
"loss": 0.278,
"step": 62
},
{
"epoch": 0.11033274956217162,
"grad_norm": 0.917614726426997,
"learning_rate": 9.925096636924465e-06,
"loss": 0.2583,
"step": 63
},
{
"epoch": 0.11208406304728546,
"grad_norm": 1.1260675596481124,
"learning_rate": 9.922706074319517e-06,
"loss": 0.1954,
"step": 64
},
{
"epoch": 0.1138353765323993,
"grad_norm": 0.9735158179996255,
"learning_rate": 9.920278257863904e-06,
"loss": 0.1688,
"step": 65
},
{
"epoch": 0.11558669001751314,
"grad_norm": 1.2032784643650782,
"learning_rate": 9.917813205930758e-06,
"loss": 0.2189,
"step": 66
},
{
"epoch": 0.11733800350262696,
"grad_norm": 1.1710648620955906,
"learning_rate": 9.915310937174995e-06,
"loss": 0.2043,
"step": 67
},
{
"epoch": 0.1190893169877408,
"grad_norm": 1.0468308734796317,
"learning_rate": 9.91277147053318e-06,
"loss": 0.1912,
"step": 68
},
{
"epoch": 0.12084063047285463,
"grad_norm": 1.0767180014359876,
"learning_rate": 9.910194825223384e-06,
"loss": 0.1916,
"step": 69
},
{
"epoch": 0.12259194395796848,
"grad_norm": 1.157850250531908,
"learning_rate": 9.90758102074504e-06,
"loss": 0.1692,
"step": 70
},
{
"epoch": 0.1243432574430823,
"grad_norm": 1.1978899308041546,
"learning_rate": 9.90493007687878e-06,
"loss": 0.1904,
"step": 71
},
{
"epoch": 0.12609457092819615,
"grad_norm": 1.3308506120726233,
"learning_rate": 9.902242013686316e-06,
"loss": 0.2215,
"step": 72
},
{
"epoch": 0.12784588441331,
"grad_norm": 1.186862457320643,
"learning_rate": 9.899516851510256e-06,
"loss": 0.2086,
"step": 73
},
{
"epoch": 0.1295971978984238,
"grad_norm": 0.9921699627050491,
"learning_rate": 9.89675461097397e-06,
"loss": 0.1942,
"step": 74
},
{
"epoch": 0.13134851138353765,
"grad_norm": 1.1944882918690307,
"learning_rate": 9.893955312981428e-06,
"loss": 0.1996,
"step": 75
},
{
"epoch": 0.1330998248686515,
"grad_norm": 1.3595222631043964,
"learning_rate": 9.89111897871704e-06,
"loss": 0.2493,
"step": 76
},
{
"epoch": 0.13485113835376533,
"grad_norm": 1.311168457583758,
"learning_rate": 9.888245629645502e-06,
"loss": 0.3023,
"step": 77
},
{
"epoch": 0.13660245183887915,
"grad_norm": 1.2264268547734622,
"learning_rate": 9.885335287511621e-06,
"loss": 0.2375,
"step": 78
},
{
"epoch": 0.138353765323993,
"grad_norm": 1.0615586907581638,
"learning_rate": 9.882387974340166e-06,
"loss": 0.2196,
"step": 79
},
{
"epoch": 0.14010507880910683,
"grad_norm": 1.2113698183185242,
"learning_rate": 9.879403712435692e-06,
"loss": 0.2094,
"step": 80
},
{
"epoch": 0.14185639229422067,
"grad_norm": 1.6599951150659369,
"learning_rate": 9.876382524382372e-06,
"loss": 0.2155,
"step": 81
},
{
"epoch": 0.1436077057793345,
"grad_norm": 1.1421227311156412,
"learning_rate": 9.873324433043825e-06,
"loss": 0.2082,
"step": 82
},
{
"epoch": 0.14535901926444833,
"grad_norm": 1.3566945133013182,
"learning_rate": 9.87022946156295e-06,
"loss": 0.221,
"step": 83
},
{
"epoch": 0.14711033274956217,
"grad_norm": 1.3051526600631753,
"learning_rate": 9.867097633361745e-06,
"loss": 0.1775,
"step": 84
},
{
"epoch": 0.14886164623467601,
"grad_norm": 0.8957821215184415,
"learning_rate": 9.863928972141127e-06,
"loss": 0.1868,
"step": 85
},
{
"epoch": 0.15061295971978983,
"grad_norm": 0.8391246575100022,
"learning_rate": 9.860723501880758e-06,
"loss": 0.1299,
"step": 86
},
{
"epoch": 0.15236427320490367,
"grad_norm": 1.0429245203527981,
"learning_rate": 9.857481246838867e-06,
"loss": 0.1631,
"step": 87
},
{
"epoch": 0.15411558669001751,
"grad_norm": 0.9670342133099205,
"learning_rate": 9.854202231552052e-06,
"loss": 0.186,
"step": 88
},
{
"epoch": 0.15586690017513136,
"grad_norm": 1.0108020188058704,
"learning_rate": 9.850886480835113e-06,
"loss": 0.2,
"step": 89
},
{
"epoch": 0.15761821366024517,
"grad_norm": 1.1527622631935206,
"learning_rate": 9.847534019780848e-06,
"loss": 0.2098,
"step": 90
},
{
"epoch": 0.159369527145359,
"grad_norm": 1.044844838210246,
"learning_rate": 9.844144873759874e-06,
"loss": 0.1915,
"step": 91
},
{
"epoch": 0.16112084063047286,
"grad_norm": 1.4573741072487383,
"learning_rate": 9.840719068420427e-06,
"loss": 0.2642,
"step": 92
},
{
"epoch": 0.1628721541155867,
"grad_norm": 1.091447061849621,
"learning_rate": 9.837256629688177e-06,
"loss": 0.1863,
"step": 93
},
{
"epoch": 0.1646234676007005,
"grad_norm": 1.252517589764872,
"learning_rate": 9.833757583766025e-06,
"loss": 0.2034,
"step": 94
},
{
"epoch": 0.16637478108581435,
"grad_norm": 1.1107281185356483,
"learning_rate": 9.830221957133903e-06,
"loss": 0.1832,
"step": 95
},
{
"epoch": 0.1681260945709282,
"grad_norm": 0.976784716781447,
"learning_rate": 9.82664977654858e-06,
"loss": 0.1326,
"step": 96
},
{
"epoch": 0.16987740805604204,
"grad_norm": 1.19891512446026,
"learning_rate": 9.823041069043457e-06,
"loss": 0.191,
"step": 97
},
{
"epoch": 0.17162872154115585,
"grad_norm": 1.1984485781115666,
"learning_rate": 9.81939586192836e-06,
"loss": 0.2456,
"step": 98
},
{
"epoch": 0.1733800350262697,
"grad_norm": 1.978454735318979,
"learning_rate": 9.815714182789335e-06,
"loss": 0.3018,
"step": 99
},
{
"epoch": 0.17513134851138354,
"grad_norm": 1.016573147746523,
"learning_rate": 9.811996059488445e-06,
"loss": 0.2071,
"step": 100
},
{
"epoch": 0.17688266199649738,
"grad_norm": 1.1362427606530137,
"learning_rate": 9.808241520163542e-06,
"loss": 0.1999,
"step": 101
},
{
"epoch": 0.1786339754816112,
"grad_norm": 1.230279571459468,
"learning_rate": 9.804450593228079e-06,
"loss": 0.1463,
"step": 102
},
{
"epoch": 0.18038528896672504,
"grad_norm": 0.868837318204682,
"learning_rate": 9.800623307370874e-06,
"loss": 0.1376,
"step": 103
},
{
"epoch": 0.18213660245183888,
"grad_norm": 0.972725973754719,
"learning_rate": 9.7967596915559e-06,
"loss": 0.1922,
"step": 104
},
{
"epoch": 0.18388791593695272,
"grad_norm": 0.8173644404176668,
"learning_rate": 9.792859775022069e-06,
"loss": 0.1299,
"step": 105
},
{
"epoch": 0.18563922942206654,
"grad_norm": 1.0879870179146585,
"learning_rate": 9.788923587283008e-06,
"loss": 0.2063,
"step": 106
},
{
"epoch": 0.18739054290718038,
"grad_norm": 0.952468881220675,
"learning_rate": 9.784951158126836e-06,
"loss": 0.1661,
"step": 107
},
{
"epoch": 0.18914185639229422,
"grad_norm": 1.0914420674457646,
"learning_rate": 9.780942517615937e-06,
"loss": 0.1622,
"step": 108
},
{
"epoch": 0.19089316987740806,
"grad_norm": 1.4948764206782315,
"learning_rate": 9.776897696086734e-06,
"loss": 0.2464,
"step": 109
},
{
"epoch": 0.19264448336252188,
"grad_norm": 1.1347837540296297,
"learning_rate": 9.772816724149459e-06,
"loss": 0.159,
"step": 110
},
{
"epoch": 0.19439579684763572,
"grad_norm": 1.3238181457613138,
"learning_rate": 9.768699632687922e-06,
"loss": 0.2241,
"step": 111
},
{
"epoch": 0.19614711033274956,
"grad_norm": 1.042910339183979,
"learning_rate": 9.764546452859277e-06,
"loss": 0.1967,
"step": 112
},
{
"epoch": 0.1978984238178634,
"grad_norm": 1.1785097761069256,
"learning_rate": 9.760357216093788e-06,
"loss": 0.1961,
"step": 113
},
{
"epoch": 0.19964973730297722,
"grad_norm": 1.1403104414823952,
"learning_rate": 9.756131954094582e-06,
"loss": 0.2042,
"step": 114
},
{
"epoch": 0.20140105078809106,
"grad_norm": 1.4022044273328718,
"learning_rate": 9.751870698837428e-06,
"loss": 0.258,
"step": 115
},
{
"epoch": 0.2031523642732049,
"grad_norm": 1.1023466504712938,
"learning_rate": 9.747573482570471e-06,
"loss": 0.1867,
"step": 116
},
{
"epoch": 0.20490367775831875,
"grad_norm": 1.013716455534731,
"learning_rate": 9.74324033781401e-06,
"loss": 0.235,
"step": 117
},
{
"epoch": 0.20665499124343256,
"grad_norm": 1.1766050779187667,
"learning_rate": 9.738871297360233e-06,
"loss": 0.2042,
"step": 118
},
{
"epoch": 0.2084063047285464,
"grad_norm": 1.1003072535572958,
"learning_rate": 9.734466394272988e-06,
"loss": 0.1945,
"step": 119
},
{
"epoch": 0.21015761821366025,
"grad_norm": 1.4838997871202069,
"learning_rate": 9.730025661887517e-06,
"loss": 0.1961,
"step": 120
},
{
"epoch": 0.2119089316987741,
"grad_norm": 1.2740923322292086,
"learning_rate": 9.725549133810205e-06,
"loss": 0.1872,
"step": 121
},
{
"epoch": 0.2136602451838879,
"grad_norm": 1.0849396699876135,
"learning_rate": 9.721036843918343e-06,
"loss": 0.1887,
"step": 122
},
{
"epoch": 0.21541155866900175,
"grad_norm": 1.171434657200328,
"learning_rate": 9.716488826359848e-06,
"loss": 0.1719,
"step": 123
},
{
"epoch": 0.2171628721541156,
"grad_norm": 0.8144293180316944,
"learning_rate": 9.711905115553014e-06,
"loss": 0.1217,
"step": 124
},
{
"epoch": 0.21891418563922943,
"grad_norm": 0.8287059128714208,
"learning_rate": 9.707285746186262e-06,
"loss": 0.126,
"step": 125
},
{
"epoch": 0.22066549912434325,
"grad_norm": 1.0397181324925568,
"learning_rate": 9.702630753217865e-06,
"loss": 0.2018,
"step": 126
},
{
"epoch": 0.2224168126094571,
"grad_norm": 1.2093454647956197,
"learning_rate": 9.697940171875682e-06,
"loss": 0.1819,
"step": 127
},
{
"epoch": 0.22416812609457093,
"grad_norm": 0.971954663646255,
"learning_rate": 9.6932140376569e-06,
"loss": 0.1459,
"step": 128
},
{
"epoch": 0.22591943957968477,
"grad_norm": 0.9871770951957114,
"learning_rate": 9.688452386327764e-06,
"loss": 0.1597,
"step": 129
},
{
"epoch": 0.2276707530647986,
"grad_norm": 1.1914487524596076,
"learning_rate": 9.6836552539233e-06,
"loss": 0.1894,
"step": 130
},
{
"epoch": 0.22942206654991243,
"grad_norm": 1.3299445049184178,
"learning_rate": 9.678822676747048e-06,
"loss": 0.21,
"step": 131
},
{
"epoch": 0.23117338003502627,
"grad_norm": 0.9944443296898016,
"learning_rate": 9.673954691370782e-06,
"loss": 0.1933,
"step": 132
},
{
"epoch": 0.2329246935201401,
"grad_norm": 1.2879765089518327,
"learning_rate": 9.669051334634243e-06,
"loss": 0.2187,
"step": 133
},
{
"epoch": 0.23467600700525393,
"grad_norm": 1.1579504079908498,
"learning_rate": 9.66411264364485e-06,
"loss": 0.2593,
"step": 134
},
{
"epoch": 0.23642732049036777,
"grad_norm": 0.889979983641812,
"learning_rate": 9.659138655777422e-06,
"loss": 0.1599,
"step": 135
},
{
"epoch": 0.2381786339754816,
"grad_norm": 1.1392333425773118,
"learning_rate": 9.654129408673897e-06,
"loss": 0.2232,
"step": 136
},
{
"epoch": 0.23992994746059546,
"grad_norm": 0.927182815890979,
"learning_rate": 9.649084940243052e-06,
"loss": 0.1547,
"step": 137
},
{
"epoch": 0.24168126094570927,
"grad_norm": 1.0230754932044808,
"learning_rate": 9.644005288660204e-06,
"loss": 0.1552,
"step": 138
},
{
"epoch": 0.2434325744308231,
"grad_norm": 1.0117195711045635,
"learning_rate": 9.638890492366924e-06,
"loss": 0.1563,
"step": 139
},
{
"epoch": 0.24518388791593695,
"grad_norm": 1.4066019466468673,
"learning_rate": 9.633740590070763e-06,
"loss": 0.3144,
"step": 140
},
{
"epoch": 0.2469352014010508,
"grad_norm": 1.1513768439474406,
"learning_rate": 9.628555620744932e-06,
"loss": 0.1434,
"step": 141
},
{
"epoch": 0.2486865148861646,
"grad_norm": 1.0700849134487496,
"learning_rate": 9.62333562362803e-06,
"loss": 0.2161,
"step": 142
},
{
"epoch": 0.2504378283712785,
"grad_norm": 1.5059188434701325,
"learning_rate": 9.618080638223732e-06,
"loss": 0.2005,
"step": 143
},
{
"epoch": 0.2521891418563923,
"grad_norm": 0.9665925005797491,
"learning_rate": 9.612790704300501e-06,
"loss": 0.1828,
"step": 144
},
{
"epoch": 0.2539404553415061,
"grad_norm": 0.8229122412140533,
"learning_rate": 9.607465861891276e-06,
"loss": 0.1208,
"step": 145
},
{
"epoch": 0.25569176882662,
"grad_norm": 1.0488909683370946,
"learning_rate": 9.602106151293182e-06,
"loss": 0.2138,
"step": 146
},
{
"epoch": 0.2574430823117338,
"grad_norm": 1.2990902146116425,
"learning_rate": 9.596711613067212e-06,
"loss": 0.2018,
"step": 147
},
{
"epoch": 0.2591943957968476,
"grad_norm": 0.8880761437071694,
"learning_rate": 9.59128228803793e-06,
"loss": 0.1679,
"step": 148
},
{
"epoch": 0.2609457092819615,
"grad_norm": 0.8863876920056281,
"learning_rate": 9.585818217293155e-06,
"loss": 0.1488,
"step": 149
},
{
"epoch": 0.2626970227670753,
"grad_norm": 1.252288840375211,
"learning_rate": 9.580319442183654e-06,
"loss": 0.2294,
"step": 150
},
{
"epoch": 0.26444833625218916,
"grad_norm": 1.1271115490692163,
"learning_rate": 9.574786004322831e-06,
"loss": 0.2005,
"step": 151
},
{
"epoch": 0.266199649737303,
"grad_norm": 0.8958292187859824,
"learning_rate": 9.569217945586406e-06,
"loss": 0.1515,
"step": 152
},
{
"epoch": 0.2679509632224168,
"grad_norm": 1.1432682883821648,
"learning_rate": 9.563615308112106e-06,
"loss": 0.1727,
"step": 153
},
{
"epoch": 0.26970227670753066,
"grad_norm": 0.8860825973578063,
"learning_rate": 9.557978134299332e-06,
"loss": 0.1561,
"step": 154
},
{
"epoch": 0.2714535901926445,
"grad_norm": 0.9458112069640355,
"learning_rate": 9.552306466808861e-06,
"loss": 0.173,
"step": 155
},
{
"epoch": 0.2732049036777583,
"grad_norm": 1.1059388701307742,
"learning_rate": 9.546600348562499e-06,
"loss": 0.1939,
"step": 156
},
{
"epoch": 0.27495621716287216,
"grad_norm": 1.3621793677790732,
"learning_rate": 9.54085982274277e-06,
"loss": 0.2033,
"step": 157
},
{
"epoch": 0.276707530647986,
"grad_norm": 1.049168730239092,
"learning_rate": 9.535084932792588e-06,
"loss": 0.2193,
"step": 158
},
{
"epoch": 0.27845884413309985,
"grad_norm": 0.8987325354112385,
"learning_rate": 9.529275722414926e-06,
"loss": 0.149,
"step": 159
},
{
"epoch": 0.28021015761821366,
"grad_norm": 1.1205614005555482,
"learning_rate": 9.523432235572485e-06,
"loss": 0.1715,
"step": 160
},
{
"epoch": 0.2819614711033275,
"grad_norm": 1.1119065744262058,
"learning_rate": 9.517554516487361e-06,
"loss": 0.2139,
"step": 161
},
{
"epoch": 0.28371278458844135,
"grad_norm": 1.2120128153983245,
"learning_rate": 9.511642609640714e-06,
"loss": 0.2055,
"step": 162
},
{
"epoch": 0.28546409807355516,
"grad_norm": 1.0381552882652774,
"learning_rate": 9.505696559772427e-06,
"loss": 0.1521,
"step": 163
},
{
"epoch": 0.287215411558669,
"grad_norm": 1.0616048587072129,
"learning_rate": 9.499716411880767e-06,
"loss": 0.1438,
"step": 164
},
{
"epoch": 0.28896672504378285,
"grad_norm": 1.465227437163341,
"learning_rate": 9.493702211222052e-06,
"loss": 0.1939,
"step": 165
},
{
"epoch": 0.29071803852889666,
"grad_norm": 1.2171201787031805,
"learning_rate": 9.4876540033103e-06,
"loss": 0.1542,
"step": 166
},
{
"epoch": 0.29246935201401053,
"grad_norm": 0.9937562205583209,
"learning_rate": 9.481571833916884e-06,
"loss": 0.1822,
"step": 167
},
{
"epoch": 0.29422066549912435,
"grad_norm": 1.139810203249971,
"learning_rate": 9.475455749070198e-06,
"loss": 0.2018,
"step": 168
},
{
"epoch": 0.29597197898423816,
"grad_norm": 1.2507741492130755,
"learning_rate": 9.469305795055292e-06,
"loss": 0.2314,
"step": 169
},
{
"epoch": 0.29772329246935203,
"grad_norm": 1.6644506023322219,
"learning_rate": 9.463122018413533e-06,
"loss": 0.1912,
"step": 170
},
{
"epoch": 0.29947460595446584,
"grad_norm": 0.8574109893402403,
"learning_rate": 9.45690446594225e-06,
"loss": 0.1236,
"step": 171
},
{
"epoch": 0.30122591943957966,
"grad_norm": 0.9262386331879862,
"learning_rate": 9.450653184694378e-06,
"loss": 0.2005,
"step": 172
},
{
"epoch": 0.30297723292469353,
"grad_norm": 0.7994796498712383,
"learning_rate": 9.444368221978102e-06,
"loss": 0.1488,
"step": 173
},
{
"epoch": 0.30472854640980734,
"grad_norm": 0.9739129710543231,
"learning_rate": 9.438049625356506e-06,
"loss": 0.2011,
"step": 174
},
{
"epoch": 0.3064798598949212,
"grad_norm": 1.7022881013682905,
"learning_rate": 9.431697442647199e-06,
"loss": 0.286,
"step": 175
},
{
"epoch": 0.30823117338003503,
"grad_norm": 0.9765370631504982,
"learning_rate": 9.425311721921967e-06,
"loss": 0.1455,
"step": 176
},
{
"epoch": 0.30998248686514884,
"grad_norm": 1.1358714034120214,
"learning_rate": 9.418892511506404e-06,
"loss": 0.1664,
"step": 177
},
{
"epoch": 0.3117338003502627,
"grad_norm": 1.0393192528807746,
"learning_rate": 9.412439859979543e-06,
"loss": 0.162,
"step": 178
},
{
"epoch": 0.3134851138353765,
"grad_norm": 1.096045737373684,
"learning_rate": 9.405953816173491e-06,
"loss": 0.1431,
"step": 179
},
{
"epoch": 0.31523642732049034,
"grad_norm": 0.7637588357172228,
"learning_rate": 9.399434429173063e-06,
"loss": 0.1522,
"step": 180
},
{
"epoch": 0.3169877408056042,
"grad_norm": 1.1511627286236419,
"learning_rate": 9.392881748315403e-06,
"loss": 0.23,
"step": 181
},
{
"epoch": 0.318739054290718,
"grad_norm": 1.296531852821544,
"learning_rate": 9.38629582318962e-06,
"loss": 0.1559,
"step": 182
},
{
"epoch": 0.3204903677758319,
"grad_norm": 1.0245849374018412,
"learning_rate": 9.379676703636402e-06,
"loss": 0.2058,
"step": 183
},
{
"epoch": 0.3222416812609457,
"grad_norm": 1.1325995765106882,
"learning_rate": 9.373024439747648e-06,
"loss": 0.1798,
"step": 184
},
{
"epoch": 0.3239929947460595,
"grad_norm": 0.955274718027506,
"learning_rate": 9.366339081866085e-06,
"loss": 0.1318,
"step": 185
},
{
"epoch": 0.3257443082311734,
"grad_norm": 1.0960146910727295,
"learning_rate": 9.359620680584889e-06,
"loss": 0.2125,
"step": 186
},
{
"epoch": 0.3274956217162872,
"grad_norm": 0.9976010826462164,
"learning_rate": 9.352869286747295e-06,
"loss": 0.1744,
"step": 187
},
{
"epoch": 0.329246935201401,
"grad_norm": 1.0033232475788938,
"learning_rate": 9.34608495144622e-06,
"loss": 0.1712,
"step": 188
},
{
"epoch": 0.3309982486865149,
"grad_norm": 1.2252146471565943,
"learning_rate": 9.33926772602388e-06,
"loss": 0.1983,
"step": 189
},
{
"epoch": 0.3327495621716287,
"grad_norm": 1.1760605780656463,
"learning_rate": 9.332417662071386e-06,
"loss": 0.1666,
"step": 190
},
{
"epoch": 0.3345008756567426,
"grad_norm": 1.3456892597616057,
"learning_rate": 9.32553481142837e-06,
"loss": 0.1829,
"step": 191
},
{
"epoch": 0.3362521891418564,
"grad_norm": 1.2660460244764533,
"learning_rate": 9.31861922618258e-06,
"loss": 0.2458,
"step": 192
},
{
"epoch": 0.3380035026269702,
"grad_norm": 1.0534767359485842,
"learning_rate": 9.311670958669502e-06,
"loss": 0.1874,
"step": 193
},
{
"epoch": 0.3397548161120841,
"grad_norm": 0.8458278928313304,
"learning_rate": 9.304690061471937e-06,
"loss": 0.1667,
"step": 194
},
{
"epoch": 0.3415061295971979,
"grad_norm": 0.9903682726455234,
"learning_rate": 9.297676587419638e-06,
"loss": 0.2062,
"step": 195
},
{
"epoch": 0.3432574430823117,
"grad_norm": 1.1314930714580482,
"learning_rate": 9.290630589588876e-06,
"loss": 0.1794,
"step": 196
},
{
"epoch": 0.3450087565674256,
"grad_norm": 1.463207590968482,
"learning_rate": 9.283552121302064e-06,
"loss": 0.2053,
"step": 197
},
{
"epoch": 0.3467600700525394,
"grad_norm": 1.0264128857179728,
"learning_rate": 9.276441236127343e-06,
"loss": 0.1463,
"step": 198
},
{
"epoch": 0.34851138353765326,
"grad_norm": 1.1530074129076198,
"learning_rate": 9.269297987878168e-06,
"loss": 0.1918,
"step": 199
},
{
"epoch": 0.3502626970227671,
"grad_norm": 0.9958162716820419,
"learning_rate": 9.262122430612922e-06,
"loss": 0.1474,
"step": 200
},
{
"epoch": 0.3520140105078809,
"grad_norm": 1.0886838150262181,
"learning_rate": 9.254914618634487e-06,
"loss": 0.2175,
"step": 201
},
{
"epoch": 0.35376532399299476,
"grad_norm": 1.1036143709701502,
"learning_rate": 9.247674606489843e-06,
"loss": 0.141,
"step": 202
},
{
"epoch": 0.3555166374781086,
"grad_norm": 1.0910090805563288,
"learning_rate": 9.240402448969655e-06,
"loss": 0.1638,
"step": 203
},
{
"epoch": 0.3572679509632224,
"grad_norm": 1.0496526329879359,
"learning_rate": 9.233098201107854e-06,
"loss": 0.1745,
"step": 204
},
{
"epoch": 0.35901926444833626,
"grad_norm": 1.0259634242913862,
"learning_rate": 9.225761918181224e-06,
"loss": 0.1554,
"step": 205
},
{
"epoch": 0.3607705779334501,
"grad_norm": 0.9318692908968823,
"learning_rate": 9.218393655708981e-06,
"loss": 0.1598,
"step": 206
},
{
"epoch": 0.36252189141856395,
"grad_norm": 1.0675242528447997,
"learning_rate": 9.210993469452357e-06,
"loss": 0.2542,
"step": 207
},
{
"epoch": 0.36427320490367776,
"grad_norm": 1.0231337468264674,
"learning_rate": 9.203561415414174e-06,
"loss": 0.1377,
"step": 208
},
{
"epoch": 0.3660245183887916,
"grad_norm": 0.8172962309380866,
"learning_rate": 9.196097549838422e-06,
"loss": 0.1337,
"step": 209
},
{
"epoch": 0.36777583187390545,
"grad_norm": 1.3437004411676805,
"learning_rate": 9.188601929209836e-06,
"loss": 0.2003,
"step": 210
},
{
"epoch": 0.36952714535901926,
"grad_norm": 0.9859223201620534,
"learning_rate": 9.181074610253457e-06,
"loss": 0.1246,
"step": 211
},
{
"epoch": 0.3712784588441331,
"grad_norm": 1.1821125005665047,
"learning_rate": 9.173515649934222e-06,
"loss": 0.1918,
"step": 212
},
{
"epoch": 0.37302977232924694,
"grad_norm": 1.3158062961704573,
"learning_rate": 9.165925105456513e-06,
"loss": 0.2639,
"step": 213
},
{
"epoch": 0.37478108581436076,
"grad_norm": 1.3388147444695622,
"learning_rate": 9.15830303426374e-06,
"loss": 0.2246,
"step": 214
},
{
"epoch": 0.37653239929947463,
"grad_norm": 0.8947375382446747,
"learning_rate": 9.150649494037895e-06,
"loss": 0.1225,
"step": 215
},
{
"epoch": 0.37828371278458844,
"grad_norm": 1.103951490161312,
"learning_rate": 9.142964542699124e-06,
"loss": 0.2026,
"step": 216
},
{
"epoch": 0.38003502626970226,
"grad_norm": 1.233924530688903,
"learning_rate": 9.135248238405282e-06,
"loss": 0.2086,
"step": 217
},
{
"epoch": 0.38178633975481613,
"grad_norm": 1.1719877250784767,
"learning_rate": 9.127500639551497e-06,
"loss": 0.1795,
"step": 218
},
{
"epoch": 0.38353765323992994,
"grad_norm": 1.255963695109249,
"learning_rate": 9.119721804769723e-06,
"loss": 0.1862,
"step": 219
},
{
"epoch": 0.38528896672504376,
"grad_norm": 1.1459951961009553,
"learning_rate": 9.111911792928308e-06,
"loss": 0.1966,
"step": 220
},
{
"epoch": 0.38704028021015763,
"grad_norm": 1.1761484428825753,
"learning_rate": 9.10407066313153e-06,
"loss": 0.1868,
"step": 221
},
{
"epoch": 0.38879159369527144,
"grad_norm": 1.118245234998823,
"learning_rate": 9.096198474719169e-06,
"loss": 0.189,
"step": 222
},
{
"epoch": 0.3905429071803853,
"grad_norm": 1.1403077273341966,
"learning_rate": 9.088295287266042e-06,
"loss": 0.1752,
"step": 223
},
{
"epoch": 0.3922942206654991,
"grad_norm": 1.1241899523938508,
"learning_rate": 9.080361160581569e-06,
"loss": 0.163,
"step": 224
},
{
"epoch": 0.39404553415061294,
"grad_norm": 1.1244116274618798,
"learning_rate": 9.0723961547093e-06,
"loss": 0.1644,
"step": 225
},
{
"epoch": 0.3957968476357268,
"grad_norm": 1.1452039316203921,
"learning_rate": 9.064400329926476e-06,
"loss": 0.1677,
"step": 226
},
{
"epoch": 0.3975481611208406,
"grad_norm": 0.9169411289803756,
"learning_rate": 9.05637374674357e-06,
"loss": 0.1499,
"step": 227
},
{
"epoch": 0.39929947460595444,
"grad_norm": 1.1320705178616042,
"learning_rate": 9.048316465903823e-06,
"loss": 0.1532,
"step": 228
},
{
"epoch": 0.4010507880910683,
"grad_norm": 1.1148728898419542,
"learning_rate": 9.04022854838279e-06,
"loss": 0.1745,
"step": 229
},
{
"epoch": 0.4028021015761821,
"grad_norm": 0.9892668796106193,
"learning_rate": 9.032110055387881e-06,
"loss": 0.192,
"step": 230
},
{
"epoch": 0.404553415061296,
"grad_norm": 1.2041757680734857,
"learning_rate": 9.023961048357885e-06,
"loss": 0.2152,
"step": 231
},
{
"epoch": 0.4063047285464098,
"grad_norm": 1.1128606421374905,
"learning_rate": 9.015781588962524e-06,
"loss": 0.273,
"step": 232
},
{
"epoch": 0.4080560420315236,
"grad_norm": 1.0110467377360022,
"learning_rate": 9.007571739101968e-06,
"loss": 0.1625,
"step": 233
},
{
"epoch": 0.4098073555166375,
"grad_norm": 2.1812012536277385,
"learning_rate": 8.999331560906382e-06,
"loss": 0.437,
"step": 234
},
{
"epoch": 0.4115586690017513,
"grad_norm": 1.2026156956732903,
"learning_rate": 8.991061116735437e-06,
"loss": 0.1843,
"step": 235
},
{
"epoch": 0.4133099824868651,
"grad_norm": 0.9335774970781566,
"learning_rate": 8.982760469177865e-06,
"loss": 0.1648,
"step": 236
},
{
"epoch": 0.415061295971979,
"grad_norm": 0.9632646131071537,
"learning_rate": 8.974429681050957e-06,
"loss": 0.234,
"step": 237
},
{
"epoch": 0.4168126094570928,
"grad_norm": 1.1628290027919184,
"learning_rate": 8.966068815400108e-06,
"loss": 0.243,
"step": 238
},
{
"epoch": 0.4185639229422067,
"grad_norm": 0.9926771048841662,
"learning_rate": 8.957677935498328e-06,
"loss": 0.2181,
"step": 239
},
{
"epoch": 0.4203152364273205,
"grad_norm": 1.2100489148340519,
"learning_rate": 8.949257104845772e-06,
"loss": 0.1799,
"step": 240
},
{
"epoch": 0.4220665499124343,
"grad_norm": 0.9686376813257808,
"learning_rate": 8.94080638716925e-06,
"loss": 0.1412,
"step": 241
},
{
"epoch": 0.4238178633975482,
"grad_norm": 1.0304072209793849,
"learning_rate": 8.932325846421755e-06,
"loss": 0.1608,
"step": 242
},
{
"epoch": 0.425569176882662,
"grad_norm": 1.1068555551888453,
"learning_rate": 8.923815546781968e-06,
"loss": 0.1736,
"step": 243
},
{
"epoch": 0.4273204903677758,
"grad_norm": 1.1406345353775849,
"learning_rate": 8.915275552653786e-06,
"loss": 0.1856,
"step": 244
},
{
"epoch": 0.4290718038528897,
"grad_norm": 0.824906662216677,
"learning_rate": 8.906705928665818e-06,
"loss": 0.1241,
"step": 245
},
{
"epoch": 0.4308231173380035,
"grad_norm": 0.9817782743515178,
"learning_rate": 8.898106739670908e-06,
"loss": 0.1391,
"step": 246
},
{
"epoch": 0.43257443082311736,
"grad_norm": 1.0061540033779128,
"learning_rate": 8.889478050745646e-06,
"loss": 0.1487,
"step": 247
},
{
"epoch": 0.4343257443082312,
"grad_norm": 1.319181249331738,
"learning_rate": 8.88081992718986e-06,
"loss": 0.2516,
"step": 248
},
{
"epoch": 0.436077057793345,
"grad_norm": 1.0168251468902272,
"learning_rate": 8.872132434526144e-06,
"loss": 0.1601,
"step": 249
},
{
"epoch": 0.43782837127845886,
"grad_norm": 0.903893586373318,
"learning_rate": 8.863415638499341e-06,
"loss": 0.1675,
"step": 250
},
{
"epoch": 0.4395796847635727,
"grad_norm": 1.5466100134300305,
"learning_rate": 8.854669605076058e-06,
"loss": 0.2004,
"step": 251
},
{
"epoch": 0.4413309982486865,
"grad_norm": 0.8990563567570562,
"learning_rate": 8.845894400444163e-06,
"loss": 0.1505,
"step": 252
},
{
"epoch": 0.44308231173380036,
"grad_norm": 1.2358463965854751,
"learning_rate": 8.837090091012289e-06,
"loss": 0.188,
"step": 253
},
{
"epoch": 0.4448336252189142,
"grad_norm": 0.8631731423566544,
"learning_rate": 8.82825674340932e-06,
"loss": 0.1085,
"step": 254
},
{
"epoch": 0.44658493870402804,
"grad_norm": 0.851054134724772,
"learning_rate": 8.819394424483898e-06,
"loss": 0.1553,
"step": 255
},
{
"epoch": 0.44833625218914186,
"grad_norm": 0.9747188919128223,
"learning_rate": 8.810503201303914e-06,
"loss": 0.1429,
"step": 256
},
{
"epoch": 0.4500875656742557,
"grad_norm": 1.1817384021065651,
"learning_rate": 8.801583141155993e-06,
"loss": 0.1714,
"step": 257
},
{
"epoch": 0.45183887915936954,
"grad_norm": 1.1497461858404032,
"learning_rate": 8.792634311545002e-06,
"loss": 0.1654,
"step": 258
},
{
"epoch": 0.45359019264448336,
"grad_norm": 1.0653546734905857,
"learning_rate": 8.78365678019352e-06,
"loss": 0.1561,
"step": 259
},
{
"epoch": 0.4553415061295972,
"grad_norm": 0.9098419050905309,
"learning_rate": 8.774650615041332e-06,
"loss": 0.1446,
"step": 260
},
{
"epoch": 0.45709281961471104,
"grad_norm": 1.3562833349770014,
"learning_rate": 8.765615884244925e-06,
"loss": 0.1887,
"step": 261
},
{
"epoch": 0.45884413309982486,
"grad_norm": 1.584928142177982,
"learning_rate": 8.75655265617696e-06,
"loss": 0.2126,
"step": 262
},
{
"epoch": 0.46059544658493873,
"grad_norm": 1.25237114313114,
"learning_rate": 8.747460999425755e-06,
"loss": 0.1999,
"step": 263
},
{
"epoch": 0.46234676007005254,
"grad_norm": 1.0137290535234078,
"learning_rate": 8.738340982794775e-06,
"loss": 0.1567,
"step": 264
},
{
"epoch": 0.46409807355516636,
"grad_norm": 0.9990455785944722,
"learning_rate": 8.729192675302104e-06,
"loss": 0.1817,
"step": 265
},
{
"epoch": 0.4658493870402802,
"grad_norm": 0.9098201735226615,
"learning_rate": 8.720016146179921e-06,
"loss": 0.181,
"step": 266
},
{
"epoch": 0.46760070052539404,
"grad_norm": 0.8339374481166864,
"learning_rate": 8.710811464873984e-06,
"loss": 0.13,
"step": 267
},
{
"epoch": 0.46935201401050786,
"grad_norm": 0.8247199593689756,
"learning_rate": 8.701578701043097e-06,
"loss": 0.116,
"step": 268
},
{
"epoch": 0.4711033274956217,
"grad_norm": 1.2251926045855088,
"learning_rate": 8.692317924558586e-06,
"loss": 0.2267,
"step": 269
},
{
"epoch": 0.47285464098073554,
"grad_norm": 1.323949068367743,
"learning_rate": 8.683029205503772e-06,
"loss": 0.2413,
"step": 270
},
{
"epoch": 0.4746059544658494,
"grad_norm": 0.8694038746666335,
"learning_rate": 8.67371261417344e-06,
"loss": 0.1314,
"step": 271
},
{
"epoch": 0.4763572679509632,
"grad_norm": 1.4214200720496517,
"learning_rate": 8.664368221073297e-06,
"loss": 0.2074,
"step": 272
},
{
"epoch": 0.47810858143607704,
"grad_norm": 0.8164989303563484,
"learning_rate": 8.65499609691946e-06,
"loss": 0.1142,
"step": 273
},
{
"epoch": 0.4798598949211909,
"grad_norm": 0.9950353997188668,
"learning_rate": 8.645596312637895e-06,
"loss": 0.2059,
"step": 274
},
{
"epoch": 0.4816112084063047,
"grad_norm": 1.1741807018487027,
"learning_rate": 8.636168939363905e-06,
"loss": 0.1762,
"step": 275
},
{
"epoch": 0.48336252189141854,
"grad_norm": 1.1639496884105414,
"learning_rate": 8.62671404844157e-06,
"loss": 0.1946,
"step": 276
},
{
"epoch": 0.4851138353765324,
"grad_norm": 1.24592549170423,
"learning_rate": 8.617231711423222e-06,
"loss": 0.2481,
"step": 277
},
{
"epoch": 0.4868651488616462,
"grad_norm": 1.6788228325648948,
"learning_rate": 8.607722000068898e-06,
"loss": 0.2368,
"step": 278
},
{
"epoch": 0.4886164623467601,
"grad_norm": 0.9120565830493937,
"learning_rate": 8.598184986345797e-06,
"loss": 0.1528,
"step": 279
},
{
"epoch": 0.4903677758318739,
"grad_norm": 1.0407774323531322,
"learning_rate": 8.588620742427733e-06,
"loss": 0.1481,
"step": 280
},
{
"epoch": 0.4921190893169877,
"grad_norm": 0.8774818082676015,
"learning_rate": 8.579029340694596e-06,
"loss": 0.167,
"step": 281
},
{
"epoch": 0.4938704028021016,
"grad_norm": 0.7302486166777703,
"learning_rate": 8.569410853731799e-06,
"loss": 0.1339,
"step": 282
},
{
"epoch": 0.4956217162872154,
"grad_norm": 1.4977744846377492,
"learning_rate": 8.559765354329728e-06,
"loss": 0.2384,
"step": 283
},
{
"epoch": 0.4973730297723292,
"grad_norm": 1.3333579162401141,
"learning_rate": 8.55009291548319e-06,
"loss": 0.2047,
"step": 284
},
{
"epoch": 0.4991243432574431,
"grad_norm": 1.0371512553867137,
"learning_rate": 8.540393610390871e-06,
"loss": 0.2014,
"step": 285
},
{
"epoch": 0.500875656742557,
"grad_norm": 5.507551141823795,
"learning_rate": 8.530667512454765e-06,
"loss": 0.2963,
"step": 286
},
{
"epoch": 0.5026269702276708,
"grad_norm": 1.0814488316677124,
"learning_rate": 8.520914695279632e-06,
"loss": 0.1799,
"step": 287
},
{
"epoch": 0.5043782837127846,
"grad_norm": 1.0877989498247866,
"learning_rate": 8.511135232672442e-06,
"loss": 0.2273,
"step": 288
},
{
"epoch": 0.5061295971978984,
"grad_norm": 1.1546522986634278,
"learning_rate": 8.501329198641802e-06,
"loss": 0.1699,
"step": 289
},
{
"epoch": 0.5078809106830122,
"grad_norm": 0.9955478078874541,
"learning_rate": 8.491496667397409e-06,
"loss": 0.1616,
"step": 290
},
{
"epoch": 0.5096322241681261,
"grad_norm": 1.0749336259458449,
"learning_rate": 8.481637713349486e-06,
"loss": 0.2121,
"step": 291
},
{
"epoch": 0.51138353765324,
"grad_norm": 1.2722939684715049,
"learning_rate": 8.471752411108216e-06,
"loss": 0.1619,
"step": 292
},
{
"epoch": 0.5131348511383538,
"grad_norm": 1.31720477548223,
"learning_rate": 8.461840835483179e-06,
"loss": 0.2357,
"step": 293
},
{
"epoch": 0.5148861646234676,
"grad_norm": 1.0512404750610869,
"learning_rate": 8.451903061482787e-06,
"loss": 0.2039,
"step": 294
},
{
"epoch": 0.5166374781085814,
"grad_norm": 1.095035695042105,
"learning_rate": 8.44193916431371e-06,
"loss": 0.1386,
"step": 295
},
{
"epoch": 0.5183887915936952,
"grad_norm": 1.2777241490218374,
"learning_rate": 8.431949219380319e-06,
"loss": 0.2109,
"step": 296
},
{
"epoch": 0.5201401050788091,
"grad_norm": 1.0873909511926902,
"learning_rate": 8.421933302284102e-06,
"loss": 0.1584,
"step": 297
},
{
"epoch": 0.521891418563923,
"grad_norm": 0.9773041895311751,
"learning_rate": 8.411891488823102e-06,
"loss": 0.1512,
"step": 298
},
{
"epoch": 0.5236427320490368,
"grad_norm": 1.1584559931137173,
"learning_rate": 8.401823854991338e-06,
"loss": 0.1974,
"step": 299
},
{
"epoch": 0.5253940455341506,
"grad_norm": 0.9617777428924835,
"learning_rate": 8.391730476978229e-06,
"loss": 0.1536,
"step": 300
},
{
"epoch": 0.5271453590192644,
"grad_norm": 1.1847455857829146,
"learning_rate": 8.381611431168027e-06,
"loss": 0.1949,
"step": 301
},
{
"epoch": 0.5288966725043783,
"grad_norm": 1.0558725080734255,
"learning_rate": 8.37146679413922e-06,
"loss": 0.1873,
"step": 302
},
{
"epoch": 0.5306479859894921,
"grad_norm": 0.9708895644033165,
"learning_rate": 8.361296642663977e-06,
"loss": 0.1851,
"step": 303
},
{
"epoch": 0.532399299474606,
"grad_norm": 1.0628725551453226,
"learning_rate": 8.351101053707545e-06,
"loss": 0.169,
"step": 304
},
{
"epoch": 0.5341506129597198,
"grad_norm": 0.9196106999391176,
"learning_rate": 8.34088010442768e-06,
"loss": 0.159,
"step": 305
},
{
"epoch": 0.5359019264448336,
"grad_norm": 1.0255932075584107,
"learning_rate": 8.330633872174057e-06,
"loss": 0.1701,
"step": 306
},
{
"epoch": 0.5376532399299475,
"grad_norm": 1.0762624625214643,
"learning_rate": 8.320362434487688e-06,
"loss": 0.1644,
"step": 307
},
{
"epoch": 0.5394045534150613,
"grad_norm": 0.7633404300233255,
"learning_rate": 8.310065869100332e-06,
"loss": 0.123,
"step": 308
},
{
"epoch": 0.5411558669001751,
"grad_norm": 0.8699176281503174,
"learning_rate": 8.299744253933908e-06,
"loss": 0.1351,
"step": 309
},
{
"epoch": 0.542907180385289,
"grad_norm": 1.2275049904001125,
"learning_rate": 8.289397667099909e-06,
"loss": 0.1675,
"step": 310
},
{
"epoch": 0.5446584938704028,
"grad_norm": 1.2101635020956292,
"learning_rate": 8.279026186898805e-06,
"loss": 0.1738,
"step": 311
},
{
"epoch": 0.5464098073555166,
"grad_norm": 0.9106260951604767,
"learning_rate": 8.26862989181945e-06,
"loss": 0.1338,
"step": 312
},
{
"epoch": 0.5481611208406305,
"grad_norm": 1.1480139615147442,
"learning_rate": 8.258208860538498e-06,
"loss": 0.1958,
"step": 313
},
{
"epoch": 0.5499124343257443,
"grad_norm": 1.543005528446332,
"learning_rate": 8.247763171919795e-06,
"loss": 0.1611,
"step": 314
},
{
"epoch": 0.5516637478108581,
"grad_norm": 1.201047458056005,
"learning_rate": 8.237292905013792e-06,
"loss": 0.2219,
"step": 315
},
{
"epoch": 0.553415061295972,
"grad_norm": 0.9962169779650556,
"learning_rate": 8.226798139056938e-06,
"loss": 0.1751,
"step": 316
},
{
"epoch": 0.5551663747810858,
"grad_norm": 1.0391141659299086,
"learning_rate": 8.216278953471088e-06,
"loss": 0.1907,
"step": 317
},
{
"epoch": 0.5569176882661997,
"grad_norm": 0.8878841272733307,
"learning_rate": 8.205735427862897e-06,
"loss": 0.1392,
"step": 318
},
{
"epoch": 0.5586690017513135,
"grad_norm": 0.8815771076681298,
"learning_rate": 8.19516764202322e-06,
"loss": 0.2054,
"step": 319
},
{
"epoch": 0.5604203152364273,
"grad_norm": 1.3561357204987174,
"learning_rate": 8.184575675926511e-06,
"loss": 0.162,
"step": 320
},
{
"epoch": 0.5621716287215411,
"grad_norm": 1.1436127474045497,
"learning_rate": 8.173959609730209e-06,
"loss": 0.1553,
"step": 321
},
{
"epoch": 0.563922942206655,
"grad_norm": 0.8696831532462574,
"learning_rate": 8.16331952377414e-06,
"loss": 0.1614,
"step": 322
},
{
"epoch": 0.5656742556917689,
"grad_norm": 1.243551733126445,
"learning_rate": 8.152655498579903e-06,
"loss": 0.241,
"step": 323
},
{
"epoch": 0.5674255691768827,
"grad_norm": 1.0649177969550163,
"learning_rate": 8.141967614850265e-06,
"loss": 0.1368,
"step": 324
},
{
"epoch": 0.5691768826619965,
"grad_norm": 1.1762554635110793,
"learning_rate": 8.131255953468553e-06,
"loss": 0.2207,
"step": 325
},
{
"epoch": 0.5709281961471103,
"grad_norm": 1.2132153902490277,
"learning_rate": 8.120520595498029e-06,
"loss": 0.1887,
"step": 326
},
{
"epoch": 0.5726795096322241,
"grad_norm": 0.7399066544679602,
"learning_rate": 8.10976162218129e-06,
"loss": 0.1267,
"step": 327
},
{
"epoch": 0.574430823117338,
"grad_norm": 0.9899573210939664,
"learning_rate": 8.09897911493965e-06,
"loss": 0.1685,
"step": 328
},
{
"epoch": 0.5761821366024519,
"grad_norm": 1.3337051985288606,
"learning_rate": 8.088173155372517e-06,
"loss": 0.3282,
"step": 329
},
{
"epoch": 0.5779334500875657,
"grad_norm": 1.005083830754978,
"learning_rate": 8.077343825256783e-06,
"loss": 0.2075,
"step": 330
},
{
"epoch": 0.5796847635726795,
"grad_norm": 0.8802289529518278,
"learning_rate": 8.066491206546206e-06,
"loss": 0.1254,
"step": 331
},
{
"epoch": 0.5814360770577933,
"grad_norm": 1.2400889336140388,
"learning_rate": 8.055615381370781e-06,
"loss": 0.185,
"step": 332
},
{
"epoch": 0.5831873905429071,
"grad_norm": 0.8545769679622639,
"learning_rate": 8.044716432036126e-06,
"loss": 0.1352,
"step": 333
},
{
"epoch": 0.5849387040280211,
"grad_norm": 1.2907830562873122,
"learning_rate": 8.033794441022857e-06,
"loss": 0.2143,
"step": 334
},
{
"epoch": 0.5866900175131349,
"grad_norm": 1.2934031358525784,
"learning_rate": 8.022849490985966e-06,
"loss": 0.2373,
"step": 335
},
{
"epoch": 0.5884413309982487,
"grad_norm": 1.0494212000075114,
"learning_rate": 8.011881664754193e-06,
"loss": 0.21,
"step": 336
},
{
"epoch": 0.5901926444833625,
"grad_norm": 1.3713545247159777,
"learning_rate": 8.000891045329394e-06,
"loss": 0.1956,
"step": 337
},
{
"epoch": 0.5919439579684763,
"grad_norm": 1.0044217822469004,
"learning_rate": 7.989877715885925e-06,
"loss": 0.1455,
"step": 338
},
{
"epoch": 0.5936952714535902,
"grad_norm": 0.969157755217139,
"learning_rate": 7.97884175977e-06,
"loss": 0.1858,
"step": 339
},
{
"epoch": 0.5954465849387041,
"grad_norm": 0.9043027733845134,
"learning_rate": 7.967783260499073e-06,
"loss": 0.1307,
"step": 340
},
{
"epoch": 0.5971978984238179,
"grad_norm": 0.7943124415991508,
"learning_rate": 7.956702301761195e-06,
"loss": 0.1142,
"step": 341
},
{
"epoch": 0.5989492119089317,
"grad_norm": 0.9004231757301827,
"learning_rate": 7.945598967414386e-06,
"loss": 0.1908,
"step": 342
},
{
"epoch": 0.6007005253940455,
"grad_norm": 1.1105437759236843,
"learning_rate": 7.934473341485998e-06,
"loss": 0.2115,
"step": 343
},
{
"epoch": 0.6024518388791593,
"grad_norm": 0.9301936250430058,
"learning_rate": 7.92332550817208e-06,
"loss": 0.1647,
"step": 344
},
{
"epoch": 0.6042031523642732,
"grad_norm": 1.047767148226068,
"learning_rate": 7.912155551836743e-06,
"loss": 0.2355,
"step": 345
},
{
"epoch": 0.6059544658493871,
"grad_norm": 0.9613875662244317,
"learning_rate": 7.900963557011519e-06,
"loss": 0.171,
"step": 346
},
{
"epoch": 0.6077057793345009,
"grad_norm": 1.2164758034474923,
"learning_rate": 7.88974960839472e-06,
"loss": 0.155,
"step": 347
},
{
"epoch": 0.6094570928196147,
"grad_norm": 1.056374791415413,
"learning_rate": 7.878513790850805e-06,
"loss": 0.1732,
"step": 348
},
{
"epoch": 0.6112084063047285,
"grad_norm": 1.2496494090012524,
"learning_rate": 7.867256189409724e-06,
"loss": 0.1835,
"step": 349
},
{
"epoch": 0.6129597197898424,
"grad_norm": 0.6631622478113742,
"learning_rate": 7.855976889266288e-06,
"loss": 0.118,
"step": 350
},
{
"epoch": 0.6147110332749562,
"grad_norm": 0.834904245415055,
"learning_rate": 7.844675975779514e-06,
"loss": 0.1363,
"step": 351
},
{
"epoch": 0.6164623467600701,
"grad_norm": 1.0623504091270612,
"learning_rate": 7.833353534471988e-06,
"loss": 0.1341,
"step": 352
},
{
"epoch": 0.6182136602451839,
"grad_norm": 1.002067391498751,
"learning_rate": 7.82200965102921e-06,
"loss": 0.1388,
"step": 353
},
{
"epoch": 0.6199649737302977,
"grad_norm": 1.2517283119177405,
"learning_rate": 7.810644411298951e-06,
"loss": 0.2027,
"step": 354
},
{
"epoch": 0.6217162872154116,
"grad_norm": 0.9691375625821814,
"learning_rate": 7.799257901290597e-06,
"loss": 0.1918,
"step": 355
},
{
"epoch": 0.6234676007005254,
"grad_norm": 1.257025304991851,
"learning_rate": 7.787850207174512e-06,
"loss": 0.1984,
"step": 356
},
{
"epoch": 0.6252189141856392,
"grad_norm": 1.2989884834762626,
"learning_rate": 7.776421415281368e-06,
"loss": 0.2251,
"step": 357
},
{
"epoch": 0.626970227670753,
"grad_norm": 0.9608457991453937,
"learning_rate": 7.764971612101497e-06,
"loss": 0.1598,
"step": 358
},
{
"epoch": 0.6287215411558669,
"grad_norm": 1.055564436388202,
"learning_rate": 7.753500884284251e-06,
"loss": 0.1506,
"step": 359
},
{
"epoch": 0.6304728546409807,
"grad_norm": 0.8536479488591544,
"learning_rate": 7.742009318637323e-06,
"loss": 0.1023,
"step": 360
},
{
"epoch": 0.6322241681260946,
"grad_norm": 1.1845728329626128,
"learning_rate": 7.730497002126105e-06,
"loss": 0.1584,
"step": 361
},
{
"epoch": 0.6339754816112084,
"grad_norm": 0.8928523838563781,
"learning_rate": 7.718964021873035e-06,
"loss": 0.1084,
"step": 362
},
{
"epoch": 0.6357267950963222,
"grad_norm": 0.9835590981476029,
"learning_rate": 7.707410465156916e-06,
"loss": 0.1638,
"step": 363
},
{
"epoch": 0.637478108581436,
"grad_norm": 0.9201578437929335,
"learning_rate": 7.695836419412277e-06,
"loss": 0.1239,
"step": 364
},
{
"epoch": 0.6392294220665499,
"grad_norm": 0.728467005507231,
"learning_rate": 7.684241972228702e-06,
"loss": 0.1244,
"step": 365
},
{
"epoch": 0.6409807355516638,
"grad_norm": 0.918030927901545,
"learning_rate": 7.672627211350164e-06,
"loss": 0.1466,
"step": 366
},
{
"epoch": 0.6427320490367776,
"grad_norm": 1.2979769917204351,
"learning_rate": 7.660992224674371e-06,
"loss": 0.2255,
"step": 367
},
{
"epoch": 0.6444833625218914,
"grad_norm": 0.771343645637159,
"learning_rate": 7.649337100252091e-06,
"loss": 0.1293,
"step": 368
},
{
"epoch": 0.6462346760070052,
"grad_norm": 1.1242763102043734,
"learning_rate": 7.637661926286493e-06,
"loss": 0.2268,
"step": 369
},
{
"epoch": 0.647985989492119,
"grad_norm": 1.024519873593415,
"learning_rate": 7.625966791132469e-06,
"loss": 0.1664,
"step": 370
},
{
"epoch": 0.649737302977233,
"grad_norm": 0.9346453233331181,
"learning_rate": 7.614251783295981e-06,
"loss": 0.1493,
"step": 371
},
{
"epoch": 0.6514886164623468,
"grad_norm": 1.032430577115044,
"learning_rate": 7.602516991433376e-06,
"loss": 0.168,
"step": 372
},
{
"epoch": 0.6532399299474606,
"grad_norm": 1.2756452983998474,
"learning_rate": 7.590762504350729e-06,
"loss": 0.2004,
"step": 373
},
{
"epoch": 0.6549912434325744,
"grad_norm": 1.0952734861105042,
"learning_rate": 7.578988411003156e-06,
"loss": 0.2038,
"step": 374
},
{
"epoch": 0.6567425569176882,
"grad_norm": 0.9982958673516226,
"learning_rate": 7.567194800494154e-06,
"loss": 0.1722,
"step": 375
},
{
"epoch": 0.658493870402802,
"grad_norm": 1.2747360607076716,
"learning_rate": 7.555381762074918e-06,
"loss": 0.1977,
"step": 376
},
{
"epoch": 0.660245183887916,
"grad_norm": 1.48637993886011,
"learning_rate": 7.543549385143667e-06,
"loss": 0.2751,
"step": 377
},
{
"epoch": 0.6619964973730298,
"grad_norm": 1.0127675566111833,
"learning_rate": 7.531697759244978e-06,
"loss": 0.1556,
"step": 378
},
{
"epoch": 0.6637478108581436,
"grad_norm": 1.0835339789681326,
"learning_rate": 7.519826974069088e-06,
"loss": 0.1884,
"step": 379
},
{
"epoch": 0.6654991243432574,
"grad_norm": 0.9837633602612155,
"learning_rate": 7.507937119451234e-06,
"loss": 0.1823,
"step": 380
},
{
"epoch": 0.6672504378283712,
"grad_norm": 1.392684179495506,
"learning_rate": 7.496028285370966e-06,
"loss": 0.1912,
"step": 381
},
{
"epoch": 0.6690017513134852,
"grad_norm": 1.0916850999558239,
"learning_rate": 7.484100561951459e-06,
"loss": 0.2066,
"step": 382
},
{
"epoch": 0.670753064798599,
"grad_norm": 0.8680971975778679,
"learning_rate": 7.472154039458851e-06,
"loss": 0.1754,
"step": 383
},
{
"epoch": 0.6725043782837128,
"grad_norm": 1.0435891343464805,
"learning_rate": 7.460188808301532e-06,
"loss": 0.1318,
"step": 384
},
{
"epoch": 0.6742556917688266,
"grad_norm": 1.042398115064006,
"learning_rate": 7.448204959029484e-06,
"loss": 0.2022,
"step": 385
},
{
"epoch": 0.6760070052539404,
"grad_norm": 0.9731481469841672,
"learning_rate": 7.436202582333587e-06,
"loss": 0.13,
"step": 386
},
{
"epoch": 0.6777583187390543,
"grad_norm": 1.386034337965727,
"learning_rate": 7.4241817690449235e-06,
"loss": 0.2216,
"step": 387
},
{
"epoch": 0.6795096322241682,
"grad_norm": 1.2608682646343083,
"learning_rate": 7.41214261013411e-06,
"loss": 0.1966,
"step": 388
},
{
"epoch": 0.681260945709282,
"grad_norm": 1.1020654673893056,
"learning_rate": 7.40008519671059e-06,
"loss": 0.1762,
"step": 389
},
{
"epoch": 0.6830122591943958,
"grad_norm": 1.0019379870434075,
"learning_rate": 7.3880096200219585e-06,
"loss": 0.1436,
"step": 390
},
{
"epoch": 0.6847635726795096,
"grad_norm": 0.9922002827434665,
"learning_rate": 7.375915971453264e-06,
"loss": 0.159,
"step": 391
},
{
"epoch": 0.6865148861646234,
"grad_norm": 1.2392846396033714,
"learning_rate": 7.363804342526315e-06,
"loss": 0.1972,
"step": 392
},
{
"epoch": 0.6882661996497373,
"grad_norm": 0.8963807218487707,
"learning_rate": 7.3516748248989955e-06,
"loss": 0.1921,
"step": 393
},
{
"epoch": 0.6900175131348512,
"grad_norm": 1.1130741139227058,
"learning_rate": 7.339527510364567e-06,
"loss": 0.1459,
"step": 394
},
{
"epoch": 0.691768826619965,
"grad_norm": 0.8075594361508924,
"learning_rate": 7.327362490850971e-06,
"loss": 0.1379,
"step": 395
},
{
"epoch": 0.6935201401050788,
"grad_norm": 1.0062548542126621,
"learning_rate": 7.315179858420138e-06,
"loss": 0.1758,
"step": 396
},
{
"epoch": 0.6952714535901926,
"grad_norm": 0.9717163669556567,
"learning_rate": 7.302979705267286e-06,
"loss": 0.1538,
"step": 397
},
{
"epoch": 0.6970227670753065,
"grad_norm": 1.0413579747819772,
"learning_rate": 7.2907621237202275e-06,
"loss": 0.1535,
"step": 398
},
{
"epoch": 0.6987740805604203,
"grad_norm": 0.9776202668633781,
"learning_rate": 7.2785272062386715e-06,
"loss": 0.2237,
"step": 399
},
{
"epoch": 0.7005253940455342,
"grad_norm": 0.9919980255734399,
"learning_rate": 7.266275045413517e-06,
"loss": 0.1759,
"step": 400
},
{
"epoch": 0.702276707530648,
"grad_norm": 1.1333937112664494,
"learning_rate": 7.254005733966159e-06,
"loss": 0.26,
"step": 401
},
{
"epoch": 0.7040280210157618,
"grad_norm": 0.9978679010728084,
"learning_rate": 7.241719364747781e-06,
"loss": 0.146,
"step": 402
},
{
"epoch": 0.7057793345008757,
"grad_norm": 1.0310148677742963,
"learning_rate": 7.229416030738661e-06,
"loss": 0.1358,
"step": 403
},
{
"epoch": 0.7075306479859895,
"grad_norm": 0.9210280630631851,
"learning_rate": 7.217095825047455e-06,
"loss": 0.1368,
"step": 404
},
{
"epoch": 0.7092819614711033,
"grad_norm": 0.8458172902391173,
"learning_rate": 7.204758840910509e-06,
"loss": 0.1548,
"step": 405
},
{
"epoch": 0.7110332749562172,
"grad_norm": 1.0007666191428417,
"learning_rate": 7.192405171691138e-06,
"loss": 0.1358,
"step": 406
},
{
"epoch": 0.712784588441331,
"grad_norm": 1.1791422782221483,
"learning_rate": 7.180034910878926e-06,
"loss": 0.2027,
"step": 407
},
{
"epoch": 0.7145359019264448,
"grad_norm": 1.3622226936019624,
"learning_rate": 7.167648152089017e-06,
"loss": 0.1446,
"step": 408
},
{
"epoch": 0.7162872154115587,
"grad_norm": 1.260377975347423,
"learning_rate": 7.155244989061415e-06,
"loss": 0.1394,
"step": 409
},
{
"epoch": 0.7180385288966725,
"grad_norm": 0.7647098429722478,
"learning_rate": 7.142825515660259e-06,
"loss": 0.1436,
"step": 410
},
{
"epoch": 0.7197898423817863,
"grad_norm": 1.0806203976791042,
"learning_rate": 7.130389825873125e-06,
"loss": 0.1472,
"step": 411
},
{
"epoch": 0.7215411558669002,
"grad_norm": 1.160580416868669,
"learning_rate": 7.1179380138103105e-06,
"loss": 0.1709,
"step": 412
},
{
"epoch": 0.723292469352014,
"grad_norm": 1.2669865482460725,
"learning_rate": 7.105470173704121e-06,
"loss": 0.1692,
"step": 413
},
{
"epoch": 0.7250437828371279,
"grad_norm": 0.8979827897826866,
"learning_rate": 7.092986399908158e-06,
"loss": 0.1538,
"step": 414
},
{
"epoch": 0.7267950963222417,
"grad_norm": 1.1613876219039976,
"learning_rate": 7.08048678689661e-06,
"loss": 0.1627,
"step": 415
},
{
"epoch": 0.7285464098073555,
"grad_norm": 0.7088309545286363,
"learning_rate": 7.067971429263527e-06,
"loss": 0.0933,
"step": 416
},
{
"epoch": 0.7302977232924693,
"grad_norm": 0.9953579341793583,
"learning_rate": 7.055440421722113e-06,
"loss": 0.1546,
"step": 417
},
{
"epoch": 0.7320490367775832,
"grad_norm": 0.893498402926208,
"learning_rate": 7.042893859104008e-06,
"loss": 0.1647,
"step": 418
},
{
"epoch": 0.7338003502626971,
"grad_norm": 0.9606538728371828,
"learning_rate": 7.030331836358565e-06,
"loss": 0.1584,
"step": 419
},
{
"epoch": 0.7355516637478109,
"grad_norm": 0.9087697947323283,
"learning_rate": 7.017754448552141e-06,
"loss": 0.1489,
"step": 420
},
{
"epoch": 0.7373029772329247,
"grad_norm": 1.0980936242324373,
"learning_rate": 7.0051617908673685e-06,
"loss": 0.1909,
"step": 421
},
{
"epoch": 0.7390542907180385,
"grad_norm": 1.0679212943502743,
"learning_rate": 6.992553958602439e-06,
"loss": 0.1933,
"step": 422
},
{
"epoch": 0.7408056042031523,
"grad_norm": 1.0201469343275764,
"learning_rate": 6.979931047170382e-06,
"loss": 0.19,
"step": 423
},
{
"epoch": 0.7425569176882661,
"grad_norm": 1.153726254550419,
"learning_rate": 6.967293152098345e-06,
"loss": 0.2164,
"step": 424
},
{
"epoch": 0.7443082311733801,
"grad_norm": 0.9986655789682201,
"learning_rate": 6.954640369026861e-06,
"loss": 0.1681,
"step": 425
},
{
"epoch": 0.7460595446584939,
"grad_norm": 0.7808715269507883,
"learning_rate": 6.941972793709141e-06,
"loss": 0.1427,
"step": 426
},
{
"epoch": 0.7478108581436077,
"grad_norm": 0.9806851736858181,
"learning_rate": 6.929290522010332e-06,
"loss": 0.1558,
"step": 427
},
{
"epoch": 0.7495621716287215,
"grad_norm": 1.1151483469529613,
"learning_rate": 6.9165936499068065e-06,
"loss": 0.1851,
"step": 428
},
{
"epoch": 0.7513134851138353,
"grad_norm": 1.0226642337566068,
"learning_rate": 6.903882273485425e-06,
"loss": 0.1406,
"step": 429
},
{
"epoch": 0.7530647985989493,
"grad_norm": 1.2154853234808298,
"learning_rate": 6.891156488942812e-06,
"loss": 0.2281,
"step": 430
},
{
"epoch": 0.7548161120840631,
"grad_norm": 0.9124967173578454,
"learning_rate": 6.878416392584635e-06,
"loss": 0.1502,
"step": 431
},
{
"epoch": 0.7565674255691769,
"grad_norm": 1.4028442036532214,
"learning_rate": 6.865662080824864e-06,
"loss": 0.2161,
"step": 432
},
{
"epoch": 0.7583187390542907,
"grad_norm": 0.975649535916596,
"learning_rate": 6.852893650185051e-06,
"loss": 0.1655,
"step": 433
},
{
"epoch": 0.7600700525394045,
"grad_norm": 1.1154571470663182,
"learning_rate": 6.840111197293594e-06,
"loss": 0.2768,
"step": 434
},
{
"epoch": 0.7618213660245184,
"grad_norm": 1.2620416059011632,
"learning_rate": 6.8273148188850105e-06,
"loss": 0.2549,
"step": 435
},
{
"epoch": 0.7635726795096323,
"grad_norm": 0.9710561696847746,
"learning_rate": 6.814504611799202e-06,
"loss": 0.1068,
"step": 436
},
{
"epoch": 0.7653239929947461,
"grad_norm": 0.8701266642614385,
"learning_rate": 6.801680672980722e-06,
"loss": 0.1272,
"step": 437
},
{
"epoch": 0.7670753064798599,
"grad_norm": 1.1987353253288306,
"learning_rate": 6.788843099478041e-06,
"loss": 0.2027,
"step": 438
},
{
"epoch": 0.7688266199649737,
"grad_norm": 1.1590697629080275,
"learning_rate": 6.775991988442816e-06,
"loss": 0.2143,
"step": 439
},
{
"epoch": 0.7705779334500875,
"grad_norm": 1.0846847919101628,
"learning_rate": 6.763127437129151e-06,
"loss": 0.1705,
"step": 440
},
{
"epoch": 0.7723292469352014,
"grad_norm": 0.8580193306506241,
"learning_rate": 6.750249542892863e-06,
"loss": 0.175,
"step": 441
},
{
"epoch": 0.7740805604203153,
"grad_norm": 1.5775617568317908,
"learning_rate": 6.737358403190746e-06,
"loss": 0.2081,
"step": 442
},
{
"epoch": 0.7758318739054291,
"grad_norm": 0.9624673340593134,
"learning_rate": 6.724454115579832e-06,
"loss": 0.1098,
"step": 443
},
{
"epoch": 0.7775831873905429,
"grad_norm": 1.0607534980440978,
"learning_rate": 6.711536777716654e-06,
"loss": 0.1803,
"step": 444
},
{
"epoch": 0.7793345008756567,
"grad_norm": 1.1255975448505335,
"learning_rate": 6.698606487356503e-06,
"loss": 0.1872,
"step": 445
},
{
"epoch": 0.7810858143607706,
"grad_norm": 0.8928366518808328,
"learning_rate": 6.685663342352693e-06,
"loss": 0.1466,
"step": 446
},
{
"epoch": 0.7828371278458844,
"grad_norm": 1.0817838600370688,
"learning_rate": 6.6727074406558224e-06,
"loss": 0.1663,
"step": 447
},
{
"epoch": 0.7845884413309983,
"grad_norm": 1.1115633011949881,
"learning_rate": 6.659738880313025e-06,
"loss": 0.1598,
"step": 448
},
{
"epoch": 0.7863397548161121,
"grad_norm": 1.0231529929725236,
"learning_rate": 6.646757759467234e-06,
"loss": 0.155,
"step": 449
},
{
"epoch": 0.7880910683012259,
"grad_norm": 1.2411539450968938,
"learning_rate": 6.633764176356434e-06,
"loss": 0.1915,
"step": 450
},
{
"epoch": 0.7898423817863398,
"grad_norm": 1.0708800647966268,
"learning_rate": 6.620758229312927e-06,
"loss": 0.1385,
"step": 451
},
{
"epoch": 0.7915936952714536,
"grad_norm": 1.0683250200759222,
"learning_rate": 6.6077400167625784e-06,
"loss": 0.1663,
"step": 452
},
{
"epoch": 0.7933450087565674,
"grad_norm": 1.250391733683125,
"learning_rate": 6.594709637224075e-06,
"loss": 0.1996,
"step": 453
},
{
"epoch": 0.7950963222416813,
"grad_norm": 1.1798794300569202,
"learning_rate": 6.581667189308185e-06,
"loss": 0.146,
"step": 454
},
{
"epoch": 0.7968476357267951,
"grad_norm": 1.4398604943836388,
"learning_rate": 6.5686127717170015e-06,
"loss": 0.3225,
"step": 455
},
{
"epoch": 0.7985989492119089,
"grad_norm": 0.9313043884553712,
"learning_rate": 6.555546483243205e-06,
"loss": 0.1389,
"step": 456
},
{
"epoch": 0.8003502626970228,
"grad_norm": 1.1833167749805182,
"learning_rate": 6.542468422769311e-06,
"loss": 0.2136,
"step": 457
},
{
"epoch": 0.8021015761821366,
"grad_norm": 0.8648334244503824,
"learning_rate": 6.529378689266923e-06,
"loss": 0.1878,
"step": 458
},
{
"epoch": 0.8038528896672504,
"grad_norm": 0.9949458854079241,
"learning_rate": 6.516277381795984e-06,
"loss": 0.1497,
"step": 459
},
{
"epoch": 0.8056042031523643,
"grad_norm": 1.1191303302941829,
"learning_rate": 6.503164599504022e-06,
"loss": 0.1566,
"step": 460
},
{
"epoch": 0.8073555166374781,
"grad_norm": 1.1298654961289165,
"learning_rate": 6.490040441625407e-06,
"loss": 0.2017,
"step": 461
},
{
"epoch": 0.809106830122592,
"grad_norm": 1.0947278566055827,
"learning_rate": 6.476905007480597e-06,
"loss": 0.1525,
"step": 462
},
{
"epoch": 0.8108581436077058,
"grad_norm": 1.1323194203448552,
"learning_rate": 6.4637583964753855e-06,
"loss": 0.2241,
"step": 463
},
{
"epoch": 0.8126094570928196,
"grad_norm": 1.0459409005734945,
"learning_rate": 6.45060070810015e-06,
"loss": 0.1296,
"step": 464
},
{
"epoch": 0.8143607705779334,
"grad_norm": 1.1316198669484385,
"learning_rate": 6.437432041929097e-06,
"loss": 0.1621,
"step": 465
},
{
"epoch": 0.8161120840630472,
"grad_norm": 0.9491729705590622,
"learning_rate": 6.424252497619511e-06,
"loss": 0.1547,
"step": 466
},
{
"epoch": 0.8178633975481612,
"grad_norm": 1.0195018137674068,
"learning_rate": 6.4110621749110014e-06,
"loss": 0.1424,
"step": 467
},
{
"epoch": 0.819614711033275,
"grad_norm": 1.2170884375327042,
"learning_rate": 6.397861173624745e-06,
"loss": 0.2018,
"step": 468
},
{
"epoch": 0.8213660245183888,
"grad_norm": 1.5918684168233677,
"learning_rate": 6.384649593662733e-06,
"loss": 0.1759,
"step": 469
},
{
"epoch": 0.8231173380035026,
"grad_norm": 1.1612346799052706,
"learning_rate": 6.371427535007008e-06,
"loss": 0.1944,
"step": 470
},
{
"epoch": 0.8248686514886164,
"grad_norm": 1.01990361540596,
"learning_rate": 6.358195097718917e-06,
"loss": 0.2028,
"step": 471
},
{
"epoch": 0.8266199649737302,
"grad_norm": 0.9233804242151922,
"learning_rate": 6.344952381938354e-06,
"loss": 0.1768,
"step": 472
},
{
"epoch": 0.8283712784588442,
"grad_norm": 1.1968348714557342,
"learning_rate": 6.331699487882987e-06,
"loss": 0.1657,
"step": 473
},
{
"epoch": 0.830122591943958,
"grad_norm": 1.4115850068938127,
"learning_rate": 6.318436515847525e-06,
"loss": 0.2006,
"step": 474
},
{
"epoch": 0.8318739054290718,
"grad_norm": 0.9507911762810964,
"learning_rate": 6.30516356620293e-06,
"loss": 0.1495,
"step": 475
},
{
"epoch": 0.8336252189141856,
"grad_norm": 1.1430728607474907,
"learning_rate": 6.291880739395683e-06,
"loss": 0.1722,
"step": 476
},
{
"epoch": 0.8353765323992994,
"grad_norm": 1.343500691590876,
"learning_rate": 6.278588135947011e-06,
"loss": 0.2047,
"step": 477
},
{
"epoch": 0.8371278458844134,
"grad_norm": 1.2595243490259276,
"learning_rate": 6.265285856452123e-06,
"loss": 0.2214,
"step": 478
},
{
"epoch": 0.8388791593695272,
"grad_norm": 1.1669616297664058,
"learning_rate": 6.251974001579459e-06,
"loss": 0.1724,
"step": 479
},
{
"epoch": 0.840630472854641,
"grad_norm": 1.0844521569107943,
"learning_rate": 6.238652672069921e-06,
"loss": 0.1961,
"step": 480
},
{
"epoch": 0.8423817863397548,
"grad_norm": 0.8709327722146551,
"learning_rate": 6.225321968736114e-06,
"loss": 0.1118,
"step": 481
},
{
"epoch": 0.8441330998248686,
"grad_norm": 1.0680058091791873,
"learning_rate": 6.211981992461583e-06,
"loss": 0.1579,
"step": 482
},
{
"epoch": 0.8458844133099825,
"grad_norm": 1.0598391865687347,
"learning_rate": 6.1986328442000425e-06,
"loss": 0.2064,
"step": 483
},
{
"epoch": 0.8476357267950964,
"grad_norm": 0.8766034007901955,
"learning_rate": 6.185274624974627e-06,
"loss": 0.1729,
"step": 484
},
{
"epoch": 0.8493870402802102,
"grad_norm": 1.0572455452106584,
"learning_rate": 6.1719074358771095e-06,
"loss": 0.1506,
"step": 485
},
{
"epoch": 0.851138353765324,
"grad_norm": 1.191222790209331,
"learning_rate": 6.158531378067151e-06,
"loss": 0.2654,
"step": 486
},
{
"epoch": 0.8528896672504378,
"grad_norm": 1.4428690121550316,
"learning_rate": 6.145146552771526e-06,
"loss": 0.1961,
"step": 487
},
{
"epoch": 0.8546409807355516,
"grad_norm": 1.4737339805651568,
"learning_rate": 6.13175306128336e-06,
"loss": 0.2061,
"step": 488
},
{
"epoch": 0.8563922942206655,
"grad_norm": 0.8182297459502176,
"learning_rate": 6.118351004961361e-06,
"loss": 0.1507,
"step": 489
},
{
"epoch": 0.8581436077057794,
"grad_norm": 0.8288780085158164,
"learning_rate": 6.104940485229055e-06,
"loss": 0.13,
"step": 490
},
{
"epoch": 0.8598949211908932,
"grad_norm": 1.1292187916757546,
"learning_rate": 6.091521603574016e-06,
"loss": 0.1324,
"step": 491
},
{
"epoch": 0.861646234676007,
"grad_norm": 1.0078907033539166,
"learning_rate": 6.0780944615471016e-06,
"loss": 0.1468,
"step": 492
},
{
"epoch": 0.8633975481611208,
"grad_norm": 1.182790806253496,
"learning_rate": 6.064659160761676e-06,
"loss": 0.1444,
"step": 493
},
{
"epoch": 0.8651488616462347,
"grad_norm": 1.2426011389183171,
"learning_rate": 6.051215802892855e-06,
"loss": 0.1864,
"step": 494
},
{
"epoch": 0.8669001751313485,
"grad_norm": 0.9102336236721412,
"learning_rate": 6.03776448967672e-06,
"loss": 0.1639,
"step": 495
},
{
"epoch": 0.8686514886164624,
"grad_norm": 0.8523410941646513,
"learning_rate": 6.024305322909565e-06,
"loss": 0.1501,
"step": 496
},
{
"epoch": 0.8704028021015762,
"grad_norm": 0.8389091979384604,
"learning_rate": 6.0108384044471115e-06,
"loss": 0.1543,
"step": 497
},
{
"epoch": 0.87215411558669,
"grad_norm": 0.9283364061815252,
"learning_rate": 5.997363836203744e-06,
"loss": 0.179,
"step": 498
},
{
"epoch": 0.8739054290718039,
"grad_norm": 1.610431915265918,
"learning_rate": 5.983881720151743e-06,
"loss": 0.211,
"step": 499
},
{
"epoch": 0.8756567425569177,
"grad_norm": 0.9193134177440736,
"learning_rate": 5.970392158320505e-06,
"loss": 0.171,
"step": 500
},
{
"epoch": 0.8756567425569177,
"eval_loss": 0.18447460234165192,
"eval_runtime": 1.9261,
"eval_samples_per_second": 24.401,
"eval_steps_per_second": 6.23,
"step": 500
},
{
"epoch": 0.8774080560420315,
"grad_norm": 1.5249299930362135,
"learning_rate": 5.956895252795778e-06,
"loss": 0.2216,
"step": 501
},
{
"epoch": 0.8791593695271454,
"grad_norm": 0.9421038979843945,
"learning_rate": 5.943391105718883e-06,
"loss": 0.172,
"step": 502
},
{
"epoch": 0.8809106830122592,
"grad_norm": 1.0939585065581912,
"learning_rate": 5.9298798192859434e-06,
"loss": 0.1562,
"step": 503
},
{
"epoch": 0.882661996497373,
"grad_norm": 1.3902162886272154,
"learning_rate": 5.91636149574711e-06,
"loss": 0.2069,
"step": 504
},
{
"epoch": 0.8844133099824869,
"grad_norm": 0.9650701747967442,
"learning_rate": 5.902836237405791e-06,
"loss": 0.1716,
"step": 505
},
{
"epoch": 0.8861646234676007,
"grad_norm": 1.083475666382535,
"learning_rate": 5.889304146617878e-06,
"loss": 0.1473,
"step": 506
},
{
"epoch": 0.8879159369527145,
"grad_norm": 0.832482252729812,
"learning_rate": 5.875765325790963e-06,
"loss": 0.1003,
"step": 507
},
{
"epoch": 0.8896672504378283,
"grad_norm": 1.096304873507722,
"learning_rate": 5.8622198773835725e-06,
"loss": 0.1779,
"step": 508
},
{
"epoch": 0.8914185639229422,
"grad_norm": 0.6978941340044295,
"learning_rate": 5.8486679039043895e-06,
"loss": 0.0972,
"step": 509
},
{
"epoch": 0.8931698774080561,
"grad_norm": 0.9622846696182903,
"learning_rate": 5.835109507911475e-06,
"loss": 0.1651,
"step": 510
},
{
"epoch": 0.8949211908931699,
"grad_norm": 1.2738814460850292,
"learning_rate": 5.821544792011495e-06,
"loss": 0.1897,
"step": 511
},
{
"epoch": 0.8966725043782837,
"grad_norm": 1.1576090771526317,
"learning_rate": 5.807973858858947e-06,
"loss": 0.1617,
"step": 512
},
{
"epoch": 0.8984238178633975,
"grad_norm": 1.0945933042426927,
"learning_rate": 5.794396811155372e-06,
"loss": 0.2411,
"step": 513
},
{
"epoch": 0.9001751313485113,
"grad_norm": 1.0071156666849748,
"learning_rate": 5.780813751648589e-06,
"loss": 0.145,
"step": 514
},
{
"epoch": 0.9019264448336253,
"grad_norm": 1.0608443685972735,
"learning_rate": 5.76722478313191e-06,
"loss": 0.1602,
"step": 515
},
{
"epoch": 0.9036777583187391,
"grad_norm": 0.8351677418619291,
"learning_rate": 5.753630008443371e-06,
"loss": 0.1649,
"step": 516
},
{
"epoch": 0.9054290718038529,
"grad_norm": 1.0252931582190568,
"learning_rate": 5.740029530464941e-06,
"loss": 0.1208,
"step": 517
},
{
"epoch": 0.9071803852889667,
"grad_norm": 1.0954354367499803,
"learning_rate": 5.726423452121751e-06,
"loss": 0.1731,
"step": 518
},
{
"epoch": 0.9089316987740805,
"grad_norm": 0.9910985812759849,
"learning_rate": 5.712811876381318e-06,
"loss": 0.185,
"step": 519
},
{
"epoch": 0.9106830122591943,
"grad_norm": 0.9431196895717147,
"learning_rate": 5.699194906252761e-06,
"loss": 0.203,
"step": 520
},
{
"epoch": 0.9124343257443083,
"grad_norm": 0.7613500880928905,
"learning_rate": 5.685572644786016e-06,
"loss": 0.1142,
"step": 521
},
{
"epoch": 0.9141856392294221,
"grad_norm": 0.9561738575523392,
"learning_rate": 5.671945195071075e-06,
"loss": 0.1474,
"step": 522
},
{
"epoch": 0.9159369527145359,
"grad_norm": 1.0351076685823428,
"learning_rate": 5.65831266023718e-06,
"loss": 0.1973,
"step": 523
},
{
"epoch": 0.9176882661996497,
"grad_norm": 1.2139682431989942,
"learning_rate": 5.644675143452065e-06,
"loss": 0.2251,
"step": 524
},
{
"epoch": 0.9194395796847635,
"grad_norm": 1.1432766151339988,
"learning_rate": 5.631032747921165e-06,
"loss": 0.2148,
"step": 525
},
{
"epoch": 0.9211908931698775,
"grad_norm": 0.8384192320224637,
"learning_rate": 5.617385576886829e-06,
"loss": 0.124,
"step": 526
},
{
"epoch": 0.9229422066549913,
"grad_norm": 0.7477784220288675,
"learning_rate": 5.603733733627559e-06,
"loss": 0.1244,
"step": 527
},
{
"epoch": 0.9246935201401051,
"grad_norm": 0.8136618533131901,
"learning_rate": 5.5900773214572016e-06,
"loss": 0.1652,
"step": 528
},
{
"epoch": 0.9264448336252189,
"grad_norm": 0.9923459165132333,
"learning_rate": 5.576416443724187e-06,
"loss": 0.1719,
"step": 529
},
{
"epoch": 0.9281961471103327,
"grad_norm": 1.2728780007916458,
"learning_rate": 5.562751203810742e-06,
"loss": 0.1844,
"step": 530
},
{
"epoch": 0.9299474605954466,
"grad_norm": 0.9234166515823709,
"learning_rate": 5.5490817051320964e-06,
"loss": 0.1612,
"step": 531
},
{
"epoch": 0.9316987740805605,
"grad_norm": 1.0361866511885336,
"learning_rate": 5.535408051135721e-06,
"loss": 0.1428,
"step": 532
},
{
"epoch": 0.9334500875656743,
"grad_norm": 0.9374383845356417,
"learning_rate": 5.5217303453005225e-06,
"loss": 0.1787,
"step": 533
},
{
"epoch": 0.9352014010507881,
"grad_norm": 1.0824506841717698,
"learning_rate": 5.508048691136075e-06,
"loss": 0.1846,
"step": 534
},
{
"epoch": 0.9369527145359019,
"grad_norm": 1.2681101493420375,
"learning_rate": 5.4943631921818365e-06,
"loss": 0.1857,
"step": 535
},
{
"epoch": 0.9387040280210157,
"grad_norm": 0.8870751536018538,
"learning_rate": 5.480673952006355e-06,
"loss": 0.1893,
"step": 536
},
{
"epoch": 0.9404553415061296,
"grad_norm": 0.8698308010341032,
"learning_rate": 5.466981074206493e-06,
"loss": 0.1576,
"step": 537
},
{
"epoch": 0.9422066549912435,
"grad_norm": 1.1493313359852635,
"learning_rate": 5.453284662406646e-06,
"loss": 0.1915,
"step": 538
},
{
"epoch": 0.9439579684763573,
"grad_norm": 0.9597895796258253,
"learning_rate": 5.439584820257949e-06,
"loss": 0.1799,
"step": 539
},
{
"epoch": 0.9457092819614711,
"grad_norm": 0.9439514805423415,
"learning_rate": 5.425881651437499e-06,
"loss": 0.1466,
"step": 540
},
{
"epoch": 0.9474605954465849,
"grad_norm": 0.9974103027552716,
"learning_rate": 5.412175259647567e-06,
"loss": 0.1623,
"step": 541
},
{
"epoch": 0.9492119089316988,
"grad_norm": 0.8671577413147288,
"learning_rate": 5.398465748614815e-06,
"loss": 0.1989,
"step": 542
},
{
"epoch": 0.9509632224168126,
"grad_norm": 0.8733255413441741,
"learning_rate": 5.384753222089515e-06,
"loss": 0.1228,
"step": 543
},
{
"epoch": 0.9527145359019265,
"grad_norm": 1.3146328909933664,
"learning_rate": 5.371037783844752e-06,
"loss": 0.2122,
"step": 544
},
{
"epoch": 0.9544658493870403,
"grad_norm": 1.1770390858844189,
"learning_rate": 5.357319537675655e-06,
"loss": 0.2062,
"step": 545
},
{
"epoch": 0.9562171628721541,
"grad_norm": 1.0962088909147447,
"learning_rate": 5.3435985873985926e-06,
"loss": 0.188,
"step": 546
},
{
"epoch": 0.957968476357268,
"grad_norm": 1.2179282929078772,
"learning_rate": 5.329875036850406e-06,
"loss": 0.1765,
"step": 547
},
{
"epoch": 0.9597197898423818,
"grad_norm": 1.069605096067716,
"learning_rate": 5.31614898988761e-06,
"loss": 0.1565,
"step": 548
},
{
"epoch": 0.9614711033274956,
"grad_norm": 1.3962825793212799,
"learning_rate": 5.302420550385612e-06,
"loss": 0.2066,
"step": 549
},
{
"epoch": 0.9632224168126094,
"grad_norm": 1.008584413162853,
"learning_rate": 5.28868982223793e-06,
"loss": 0.1698,
"step": 550
},
{
"epoch": 0.9649737302977233,
"grad_norm": 0.998912078130381,
"learning_rate": 5.274956909355395e-06,
"loss": 0.179,
"step": 551
},
{
"epoch": 0.9667250437828371,
"grad_norm": 0.9297277664472026,
"learning_rate": 5.261221915665375e-06,
"loss": 0.1184,
"step": 552
},
{
"epoch": 0.968476357267951,
"grad_norm": 1.185642958138451,
"learning_rate": 5.247484945110988e-06,
"loss": 0.1932,
"step": 553
},
{
"epoch": 0.9702276707530648,
"grad_norm": 1.088829726983837,
"learning_rate": 5.233746101650308e-06,
"loss": 0.2206,
"step": 554
},
{
"epoch": 0.9719789842381786,
"grad_norm": 1.0987062412828756,
"learning_rate": 5.220005489255583e-06,
"loss": 0.1554,
"step": 555
},
{
"epoch": 0.9737302977232924,
"grad_norm": 1.0323763199957168,
"learning_rate": 5.20626321191245e-06,
"loss": 0.1546,
"step": 556
},
{
"epoch": 0.9754816112084063,
"grad_norm": 0.9972604317206961,
"learning_rate": 5.192519373619145e-06,
"loss": 0.1742,
"step": 557
},
{
"epoch": 0.9772329246935202,
"grad_norm": 1.1779226698001648,
"learning_rate": 5.1787740783857164e-06,
"loss": 0.1969,
"step": 558
},
{
"epoch": 0.978984238178634,
"grad_norm": 1.1733643102354534,
"learning_rate": 5.165027430233239e-06,
"loss": 0.138,
"step": 559
},
{
"epoch": 0.9807355516637478,
"grad_norm": 1.0272420360834542,
"learning_rate": 5.151279533193027e-06,
"loss": 0.1705,
"step": 560
},
{
"epoch": 0.9824868651488616,
"grad_norm": 0.904519502078042,
"learning_rate": 5.137530491305844e-06,
"loss": 0.1255,
"step": 561
},
{
"epoch": 0.9842381786339754,
"grad_norm": 0.9629383999654443,
"learning_rate": 5.123780408621118e-06,
"loss": 0.1659,
"step": 562
},
{
"epoch": 0.9859894921190894,
"grad_norm": 1.3124846848756935,
"learning_rate": 5.110029389196155e-06,
"loss": 0.1844,
"step": 563
},
{
"epoch": 0.9877408056042032,
"grad_norm": 1.0676108653291219,
"learning_rate": 5.096277537095348e-06,
"loss": 0.2078,
"step": 564
},
{
"epoch": 0.989492119089317,
"grad_norm": 0.9382113481780886,
"learning_rate": 5.082524956389394e-06,
"loss": 0.1409,
"step": 565
},
{
"epoch": 0.9912434325744308,
"grad_norm": 0.9936394728663424,
"learning_rate": 5.0687717511545e-06,
"loss": 0.2063,
"step": 566
},
{
"epoch": 0.9929947460595446,
"grad_norm": 1.031862421429508,
"learning_rate": 5.055018025471602e-06,
"loss": 0.1595,
"step": 567
},
{
"epoch": 0.9947460595446584,
"grad_norm": 1.3217031605637353,
"learning_rate": 5.0412638834255755e-06,
"loss": 0.1276,
"step": 568
},
{
"epoch": 0.9964973730297724,
"grad_norm": 1.1933242590091766,
"learning_rate": 5.027509429104443e-06,
"loss": 0.1923,
"step": 569
},
{
"epoch": 0.9982486865148862,
"grad_norm": 0.8661254447213783,
"learning_rate": 5.013754766598599e-06,
"loss": 0.1724,
"step": 570
},
{
"epoch": 1.0,
"grad_norm": 1.1986284951562434,
"learning_rate": 5e-06,
"loss": 0.1998,
"step": 571
},
{
"epoch": 1.001751313485114,
"grad_norm": 0.7771391696807222,
"learning_rate": 4.986245233401403e-06,
"loss": 0.1012,
"step": 572
},
{
"epoch": 1.0035026269702276,
"grad_norm": 0.6737848363676279,
"learning_rate": 4.9724905708955575e-06,
"loss": 0.0784,
"step": 573
},
{
"epoch": 1.0052539404553416,
"grad_norm": 0.6454263064830977,
"learning_rate": 4.958736116574426e-06,
"loss": 0.0818,
"step": 574
},
{
"epoch": 1.0070052539404553,
"grad_norm": 0.6725211290025607,
"learning_rate": 4.9449819745284e-06,
"loss": 0.0843,
"step": 575
},
{
"epoch": 1.0087565674255692,
"grad_norm": 0.8469678132359898,
"learning_rate": 4.931228248845502e-06,
"loss": 0.1477,
"step": 576
},
{
"epoch": 1.010507880910683,
"grad_norm": 0.7442182106279001,
"learning_rate": 4.9174750436106076e-06,
"loss": 0.0892,
"step": 577
},
{
"epoch": 1.0122591943957968,
"grad_norm": 0.7468024313770749,
"learning_rate": 4.903722462904653e-06,
"loss": 0.0948,
"step": 578
},
{
"epoch": 1.0140105078809107,
"grad_norm": 0.7166080541638878,
"learning_rate": 4.889970610803845e-06,
"loss": 0.0991,
"step": 579
},
{
"epoch": 1.0157618213660244,
"grad_norm": 0.7762519026289294,
"learning_rate": 4.8762195913788825e-06,
"loss": 0.0774,
"step": 580
},
{
"epoch": 1.0175131348511384,
"grad_norm": 0.808868055025971,
"learning_rate": 4.862469508694157e-06,
"loss": 0.1099,
"step": 581
},
{
"epoch": 1.0192644483362523,
"grad_norm": 0.5819265746424057,
"learning_rate": 4.8487204668069735e-06,
"loss": 0.0695,
"step": 582
},
{
"epoch": 1.021015761821366,
"grad_norm": 0.6156072907006408,
"learning_rate": 4.834972569766762e-06,
"loss": 0.0838,
"step": 583
},
{
"epoch": 1.02276707530648,
"grad_norm": 0.7635944643048334,
"learning_rate": 4.8212259216142835e-06,
"loss": 0.104,
"step": 584
},
{
"epoch": 1.0245183887915936,
"grad_norm": 0.7637571188691638,
"learning_rate": 4.8074806263808565e-06,
"loss": 0.0708,
"step": 585
},
{
"epoch": 1.0262697022767076,
"grad_norm": 0.9551589547852135,
"learning_rate": 4.7937367880875514e-06,
"loss": 0.1281,
"step": 586
},
{
"epoch": 1.0280210157618215,
"grad_norm": 0.7293559825967365,
"learning_rate": 4.779994510744419e-06,
"loss": 0.0912,
"step": 587
},
{
"epoch": 1.0297723292469352,
"grad_norm": 0.8566789761394177,
"learning_rate": 4.766253898349694e-06,
"loss": 0.1606,
"step": 588
},
{
"epoch": 1.031523642732049,
"grad_norm": 0.6627533346656157,
"learning_rate": 4.752515054889012e-06,
"loss": 0.0718,
"step": 589
},
{
"epoch": 1.0332749562171628,
"grad_norm": 0.5815945433400204,
"learning_rate": 4.738778084334625e-06,
"loss": 0.0695,
"step": 590
},
{
"epoch": 1.0350262697022767,
"grad_norm": 0.760838011156959,
"learning_rate": 4.725043090644606e-06,
"loss": 0.0884,
"step": 591
},
{
"epoch": 1.0367775831873904,
"grad_norm": 0.7483668041512324,
"learning_rate": 4.711310177762072e-06,
"loss": 0.0669,
"step": 592
},
{
"epoch": 1.0385288966725044,
"grad_norm": 0.8799384240976829,
"learning_rate": 4.697579449614389e-06,
"loss": 0.0998,
"step": 593
},
{
"epoch": 1.0402802101576183,
"grad_norm": 0.8082415948151013,
"learning_rate": 4.683851010112391e-06,
"loss": 0.0985,
"step": 594
},
{
"epoch": 1.042031523642732,
"grad_norm": 0.6580267314761,
"learning_rate": 4.670124963149596e-06,
"loss": 0.1115,
"step": 595
},
{
"epoch": 1.043782837127846,
"grad_norm": 0.7118138502690213,
"learning_rate": 4.656401412601408e-06,
"loss": 0.0662,
"step": 596
},
{
"epoch": 1.0455341506129596,
"grad_norm": 0.7285566494410792,
"learning_rate": 4.642680462324348e-06,
"loss": 0.1036,
"step": 597
},
{
"epoch": 1.0472854640980735,
"grad_norm": 0.6952238511426204,
"learning_rate": 4.628962216155249e-06,
"loss": 0.0882,
"step": 598
},
{
"epoch": 1.0490367775831875,
"grad_norm": 0.7543264293172796,
"learning_rate": 4.615246777910485e-06,
"loss": 0.087,
"step": 599
},
{
"epoch": 1.0507880910683012,
"grad_norm": 0.6191909832818346,
"learning_rate": 4.6015342513851854e-06,
"loss": 0.0745,
"step": 600
},
{
"epoch": 1.052539404553415,
"grad_norm": 0.922433858210772,
"learning_rate": 4.587824740352435e-06,
"loss": 0.1058,
"step": 601
},
{
"epoch": 1.0542907180385288,
"grad_norm": 0.6821453050325335,
"learning_rate": 4.5741183485625044e-06,
"loss": 0.0771,
"step": 602
},
{
"epoch": 1.0560420315236427,
"grad_norm": 0.9489221869695271,
"learning_rate": 4.560415179742052e-06,
"loss": 0.0955,
"step": 603
},
{
"epoch": 1.0577933450087567,
"grad_norm": 0.7026344715382692,
"learning_rate": 4.546715337593354e-06,
"loss": 0.0819,
"step": 604
},
{
"epoch": 1.0595446584938704,
"grad_norm": 0.6671067987720858,
"learning_rate": 4.533018925793508e-06,
"loss": 0.0727,
"step": 605
},
{
"epoch": 1.0612959719789843,
"grad_norm": 0.8642821874175421,
"learning_rate": 4.519326047993647e-06,
"loss": 0.0937,
"step": 606
},
{
"epoch": 1.063047285464098,
"grad_norm": 0.8793543522695341,
"learning_rate": 4.505636807818166e-06,
"loss": 0.1301,
"step": 607
},
{
"epoch": 1.064798598949212,
"grad_norm": 0.8237519754604328,
"learning_rate": 4.491951308863926e-06,
"loss": 0.0825,
"step": 608
},
{
"epoch": 1.0665499124343258,
"grad_norm": 0.8459623510016205,
"learning_rate": 4.478269654699478e-06,
"loss": 0.0821,
"step": 609
},
{
"epoch": 1.0683012259194395,
"grad_norm": 0.8255877830028168,
"learning_rate": 4.464591948864281e-06,
"loss": 0.0842,
"step": 610
},
{
"epoch": 1.0700525394045535,
"grad_norm": 0.8718483520086847,
"learning_rate": 4.4509182948679035e-06,
"loss": 0.0821,
"step": 611
},
{
"epoch": 1.0718038528896672,
"grad_norm": 0.9897123154664441,
"learning_rate": 4.43724879618926e-06,
"loss": 0.1109,
"step": 612
},
{
"epoch": 1.073555166374781,
"grad_norm": 0.8636401435184293,
"learning_rate": 4.423583556275814e-06,
"loss": 0.0904,
"step": 613
},
{
"epoch": 1.0753064798598948,
"grad_norm": 0.8916326658281433,
"learning_rate": 4.409922678542799e-06,
"loss": 0.0695,
"step": 614
},
{
"epoch": 1.0770577933450087,
"grad_norm": 0.6527711082304235,
"learning_rate": 4.396266266372443e-06,
"loss": 0.0512,
"step": 615
},
{
"epoch": 1.0788091068301227,
"grad_norm": 0.7314392682365592,
"learning_rate": 4.382614423113171e-06,
"loss": 0.0772,
"step": 616
},
{
"epoch": 1.0805604203152364,
"grad_norm": 0.7896716564759495,
"learning_rate": 4.368967252078838e-06,
"loss": 0.0837,
"step": 617
},
{
"epoch": 1.0823117338003503,
"grad_norm": 0.8904478092266286,
"learning_rate": 4.355324856547936e-06,
"loss": 0.0984,
"step": 618
},
{
"epoch": 1.084063047285464,
"grad_norm": 0.7997582578314417,
"learning_rate": 4.341687339762822e-06,
"loss": 0.0719,
"step": 619
},
{
"epoch": 1.085814360770578,
"grad_norm": 0.9328053345880932,
"learning_rate": 4.3280548049289275e-06,
"loss": 0.1102,
"step": 620
},
{
"epoch": 1.0875656742556918,
"grad_norm": 0.7402218059098891,
"learning_rate": 4.314427355213984e-06,
"loss": 0.0882,
"step": 621
},
{
"epoch": 1.0893169877408055,
"grad_norm": 1.0242119598597839,
"learning_rate": 4.3008050937472424e-06,
"loss": 0.0971,
"step": 622
},
{
"epoch": 1.0910683012259195,
"grad_norm": 1.078920725370126,
"learning_rate": 4.2871881236186835e-06,
"loss": 0.1252,
"step": 623
},
{
"epoch": 1.0928196147110332,
"grad_norm": 0.8096017533492185,
"learning_rate": 4.273576547878252e-06,
"loss": 0.0717,
"step": 624
},
{
"epoch": 1.094570928196147,
"grad_norm": 0.7940757501651525,
"learning_rate": 4.259970469535061e-06,
"loss": 0.1207,
"step": 625
},
{
"epoch": 1.096322241681261,
"grad_norm": 0.6730235221485272,
"learning_rate": 4.24636999155663e-06,
"loss": 0.0642,
"step": 626
},
{
"epoch": 1.0980735551663747,
"grad_norm": 0.8410423617949386,
"learning_rate": 4.2327752168680904e-06,
"loss": 0.1123,
"step": 627
},
{
"epoch": 1.0998248686514887,
"grad_norm": 0.7165164918449669,
"learning_rate": 4.219186248351413e-06,
"loss": 0.1079,
"step": 628
},
{
"epoch": 1.1015761821366024,
"grad_norm": 0.9046144772499766,
"learning_rate": 4.20560318884463e-06,
"loss": 0.0889,
"step": 629
},
{
"epoch": 1.1033274956217163,
"grad_norm": 0.6944164438470994,
"learning_rate": 4.192026141141054e-06,
"loss": 0.0726,
"step": 630
},
{
"epoch": 1.1050788091068302,
"grad_norm": 0.6933343999073917,
"learning_rate": 4.178455207988504e-06,
"loss": 0.103,
"step": 631
},
{
"epoch": 1.106830122591944,
"grad_norm": 0.9114549106270846,
"learning_rate": 4.164890492088527e-06,
"loss": 0.0816,
"step": 632
},
{
"epoch": 1.1085814360770578,
"grad_norm": 0.8832236692461997,
"learning_rate": 4.151332096095613e-06,
"loss": 0.0716,
"step": 633
},
{
"epoch": 1.1103327495621715,
"grad_norm": 0.6560477009246709,
"learning_rate": 4.13778012261643e-06,
"loss": 0.0495,
"step": 634
},
{
"epoch": 1.1120840630472855,
"grad_norm": 0.7697309148882717,
"learning_rate": 4.124234674209038e-06,
"loss": 0.0784,
"step": 635
},
{
"epoch": 1.1138353765323994,
"grad_norm": 0.8584207785015194,
"learning_rate": 4.110695853382123e-06,
"loss": 0.0838,
"step": 636
},
{
"epoch": 1.115586690017513,
"grad_norm": 0.8178593101603066,
"learning_rate": 4.09716376259421e-06,
"loss": 0.0885,
"step": 637
},
{
"epoch": 1.117338003502627,
"grad_norm": 0.7493368271272891,
"learning_rate": 4.083638504252891e-06,
"loss": 0.0755,
"step": 638
},
{
"epoch": 1.1190893169877407,
"grad_norm": 0.8921855827679555,
"learning_rate": 4.070120180714059e-06,
"loss": 0.1016,
"step": 639
},
{
"epoch": 1.1208406304728546,
"grad_norm": 0.6566522468773398,
"learning_rate": 4.056608894281118e-06,
"loss": 0.0621,
"step": 640
},
{
"epoch": 1.1225919439579686,
"grad_norm": 1.0368825738027343,
"learning_rate": 4.043104747204222e-06,
"loss": 0.0964,
"step": 641
},
{
"epoch": 1.1243432574430823,
"grad_norm": 0.7545101954718236,
"learning_rate": 4.029607841679496e-06,
"loss": 0.0743,
"step": 642
},
{
"epoch": 1.1260945709281962,
"grad_norm": 0.7795780796724158,
"learning_rate": 4.016118279848259e-06,
"loss": 0.0818,
"step": 643
},
{
"epoch": 1.12784588441331,
"grad_norm": 0.8102419707778211,
"learning_rate": 4.002636163796259e-06,
"loss": 0.0601,
"step": 644
},
{
"epoch": 1.1295971978984238,
"grad_norm": 0.8912319933395433,
"learning_rate": 3.989161595552891e-06,
"loss": 0.1056,
"step": 645
},
{
"epoch": 1.1313485113835378,
"grad_norm": 0.9227762274712196,
"learning_rate": 3.975694677090436e-06,
"loss": 0.0946,
"step": 646
},
{
"epoch": 1.1330998248686515,
"grad_norm": 1.0277331150492526,
"learning_rate": 3.9622355103232805e-06,
"loss": 0.0943,
"step": 647
},
{
"epoch": 1.1348511383537654,
"grad_norm": 0.7898037783030375,
"learning_rate": 3.948784197107146e-06,
"loss": 0.0724,
"step": 648
},
{
"epoch": 1.136602451838879,
"grad_norm": 0.8906513390558273,
"learning_rate": 3.935340839238325e-06,
"loss": 0.0978,
"step": 649
},
{
"epoch": 1.138353765323993,
"grad_norm": 0.8134210787173696,
"learning_rate": 3.9219055384529e-06,
"loss": 0.0743,
"step": 650
},
{
"epoch": 1.140105078809107,
"grad_norm": 0.7115092884012872,
"learning_rate": 3.9084783964259855e-06,
"loss": 0.0492,
"step": 651
},
{
"epoch": 1.1418563922942206,
"grad_norm": 0.7227199245003441,
"learning_rate": 3.895059514770947e-06,
"loss": 0.089,
"step": 652
},
{
"epoch": 1.1436077057793346,
"grad_norm": 0.8212458686585175,
"learning_rate": 3.88164899503864e-06,
"loss": 0.0873,
"step": 653
},
{
"epoch": 1.1453590192644483,
"grad_norm": 0.6189120343911374,
"learning_rate": 3.868246938716643e-06,
"loss": 0.0519,
"step": 654
},
{
"epoch": 1.1471103327495622,
"grad_norm": 0.9135905892477679,
"learning_rate": 3.854853447228475e-06,
"loss": 0.0815,
"step": 655
},
{
"epoch": 1.1488616462346761,
"grad_norm": 0.9650411991425408,
"learning_rate": 3.841468621932851e-06,
"loss": 0.0864,
"step": 656
},
{
"epoch": 1.1506129597197898,
"grad_norm": 0.8715517114024203,
"learning_rate": 3.828092564122893e-06,
"loss": 0.0808,
"step": 657
},
{
"epoch": 1.1523642732049038,
"grad_norm": 0.7812734597619906,
"learning_rate": 3.814725375025376e-06,
"loss": 0.0681,
"step": 658
},
{
"epoch": 1.1541155866900175,
"grad_norm": 0.9464561590385424,
"learning_rate": 3.801367155799959e-06,
"loss": 0.0967,
"step": 659
},
{
"epoch": 1.1558669001751314,
"grad_norm": 0.8197419560752517,
"learning_rate": 3.788018007538419e-06,
"loss": 0.0876,
"step": 660
},
{
"epoch": 1.157618213660245,
"grad_norm": 0.7844620890716654,
"learning_rate": 3.774678031263887e-06,
"loss": 0.0949,
"step": 661
},
{
"epoch": 1.159369527145359,
"grad_norm": 0.6684997290021009,
"learning_rate": 3.7613473279300804e-06,
"loss": 0.0659,
"step": 662
},
{
"epoch": 1.161120840630473,
"grad_norm": 0.6607818899113638,
"learning_rate": 3.7480259984205426e-06,
"loss": 0.0797,
"step": 663
},
{
"epoch": 1.1628721541155866,
"grad_norm": 0.8000383670556351,
"learning_rate": 3.734714143547879e-06,
"loss": 0.0982,
"step": 664
},
{
"epoch": 1.1646234676007006,
"grad_norm": 0.7750030818236461,
"learning_rate": 3.7214118640529894e-06,
"loss": 0.0755,
"step": 665
},
{
"epoch": 1.1663747810858143,
"grad_norm": 0.7718230303441634,
"learning_rate": 3.708119260604317e-06,
"loss": 0.0775,
"step": 666
},
{
"epoch": 1.1681260945709282,
"grad_norm": 0.7606884085579109,
"learning_rate": 3.694836433797071e-06,
"loss": 0.0652,
"step": 667
},
{
"epoch": 1.1698774080560421,
"grad_norm": 0.9648178738887017,
"learning_rate": 3.681563484152477e-06,
"loss": 0.0892,
"step": 668
},
{
"epoch": 1.1716287215411558,
"grad_norm": 1.0030218510210618,
"learning_rate": 3.668300512117014e-06,
"loss": 0.0996,
"step": 669
},
{
"epoch": 1.1733800350262698,
"grad_norm": 0.602908672449069,
"learning_rate": 3.655047618061648e-06,
"loss": 0.0583,
"step": 670
},
{
"epoch": 1.1751313485113835,
"grad_norm": 0.9365604274710315,
"learning_rate": 3.6418049022810843e-06,
"loss": 0.0884,
"step": 671
},
{
"epoch": 1.1768826619964974,
"grad_norm": 0.9882368559103631,
"learning_rate": 3.6285724649929944e-06,
"loss": 0.1015,
"step": 672
},
{
"epoch": 1.178633975481611,
"grad_norm": 0.965569447935582,
"learning_rate": 3.615350406337269e-06,
"loss": 0.097,
"step": 673
},
{
"epoch": 1.180385288966725,
"grad_norm": 1.0268385342532949,
"learning_rate": 3.6021388263752566e-06,
"loss": 0.1107,
"step": 674
},
{
"epoch": 1.182136602451839,
"grad_norm": 0.9192005858295202,
"learning_rate": 3.588937825088999e-06,
"loss": 0.1,
"step": 675
},
{
"epoch": 1.1838879159369526,
"grad_norm": 0.9617400889226272,
"learning_rate": 3.5757475023804907e-06,
"loss": 0.0692,
"step": 676
},
{
"epoch": 1.1856392294220666,
"grad_norm": 0.8649057821650563,
"learning_rate": 3.562567958070905e-06,
"loss": 0.1033,
"step": 677
},
{
"epoch": 1.1873905429071803,
"grad_norm": 1.0420162692258215,
"learning_rate": 3.549399291899851e-06,
"loss": 0.099,
"step": 678
},
{
"epoch": 1.1891418563922942,
"grad_norm": 0.5439954275486397,
"learning_rate": 3.536241603524616e-06,
"loss": 0.0514,
"step": 679
},
{
"epoch": 1.1908931698774081,
"grad_norm": 0.8221945556326056,
"learning_rate": 3.5230949925194034e-06,
"loss": 0.0841,
"step": 680
},
{
"epoch": 1.1926444833625218,
"grad_norm": 1.0298523052786546,
"learning_rate": 3.5099595583745947e-06,
"loss": 0.102,
"step": 681
},
{
"epoch": 1.1943957968476357,
"grad_norm": 0.9729770963676866,
"learning_rate": 3.4968354004959804e-06,
"loss": 0.0959,
"step": 682
},
{
"epoch": 1.1961471103327495,
"grad_norm": 0.8687339216607315,
"learning_rate": 3.4837226182040184e-06,
"loss": 0.0723,
"step": 683
},
{
"epoch": 1.1978984238178634,
"grad_norm": 1.1187226087659867,
"learning_rate": 3.470621310733078e-06,
"loss": 0.1072,
"step": 684
},
{
"epoch": 1.1996497373029773,
"grad_norm": 0.8518811353036616,
"learning_rate": 3.4575315772306894e-06,
"loss": 0.1147,
"step": 685
},
{
"epoch": 1.201401050788091,
"grad_norm": 1.0529027501197052,
"learning_rate": 3.444453516756796e-06,
"loss": 0.1036,
"step": 686
},
{
"epoch": 1.203152364273205,
"grad_norm": 1.0032377659240923,
"learning_rate": 3.4313872282829998e-06,
"loss": 0.1128,
"step": 687
},
{
"epoch": 1.2049036777583186,
"grad_norm": 1.0018061327702659,
"learning_rate": 3.4183328106918177e-06,
"loss": 0.092,
"step": 688
},
{
"epoch": 1.2066549912434326,
"grad_norm": 0.8566035601149397,
"learning_rate": 3.4052903627759264e-06,
"loss": 0.0936,
"step": 689
},
{
"epoch": 1.2084063047285465,
"grad_norm": 0.9125746690092138,
"learning_rate": 3.3922599832374224e-06,
"loss": 0.0788,
"step": 690
},
{
"epoch": 1.2101576182136602,
"grad_norm": 1.0426073296986158,
"learning_rate": 3.379241770687074e-06,
"loss": 0.0799,
"step": 691
},
{
"epoch": 1.2119089316987741,
"grad_norm": 1.06100141802292,
"learning_rate": 3.3662358236435664e-06,
"loss": 0.1105,
"step": 692
},
{
"epoch": 1.2136602451838878,
"grad_norm": 0.9390043654644253,
"learning_rate": 3.353242240532769e-06,
"loss": 0.1165,
"step": 693
},
{
"epoch": 1.2154115586690017,
"grad_norm": 0.8503852226528796,
"learning_rate": 3.3402611196869764e-06,
"loss": 0.1161,
"step": 694
},
{
"epoch": 1.2171628721541157,
"grad_norm": 0.9078220049525988,
"learning_rate": 3.327292559344178e-06,
"loss": 0.0681,
"step": 695
},
{
"epoch": 1.2189141856392294,
"grad_norm": 0.7127386687186886,
"learning_rate": 3.314336657647308e-06,
"loss": 0.0752,
"step": 696
},
{
"epoch": 1.2206654991243433,
"grad_norm": 0.9358408817071951,
"learning_rate": 3.3013935126434994e-06,
"loss": 0.1019,
"step": 697
},
{
"epoch": 1.222416812609457,
"grad_norm": 0.7742420094838459,
"learning_rate": 3.288463222283349e-06,
"loss": 0.0931,
"step": 698
},
{
"epoch": 1.224168126094571,
"grad_norm": 0.941039147307977,
"learning_rate": 3.2755458844201692e-06,
"loss": 0.094,
"step": 699
},
{
"epoch": 1.2259194395796849,
"grad_norm": 0.8228135353803155,
"learning_rate": 3.262641596809254e-06,
"loss": 0.0752,
"step": 700
},
{
"epoch": 1.2276707530647986,
"grad_norm": 1.0260217425221851,
"learning_rate": 3.249750457107138e-06,
"loss": 0.1434,
"step": 701
},
{
"epoch": 1.2294220665499125,
"grad_norm": 0.8764995569070175,
"learning_rate": 3.2368725628708507e-06,
"loss": 0.0942,
"step": 702
},
{
"epoch": 1.2311733800350262,
"grad_norm": 0.8810808130600745,
"learning_rate": 3.224008011557186e-06,
"loss": 0.0773,
"step": 703
},
{
"epoch": 1.2329246935201401,
"grad_norm": 0.8339945067432337,
"learning_rate": 3.211156900521961e-06,
"loss": 0.0577,
"step": 704
},
{
"epoch": 1.234676007005254,
"grad_norm": 0.9539133650050169,
"learning_rate": 3.1983193270192787e-06,
"loss": 0.0854,
"step": 705
},
{
"epoch": 1.2364273204903677,
"grad_norm": 0.7988973560578649,
"learning_rate": 3.185495388200799e-06,
"loss": 0.0718,
"step": 706
},
{
"epoch": 1.2381786339754817,
"grad_norm": 1.074214838443568,
"learning_rate": 3.1726851811149907e-06,
"loss": 0.0927,
"step": 707
},
{
"epoch": 1.2399299474605954,
"grad_norm": 0.8023394631186075,
"learning_rate": 3.159888802706408e-06,
"loss": 0.076,
"step": 708
},
{
"epoch": 1.2416812609457093,
"grad_norm": 0.6592103260258445,
"learning_rate": 3.147106349814951e-06,
"loss": 0.0602,
"step": 709
},
{
"epoch": 1.2434325744308232,
"grad_norm": 1.1114735422528423,
"learning_rate": 3.1343379191751366e-06,
"loss": 0.0904,
"step": 710
},
{
"epoch": 1.245183887915937,
"grad_norm": 0.8722207507134669,
"learning_rate": 3.1215836074153666e-06,
"loss": 0.0553,
"step": 711
},
{
"epoch": 1.2469352014010509,
"grad_norm": 0.9701640956470772,
"learning_rate": 3.1088435110571884e-06,
"loss": 0.0951,
"step": 712
},
{
"epoch": 1.2486865148861646,
"grad_norm": 0.7785105550035741,
"learning_rate": 3.0961177265145776e-06,
"loss": 0.0744,
"step": 713
},
{
"epoch": 1.2504378283712785,
"grad_norm": 1.1865316263907835,
"learning_rate": 3.0834063500931947e-06,
"loss": 0.1155,
"step": 714
},
{
"epoch": 1.2521891418563924,
"grad_norm": 0.670146861520249,
"learning_rate": 3.0707094779896695e-06,
"loss": 0.0737,
"step": 715
},
{
"epoch": 1.253940455341506,
"grad_norm": 1.0183130513166565,
"learning_rate": 3.0580272062908605e-06,
"loss": 0.1009,
"step": 716
},
{
"epoch": 1.25569176882662,
"grad_norm": 1.3895554101906575,
"learning_rate": 3.0453596309731396e-06,
"loss": 0.1295,
"step": 717
},
{
"epoch": 1.2574430823117337,
"grad_norm": 0.8401700996571929,
"learning_rate": 3.032706847901658e-06,
"loss": 0.1052,
"step": 718
},
{
"epoch": 1.2591943957968477,
"grad_norm": 0.9557568744741772,
"learning_rate": 3.020068952829619e-06,
"loss": 0.1099,
"step": 719
},
{
"epoch": 1.2609457092819616,
"grad_norm": 0.7781143616664629,
"learning_rate": 3.0074460413975636e-06,
"loss": 0.0603,
"step": 720
},
{
"epoch": 1.2626970227670753,
"grad_norm": 0.8080335461170193,
"learning_rate": 2.9948382091326328e-06,
"loss": 0.0971,
"step": 721
},
{
"epoch": 1.2644483362521892,
"grad_norm": 0.8725117432766293,
"learning_rate": 2.98224555144786e-06,
"loss": 0.0565,
"step": 722
},
{
"epoch": 1.266199649737303,
"grad_norm": 1.0281518070875828,
"learning_rate": 2.9696681636414372e-06,
"loss": 0.1,
"step": 723
},
{
"epoch": 1.2679509632224168,
"grad_norm": 0.9107780574366331,
"learning_rate": 2.9571061408959943e-06,
"loss": 0.0812,
"step": 724
},
{
"epoch": 1.2697022767075308,
"grad_norm": 0.8277139085800029,
"learning_rate": 2.944559578277889e-06,
"loss": 0.0669,
"step": 725
},
{
"epoch": 1.2714535901926445,
"grad_norm": 0.8815637667439311,
"learning_rate": 2.932028570736474e-06,
"loss": 0.097,
"step": 726
},
{
"epoch": 1.2732049036777582,
"grad_norm": 1.1364269200108077,
"learning_rate": 2.919513213103391e-06,
"loss": 0.0706,
"step": 727
},
{
"epoch": 1.274956217162872,
"grad_norm": 0.7498765483852221,
"learning_rate": 2.9070136000918426e-06,
"loss": 0.0687,
"step": 728
},
{
"epoch": 1.276707530647986,
"grad_norm": 0.7962376754602092,
"learning_rate": 2.89452982629588e-06,
"loss": 0.0609,
"step": 729
},
{
"epoch": 1.2784588441331,
"grad_norm": 1.0565112515662811,
"learning_rate": 2.8820619861896908e-06,
"loss": 0.089,
"step": 730
},
{
"epoch": 1.2802101576182137,
"grad_norm": 1.0984844227065096,
"learning_rate": 2.8696101741268765e-06,
"loss": 0.0609,
"step": 731
},
{
"epoch": 1.2819614711033274,
"grad_norm": 1.3486910096875888,
"learning_rate": 2.8571744843397412e-06,
"loss": 0.1295,
"step": 732
},
{
"epoch": 1.2837127845884413,
"grad_norm": 0.7596179516412704,
"learning_rate": 2.844755010938586e-06,
"loss": 0.0552,
"step": 733
},
{
"epoch": 1.2854640980735552,
"grad_norm": 0.8293822539041598,
"learning_rate": 2.8323518479109824e-06,
"loss": 0.0673,
"step": 734
},
{
"epoch": 1.287215411558669,
"grad_norm": 1.0021410590514732,
"learning_rate": 2.819965089121076e-06,
"loss": 0.079,
"step": 735
},
{
"epoch": 1.2889667250437828,
"grad_norm": 0.98205909089748,
"learning_rate": 2.8075948283088637e-06,
"loss": 0.0956,
"step": 736
},
{
"epoch": 1.2907180385288965,
"grad_norm": 0.9484379433282005,
"learning_rate": 2.7952411590894914e-06,
"loss": 0.0836,
"step": 737
},
{
"epoch": 1.2924693520140105,
"grad_norm": 0.8548057872579898,
"learning_rate": 2.7829041749525455e-06,
"loss": 0.0698,
"step": 738
},
{
"epoch": 1.2942206654991244,
"grad_norm": 0.6958930931575743,
"learning_rate": 2.77058396926134e-06,
"loss": 0.0472,
"step": 739
},
{
"epoch": 1.295971978984238,
"grad_norm": 0.8590285326110315,
"learning_rate": 2.7582806352522194e-06,
"loss": 0.1035,
"step": 740
},
{
"epoch": 1.297723292469352,
"grad_norm": 0.7506949834567328,
"learning_rate": 2.7459942660338434e-06,
"loss": 0.0844,
"step": 741
},
{
"epoch": 1.2994746059544657,
"grad_norm": 0.718144378394281,
"learning_rate": 2.733724954586483e-06,
"loss": 0.072,
"step": 742
},
{
"epoch": 1.3012259194395797,
"grad_norm": 0.9554449618063786,
"learning_rate": 2.7214727937613293e-06,
"loss": 0.0738,
"step": 743
},
{
"epoch": 1.3029772329246936,
"grad_norm": 1.281934191339505,
"learning_rate": 2.709237876279772e-06,
"loss": 0.0861,
"step": 744
},
{
"epoch": 1.3047285464098073,
"grad_norm": 0.8591603075335503,
"learning_rate": 2.6970202947327156e-06,
"loss": 0.0738,
"step": 745
},
{
"epoch": 1.3064798598949212,
"grad_norm": 1.077686551423765,
"learning_rate": 2.6848201415798646e-06,
"loss": 0.1006,
"step": 746
},
{
"epoch": 1.308231173380035,
"grad_norm": 1.0140620923015204,
"learning_rate": 2.6726375091490313e-06,
"loss": 0.1179,
"step": 747
},
{
"epoch": 1.3099824868651488,
"grad_norm": 0.8465851580721724,
"learning_rate": 2.6604724896354338e-06,
"loss": 0.095,
"step": 748
},
{
"epoch": 1.3117338003502628,
"grad_norm": 0.8292233955682955,
"learning_rate": 2.648325175101004e-06,
"loss": 0.078,
"step": 749
},
{
"epoch": 1.3134851138353765,
"grad_norm": 0.9961150572832315,
"learning_rate": 2.6361956574736867e-06,
"loss": 0.1037,
"step": 750
},
{
"epoch": 1.3152364273204904,
"grad_norm": 1.0125252057907417,
"learning_rate": 2.624084028546739e-06,
"loss": 0.1302,
"step": 751
},
{
"epoch": 1.316987740805604,
"grad_norm": 1.4492932570805932,
"learning_rate": 2.6119903799780445e-06,
"loss": 0.1018,
"step": 752
},
{
"epoch": 1.318739054290718,
"grad_norm": 1.0313545636415973,
"learning_rate": 2.5999148032894116e-06,
"loss": 0.1301,
"step": 753
},
{
"epoch": 1.320490367775832,
"grad_norm": 0.7640005278295612,
"learning_rate": 2.587857389865891e-06,
"loss": 0.0705,
"step": 754
},
{
"epoch": 1.3222416812609457,
"grad_norm": 0.9241800146530195,
"learning_rate": 2.5758182309550773e-06,
"loss": 0.1024,
"step": 755
},
{
"epoch": 1.3239929947460596,
"grad_norm": 0.8476677958931723,
"learning_rate": 2.5637974176664156e-06,
"loss": 0.075,
"step": 756
},
{
"epoch": 1.3257443082311733,
"grad_norm": 0.9539049936089635,
"learning_rate": 2.5517950409705173e-06,
"loss": 0.0732,
"step": 757
},
{
"epoch": 1.3274956217162872,
"grad_norm": 0.7787415845648707,
"learning_rate": 2.539811191698469e-06,
"loss": 0.068,
"step": 758
},
{
"epoch": 1.3292469352014011,
"grad_norm": 0.6740960966163063,
"learning_rate": 2.52784596054115e-06,
"loss": 0.0807,
"step": 759
},
{
"epoch": 1.3309982486865148,
"grad_norm": 0.9272432887557794,
"learning_rate": 2.5158994380485403e-06,
"loss": 0.1073,
"step": 760
},
{
"epoch": 1.3327495621716288,
"grad_norm": 0.805162044866896,
"learning_rate": 2.5039717146290365e-06,
"loss": 0.1363,
"step": 761
},
{
"epoch": 1.3345008756567425,
"grad_norm": 0.9017129027314272,
"learning_rate": 2.4920628805487684e-06,
"loss": 0.093,
"step": 762
},
{
"epoch": 1.3362521891418564,
"grad_norm": 0.9079872727729905,
"learning_rate": 2.4801730259309136e-06,
"loss": 0.0808,
"step": 763
},
{
"epoch": 1.3380035026269703,
"grad_norm": 0.7596071384023176,
"learning_rate": 2.468302240755023e-06,
"loss": 0.0811,
"step": 764
},
{
"epoch": 1.339754816112084,
"grad_norm": 0.8763250650248721,
"learning_rate": 2.456450614856333e-06,
"loss": 0.0887,
"step": 765
},
{
"epoch": 1.341506129597198,
"grad_norm": 0.9923301577394401,
"learning_rate": 2.4446182379250843e-06,
"loss": 0.0893,
"step": 766
},
{
"epoch": 1.3432574430823117,
"grad_norm": 0.8633434639363498,
"learning_rate": 2.4328051995058482e-06,
"loss": 0.088,
"step": 767
},
{
"epoch": 1.3450087565674256,
"grad_norm": 1.0475821217828167,
"learning_rate": 2.4210115889968446e-06,
"loss": 0.0924,
"step": 768
},
{
"epoch": 1.3467600700525395,
"grad_norm": 0.9107712943754082,
"learning_rate": 2.409237495649271e-06,
"loss": 0.0728,
"step": 769
},
{
"epoch": 1.3485113835376532,
"grad_norm": 1.5257662309673086,
"learning_rate": 2.397483008566624e-06,
"loss": 0.1125,
"step": 770
},
{
"epoch": 1.3502626970227671,
"grad_norm": 0.876528919503985,
"learning_rate": 2.3857482167040215e-06,
"loss": 0.0974,
"step": 771
},
{
"epoch": 1.3520140105078808,
"grad_norm": 0.924196021469823,
"learning_rate": 2.374033208867534e-06,
"loss": 0.0915,
"step": 772
},
{
"epoch": 1.3537653239929948,
"grad_norm": 0.792737877765985,
"learning_rate": 2.3623380737135094e-06,
"loss": 0.0678,
"step": 773
},
{
"epoch": 1.3555166374781087,
"grad_norm": 0.7778836639012523,
"learning_rate": 2.3506628997479085e-06,
"loss": 0.0653,
"step": 774
},
{
"epoch": 1.3572679509632224,
"grad_norm": 0.8527989490917307,
"learning_rate": 2.339007775325629e-06,
"loss": 0.0833,
"step": 775
},
{
"epoch": 1.3590192644483363,
"grad_norm": 0.7022012295601906,
"learning_rate": 2.3273727886498372e-06,
"loss": 0.0593,
"step": 776
},
{
"epoch": 1.36077057793345,
"grad_norm": 0.957527967610434,
"learning_rate": 2.3157580277713004e-06,
"loss": 0.0669,
"step": 777
},
{
"epoch": 1.362521891418564,
"grad_norm": 1.5092316384911828,
"learning_rate": 2.304163580587724e-06,
"loss": 0.1074,
"step": 778
},
{
"epoch": 1.3642732049036779,
"grad_norm": 0.899000777013253,
"learning_rate": 2.2925895348430856e-06,
"loss": 0.0835,
"step": 779
},
{
"epoch": 1.3660245183887916,
"grad_norm": 0.790034586719747,
"learning_rate": 2.2810359781269657e-06,
"loss": 0.0719,
"step": 780
},
{
"epoch": 1.3677758318739055,
"grad_norm": 0.7907785753390943,
"learning_rate": 2.269502997873895e-06,
"loss": 0.0781,
"step": 781
},
{
"epoch": 1.3695271453590192,
"grad_norm": 0.6926955937004445,
"learning_rate": 2.2579906813626807e-06,
"loss": 0.0728,
"step": 782
},
{
"epoch": 1.3712784588441331,
"grad_norm": 0.6166696427484202,
"learning_rate": 2.246499115715751e-06,
"loss": 0.0644,
"step": 783
},
{
"epoch": 1.373029772329247,
"grad_norm": 0.7255433228039453,
"learning_rate": 2.235028387898504e-06,
"loss": 0.0979,
"step": 784
},
{
"epoch": 1.3747810858143608,
"grad_norm": 0.7617740643442561,
"learning_rate": 2.2235785847186338e-06,
"loss": 0.0924,
"step": 785
},
{
"epoch": 1.3765323992994747,
"grad_norm": 1.0217536171541641,
"learning_rate": 2.212149792825489e-06,
"loss": 0.0979,
"step": 786
},
{
"epoch": 1.3782837127845884,
"grad_norm": 0.8732538774678509,
"learning_rate": 2.2007420987094036e-06,
"loss": 0.0734,
"step": 787
},
{
"epoch": 1.3800350262697023,
"grad_norm": 1.0019292167266876,
"learning_rate": 2.189355588701051e-06,
"loss": 0.1069,
"step": 788
},
{
"epoch": 1.3817863397548162,
"grad_norm": 0.8315824634330226,
"learning_rate": 2.177990348970792e-06,
"loss": 0.0909,
"step": 789
},
{
"epoch": 1.38353765323993,
"grad_norm": 0.647602160309111,
"learning_rate": 2.1666464655280133e-06,
"loss": 0.0812,
"step": 790
},
{
"epoch": 1.3852889667250436,
"grad_norm": 0.9640769615135378,
"learning_rate": 2.1553240242204876e-06,
"loss": 0.0873,
"step": 791
},
{
"epoch": 1.3870402802101576,
"grad_norm": 0.9202817145346947,
"learning_rate": 2.1440231107337147e-06,
"loss": 0.0792,
"step": 792
},
{
"epoch": 1.3887915936952715,
"grad_norm": 0.912675153291401,
"learning_rate": 2.1327438105902763e-06,
"loss": 0.0773,
"step": 793
},
{
"epoch": 1.3905429071803854,
"grad_norm": 0.9415049365667928,
"learning_rate": 2.1214862091491966e-06,
"loss": 0.1135,
"step": 794
},
{
"epoch": 1.3922942206654991,
"grad_norm": 0.9970178640835594,
"learning_rate": 2.1102503916052797e-06,
"loss": 0.0847,
"step": 795
},
{
"epoch": 1.3940455341506128,
"grad_norm": 0.9832327330756709,
"learning_rate": 2.0990364429884828e-06,
"loss": 0.1235,
"step": 796
},
{
"epoch": 1.3957968476357268,
"grad_norm": 0.8721793805349013,
"learning_rate": 2.0878444481632597e-06,
"loss": 0.1004,
"step": 797
},
{
"epoch": 1.3975481611208407,
"grad_norm": 0.7747485377067997,
"learning_rate": 2.076674491827922e-06,
"loss": 0.069,
"step": 798
},
{
"epoch": 1.3992994746059544,
"grad_norm": 0.8456173036014264,
"learning_rate": 2.0655266585140045e-06,
"loss": 0.0754,
"step": 799
},
{
"epoch": 1.4010507880910683,
"grad_norm": 0.9391798871992534,
"learning_rate": 2.0544010325856146e-06,
"loss": 0.0969,
"step": 800
},
{
"epoch": 1.402802101576182,
"grad_norm": 0.8875537072650175,
"learning_rate": 2.043297698238805e-06,
"loss": 0.0678,
"step": 801
},
{
"epoch": 1.404553415061296,
"grad_norm": 0.7833633854106007,
"learning_rate": 2.0322167395009286e-06,
"loss": 0.0877,
"step": 802
},
{
"epoch": 1.4063047285464099,
"grad_norm": 0.9708805413159364,
"learning_rate": 2.0211582402300007e-06,
"loss": 0.0937,
"step": 803
},
{
"epoch": 1.4080560420315236,
"grad_norm": 0.8290033085639599,
"learning_rate": 2.0101222841140775e-06,
"loss": 0.0722,
"step": 804
},
{
"epoch": 1.4098073555166375,
"grad_norm": 0.7734317269111782,
"learning_rate": 1.9991089546706067e-06,
"loss": 0.0788,
"step": 805
},
{
"epoch": 1.4115586690017512,
"grad_norm": 0.8912988277705161,
"learning_rate": 1.9881183352458083e-06,
"loss": 0.0923,
"step": 806
},
{
"epoch": 1.4133099824868651,
"grad_norm": 0.8930959140982987,
"learning_rate": 1.9771505090140343e-06,
"loss": 0.0858,
"step": 807
},
{
"epoch": 1.415061295971979,
"grad_norm": 1.0486079636516412,
"learning_rate": 1.9662055589771427e-06,
"loss": 0.0848,
"step": 808
},
{
"epoch": 1.4168126094570928,
"grad_norm": 0.869268490663677,
"learning_rate": 1.955283567963876e-06,
"loss": 0.1326,
"step": 809
},
{
"epoch": 1.4185639229422067,
"grad_norm": 0.6923293806693147,
"learning_rate": 1.9443846186292204e-06,
"loss": 0.0594,
"step": 810
},
{
"epoch": 1.4203152364273204,
"grad_norm": 0.7354927224586634,
"learning_rate": 1.9335087934537956e-06,
"loss": 0.0806,
"step": 811
},
{
"epoch": 1.4220665499124343,
"grad_norm": 0.8259624368436999,
"learning_rate": 1.9226561747432188e-06,
"loss": 0.0857,
"step": 812
},
{
"epoch": 1.4238178633975482,
"grad_norm": 0.8711192744207589,
"learning_rate": 1.911826844627485e-06,
"loss": 0.0752,
"step": 813
},
{
"epoch": 1.425569176882662,
"grad_norm": 2.6069679207164667,
"learning_rate": 1.901020885060353e-06,
"loss": 0.0866,
"step": 814
},
{
"epoch": 1.4273204903677759,
"grad_norm": 0.8510608016527956,
"learning_rate": 1.8902383778187106e-06,
"loss": 0.1021,
"step": 815
},
{
"epoch": 1.4290718038528896,
"grad_norm": 0.9912516765235937,
"learning_rate": 1.8794794045019727e-06,
"loss": 0.1037,
"step": 816
},
{
"epoch": 1.4308231173380035,
"grad_norm": 1.4280563242119932,
"learning_rate": 1.8687440465314493e-06,
"loss": 0.125,
"step": 817
},
{
"epoch": 1.4325744308231174,
"grad_norm": 0.8887782023449038,
"learning_rate": 1.858032385149735e-06,
"loss": 0.0961,
"step": 818
},
{
"epoch": 1.4343257443082311,
"grad_norm": 0.8092352027580832,
"learning_rate": 1.8473445014200992e-06,
"loss": 0.0784,
"step": 819
},
{
"epoch": 1.436077057793345,
"grad_norm": 0.8508624184842186,
"learning_rate": 1.8366804762258612e-06,
"loss": 0.0993,
"step": 820
},
{
"epoch": 1.4378283712784588,
"grad_norm": 0.8476088885858866,
"learning_rate": 1.826040390269792e-06,
"loss": 0.097,
"step": 821
},
{
"epoch": 1.4395796847635727,
"grad_norm": 0.7858581848448734,
"learning_rate": 1.8154243240734904e-06,
"loss": 0.0545,
"step": 822
},
{
"epoch": 1.4413309982486866,
"grad_norm": 0.7511491075823152,
"learning_rate": 1.8048323579767796e-06,
"loss": 0.0614,
"step": 823
},
{
"epoch": 1.4430823117338003,
"grad_norm": 0.8916221736098047,
"learning_rate": 1.7942645721371043e-06,
"loss": 0.0688,
"step": 824
},
{
"epoch": 1.4448336252189142,
"grad_norm": 1.0386802641836865,
"learning_rate": 1.7837210465289129e-06,
"loss": 0.1243,
"step": 825
},
{
"epoch": 1.446584938704028,
"grad_norm": 0.8600832619931907,
"learning_rate": 1.773201860943063e-06,
"loss": 0.0591,
"step": 826
},
{
"epoch": 1.4483362521891419,
"grad_norm": 0.85335509068009,
"learning_rate": 1.7627070949862095e-06,
"loss": 0.0897,
"step": 827
},
{
"epoch": 1.4500875656742558,
"grad_norm": 1.0290314674102257,
"learning_rate": 1.7522368280802048e-06,
"loss": 0.1101,
"step": 828
},
{
"epoch": 1.4518388791593695,
"grad_norm": 0.7360509013463974,
"learning_rate": 1.7417911394615033e-06,
"loss": 0.073,
"step": 829
},
{
"epoch": 1.4535901926444834,
"grad_norm": 1.0355631885341359,
"learning_rate": 1.7313701081805506e-06,
"loss": 0.1144,
"step": 830
},
{
"epoch": 1.4553415061295971,
"grad_norm": 0.6596592368497637,
"learning_rate": 1.7209738131011977e-06,
"loss": 0.0815,
"step": 831
},
{
"epoch": 1.457092819614711,
"grad_norm": 0.8157916293097928,
"learning_rate": 1.7106023329000932e-06,
"loss": 0.0825,
"step": 832
},
{
"epoch": 1.458844133099825,
"grad_norm": 0.8170690188009203,
"learning_rate": 1.700255746066093e-06,
"loss": 0.0768,
"step": 833
},
{
"epoch": 1.4605954465849387,
"grad_norm": 0.7806828063640731,
"learning_rate": 1.6899341308996704e-06,
"loss": 0.0828,
"step": 834
},
{
"epoch": 1.4623467600700526,
"grad_norm": 0.7615625289107787,
"learning_rate": 1.6796375655123126e-06,
"loss": 0.0983,
"step": 835
},
{
"epoch": 1.4640980735551663,
"grad_norm": 1.1145859850353201,
"learning_rate": 1.6693661278259438e-06,
"loss": 0.1593,
"step": 836
},
{
"epoch": 1.4658493870402802,
"grad_norm": 0.8460031372441591,
"learning_rate": 1.659119895572322e-06,
"loss": 0.0713,
"step": 837
},
{
"epoch": 1.4676007005253942,
"grad_norm": 0.8666355376799286,
"learning_rate": 1.648898946292456e-06,
"loss": 0.0795,
"step": 838
},
{
"epoch": 1.4693520140105079,
"grad_norm": 1.098767971036189,
"learning_rate": 1.6387033573360244e-06,
"loss": 0.1291,
"step": 839
},
{
"epoch": 1.4711033274956218,
"grad_norm": 1.0895405573642913,
"learning_rate": 1.62853320586078e-06,
"loss": 0.1035,
"step": 840
},
{
"epoch": 1.4728546409807355,
"grad_norm": 1.0610577874468181,
"learning_rate": 1.6183885688319755e-06,
"loss": 0.1761,
"step": 841
},
{
"epoch": 1.4746059544658494,
"grad_norm": 0.9087423490771874,
"learning_rate": 1.6082695230217721e-06,
"loss": 0.0903,
"step": 842
},
{
"epoch": 1.4763572679509633,
"grad_norm": 1.4022223726835983,
"learning_rate": 1.5981761450086647e-06,
"loss": 0.1407,
"step": 843
},
{
"epoch": 1.478108581436077,
"grad_norm": 0.8453730182740572,
"learning_rate": 1.588108511176899e-06,
"loss": 0.0801,
"step": 844
},
{
"epoch": 1.479859894921191,
"grad_norm": 0.7650759951583646,
"learning_rate": 1.5780666977158976e-06,
"loss": 0.0898,
"step": 845
},
{
"epoch": 1.4816112084063047,
"grad_norm": 1.0534592243091558,
"learning_rate": 1.5680507806196815e-06,
"loss": 0.1065,
"step": 846
},
{
"epoch": 1.4833625218914186,
"grad_norm": 0.7581070203659848,
"learning_rate": 1.558060835686291e-06,
"loss": 0.0768,
"step": 847
},
{
"epoch": 1.4851138353765325,
"grad_norm": 0.8228217306630333,
"learning_rate": 1.548096938517215e-06,
"loss": 0.0864,
"step": 848
},
{
"epoch": 1.4868651488616462,
"grad_norm": 0.6948138142476058,
"learning_rate": 1.5381591645168214e-06,
"loss": 0.0727,
"step": 849
},
{
"epoch": 1.4886164623467601,
"grad_norm": 1.1215312769377082,
"learning_rate": 1.5282475888917837e-06,
"loss": 0.1084,
"step": 850
},
{
"epoch": 1.4903677758318739,
"grad_norm": 0.556612824562337,
"learning_rate": 1.5183622866505149e-06,
"loss": 0.0378,
"step": 851
},
{
"epoch": 1.4921190893169878,
"grad_norm": 0.8334893621906851,
"learning_rate": 1.5085033326025933e-06,
"loss": 0.1058,
"step": 852
},
{
"epoch": 1.4938704028021017,
"grad_norm": 0.8483067154096786,
"learning_rate": 1.4986708013582013e-06,
"loss": 0.0593,
"step": 853
},
{
"epoch": 1.4956217162872154,
"grad_norm": 0.7563089529959395,
"learning_rate": 1.4888647673275598e-06,
"loss": 0.0881,
"step": 854
},
{
"epoch": 1.4973730297723291,
"grad_norm": 0.9581064139110383,
"learning_rate": 1.4790853047203674e-06,
"loss": 0.1231,
"step": 855
},
{
"epoch": 1.499124343257443,
"grad_norm": 0.8646013332942565,
"learning_rate": 1.4693324875452369e-06,
"loss": 0.0962,
"step": 856
},
{
"epoch": 1.500875656742557,
"grad_norm": 0.8161715950753782,
"learning_rate": 1.4596063896091316e-06,
"loss": 0.0984,
"step": 857
},
{
"epoch": 1.5026269702276709,
"grad_norm": 0.85028135397844,
"learning_rate": 1.4499070845168112e-06,
"loss": 0.0998,
"step": 858
},
{
"epoch": 1.5043782837127846,
"grad_norm": 0.8492942866077009,
"learning_rate": 1.4402346456702737e-06,
"loss": 0.0802,
"step": 859
},
{
"epoch": 1.5061295971978983,
"grad_norm": 1.0442880468905495,
"learning_rate": 1.4305891462682004e-06,
"loss": 0.1154,
"step": 860
},
{
"epoch": 1.5078809106830122,
"grad_norm": 0.8309778737447728,
"learning_rate": 1.420970659305404e-06,
"loss": 0.1184,
"step": 861
},
{
"epoch": 1.5096322241681261,
"grad_norm": 1.0968471389762762,
"learning_rate": 1.4113792575722684e-06,
"loss": 0.0877,
"step": 862
},
{
"epoch": 1.51138353765324,
"grad_norm": 0.7388443383068837,
"learning_rate": 1.4018150136542063e-06,
"loss": 0.0431,
"step": 863
},
{
"epoch": 1.5131348511383538,
"grad_norm": 0.978572938428472,
"learning_rate": 1.3922779999311032e-06,
"loss": 0.0662,
"step": 864
},
{
"epoch": 1.5148861646234675,
"grad_norm": 0.825879975581395,
"learning_rate": 1.3827682885767778e-06,
"loss": 0.0741,
"step": 865
},
{
"epoch": 1.5166374781085814,
"grad_norm": 0.8275889522903741,
"learning_rate": 1.3732859515584306e-06,
"loss": 0.0719,
"step": 866
},
{
"epoch": 1.5183887915936953,
"grad_norm": 0.8930447963765076,
"learning_rate": 1.363831060636096e-06,
"loss": 0.101,
"step": 867
},
{
"epoch": 1.5201401050788093,
"grad_norm": 1.0626174646711952,
"learning_rate": 1.3544036873621054e-06,
"loss": 0.1285,
"step": 868
},
{
"epoch": 1.521891418563923,
"grad_norm": 0.9900839577095674,
"learning_rate": 1.345003903080541e-06,
"loss": 0.072,
"step": 869
},
{
"epoch": 1.5236427320490367,
"grad_norm": 0.9536264805723799,
"learning_rate": 1.335631778926702e-06,
"loss": 0.1401,
"step": 870
},
{
"epoch": 1.5253940455341506,
"grad_norm": 0.816870903519518,
"learning_rate": 1.3262873858265618e-06,
"loss": 0.0764,
"step": 871
},
{
"epoch": 1.5271453590192645,
"grad_norm": 1.1150610380119643,
"learning_rate": 1.316970794496229e-06,
"loss": 0.0694,
"step": 872
},
{
"epoch": 1.5288966725043784,
"grad_norm": 0.955595059805072,
"learning_rate": 1.3076820754414165e-06,
"loss": 0.0844,
"step": 873
},
{
"epoch": 1.5306479859894921,
"grad_norm": 0.8892461247283927,
"learning_rate": 1.2984212989569055e-06,
"loss": 0.0709,
"step": 874
},
{
"epoch": 1.5323992994746058,
"grad_norm": 1.0183197020080885,
"learning_rate": 1.2891885351260191e-06,
"loss": 0.0835,
"step": 875
},
{
"epoch": 1.5341506129597198,
"grad_norm": 0.7689697067022396,
"learning_rate": 1.2799838538200804e-06,
"loss": 0.0865,
"step": 876
},
{
"epoch": 1.5359019264448337,
"grad_norm": 0.9433827530845643,
"learning_rate": 1.270807324697898e-06,
"loss": 0.0831,
"step": 877
},
{
"epoch": 1.5376532399299476,
"grad_norm": 0.7106251655802119,
"learning_rate": 1.2616590172052268e-06,
"loss": 0.0772,
"step": 878
},
{
"epoch": 1.5394045534150613,
"grad_norm": 0.787230689946638,
"learning_rate": 1.252539000574246e-06,
"loss": 0.0839,
"step": 879
},
{
"epoch": 1.541155866900175,
"grad_norm": 0.7954726817869575,
"learning_rate": 1.2434473438230426e-06,
"loss": 0.0655,
"step": 880
},
{
"epoch": 1.542907180385289,
"grad_norm": 0.8902609833747344,
"learning_rate": 1.2343841157550757e-06,
"loss": 0.0812,
"step": 881
},
{
"epoch": 1.5446584938704029,
"grad_norm": 0.8517459203382189,
"learning_rate": 1.2253493849586695e-06,
"loss": 0.091,
"step": 882
},
{
"epoch": 1.5464098073555166,
"grad_norm": 0.9150985106553249,
"learning_rate": 1.2163432198064834e-06,
"loss": 0.0957,
"step": 883
},
{
"epoch": 1.5481611208406305,
"grad_norm": 1.0872606970611478,
"learning_rate": 1.207365688454999e-06,
"loss": 0.0643,
"step": 884
},
{
"epoch": 1.5499124343257442,
"grad_norm": 1.0678678238356631,
"learning_rate": 1.1984168588440075e-06,
"loss": 0.089,
"step": 885
},
{
"epoch": 1.5516637478108581,
"grad_norm": 0.8484700364176362,
"learning_rate": 1.1894967986960877e-06,
"loss": 0.089,
"step": 886
},
{
"epoch": 1.553415061295972,
"grad_norm": 0.8940833430222472,
"learning_rate": 1.1806055755161029e-06,
"loss": 0.107,
"step": 887
},
{
"epoch": 1.5551663747810858,
"grad_norm": 0.8190347787792227,
"learning_rate": 1.1717432565906817e-06,
"loss": 0.0787,
"step": 888
},
{
"epoch": 1.5569176882661997,
"grad_norm": 0.71210611816795,
"learning_rate": 1.1629099089877116e-06,
"loss": 0.0665,
"step": 889
},
{
"epoch": 1.5586690017513134,
"grad_norm": 0.9042648925525666,
"learning_rate": 1.154105599555837e-06,
"loss": 0.0886,
"step": 890
},
{
"epoch": 1.5604203152364273,
"grad_norm": 0.9505703326726503,
"learning_rate": 1.1453303949239431e-06,
"loss": 0.097,
"step": 891
},
{
"epoch": 1.5621716287215412,
"grad_norm": 1.3798512911645553,
"learning_rate": 1.1365843615006606e-06,
"loss": 0.0818,
"step": 892
},
{
"epoch": 1.563922942206655,
"grad_norm": 0.877599922223935,
"learning_rate": 1.127867565473858e-06,
"loss": 0.0697,
"step": 893
},
{
"epoch": 1.5656742556917689,
"grad_norm": 0.8465883171108056,
"learning_rate": 1.11918007281014e-06,
"loss": 0.0831,
"step": 894
},
{
"epoch": 1.5674255691768826,
"grad_norm": 0.9027728204073343,
"learning_rate": 1.1105219492543567e-06,
"loss": 0.1178,
"step": 895
},
{
"epoch": 1.5691768826619965,
"grad_norm": 0.976849713853511,
"learning_rate": 1.1018932603290927e-06,
"loss": 0.1209,
"step": 896
},
{
"epoch": 1.5709281961471104,
"grad_norm": 0.9584305854695117,
"learning_rate": 1.0932940713341843e-06,
"loss": 0.1158,
"step": 897
},
{
"epoch": 1.5726795096322241,
"grad_norm": 0.8686276598286167,
"learning_rate": 1.0847244473462165e-06,
"loss": 0.0715,
"step": 898
},
{
"epoch": 1.5744308231173378,
"grad_norm": 1.0091500961047988,
"learning_rate": 1.0761844532180322e-06,
"loss": 0.0961,
"step": 899
},
{
"epoch": 1.5761821366024518,
"grad_norm": 0.7807508240613471,
"learning_rate": 1.067674153578247e-06,
"loss": 0.0664,
"step": 900
},
{
"epoch": 1.5779334500875657,
"grad_norm": 0.8067749796026943,
"learning_rate": 1.05919361283075e-06,
"loss": 0.0636,
"step": 901
},
{
"epoch": 1.5796847635726796,
"grad_norm": 0.8759620340871858,
"learning_rate": 1.0507428951542293e-06,
"loss": 0.0753,
"step": 902
},
{
"epoch": 1.5814360770577933,
"grad_norm": 0.975397189576844,
"learning_rate": 1.042322064501673e-06,
"loss": 0.0825,
"step": 903
},
{
"epoch": 1.583187390542907,
"grad_norm": 0.8598966333634731,
"learning_rate": 1.0339311845998929e-06,
"loss": 0.0713,
"step": 904
},
{
"epoch": 1.584938704028021,
"grad_norm": 0.7086427629579961,
"learning_rate": 1.025570318949044e-06,
"loss": 0.0588,
"step": 905
},
{
"epoch": 1.5866900175131349,
"grad_norm": 0.9371811065539839,
"learning_rate": 1.0172395308221355e-06,
"loss": 0.1025,
"step": 906
},
{
"epoch": 1.5884413309982488,
"grad_norm": 0.8523909056255194,
"learning_rate": 1.008938883264563e-06,
"loss": 0.0785,
"step": 907
},
{
"epoch": 1.5901926444833625,
"grad_norm": 0.6932575906112352,
"learning_rate": 1.0006684390936206e-06,
"loss": 0.0527,
"step": 908
},
{
"epoch": 1.5919439579684762,
"grad_norm": 0.7895131639078904,
"learning_rate": 9.924282608980318e-07,
"loss": 0.0672,
"step": 909
},
{
"epoch": 1.5936952714535901,
"grad_norm": 0.9109376997806669,
"learning_rate": 9.84218411037477e-07,
"loss": 0.0695,
"step": 910
},
{
"epoch": 1.595446584938704,
"grad_norm": 1.375391449767245,
"learning_rate": 9.760389516421143e-07,
"loss": 0.1032,
"step": 911
},
{
"epoch": 1.597197898423818,
"grad_norm": 0.7645671144180791,
"learning_rate": 9.678899446121205e-07,
"loss": 0.0487,
"step": 912
},
{
"epoch": 1.5989492119089317,
"grad_norm": 0.98070657894628,
"learning_rate": 9.597714516172107e-07,
"loss": 0.1004,
"step": 913
},
{
"epoch": 1.6007005253940454,
"grad_norm": 0.8547769727295768,
"learning_rate": 9.516835340961783e-07,
"loss": 0.0743,
"step": 914
},
{
"epoch": 1.6024518388791593,
"grad_norm": 1.0122657120554246,
"learning_rate": 9.436262532564316e-07,
"loss": 0.1235,
"step": 915
},
{
"epoch": 1.6042031523642732,
"grad_norm": 1.0918909664473564,
"learning_rate": 9.355996700735242e-07,
"loss": 0.0997,
"step": 916
},
{
"epoch": 1.6059544658493872,
"grad_norm": 0.9405618536030825,
"learning_rate": 9.276038452907016e-07,
"loss": 0.0692,
"step": 917
},
{
"epoch": 1.6077057793345009,
"grad_norm": 0.9492717016651747,
"learning_rate": 9.19638839418433e-07,
"loss": 0.0828,
"step": 918
},
{
"epoch": 1.6094570928196146,
"grad_norm": 0.7689394884673381,
"learning_rate": 9.117047127339579e-07,
"loss": 0.0973,
"step": 919
},
{
"epoch": 1.6112084063047285,
"grad_norm": 0.8844949815830236,
"learning_rate": 9.038015252808335e-07,
"loss": 0.0863,
"step": 920
},
{
"epoch": 1.6129597197898424,
"grad_norm": 0.7795777404725,
"learning_rate": 8.959293368684713e-07,
"loss": 0.0707,
"step": 921
},
{
"epoch": 1.6147110332749564,
"grad_norm": 0.9595778145894932,
"learning_rate": 8.880882070716945e-07,
"loss": 0.0936,
"step": 922
},
{
"epoch": 1.61646234676007,
"grad_norm": 0.9497307647401774,
"learning_rate": 8.80278195230278e-07,
"loss": 0.0941,
"step": 923
},
{
"epoch": 1.6182136602451838,
"grad_norm": 1.1659675647916,
"learning_rate": 8.724993604485044e-07,
"loss": 0.1023,
"step": 924
},
{
"epoch": 1.6199649737302977,
"grad_norm": 1.1234743594327323,
"learning_rate": 8.647517615947193e-07,
"loss": 0.0776,
"step": 925
},
{
"epoch": 1.6217162872154116,
"grad_norm": 0.8528927864364849,
"learning_rate": 8.57035457300876e-07,
"loss": 0.0807,
"step": 926
},
{
"epoch": 1.6234676007005255,
"grad_norm": 0.83886810565477,
"learning_rate": 8.49350505962106e-07,
"loss": 0.0594,
"step": 927
},
{
"epoch": 1.6252189141856392,
"grad_norm": 0.855463864117844,
"learning_rate": 8.416969657362622e-07,
"loss": 0.0819,
"step": 928
},
{
"epoch": 1.626970227670753,
"grad_norm": 0.908767552315424,
"learning_rate": 8.340748945434879e-07,
"loss": 0.1285,
"step": 929
},
{
"epoch": 1.6287215411558669,
"grad_norm": 1.0012453661601142,
"learning_rate": 8.264843500657799e-07,
"loss": 0.0861,
"step": 930
},
{
"epoch": 1.6304728546409808,
"grad_norm": 1.0333069552134135,
"learning_rate": 8.189253897465433e-07,
"loss": 0.0753,
"step": 931
},
{
"epoch": 1.6322241681260947,
"grad_norm": 0.905280465006129,
"learning_rate": 8.113980707901653e-07,
"loss": 0.0899,
"step": 932
},
{
"epoch": 1.6339754816112084,
"grad_norm": 1.0338044714761787,
"learning_rate": 8.039024501615777e-07,
"loss": 0.0938,
"step": 933
},
{
"epoch": 1.6357267950963221,
"grad_norm": 2.359076466144793,
"learning_rate": 7.964385845858258e-07,
"loss": 0.1217,
"step": 934
},
{
"epoch": 1.637478108581436,
"grad_norm": 0.9826730256245049,
"learning_rate": 7.890065305476441e-07,
"loss": 0.1191,
"step": 935
},
{
"epoch": 1.63922942206655,
"grad_norm": 0.8747332183619412,
"learning_rate": 7.816063442910193e-07,
"loss": 0.0993,
"step": 936
},
{
"epoch": 1.640980735551664,
"grad_norm": 0.8075496453024368,
"learning_rate": 7.742380818187772e-07,
"loss": 0.071,
"step": 937
},
{
"epoch": 1.6427320490367776,
"grad_norm": 1.001355554288775,
"learning_rate": 7.669017988921474e-07,
"loss": 0.1207,
"step": 938
},
{
"epoch": 1.6444833625218913,
"grad_norm": 0.8066142991914813,
"learning_rate": 7.595975510303466e-07,
"loss": 0.0833,
"step": 939
},
{
"epoch": 1.6462346760070052,
"grad_norm": 0.845058894943196,
"learning_rate": 7.523253935101577e-07,
"loss": 0.0838,
"step": 940
},
{
"epoch": 1.6479859894921192,
"grad_norm": 0.8892397239300569,
"learning_rate": 7.45085381365514e-07,
"loss": 0.0842,
"step": 941
},
{
"epoch": 1.649737302977233,
"grad_norm": 0.7913749401745233,
"learning_rate": 7.378775693870793e-07,
"loss": 0.0656,
"step": 942
},
{
"epoch": 1.6514886164623468,
"grad_norm": 0.9412135475034102,
"learning_rate": 7.307020121218333e-07,
"loss": 0.0988,
"step": 943
},
{
"epoch": 1.6532399299474605,
"grad_norm": 0.6922791542112726,
"learning_rate": 7.235587638726599e-07,
"loss": 0.0644,
"step": 944
},
{
"epoch": 1.6549912434325744,
"grad_norm": 0.683548226075825,
"learning_rate": 7.164478786979356e-07,
"loss": 0.0507,
"step": 945
},
{
"epoch": 1.6567425569176883,
"grad_norm": 0.9731267616312939,
"learning_rate": 7.093694104111237e-07,
"loss": 0.078,
"step": 946
},
{
"epoch": 1.658493870402802,
"grad_norm": 0.9269892309271431,
"learning_rate": 7.023234125803635e-07,
"loss": 0.1005,
"step": 947
},
{
"epoch": 1.660245183887916,
"grad_norm": 0.9430923254149106,
"learning_rate": 6.953099385280632e-07,
"loss": 0.063,
"step": 948
},
{
"epoch": 1.6619964973730297,
"grad_norm": 0.954580758978347,
"learning_rate": 6.883290413305011e-07,
"loss": 0.1154,
"step": 949
},
{
"epoch": 1.6637478108581436,
"grad_norm": 0.9178349147320257,
"learning_rate": 6.813807738174199e-07,
"loss": 0.0574,
"step": 950
},
{
"epoch": 1.6654991243432575,
"grad_norm": 0.8477054090991074,
"learning_rate": 6.744651885716313e-07,
"loss": 0.0713,
"step": 951
},
{
"epoch": 1.6672504378283712,
"grad_norm": 1.033767975754156,
"learning_rate": 6.675823379286151e-07,
"loss": 0.1363,
"step": 952
},
{
"epoch": 1.6690017513134852,
"grad_norm": 0.8140614446219004,
"learning_rate": 6.607322739761219e-07,
"loss": 0.0811,
"step": 953
},
{
"epoch": 1.6707530647985989,
"grad_norm": 0.8580159228495404,
"learning_rate": 6.53915048553781e-07,
"loss": 0.0786,
"step": 954
},
{
"epoch": 1.6725043782837128,
"grad_norm": 1.0237229957514502,
"learning_rate": 6.471307132527071e-07,
"loss": 0.072,
"step": 955
},
{
"epoch": 1.6742556917688267,
"grad_norm": 1.1139363365853103,
"learning_rate": 6.40379319415112e-07,
"loss": 0.1586,
"step": 956
},
{
"epoch": 1.6760070052539404,
"grad_norm": 0.8039008743188236,
"learning_rate": 6.336609181339148e-07,
"loss": 0.074,
"step": 957
},
{
"epoch": 1.6777583187390543,
"grad_norm": 0.955504225000358,
"learning_rate": 6.269755602523531e-07,
"loss": 0.0941,
"step": 958
},
{
"epoch": 1.679509632224168,
"grad_norm": 1.1369679103050756,
"learning_rate": 6.203232963636003e-07,
"loss": 0.0953,
"step": 959
},
{
"epoch": 1.681260945709282,
"grad_norm": 0.8138093638660885,
"learning_rate": 6.137041768103819e-07,
"loss": 0.0682,
"step": 960
},
{
"epoch": 1.683012259194396,
"grad_norm": 0.9438149839513782,
"learning_rate": 6.071182516845974e-07,
"loss": 0.0759,
"step": 961
},
{
"epoch": 1.6847635726795096,
"grad_norm": 1.1428154671493533,
"learning_rate": 6.005655708269386e-07,
"loss": 0.0851,
"step": 962
},
{
"epoch": 1.6865148861646233,
"grad_norm": 0.8193942407950339,
"learning_rate": 5.9404618382651e-07,
"loss": 0.0985,
"step": 963
},
{
"epoch": 1.6882661996497372,
"grad_norm": 1.054106752133417,
"learning_rate": 5.87560140020459e-07,
"loss": 0.0707,
"step": 964
},
{
"epoch": 1.6900175131348512,
"grad_norm": 0.8940576683459753,
"learning_rate": 5.811074884935964e-07,
"loss": 0.0683,
"step": 965
},
{
"epoch": 1.691768826619965,
"grad_norm": 0.6287784415256515,
"learning_rate": 5.746882780780322e-07,
"loss": 0.0589,
"step": 966
},
{
"epoch": 1.6935201401050788,
"grad_norm": 0.9750764230003164,
"learning_rate": 5.683025573528017e-07,
"loss": 0.1097,
"step": 967
},
{
"epoch": 1.6952714535901925,
"grad_norm": 0.9218101868960946,
"learning_rate": 5.619503746434956e-07,
"loss": 0.1168,
"step": 968
},
{
"epoch": 1.6970227670753064,
"grad_norm": 1.1755520131559736,
"learning_rate": 5.55631778021899e-07,
"loss": 0.0959,
"step": 969
},
{
"epoch": 1.6987740805604203,
"grad_norm": 0.8414667003154775,
"learning_rate": 5.493468153056236e-07,
"loss": 0.0664,
"step": 970
},
{
"epoch": 1.7005253940455343,
"grad_norm": 0.94270536937066,
"learning_rate": 5.430955340577515e-07,
"loss": 0.0608,
"step": 971
},
{
"epoch": 1.702276707530648,
"grad_norm": 1.1274506094616077,
"learning_rate": 5.368779815864678e-07,
"loss": 0.1253,
"step": 972
},
{
"epoch": 1.7040280210157617,
"grad_norm": 1.0875056272885322,
"learning_rate": 5.306942049447095e-07,
"loss": 0.0803,
"step": 973
},
{
"epoch": 1.7057793345008756,
"grad_norm": 0.7586215304561557,
"learning_rate": 5.245442509298038e-07,
"loss": 0.0707,
"step": 974
},
{
"epoch": 1.7075306479859895,
"grad_norm": 0.9854236513125099,
"learning_rate": 5.184281660831158e-07,
"loss": 0.0862,
"step": 975
},
{
"epoch": 1.7092819614711035,
"grad_norm": 1.1000068183348335,
"learning_rate": 5.123459966897021e-07,
"loss": 0.0804,
"step": 976
},
{
"epoch": 1.7110332749562172,
"grad_norm": 0.9054286431304135,
"learning_rate": 5.062977887779486e-07,
"loss": 0.0605,
"step": 977
},
{
"epoch": 1.7127845884413309,
"grad_norm": 0.7861214593313163,
"learning_rate": 5.002835881192336e-07,
"loss": 0.0827,
"step": 978
},
{
"epoch": 1.7145359019264448,
"grad_norm": 0.8032715915593343,
"learning_rate": 4.943034402275754e-07,
"loss": 0.0983,
"step": 979
},
{
"epoch": 1.7162872154115587,
"grad_norm": 1.0469596242294776,
"learning_rate": 4.88357390359287e-07,
"loss": 0.0669,
"step": 980
},
{
"epoch": 1.7180385288966726,
"grad_norm": 1.2227292003086079,
"learning_rate": 4.824454835126402e-07,
"loss": 0.1081,
"step": 981
},
{
"epoch": 1.7197898423817863,
"grad_norm": 0.8383307838231276,
"learning_rate": 4.765677644275163e-07,
"loss": 0.1177,
"step": 982
},
{
"epoch": 1.7215411558669,
"grad_norm": 0.8845101264170805,
"learning_rate": 4.707242775850751e-07,
"loss": 0.0825,
"step": 983
},
{
"epoch": 1.723292469352014,
"grad_norm": 0.8738347124266663,
"learning_rate": 4.6491506720741376e-07,
"loss": 0.0767,
"step": 984
},
{
"epoch": 1.725043782837128,
"grad_norm": 0.9235780941896881,
"learning_rate": 4.591401772572313e-07,
"loss": 0.1073,
"step": 985
},
{
"epoch": 1.7267950963222418,
"grad_norm": 0.808520171929526,
"learning_rate": 4.533996514375033e-07,
"loss": 0.0888,
"step": 986
},
{
"epoch": 1.7285464098073555,
"grad_norm": 0.8040022329220664,
"learning_rate": 4.476935331911397e-07,
"loss": 0.0689,
"step": 987
},
{
"epoch": 1.7302977232924692,
"grad_norm": 1.024560760379506,
"learning_rate": 4.4202186570066753e-07,
"loss": 0.0624,
"step": 988
},
{
"epoch": 1.7320490367775832,
"grad_norm": 0.7655215543483458,
"learning_rate": 4.363846918878961e-07,
"loss": 0.0641,
"step": 989
},
{
"epoch": 1.733800350262697,
"grad_norm": 0.8347354918909909,
"learning_rate": 4.307820544135938e-07,
"loss": 0.065,
"step": 990
},
{
"epoch": 1.735551663747811,
"grad_norm": 1.0085283172237212,
"learning_rate": 4.2521399567717004e-07,
"loss": 0.0696,
"step": 991
},
{
"epoch": 1.7373029772329247,
"grad_norm": 0.9981732466737275,
"learning_rate": 4.1968055781634655e-07,
"loss": 0.0668,
"step": 992
},
{
"epoch": 1.7390542907180384,
"grad_norm": 0.6502067313094136,
"learning_rate": 4.1418178270684727e-07,
"loss": 0.067,
"step": 993
},
{
"epoch": 1.7408056042031523,
"grad_norm": 0.9620108166680914,
"learning_rate": 4.0871771196207223e-07,
"loss": 0.0865,
"step": 994
},
{
"epoch": 1.7425569176882663,
"grad_norm": 0.7915472204717325,
"learning_rate": 4.032883869327886e-07,
"loss": 0.0725,
"step": 995
},
{
"epoch": 1.7443082311733802,
"grad_norm": 1.11392133448476,
"learning_rate": 3.9789384870681904e-07,
"loss": 0.0976,
"step": 996
},
{
"epoch": 1.746059544658494,
"grad_norm": 0.8699184608686867,
"learning_rate": 3.925341381087239e-07,
"loss": 0.0631,
"step": 997
},
{
"epoch": 1.7478108581436076,
"grad_norm": 0.7718373090564703,
"learning_rate": 3.872092956995005e-07,
"loss": 0.0555,
"step": 998
},
{
"epoch": 1.7495621716287215,
"grad_norm": 1.2956166388511012,
"learning_rate": 3.81919361776269e-07,
"loss": 0.1143,
"step": 999
},
{
"epoch": 1.7513134851138354,
"grad_norm": 1.1331409176270781,
"learning_rate": 3.7666437637197127e-07,
"loss": 0.0937,
"step": 1000
},
{
"epoch": 1.7513134851138354,
"eval_loss": 0.20276139676570892,
"eval_runtime": 1.9019,
"eval_samples_per_second": 24.712,
"eval_steps_per_second": 6.31,
"step": 1000
},
{
"epoch": 1.7530647985989494,
"grad_norm": 0.7722864692389659,
"learning_rate": 3.714443792550687e-07,
"loss": 0.0783,
"step": 1001
},
{
"epoch": 1.754816112084063,
"grad_norm": 0.859006539210779,
"learning_rate": 3.6625940992923826e-07,
"loss": 0.0823,
"step": 1002
},
{
"epoch": 1.7565674255691768,
"grad_norm": 0.9250172746239839,
"learning_rate": 3.611095076330762e-07,
"loss": 0.1252,
"step": 1003
},
{
"epoch": 1.7583187390542907,
"grad_norm": 1.125672327130624,
"learning_rate": 3.559947113397988e-07,
"loss": 0.0956,
"step": 1004
},
{
"epoch": 1.7600700525394046,
"grad_norm": 1.0498908967100793,
"learning_rate": 3.509150597569483e-07,
"loss": 0.0776,
"step": 1005
},
{
"epoch": 1.7618213660245186,
"grad_norm": 0.9371798046914668,
"learning_rate": 3.458705913261029e-07,
"loss": 0.0605,
"step": 1006
},
{
"epoch": 1.7635726795096323,
"grad_norm": 0.8138873352682954,
"learning_rate": 3.4086134422257945e-07,
"loss": 0.0592,
"step": 1007
},
{
"epoch": 1.765323992994746,
"grad_norm": 0.9071537379118976,
"learning_rate": 3.3588735635515177e-07,
"loss": 0.1014,
"step": 1008
},
{
"epoch": 1.7670753064798599,
"grad_norm": 0.9437847210944791,
"learning_rate": 3.309486653657584e-07,
"loss": 0.1097,
"step": 1009
},
{
"epoch": 1.7688266199649738,
"grad_norm": 1.0469256700567728,
"learning_rate": 3.260453086292187e-07,
"loss": 0.0508,
"step": 1010
},
{
"epoch": 1.7705779334500875,
"grad_norm": 0.9028655810648165,
"learning_rate": 3.2117732325295416e-07,
"loss": 0.0708,
"step": 1011
},
{
"epoch": 1.7723292469352014,
"grad_norm": 1.0851027389495533,
"learning_rate": 3.163447460767005e-07,
"loss": 0.0761,
"step": 1012
},
{
"epoch": 1.7740805604203151,
"grad_norm": 0.9493103571999805,
"learning_rate": 3.115476136722362e-07,
"loss": 0.0996,
"step": 1013
},
{
"epoch": 1.775831873905429,
"grad_norm": 0.7586774662370825,
"learning_rate": 3.067859623431008e-07,
"loss": 0.0727,
"step": 1014
},
{
"epoch": 1.777583187390543,
"grad_norm": 0.9384004769032015,
"learning_rate": 3.0205982812431924e-07,
"loss": 0.0723,
"step": 1015
},
{
"epoch": 1.7793345008756567,
"grad_norm": 1.4973320222136197,
"learning_rate": 2.973692467821371e-07,
"loss": 0.125,
"step": 1016
},
{
"epoch": 1.7810858143607706,
"grad_norm": 0.8533671156495338,
"learning_rate": 2.927142538137384e-07,
"loss": 0.0596,
"step": 1017
},
{
"epoch": 1.7828371278458843,
"grad_norm": 0.7496043304937501,
"learning_rate": 2.880948844469872e-07,
"loss": 0.0788,
"step": 1018
},
{
"epoch": 1.7845884413309983,
"grad_norm": 1.3612555191011448,
"learning_rate": 2.8351117364015526e-07,
"loss": 0.0955,
"step": 1019
},
{
"epoch": 1.7863397548161122,
"grad_norm": 0.9630356160518716,
"learning_rate": 2.78963156081658e-07,
"loss": 0.0988,
"step": 1020
},
{
"epoch": 1.7880910683012259,
"grad_norm": 1.1121115936311063,
"learning_rate": 2.744508661897949e-07,
"loss": 0.1138,
"step": 1021
},
{
"epoch": 1.7898423817863398,
"grad_norm": 0.7339445298389599,
"learning_rate": 2.6997433811248475e-07,
"loss": 0.0927,
"step": 1022
},
{
"epoch": 1.7915936952714535,
"grad_norm": 0.8953178801892129,
"learning_rate": 2.6553360572701195e-07,
"loss": 0.1039,
"step": 1023
},
{
"epoch": 1.7933450087565674,
"grad_norm": 1.0398978963579788,
"learning_rate": 2.6112870263976686e-07,
"loss": 0.1398,
"step": 1024
},
{
"epoch": 1.7950963222416814,
"grad_norm": 1.182068666933184,
"learning_rate": 2.5675966218599136e-07,
"loss": 0.1103,
"step": 1025
},
{
"epoch": 1.796847635726795,
"grad_norm": 0.8892213526204024,
"learning_rate": 2.524265174295293e-07,
"loss": 0.0669,
"step": 1026
},
{
"epoch": 1.7985989492119088,
"grad_norm": 0.8059578982086425,
"learning_rate": 2.481293011625724e-07,
"loss": 0.0648,
"step": 1027
},
{
"epoch": 1.8003502626970227,
"grad_norm": 0.7102065798669428,
"learning_rate": 2.438680459054171e-07,
"loss": 0.0718,
"step": 1028
},
{
"epoch": 1.8021015761821366,
"grad_norm": 0.6852586354069934,
"learning_rate": 2.3964278390621374e-07,
"loss": 0.0826,
"step": 1029
},
{
"epoch": 1.8038528896672505,
"grad_norm": 0.6825735878520617,
"learning_rate": 2.3545354714072265e-07,
"loss": 0.0569,
"step": 1030
},
{
"epoch": 1.8056042031523643,
"grad_norm": 0.945132269971334,
"learning_rate": 2.3130036731207893e-07,
"loss": 0.1009,
"step": 1031
},
{
"epoch": 1.807355516637478,
"grad_norm": 0.7485096153539492,
"learning_rate": 2.2718327585054156e-07,
"loss": 0.0551,
"step": 1032
},
{
"epoch": 1.8091068301225919,
"grad_norm": 1.0031751267167337,
"learning_rate": 2.2310230391326682e-07,
"loss": 0.1056,
"step": 1033
},
{
"epoch": 1.8108581436077058,
"grad_norm": 0.7465291965071548,
"learning_rate": 2.190574823840641e-07,
"loss": 0.0799,
"step": 1034
},
{
"epoch": 1.8126094570928197,
"grad_norm": 1.0587014217350015,
"learning_rate": 2.15048841873165e-07,
"loss": 0.0851,
"step": 1035
},
{
"epoch": 1.8143607705779334,
"grad_norm": 1.1062177226269723,
"learning_rate": 2.110764127169923e-07,
"loss": 0.1016,
"step": 1036
},
{
"epoch": 1.8161120840630471,
"grad_norm": 0.8981661281109393,
"learning_rate": 2.0714022497793197e-07,
"loss": 0.0762,
"step": 1037
},
{
"epoch": 1.817863397548161,
"grad_norm": 0.9608421098019754,
"learning_rate": 2.0324030844410204e-07,
"loss": 0.1099,
"step": 1038
},
{
"epoch": 1.819614711033275,
"grad_norm": 0.9681573187969551,
"learning_rate": 1.993766926291285e-07,
"loss": 0.0735,
"step": 1039
},
{
"epoch": 1.821366024518389,
"grad_norm": 0.8273362165098569,
"learning_rate": 1.9554940677192213e-07,
"loss": 0.0981,
"step": 1040
},
{
"epoch": 1.8231173380035026,
"grad_norm": 0.8824152506254714,
"learning_rate": 1.9175847983645857e-07,
"loss": 0.1064,
"step": 1041
},
{
"epoch": 1.8248686514886163,
"grad_norm": 1.0042241694921472,
"learning_rate": 1.880039405115569e-07,
"loss": 0.0947,
"step": 1042
},
{
"epoch": 1.8266199649737302,
"grad_norm": 1.0580797717184558,
"learning_rate": 1.8428581721066486e-07,
"loss": 0.077,
"step": 1043
},
{
"epoch": 1.8283712784588442,
"grad_norm": 0.8676736423566411,
"learning_rate": 1.806041380716411e-07,
"loss": 0.0854,
"step": 1044
},
{
"epoch": 1.830122591943958,
"grad_norm": 1.0605831500963205,
"learning_rate": 1.769589309565445e-07,
"loss": 0.1308,
"step": 1045
},
{
"epoch": 1.8318739054290718,
"grad_norm": 1.14451694965405,
"learning_rate": 1.733502234514206e-07,
"loss": 0.0877,
"step": 1046
},
{
"epoch": 1.8336252189141855,
"grad_norm": 1.0185258661977419,
"learning_rate": 1.6977804286609777e-07,
"loss": 0.0884,
"step": 1047
},
{
"epoch": 1.8353765323992994,
"grad_norm": 0.9397169836007261,
"learning_rate": 1.6624241623397598e-07,
"loss": 0.1227,
"step": 1048
},
{
"epoch": 1.8371278458844134,
"grad_norm": 0.988120401495064,
"learning_rate": 1.6274337031182362e-07,
"loss": 0.0721,
"step": 1049
},
{
"epoch": 1.8388791593695273,
"grad_norm": 0.842379722481448,
"learning_rate": 1.5928093157957403e-07,
"loss": 0.0883,
"step": 1050
},
{
"epoch": 1.840630472854641,
"grad_norm": 0.7650106307160991,
"learning_rate": 1.5585512624012812e-07,
"loss": 0.0627,
"step": 1051
},
{
"epoch": 1.8423817863397547,
"grad_norm": 0.7520245228487481,
"learning_rate": 1.5246598021915304e-07,
"loss": 0.0583,
"step": 1052
},
{
"epoch": 1.8441330998248686,
"grad_norm": 0.7816067951505854,
"learning_rate": 1.4911351916488849e-07,
"loss": 0.0629,
"step": 1053
},
{
"epoch": 1.8458844133099825,
"grad_norm": 0.8206548278069278,
"learning_rate": 1.4579776844794834e-07,
"loss": 0.0629,
"step": 1054
},
{
"epoch": 1.8476357267950965,
"grad_norm": 0.8413065831699674,
"learning_rate": 1.4251875316113495e-07,
"loss": 0.0918,
"step": 1055
},
{
"epoch": 1.8493870402802102,
"grad_norm": 0.8646047693995625,
"learning_rate": 1.3927649811924182e-07,
"loss": 0.1067,
"step": 1056
},
{
"epoch": 1.8511383537653239,
"grad_norm": 1.1648039728609827,
"learning_rate": 1.3607102785887393e-07,
"loss": 0.1264,
"step": 1057
},
{
"epoch": 1.8528896672504378,
"grad_norm": 0.7719934600369042,
"learning_rate": 1.3290236663825562e-07,
"loss": 0.083,
"step": 1058
},
{
"epoch": 1.8546409807355517,
"grad_norm": 0.8950213917800804,
"learning_rate": 1.2977053843704957e-07,
"loss": 0.0847,
"step": 1059
},
{
"epoch": 1.8563922942206657,
"grad_norm": 1.0026565240342258,
"learning_rate": 1.2667556695617534e-07,
"loss": 0.1044,
"step": 1060
},
{
"epoch": 1.8581436077057794,
"grad_norm": 0.744107605196941,
"learning_rate": 1.236174756176295e-07,
"loss": 0.0721,
"step": 1061
},
{
"epoch": 1.859894921190893,
"grad_norm": 0.9509055299042167,
"learning_rate": 1.2059628756430797e-07,
"loss": 0.0818,
"step": 1062
},
{
"epoch": 1.861646234676007,
"grad_norm": 0.9112160743275268,
"learning_rate": 1.1761202565983399e-07,
"loss": 0.0645,
"step": 1063
},
{
"epoch": 1.863397548161121,
"grad_norm": 1.0272055127705473,
"learning_rate": 1.1466471248837985e-07,
"loss": 0.0748,
"step": 1064
},
{
"epoch": 1.8651488616462348,
"grad_norm": 0.8706834973807851,
"learning_rate": 1.1175437035450043e-07,
"loss": 0.0809,
"step": 1065
},
{
"epoch": 1.8669001751313485,
"grad_norm": 0.9563935615496911,
"learning_rate": 1.0888102128296052e-07,
"loss": 0.0581,
"step": 1066
},
{
"epoch": 1.8686514886164622,
"grad_norm": 0.5939777496692319,
"learning_rate": 1.0604468701857384e-07,
"loss": 0.0715,
"step": 1067
},
{
"epoch": 1.8704028021015762,
"grad_norm": 0.8243817906122596,
"learning_rate": 1.0324538902603154e-07,
"loss": 0.0795,
"step": 1068
},
{
"epoch": 1.87215411558669,
"grad_norm": 0.9622902905039842,
"learning_rate": 1.0048314848974616e-07,
"loss": 0.1115,
"step": 1069
},
{
"epoch": 1.873905429071804,
"grad_norm": 0.8979703410249232,
"learning_rate": 9.775798631368627e-08,
"loss": 0.0807,
"step": 1070
},
{
"epoch": 1.8756567425569177,
"grad_norm": 1.0203580216733263,
"learning_rate": 9.506992312122044e-08,
"loss": 0.1578,
"step": 1071
},
{
"epoch": 1.8774080560420314,
"grad_norm": 0.7923499980875016,
"learning_rate": 9.24189792549629e-08,
"loss": 0.0717,
"step": 1072
},
{
"epoch": 1.8791593695271454,
"grad_norm": 0.9641425590050212,
"learning_rate": 8.980517477661543e-08,
"loss": 0.0915,
"step": 1073
},
{
"epoch": 1.8809106830122593,
"grad_norm": 1.3669709921921882,
"learning_rate": 8.722852946682014e-08,
"loss": 0.1012,
"step": 1074
},
{
"epoch": 1.882661996497373,
"grad_norm": 0.9665788733716942,
"learning_rate": 8.468906282500577e-08,
"loss": 0.0614,
"step": 1075
},
{
"epoch": 1.884413309982487,
"grad_norm": 1.400962552491105,
"learning_rate": 8.218679406924279e-08,
"loss": 0.1118,
"step": 1076
},
{
"epoch": 1.8861646234676006,
"grad_norm": 0.9219968558833442,
"learning_rate": 7.972174213609684e-08,
"loss": 0.0779,
"step": 1077
},
{
"epoch": 1.8879159369527145,
"grad_norm": 0.8045432273483273,
"learning_rate": 7.7293925680485e-08,
"loss": 0.0767,
"step": 1078
},
{
"epoch": 1.8896672504378285,
"grad_norm": 0.9755957257233182,
"learning_rate": 7.490336307553691e-08,
"loss": 0.0859,
"step": 1079
},
{
"epoch": 1.8914185639229422,
"grad_norm": 1.0420255755068712,
"learning_rate": 7.255007241245227e-08,
"loss": 0.0811,
"step": 1080
},
{
"epoch": 1.893169877408056,
"grad_norm": 0.9551488849265928,
"learning_rate": 7.023407150036632e-08,
"loss": 0.1306,
"step": 1081
},
{
"epoch": 1.8949211908931698,
"grad_norm": 0.77813629526978,
"learning_rate": 6.795537786621564e-08,
"loss": 0.0741,
"step": 1082
},
{
"epoch": 1.8966725043782837,
"grad_norm": 0.8084153621002628,
"learning_rate": 6.571400875460154e-08,
"loss": 0.0814,
"step": 1083
},
{
"epoch": 1.8984238178633976,
"grad_norm": 0.8812494181028698,
"learning_rate": 6.350998112766626e-08,
"loss": 0.0897,
"step": 1084
},
{
"epoch": 1.9001751313485113,
"grad_norm": 0.7735066758206781,
"learning_rate": 6.1343311664957e-08,
"loss": 0.0895,
"step": 1085
},
{
"epoch": 1.9019264448336253,
"grad_norm": 0.8161894034164808,
"learning_rate": 5.92140167633054e-08,
"loss": 0.0899,
"step": 1086
},
{
"epoch": 1.903677758318739,
"grad_norm": 0.9086034533332163,
"learning_rate": 5.712211253670108e-08,
"loss": 0.0896,
"step": 1087
},
{
"epoch": 1.905429071803853,
"grad_norm": 0.75646864097532,
"learning_rate": 5.5067614816169955e-08,
"loss": 0.0766,
"step": 1088
},
{
"epoch": 1.9071803852889668,
"grad_norm": 0.731979768510242,
"learning_rate": 5.3050539149654964e-08,
"loss": 0.0686,
"step": 1089
},
{
"epoch": 1.9089316987740805,
"grad_norm": 0.8139155119262608,
"learning_rate": 5.107090080189725e-08,
"loss": 0.0801,
"step": 1090
},
{
"epoch": 1.9106830122591942,
"grad_norm": 0.9723091179497511,
"learning_rate": 4.9128714754321794e-08,
"loss": 0.0919,
"step": 1091
},
{
"epoch": 1.9124343257443082,
"grad_norm": 0.8354742100084726,
"learning_rate": 4.722399570492309e-08,
"loss": 0.0675,
"step": 1092
},
{
"epoch": 1.914185639229422,
"grad_norm": 0.8372591413758104,
"learning_rate": 4.535675806815576e-08,
"loss": 0.0645,
"step": 1093
},
{
"epoch": 1.915936952714536,
"grad_norm": 1.046373747378417,
"learning_rate": 4.352701597482245e-08,
"loss": 0.0907,
"step": 1094
},
{
"epoch": 1.9176882661996497,
"grad_norm": 0.8526193691562309,
"learning_rate": 4.173478327197e-08,
"loss": 0.084,
"step": 1095
},
{
"epoch": 1.9194395796847634,
"grad_norm": 0.8722546887931946,
"learning_rate": 3.998007352278233e-08,
"loss": 0.1041,
"step": 1096
},
{
"epoch": 1.9211908931698773,
"grad_norm": 0.9484274530627599,
"learning_rate": 3.826290000647881e-08,
"loss": 0.0926,
"step": 1097
},
{
"epoch": 1.9229422066549913,
"grad_norm": 1.064819089816097,
"learning_rate": 3.6583275718214406e-08,
"loss": 0.1145,
"step": 1098
},
{
"epoch": 1.9246935201401052,
"grad_norm": 0.6922227153177929,
"learning_rate": 3.4941213368980264e-08,
"loss": 0.0826,
"step": 1099
},
{
"epoch": 1.926444833625219,
"grad_norm": 0.9547055760475679,
"learning_rate": 3.333672538550714e-08,
"loss": 0.088,
"step": 1100
},
{
"epoch": 1.9281961471103326,
"grad_norm": 1.1501182412152315,
"learning_rate": 3.176982391017214e-08,
"loss": 0.1087,
"step": 1101
},
{
"epoch": 1.9299474605954465,
"grad_norm": 0.9179486185891369,
"learning_rate": 3.024052080090822e-08,
"loss": 0.0724,
"step": 1102
},
{
"epoch": 1.9316987740805605,
"grad_norm": 0.664000841194001,
"learning_rate": 2.874882763111153e-08,
"loss": 0.0474,
"step": 1103
},
{
"epoch": 1.9334500875656744,
"grad_norm": 0.7849197413612594,
"learning_rate": 2.7294755689555307e-08,
"loss": 0.0624,
"step": 1104
},
{
"epoch": 1.935201401050788,
"grad_norm": 0.768814756583036,
"learning_rate": 2.5878315980305548e-08,
"loss": 0.0741,
"step": 1105
},
{
"epoch": 1.9369527145359018,
"grad_norm": 1.003735722548012,
"learning_rate": 2.4499519222635493e-08,
"loss": 0.064,
"step": 1106
},
{
"epoch": 1.9387040280210157,
"grad_norm": 1.1491273999902563,
"learning_rate": 2.3158375850946268e-08,
"loss": 0.1346,
"step": 1107
},
{
"epoch": 1.9404553415061296,
"grad_norm": 0.7589932497855878,
"learning_rate": 2.1854896014686376e-08,
"loss": 0.0762,
"step": 1108
},
{
"epoch": 1.9422066549912436,
"grad_norm": 0.8331017479251108,
"learning_rate": 2.0589089578276767e-08,
"loss": 0.0484,
"step": 1109
},
{
"epoch": 1.9439579684763573,
"grad_norm": 0.9832005255060994,
"learning_rate": 1.936096612103533e-08,
"loss": 0.1037,
"step": 1110
},
{
"epoch": 1.945709281961471,
"grad_norm": 0.8225952913071677,
"learning_rate": 1.817053493710308e-08,
"loss": 0.0494,
"step": 1111
},
{
"epoch": 1.947460595446585,
"grad_norm": 0.7543806570940519,
"learning_rate": 1.7017805035375866e-08,
"loss": 0.0601,
"step": 1112
},
{
"epoch": 1.9492119089316988,
"grad_norm": 0.8293133630950005,
"learning_rate": 1.590278513943555e-08,
"loss": 0.0803,
"step": 1113
},
{
"epoch": 1.9509632224168127,
"grad_norm": 0.8768099650131057,
"learning_rate": 1.4825483687483377e-08,
"loss": 0.0659,
"step": 1114
},
{
"epoch": 1.9527145359019265,
"grad_norm": 0.9469149296349607,
"learning_rate": 1.3785908832275596e-08,
"loss": 0.0824,
"step": 1115
},
{
"epoch": 1.9544658493870402,
"grad_norm": 0.8351027145369079,
"learning_rate": 1.2784068441064611e-08,
"loss": 0.0678,
"step": 1116
},
{
"epoch": 1.956217162872154,
"grad_norm": 1.020423409119886,
"learning_rate": 1.1819970095536814e-08,
"loss": 0.1034,
"step": 1117
},
{
"epoch": 1.957968476357268,
"grad_norm": 0.787927571900196,
"learning_rate": 1.0893621091754847e-08,
"loss": 0.068,
"step": 1118
},
{
"epoch": 1.959719789842382,
"grad_norm": 0.6892271318399958,
"learning_rate": 1.0005028440104313e-08,
"loss": 0.0547,
"step": 1119
},
{
"epoch": 1.9614711033274956,
"grad_norm": 0.9205823435107259,
"learning_rate": 9.154198865239938e-09,
"loss": 0.1156,
"step": 1120
},
{
"epoch": 1.9632224168126093,
"grad_norm": 1.742536478730652,
"learning_rate": 8.341138806035043e-09,
"loss": 0.1909,
"step": 1121
},
{
"epoch": 1.9649737302977233,
"grad_norm": 1.0405677994948164,
"learning_rate": 7.565854415531037e-09,
"loss": 0.1204,
"step": 1122
},
{
"epoch": 1.9667250437828372,
"grad_norm": 0.9209713667024632,
"learning_rate": 6.8283515608924545e-09,
"loss": 0.0842,
"step": 1123
},
{
"epoch": 1.9684763572679511,
"grad_norm": 0.847022736110267,
"learning_rate": 6.128635823364204e-09,
"loss": 0.0647,
"step": 1124
},
{
"epoch": 1.9702276707530648,
"grad_norm": 0.8569688135314069,
"learning_rate": 5.466712498225501e-09,
"loss": 0.0984,
"step": 1125
},
{
"epoch": 1.9719789842381785,
"grad_norm": 0.7807927515616613,
"learning_rate": 4.8425865947515635e-09,
"loss": 0.0919,
"step": 1126
},
{
"epoch": 1.9737302977232924,
"grad_norm": 0.803314368964672,
"learning_rate": 4.256262836176972e-09,
"loss": 0.0748,
"step": 1127
},
{
"epoch": 1.9754816112084064,
"grad_norm": 0.8192070789421884,
"learning_rate": 3.7077456596584793e-09,
"loss": 0.0622,
"step": 1128
},
{
"epoch": 1.9772329246935203,
"grad_norm": 0.7402355614601918,
"learning_rate": 3.197039216241149e-09,
"loss": 0.0611,
"step": 1129
},
{
"epoch": 1.978984238178634,
"grad_norm": 0.8005803168162587,
"learning_rate": 2.7241473708283784e-09,
"loss": 0.082,
"step": 1130
},
{
"epoch": 1.9807355516637477,
"grad_norm": 0.9029956857496308,
"learning_rate": 2.2890737021513675e-09,
"loss": 0.099,
"step": 1131
},
{
"epoch": 1.9824868651488616,
"grad_norm": 0.786670113306005,
"learning_rate": 1.8918215027424746e-09,
"loss": 0.1038,
"step": 1132
},
{
"epoch": 1.9842381786339756,
"grad_norm": 0.8219187516038525,
"learning_rate": 1.532393778910235e-09,
"loss": 0.0785,
"step": 1133
},
{
"epoch": 1.9859894921190895,
"grad_norm": 0.7919831938955016,
"learning_rate": 1.2107932507177123e-09,
"loss": 0.0784,
"step": 1134
},
{
"epoch": 1.9877408056042032,
"grad_norm": 0.9217680061030283,
"learning_rate": 9.270223519586285e-10,
"loss": 0.086,
"step": 1135
},
{
"epoch": 1.989492119089317,
"grad_norm": 0.7642493509648998,
"learning_rate": 6.810832301440417e-10,
"loss": 0.0862,
"step": 1136
},
{
"epoch": 1.9912434325744308,
"grad_norm": 0.8154622682049354,
"learning_rate": 4.729777464806961e-10,
"loss": 0.0821,
"step": 1137
},
{
"epoch": 1.9929947460595447,
"grad_norm": 0.8127834975137689,
"learning_rate": 3.0270747586103045e-10,
"loss": 0.1092,
"step": 1138
},
{
"epoch": 1.9947460595446584,
"grad_norm": 1.0067550313335858,
"learning_rate": 1.702737068492999e-10,
"loss": 0.0985,
"step": 1139
},
{
"epoch": 1.9964973730297724,
"grad_norm": 0.9383181919963424,
"learning_rate": 7.567744167269464e-11,
"loss": 0.0958,
"step": 1140
},
{
"epoch": 1.998248686514886,
"grad_norm": 0.8519146037451801,
"learning_rate": 1.8919396212457865e-11,
"loss": 0.0717,
"step": 1141
},
{
"epoch": 2.0,
"grad_norm": 0.6152399598695977,
"learning_rate": 0.0,
"loss": 0.0677,
"step": 1142
},
{
"epoch": 2.0,
"step": 1142,
"total_flos": 3773592600576.0,
"train_loss": 0.13601795624086924,
"train_runtime": 863.4724,
"train_samples_per_second": 10.581,
"train_steps_per_second": 1.323
}
],
"logging_steps": 1,
"max_steps": 1142,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3773592600576.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}