redsgnaoh's picture
Upload folder using huggingface_hub
6d383f3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.262511373976342,
"eval_steps": 500,
"global_step": 2775,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 10.08474414591373,
"learning_rate": 5e-06,
"loss": 0.4268,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 7.187634396883529,
"learning_rate": 4.999999897855645e-06,
"loss": 0.4238,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 3.8327630883917294,
"learning_rate": 4.9999995914225884e-06,
"loss": 0.2838,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 4.248807424602059,
"learning_rate": 4.999999080700855e-06,
"loss": 0.236,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 4.089663323785212,
"learning_rate": 4.999998365690486e-06,
"loss": 0.2601,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 3.9876649053708864,
"learning_rate": 4.999997446391542e-06,
"loss": 0.2326,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 2.9111466473566785,
"learning_rate": 4.999996322804095e-06,
"loss": 0.2269,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.5524867538991827,
"learning_rate": 4.999994994928239e-06,
"loss": 0.2052,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 2.5545174637937094,
"learning_rate": 4.999993462764082e-06,
"loss": 0.2696,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 2.4518559406151006,
"learning_rate": 4.999991726311749e-06,
"loss": 0.1618,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 3.405646599606387,
"learning_rate": 4.999989785571382e-06,
"loss": 0.2355,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 3.4944361203618186,
"learning_rate": 4.999987640543139e-06,
"loss": 0.2585,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 2.5057214280143674,
"learning_rate": 4.999985291227196e-06,
"loss": 0.2235,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 3.617819326198201,
"learning_rate": 4.999982737623746e-06,
"loss": 0.3207,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.953536091708363,
"learning_rate": 4.999979979732995e-06,
"loss": 0.2543,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 2.6415876340824465,
"learning_rate": 4.999977017555171e-06,
"loss": 0.174,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.62032982183088,
"learning_rate": 4.999973851090514e-06,
"loss": 0.2526,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 2.208495589846344,
"learning_rate": 4.999970480339284e-06,
"loss": 0.2381,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 4.827328107147866,
"learning_rate": 4.9999669053017564e-06,
"loss": 0.2259,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 3.293711347019613,
"learning_rate": 4.9999631259782235e-06,
"loss": 0.1889,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.9449987435140708,
"learning_rate": 4.999959142368993e-06,
"loss": 0.1916,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.4684804441032533,
"learning_rate": 4.999954954474391e-06,
"loss": 0.2267,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 2.420072565048825,
"learning_rate": 4.9999505622947594e-06,
"loss": 0.1781,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 3.190045330917334,
"learning_rate": 4.999945965830458e-06,
"loss": 0.204,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 3.144753224980832,
"learning_rate": 4.999941165081863e-06,
"loss": 0.1837,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 2.2772166419161026,
"learning_rate": 4.999936160049364e-06,
"loss": 0.203,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 2.842182064416549,
"learning_rate": 4.999930950733373e-06,
"loss": 0.2594,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 2.689259909233601,
"learning_rate": 4.999925537134312e-06,
"loss": 0.1829,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 2.6543387078431233,
"learning_rate": 4.9999199192526286e-06,
"loss": 0.209,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 2.660710953873218,
"learning_rate": 4.9999140970887775e-06,
"loss": 0.2084,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 3.1124474906382065,
"learning_rate": 4.999908070643236e-06,
"loss": 0.2088,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 2.750714892828661,
"learning_rate": 4.999901839916495e-06,
"loss": 0.1738,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 2.6053321715737314,
"learning_rate": 4.999895404909067e-06,
"loss": 0.1723,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 2.8576481166567587,
"learning_rate": 4.999888765621476e-06,
"loss": 0.1729,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 2.773654545068012,
"learning_rate": 4.999881922054264e-06,
"loss": 0.1453,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 2.037109443657936,
"learning_rate": 4.999874874207991e-06,
"loss": 0.1197,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 2.6994551736744268,
"learning_rate": 4.999867622083232e-06,
"loss": 0.2238,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 2.634969731102202,
"learning_rate": 4.99986016568058e-06,
"loss": 0.2118,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 2.955393409573457,
"learning_rate": 4.999852505000646e-06,
"loss": 0.2215,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 2.0111122791563285,
"learning_rate": 4.999844640044053e-06,
"loss": 0.1216,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.7660608350268077,
"learning_rate": 4.999836570811445e-06,
"loss": 0.1948,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 2.581238704515564,
"learning_rate": 4.999828297303483e-06,
"loss": 0.2053,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 2.921825171868496,
"learning_rate": 4.9998198195208405e-06,
"loss": 0.2124,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 2.5257433259743145,
"learning_rate": 4.999811137464212e-06,
"loss": 0.1754,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 2.4051206013490947,
"learning_rate": 4.999802251134307e-06,
"loss": 0.2384,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 2.824019582183984,
"learning_rate": 4.99979316053185e-06,
"loss": 0.1845,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 2.4758052686748395,
"learning_rate": 4.999783865657585e-06,
"loss": 0.2639,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 3.3028306393170053,
"learning_rate": 4.999774366512272e-06,
"loss": 0.221,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 3.108709580219038,
"learning_rate": 4.9997646630966865e-06,
"loss": 0.2205,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 2.076369424843288,
"learning_rate": 4.999754755411621e-06,
"loss": 0.1336,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 2.7444959299225715,
"learning_rate": 4.9997446434578865e-06,
"loss": 0.1836,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 3.2836031890921418,
"learning_rate": 4.999734327236307e-06,
"loss": 0.1877,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 1.951056721435438,
"learning_rate": 4.999723806747728e-06,
"loss": 0.1151,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 2.6138639966442203,
"learning_rate": 4.99971308199301e-06,
"loss": 0.1363,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 2.444124379430723,
"learning_rate": 4.999702152973025e-06,
"loss": 0.1482,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 2.4597235759126987,
"learning_rate": 4.9996910196886694e-06,
"loss": 0.133,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 2.6784146485916343,
"learning_rate": 4.999679682140852e-06,
"loss": 0.1174,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 2.7424790633709564,
"learning_rate": 4.999668140330499e-06,
"loss": 0.252,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 3.348265074283292,
"learning_rate": 4.999656394258555e-06,
"loss": 0.1925,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 2.1154638113016193,
"learning_rate": 4.999644443925978e-06,
"loss": 0.1836,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 2.4179191653959484,
"learning_rate": 4.999632289333746e-06,
"loss": 0.153,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 3.9087207564649495,
"learning_rate": 4.999619930482852e-06,
"loss": 0.17,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 3.9984836138839994,
"learning_rate": 4.999607367374304e-06,
"loss": 0.2311,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 3.296600637312694,
"learning_rate": 4.999594600009131e-06,
"loss": 0.1665,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 3.086306216989983,
"learning_rate": 4.999581628388375e-06,
"loss": 0.212,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 2.48917207768275,
"learning_rate": 4.999568452513097e-06,
"loss": 0.236,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 2.42340749830043,
"learning_rate": 4.9995550723843726e-06,
"loss": 0.1917,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 3.0972614391682396,
"learning_rate": 4.999541488003295e-06,
"loss": 0.1765,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 2.3696589048498193,
"learning_rate": 4.999527699370975e-06,
"loss": 0.1814,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 2.875746597678631,
"learning_rate": 4.99951370648854e-06,
"loss": 0.1878,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 2.4253311315699606,
"learning_rate": 4.999499509357132e-06,
"loss": 0.15,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.766432808739805,
"learning_rate": 4.999485107977912e-06,
"loss": 0.1889,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 2.625328870617005,
"learning_rate": 4.999470502352057e-06,
"loss": 0.1719,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 2.982643055808138,
"learning_rate": 4.999455692480759e-06,
"loss": 0.2113,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 2.242621960634031,
"learning_rate": 4.999440678365229e-06,
"loss": 0.1721,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 2.4926186894362976,
"learning_rate": 4.999425460006695e-06,
"loss": 0.173,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 2.3671699591796305,
"learning_rate": 4.9994100374063995e-06,
"loss": 0.1687,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 3.4429608280507216,
"learning_rate": 4.9993944105656035e-06,
"loss": 0.2649,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 2.0807531109765987,
"learning_rate": 4.999378579485582e-06,
"loss": 0.1476,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 2.5883097677868334,
"learning_rate": 4.999362544167632e-06,
"loss": 0.162,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 1.9494729618347428,
"learning_rate": 4.99934630461306e-06,
"loss": 0.1869,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 3.2846426885249205,
"learning_rate": 4.999329860823197e-06,
"loss": 0.203,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 2.6587615060855616,
"learning_rate": 4.999313212799383e-06,
"loss": 0.1773,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 3.210244688238914,
"learning_rate": 4.99929636054298e-06,
"loss": 0.2184,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 2.2958732125888224,
"learning_rate": 4.999279304055366e-06,
"loss": 0.2084,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 2.3139948703024857,
"learning_rate": 4.999262043337933e-06,
"loss": 0.1973,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 2.6677501256903002,
"learning_rate": 4.999244578392094e-06,
"loss": 0.1808,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 2.1844571391295524,
"learning_rate": 4.9992269092192736e-06,
"loss": 0.1761,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 2.4616623603088947,
"learning_rate": 4.9992090358209166e-06,
"loss": 0.1731,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 2.337094817685032,
"learning_rate": 4.9991909581984835e-06,
"loss": 0.1714,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 2.769205118473802,
"learning_rate": 4.999172676353451e-06,
"loss": 0.1286,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 1.993822184781022,
"learning_rate": 4.999154190287314e-06,
"loss": 0.1722,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 2.4020441009943716,
"learning_rate": 4.999135500001583e-06,
"loss": 0.2235,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 2.0794454896454013,
"learning_rate": 4.9991166054977844e-06,
"loss": 0.1424,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 2.5362620116303636,
"learning_rate": 4.999097506777463e-06,
"loss": 0.1878,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 2.3575608544869393,
"learning_rate": 4.999078203842179e-06,
"loss": 0.2241,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 2.0445052328297217,
"learning_rate": 4.999058696693511e-06,
"loss": 0.1196,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 2.7989157148193615,
"learning_rate": 4.99903898533305e-06,
"loss": 0.186,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 2.6048410678209177,
"learning_rate": 4.99901906976241e-06,
"loss": 0.1675,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 2.232255651321915,
"learning_rate": 4.998998949983217e-06,
"loss": 0.1379,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 2.8190134265237203,
"learning_rate": 4.998978625997115e-06,
"loss": 0.2079,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 2.3706098438086003,
"learning_rate": 4.998958097805765e-06,
"loss": 0.141,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 2.44520778150716,
"learning_rate": 4.9989373654108445e-06,
"loss": 0.164,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 3.5342837078815115,
"learning_rate": 4.9989164288140465e-06,
"loss": 0.1548,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 2.0458160494053836,
"learning_rate": 4.998895288017085e-06,
"loss": 0.179,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 2.205598400099282,
"learning_rate": 4.998873943021684e-06,
"loss": 0.1614,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 2.511554629528065,
"learning_rate": 4.998852393829589e-06,
"loss": 0.1659,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 3.219796004043862,
"learning_rate": 4.9988306404425625e-06,
"loss": 0.2276,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 1.752131198173806,
"learning_rate": 4.99880868286238e-06,
"loss": 0.1742,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 3.361908404370123,
"learning_rate": 4.998786521090836e-06,
"loss": 0.1724,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 2.360660279895669,
"learning_rate": 4.9987641551297426e-06,
"loss": 0.1999,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 2.307324595436377,
"learning_rate": 4.998741584980926e-06,
"loss": 0.2101,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 2.6034298836542247,
"learning_rate": 4.9987188106462314e-06,
"loss": 0.167,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 1.5842459657245014,
"learning_rate": 4.99869583212752e-06,
"loss": 0.1538,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 2.627805184680893,
"learning_rate": 4.9986726494266694e-06,
"loss": 0.2522,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 2.5410809044474907,
"learning_rate": 4.998649262545574e-06,
"loss": 0.1776,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 2.076630177156468,
"learning_rate": 4.998625671486144e-06,
"loss": 0.1828,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 2.5484627386038343,
"learning_rate": 4.998601876250308e-06,
"loss": 0.1781,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 2.0245969343413983,
"learning_rate": 4.998577876840011e-06,
"loss": 0.1157,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 2.1240696181789143,
"learning_rate": 4.9985536732572124e-06,
"loss": 0.2097,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 2.4280518543324776,
"learning_rate": 4.998529265503891e-06,
"loss": 0.1631,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 2.203499108228096,
"learning_rate": 4.9985046535820416e-06,
"loss": 0.2094,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 1.7616968616285278,
"learning_rate": 4.998479837493675e-06,
"loss": 0.1265,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 2.790115396130319,
"learning_rate": 4.9984548172408195e-06,
"loss": 0.162,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 2.7234581680187087,
"learning_rate": 4.998429592825519e-06,
"loss": 0.1901,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 2.7369239231742375,
"learning_rate": 4.998404164249835e-06,
"loss": 0.2289,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 2.145081624481222,
"learning_rate": 4.998378531515845e-06,
"loss": 0.1267,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 3.4112888898442586,
"learning_rate": 4.998352694625645e-06,
"loss": 0.1536,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 1.8616422473229426,
"learning_rate": 4.998326653581343e-06,
"loss": 0.1342,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 2.107533644057457,
"learning_rate": 4.998300408385072e-06,
"loss": 0.1774,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 3.079768243729869,
"learning_rate": 4.998273959038972e-06,
"loss": 0.228,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 1.7403897659478,
"learning_rate": 4.998247305545207e-06,
"loss": 0.1257,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 1.663929944748691,
"learning_rate": 4.998220447905953e-06,
"loss": 0.1857,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 2.604082553460826,
"learning_rate": 4.998193386123408e-06,
"loss": 0.1724,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 2.662434521006077,
"learning_rate": 4.99816612019978e-06,
"loss": 0.1858,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 2.747586314783755,
"learning_rate": 4.998138650137298e-06,
"loss": 0.1764,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 2.299433423879838,
"learning_rate": 4.998110975938208e-06,
"loss": 0.2321,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 2.527715242455789,
"learning_rate": 4.998083097604769e-06,
"loss": 0.2159,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 2.5218619075726285,
"learning_rate": 4.998055015139261e-06,
"loss": 0.1608,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 3.0047644164754495,
"learning_rate": 4.998026728543979e-06,
"loss": 0.2065,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 2.178572369709547,
"learning_rate": 4.997998237821233e-06,
"loss": 0.1865,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 1.5759272732327654,
"learning_rate": 4.997969542973352e-06,
"loss": 0.141,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 2.0811820514545554,
"learning_rate": 4.997940644002681e-06,
"loss": 0.1676,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 3.4671123551644403,
"learning_rate": 4.997911540911581e-06,
"loss": 0.2163,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 2.2842746412883312,
"learning_rate": 4.99788223370243e-06,
"loss": 0.1677,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 2.3367815299616734,
"learning_rate": 4.9978527223776245e-06,
"loss": 0.1811,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 2.088943555321838,
"learning_rate": 4.9978230069395735e-06,
"loss": 0.1627,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 2.5972570174963474,
"learning_rate": 4.9977930873907065e-06,
"loss": 0.1415,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 2.3401595363726595,
"learning_rate": 4.997762963733468e-06,
"loss": 0.148,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 2.894021920414895,
"learning_rate": 4.997732635970321e-06,
"loss": 0.2262,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.7373422038949267,
"learning_rate": 4.9977021041037425e-06,
"loss": 0.1697,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 2.5175987385537697,
"learning_rate": 4.9976713681362265e-06,
"loss": 0.2353,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 2.4396682297474563,
"learning_rate": 4.997640428070286e-06,
"loss": 0.2143,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 2.2947939267715087,
"learning_rate": 4.99760928390845e-06,
"loss": 0.1369,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 2.4758802729165326,
"learning_rate": 4.997577935653262e-06,
"loss": 0.1498,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 2.283530414912182,
"learning_rate": 4.9975463833072835e-06,
"loss": 0.1558,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 1.785546461501872,
"learning_rate": 4.997514626873093e-06,
"loss": 0.1548,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 2.5778925367686645,
"learning_rate": 4.997482666353287e-06,
"loss": 0.1568,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 2.14376664899083,
"learning_rate": 4.997450501750476e-06,
"loss": 0.169,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 1.7889496418860382,
"learning_rate": 4.997418133067288e-06,
"loss": 0.1178,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 2.734023407734539,
"learning_rate": 4.997385560306368e-06,
"loss": 0.2024,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 2.438529690680932,
"learning_rate": 4.997352783470379e-06,
"loss": 0.1877,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 2.358353345441234,
"learning_rate": 4.997319802561997e-06,
"loss": 0.1349,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 2.1448042331352677,
"learning_rate": 4.9972866175839196e-06,
"loss": 0.1268,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 2.279102892849676,
"learning_rate": 4.9972532285388575e-06,
"loss": 0.1799,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 2.5140889210625543,
"learning_rate": 4.997219635429538e-06,
"loss": 0.1876,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 2.6687467871063664,
"learning_rate": 4.997185838258709e-06,
"loss": 0.1787,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 3.3415050416363354,
"learning_rate": 4.997151837029129e-06,
"loss": 0.1799,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 1.9269629920973084,
"learning_rate": 4.997117631743579e-06,
"loss": 0.1397,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 3.00621227688512,
"learning_rate": 4.997083222404852e-06,
"loss": 0.1967,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 2.2615169475731327,
"learning_rate": 4.997048609015762e-06,
"loss": 0.1288,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 2.4342779650863724,
"learning_rate": 4.997013791579136e-06,
"loss": 0.186,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 2.4576007392784542,
"learning_rate": 4.996978770097819e-06,
"loss": 0.1577,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 2.4106466164039766,
"learning_rate": 4.996943544574673e-06,
"loss": 0.1886,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 2.5961861603572225,
"learning_rate": 4.996908115012576e-06,
"loss": 0.1621,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 2.833499016976519,
"learning_rate": 4.996872481414425e-06,
"loss": 0.1818,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 3.5757833649912834,
"learning_rate": 4.9968366437831305e-06,
"loss": 0.2517,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 1.5552303076468192,
"learning_rate": 4.99680060212162e-06,
"loss": 0.1245,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 2.2202920086611213,
"learning_rate": 4.996764356432841e-06,
"loss": 0.2174,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 2.1293059669722196,
"learning_rate": 4.996727906719754e-06,
"loss": 0.1605,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 2.212380091830394,
"learning_rate": 4.9966912529853365e-06,
"loss": 0.125,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 2.1098748731042507,
"learning_rate": 4.996654395232585e-06,
"loss": 0.17,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 2.3315908475718183,
"learning_rate": 4.996617333464512e-06,
"loss": 0.1678,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 2.100678357161413,
"learning_rate": 4.996580067684145e-06,
"loss": 0.1512,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 1.6542642571071706,
"learning_rate": 4.996542597894528e-06,
"loss": 0.1875,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 1.500567296289452,
"learning_rate": 4.996504924098726e-06,
"loss": 0.1579,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 1.5859042172394868,
"learning_rate": 4.9964670462998145e-06,
"loss": 0.146,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 1.7178165607526288,
"learning_rate": 4.99642896450089e-06,
"loss": 0.2372,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 2.8492778772061484,
"learning_rate": 4.9963906787050656e-06,
"loss": 0.2504,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 1.9406179967433874,
"learning_rate": 4.996352188915467e-06,
"loss": 0.1733,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 2.811015878830941,
"learning_rate": 4.996313495135242e-06,
"loss": 0.2133,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 2.222839682156962,
"learning_rate": 4.9962745973675505e-06,
"loss": 0.2113,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 2.6159522481523343,
"learning_rate": 4.996235495615572e-06,
"loss": 0.1622,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 2.3708185697184847,
"learning_rate": 4.996196189882503e-06,
"loss": 0.1685,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 3.228308382699869,
"learning_rate": 4.996156680171552e-06,
"loss": 0.2332,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 2.351705904801359,
"learning_rate": 4.996116966485951e-06,
"loss": 0.1816,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 2.320092450855665,
"learning_rate": 4.996077048828944e-06,
"loss": 0.2321,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 1.960036016410063,
"learning_rate": 4.996036927203793e-06,
"loss": 0.1745,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.3679323522005573,
"learning_rate": 4.995996601613775e-06,
"loss": 0.1927,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 2.1775512973195723,
"learning_rate": 4.9959560720621875e-06,
"loss": 0.1576,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 2.286317354363178,
"learning_rate": 4.995915338552341e-06,
"loss": 0.2184,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 2.0945800180559275,
"learning_rate": 4.995874401087565e-06,
"loss": 0.1572,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 2.741714725855865,
"learning_rate": 4.9958332596712035e-06,
"loss": 0.2087,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 3.0871074584367864,
"learning_rate": 4.99579191430662e-06,
"loss": 0.1968,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 1.9723075192584005,
"learning_rate": 4.995750364997192e-06,
"loss": 0.1507,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 1.8988997770559113,
"learning_rate": 4.995708611746314e-06,
"loss": 0.1288,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 2.420700916830186,
"learning_rate": 4.995666654557399e-06,
"loss": 0.1988,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 2.370720479747693,
"learning_rate": 4.995624493433876e-06,
"loss": 0.2215,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 2.2764445558307607,
"learning_rate": 4.995582128379189e-06,
"loss": 0.1984,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 2.382102062046725,
"learning_rate": 4.9955395593968e-06,
"loss": 0.2535,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 2.833827673252778,
"learning_rate": 4.99549678649019e-06,
"loss": 0.1998,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 2.486472694935685,
"learning_rate": 4.99545380966285e-06,
"loss": 0.2118,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 3.0088319794179883,
"learning_rate": 4.995410628918294e-06,
"loss": 0.1584,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 1.975326638907469,
"learning_rate": 4.995367244260052e-06,
"loss": 0.1832,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 1.9912128526989044,
"learning_rate": 4.995323655691667e-06,
"loss": 0.1346,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 2.603090937917312,
"learning_rate": 4.995279863216702e-06,
"loss": 0.2124,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 2.053886430988171,
"learning_rate": 4.995235866838735e-06,
"loss": 0.1567,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 2.039621450617981,
"learning_rate": 4.995191666561361e-06,
"loss": 0.1694,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 2.0601930905500394,
"learning_rate": 4.995147262388192e-06,
"loss": 0.1264,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 3.3199244613439802,
"learning_rate": 4.995102654322858e-06,
"loss": 0.2204,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 2.1212806825874906,
"learning_rate": 4.995057842369002e-06,
"loss": 0.1122,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 2.157454599738766,
"learning_rate": 4.995012826530287e-06,
"loss": 0.1977,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 1.9698536511203952,
"learning_rate": 4.99496760681039e-06,
"loss": 0.1934,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 1.9533190562259675,
"learning_rate": 4.994922183213009e-06,
"loss": 0.1686,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 1.8311151598660917,
"learning_rate": 4.9948765557418535e-06,
"loss": 0.1376,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 2.6814547442935766,
"learning_rate": 4.994830724400653e-06,
"loss": 0.2536,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 2.065521179879655,
"learning_rate": 4.994784689193151e-06,
"loss": 0.1594,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 2.082741947039302,
"learning_rate": 4.994738450123111e-06,
"loss": 0.1792,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 2.1268040832192896,
"learning_rate": 4.994692007194312e-06,
"loss": 0.1746,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 1.6028966765046104,
"learning_rate": 4.994645360410547e-06,
"loss": 0.1442,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 2.051519186273431,
"learning_rate": 4.99459850977563e-06,
"loss": 0.1501,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 2.0348997381654774,
"learning_rate": 4.994551455293388e-06,
"loss": 0.1544,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 2.1087346651931758,
"learning_rate": 4.9945041969676654e-06,
"loss": 0.1768,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 2.2918772612100704,
"learning_rate": 4.994456734802325e-06,
"loss": 0.1361,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 1.6027315868889764,
"learning_rate": 4.994409068801247e-06,
"loss": 0.1905,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 1.3896946472755238,
"learning_rate": 4.994361198968323e-06,
"loss": 0.1282,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 2.8336860099519687,
"learning_rate": 4.994313125307466e-06,
"loss": 0.1795,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 2.3591551410924034,
"learning_rate": 4.994264847822605e-06,
"loss": 0.2012,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 1.963795078441063,
"learning_rate": 4.994216366517684e-06,
"loss": 0.122,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 2.2161995153888356,
"learning_rate": 4.994167681396667e-06,
"loss": 0.2013,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 2.116594401017286,
"learning_rate": 4.994118792463529e-06,
"loss": 0.1678,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 2.004374732998671,
"learning_rate": 4.994069699722267e-06,
"loss": 0.1937,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 1.8488901498313728,
"learning_rate": 4.994020403176893e-06,
"loss": 0.1668,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 1.9972157818131948,
"learning_rate": 4.9939709028314345e-06,
"loss": 0.1589,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 2.748474268313726,
"learning_rate": 4.993921198689935e-06,
"loss": 0.1244,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 2.2905102593877893,
"learning_rate": 4.993871290756459e-06,
"loss": 0.1828,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 2.4243824405880825,
"learning_rate": 4.9938211790350835e-06,
"loss": 0.1534,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 2.7563047154810767,
"learning_rate": 4.993770863529902e-06,
"loss": 0.2186,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 2.0782876036120044,
"learning_rate": 4.993720344245029e-06,
"loss": 0.1519,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 2.1737696697985065,
"learning_rate": 4.99366962118459e-06,
"loss": 0.1705,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 2.117835290775163,
"learning_rate": 4.99361869435273e-06,
"loss": 0.1279,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 2.2816195263684906,
"learning_rate": 4.993567563753613e-06,
"loss": 0.1498,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 2.303960194203604,
"learning_rate": 4.993516229391414e-06,
"loss": 0.1505,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 2.932533158282557,
"learning_rate": 4.993464691270331e-06,
"loss": 0.1672,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 2.050977411803408,
"learning_rate": 4.993412949394572e-06,
"loss": 0.1511,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 1.9367899744301398,
"learning_rate": 4.993361003768369e-06,
"loss": 0.1203,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 2.3417493914717027,
"learning_rate": 4.993308854395963e-06,
"loss": 0.1782,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 2.2791020802299498,
"learning_rate": 4.993256501281618e-06,
"loss": 0.1643,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 2.051233293233244,
"learning_rate": 4.993203944429611e-06,
"loss": 0.1761,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 2.554462221777923,
"learning_rate": 4.993151183844236e-06,
"loss": 0.1654,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 1.8796649091666686,
"learning_rate": 4.9930982195298065e-06,
"loss": 0.1826,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 2.1843940505934336,
"learning_rate": 4.9930450514906484e-06,
"loss": 0.1755,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 2.600288448730721,
"learning_rate": 4.9929916797311075e-06,
"loss": 0.1758,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 2.0789865508427714,
"learning_rate": 4.992938104255545e-06,
"loss": 0.1571,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 2.6999799828889546,
"learning_rate": 4.992884325068339e-06,
"loss": 0.2177,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 2.1928099848185756,
"learning_rate": 4.992830342173882e-06,
"loss": 0.1831,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 1.6337451712782205,
"learning_rate": 4.992776155576589e-06,
"loss": 0.1506,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 1.2235042033062622,
"learning_rate": 4.992721765280884e-06,
"loss": 0.1214,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 2.8845660466122873,
"learning_rate": 4.992667171291215e-06,
"loss": 0.2148,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 2.7398139900638476,
"learning_rate": 4.992612373612042e-06,
"loss": 0.1661,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 3.738889974273454,
"learning_rate": 4.99255737224784e-06,
"loss": 0.2297,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 1.5329721181759282,
"learning_rate": 4.9925021672031075e-06,
"loss": 0.1486,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 2.3823467276559875,
"learning_rate": 4.992446758482353e-06,
"loss": 0.1552,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 2.1454290127697924,
"learning_rate": 4.992391146090106e-06,
"loss": 0.1736,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 1.4949223744659494,
"learning_rate": 4.99233533003091e-06,
"loss": 0.1373,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 1.5553413773794396,
"learning_rate": 4.992279310309326e-06,
"loss": 0.1835,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 2.969806225573073,
"learning_rate": 4.9922230869299316e-06,
"loss": 0.2793,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 2.3168611268442763,
"learning_rate": 4.992166659897321e-06,
"loss": 0.1922,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 2.3995795142770455,
"learning_rate": 4.992110029216106e-06,
"loss": 0.1955,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.6975631974230885,
"learning_rate": 4.992053194890914e-06,
"loss": 0.1112,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 2.087297197910066,
"learning_rate": 4.991996156926388e-06,
"loss": 0.1333,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 2.6326611217122475,
"learning_rate": 4.9919389153271904e-06,
"loss": 0.2017,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 1.4167548054089978,
"learning_rate": 4.991881470097998e-06,
"loss": 0.2074,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 2.325650637419427,
"learning_rate": 4.991823821243505e-06,
"loss": 0.1777,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 2.7279251785825,
"learning_rate": 4.991765968768422e-06,
"loss": 0.1801,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 2.9061020144564087,
"learning_rate": 4.991707912677477e-06,
"loss": 0.1702,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 1.8358268112205725,
"learning_rate": 4.991649652975414e-06,
"loss": 0.1433,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 2.5332736723438636,
"learning_rate": 4.991591189666994e-06,
"loss": 0.2469,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 2.1606263891645527,
"learning_rate": 4.991532522756993e-06,
"loss": 0.18,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 1.995831189895407,
"learning_rate": 4.991473652250207e-06,
"loss": 0.1577,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 2.4955613558163754,
"learning_rate": 4.991414578151445e-06,
"loss": 0.1544,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 2.2942486381281326,
"learning_rate": 4.991355300465535e-06,
"loss": 0.1794,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 2.6074492667183486,
"learning_rate": 4.99129581919732e-06,
"loss": 0.2319,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 2.563328131279355,
"learning_rate": 4.9912361343516616e-06,
"loss": 0.1498,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 2.2818975551142535,
"learning_rate": 4.991176245933437e-06,
"loss": 0.1996,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 2.3084476659986874,
"learning_rate": 4.9911161539475385e-06,
"loss": 0.1837,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 2.271697592195805,
"learning_rate": 4.991055858398879e-06,
"loss": 0.1839,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 2.7071752536725993,
"learning_rate": 4.990995359292384e-06,
"loss": 0.2051,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 2.1654433443615444,
"learning_rate": 4.990934656632997e-06,
"loss": 0.1845,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 2.56820477539861,
"learning_rate": 4.990873750425679e-06,
"loss": 0.1987,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 1.8972328280195017,
"learning_rate": 4.990812640675406e-06,
"loss": 0.1352,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 2.160948607003053,
"learning_rate": 4.990751327387174e-06,
"loss": 0.1788,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 2.2034240871386026,
"learning_rate": 4.99068981056599e-06,
"loss": 0.14,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 2.273981179049363,
"learning_rate": 4.990628090216885e-06,
"loss": 0.1914,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 2.0189718711860096,
"learning_rate": 4.990566166344898e-06,
"loss": 0.1455,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 2.596979330537977,
"learning_rate": 4.990504038955092e-06,
"loss": 0.1503,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 2.694293011033057,
"learning_rate": 4.990441708052542e-06,
"loss": 0.1582,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 2.00968932243832,
"learning_rate": 4.9903791736423435e-06,
"loss": 0.1531,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 1.7247039385783955,
"learning_rate": 4.9903164357296044e-06,
"loss": 0.1258,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 1.4795211673422664,
"learning_rate": 4.990253494319453e-06,
"loss": 0.1918,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 2.4289846785611573,
"learning_rate": 4.990190349417032e-06,
"loss": 0.264,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 2.1742573666821245,
"learning_rate": 4.990127001027501e-06,
"loss": 0.1382,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 2.051388070470128,
"learning_rate": 4.990063449156037e-06,
"loss": 0.234,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 2.3613735603207435,
"learning_rate": 4.989999693807832e-06,
"loss": 0.1963,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 3.162328527546947,
"learning_rate": 4.989935734988098e-06,
"loss": 0.1913,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 2.8669333432356967,
"learning_rate": 4.98987157270206e-06,
"loss": 0.15,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 2.383827835780797,
"learning_rate": 4.989807206954961e-06,
"loss": 0.2103,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 1.6341024017470744,
"learning_rate": 4.9897426377520605e-06,
"loss": 0.1393,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 2.146073254076934,
"learning_rate": 4.989677865098636e-06,
"loss": 0.1836,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 1.6889700199846902,
"learning_rate": 4.989612888999978e-06,
"loss": 0.1257,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 1.6032091805420865,
"learning_rate": 4.9895477094614e-06,
"loss": 0.1578,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 1.8161786006418608,
"learning_rate": 4.989482326488225e-06,
"loss": 0.1492,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 1.9978970628488169,
"learning_rate": 4.989416740085796e-06,
"loss": 0.1637,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 2.7066161025891335,
"learning_rate": 4.9893509502594735e-06,
"loss": 0.1963,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 2.420242793982077,
"learning_rate": 4.9892849570146335e-06,
"loss": 0.1877,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 2.153067326288121,
"learning_rate": 4.989218760356668e-06,
"loss": 0.1635,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 2.0543349130585216,
"learning_rate": 4.989152360290987e-06,
"loss": 0.1744,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 2.1211312409383716,
"learning_rate": 4.989085756823015e-06,
"loss": 0.2104,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 1.9888085672791085,
"learning_rate": 4.989018949958197e-06,
"loss": 0.1876,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 1.7510207885281333,
"learning_rate": 4.98895193970199e-06,
"loss": 0.1251,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 2.132384994640236,
"learning_rate": 4.9888847260598705e-06,
"loss": 0.154,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 2.323691709571053,
"learning_rate": 4.98881730903733e-06,
"loss": 0.1599,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 1.7667120167873211,
"learning_rate": 4.98874968863988e-06,
"loss": 0.1706,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 2.2465388060545424,
"learning_rate": 4.988681864873044e-06,
"loss": 0.152,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 2.150731238347554,
"learning_rate": 4.988613837742364e-06,
"loss": 0.1784,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 2.6552266788081913,
"learning_rate": 4.9885456072534015e-06,
"loss": 0.1692,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 2.6431963904654867,
"learning_rate": 4.988477173411728e-06,
"loss": 0.2313,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 1.6862589720746106,
"learning_rate": 4.988408536222939e-06,
"loss": 0.1569,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 2.4287850849792343,
"learning_rate": 4.9883396956926416e-06,
"loss": 0.2077,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 2.1310532776354556,
"learning_rate": 4.988270651826462e-06,
"loss": 0.1603,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 2.426464258613891,
"learning_rate": 4.988201404630041e-06,
"loss": 0.1804,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 2.2461225244692966,
"learning_rate": 4.988131954109038e-06,
"loss": 0.1749,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 1.7543756867291544,
"learning_rate": 4.988062300269128e-06,
"loss": 0.2141,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 1.8842714079345257,
"learning_rate": 4.987992443116003e-06,
"loss": 0.1509,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 2.5046760683256917,
"learning_rate": 4.987922382655372e-06,
"loss": 0.1555,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 2.3171833195987186,
"learning_rate": 4.987852118892958e-06,
"loss": 0.259,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 1.7971407845013883,
"learning_rate": 4.987781651834503e-06,
"loss": 0.2111,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 2.229282526599637,
"learning_rate": 4.987710981485768e-06,
"loss": 0.1639,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 2.090625191317677,
"learning_rate": 4.987640107852525e-06,
"loss": 0.2123,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 2.117001720390773,
"learning_rate": 4.987569030940567e-06,
"loss": 0.1762,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 1.7897158962626623,
"learning_rate": 4.987497750755702e-06,
"loss": 0.0935,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 2.0946360877045906,
"learning_rate": 4.987426267303753e-06,
"loss": 0.2049,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 2.07614941330386,
"learning_rate": 4.987354580590563e-06,
"loss": 0.1858,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 1.6797770286484157,
"learning_rate": 4.987282690621991e-06,
"loss": 0.1652,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 1.6413851962480772,
"learning_rate": 4.987210597403907e-06,
"loss": 0.156,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 2.5143144976994285,
"learning_rate": 4.987138300942208e-06,
"loss": 0.1804,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 2.128297430906798,
"learning_rate": 4.987065801242798e-06,
"loss": 0.1634,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 2.039358127433988,
"learning_rate": 4.986993098311601e-06,
"loss": 0.172,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 2.2470477292441906,
"learning_rate": 4.986920192154561e-06,
"loss": 0.1419,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 1.8708576936226033,
"learning_rate": 4.986847082777632e-06,
"loss": 0.165,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 2.2426713628374406,
"learning_rate": 4.986773770186791e-06,
"loss": 0.2113,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 2.1231842278965716,
"learning_rate": 4.986700254388027e-06,
"loss": 0.2583,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 1.9962414368551604,
"learning_rate": 4.986626535387349e-06,
"loss": 0.2146,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 2.7738560722941656,
"learning_rate": 4.9865526131907795e-06,
"loss": 0.1913,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 1.8910905183030835,
"learning_rate": 4.9864784878043595e-06,
"loss": 0.2243,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 2.943803252646498,
"learning_rate": 4.986404159234146e-06,
"loss": 0.2169,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 2.067283855325497,
"learning_rate": 4.986329627486213e-06,
"loss": 0.1392,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 1.7900649282380081,
"learning_rate": 4.986254892566652e-06,
"loss": 0.1929,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 2.05364008592912,
"learning_rate": 4.9861799544815684e-06,
"loss": 0.1539,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 1.8722252354131819,
"learning_rate": 4.986104813237086e-06,
"loss": 0.1584,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 2.127812745723865,
"learning_rate": 4.986029468839346e-06,
"loss": 0.1618,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 2.4926065420888643,
"learning_rate": 4.985953921294505e-06,
"loss": 0.2601,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 2.973425717527041,
"learning_rate": 4.985878170608736e-06,
"loss": 0.1919,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 2.1354583522718604,
"learning_rate": 4.985802216788228e-06,
"loss": 0.1904,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 2.4618549416407634,
"learning_rate": 4.98572605983919e-06,
"loss": 0.2137,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 1.3365138469487268,
"learning_rate": 4.985649699767842e-06,
"loss": 0.1069,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 1.9602605162416638,
"learning_rate": 4.985573136580427e-06,
"loss": 0.1723,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 1.6915428216688142,
"learning_rate": 4.9854963702832e-06,
"loss": 0.1673,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 2.0131015516091875,
"learning_rate": 4.985419400882433e-06,
"loss": 0.2159,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 1.8436996177818286,
"learning_rate": 4.985342228384418e-06,
"loss": 0.1777,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 3.2955423815059257,
"learning_rate": 4.985264852795459e-06,
"loss": 0.2759,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 2.386347589584829,
"learning_rate": 4.98518727412188e-06,
"loss": 0.1958,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 2.5771465793014294,
"learning_rate": 4.98510949237002e-06,
"loss": 0.1861,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 2.420697255730561,
"learning_rate": 4.985031507546234e-06,
"loss": 0.1538,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 2.6016330527075895,
"learning_rate": 4.984953319656896e-06,
"loss": 0.1981,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 2.671850671096213,
"learning_rate": 4.984874928708395e-06,
"loss": 0.1802,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 2.329893515854394,
"learning_rate": 4.984796334707136e-06,
"loss": 0.1916,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 2.900381887848387,
"learning_rate": 4.984717537659542e-06,
"loss": 0.1851,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 2.8920348384518295,
"learning_rate": 4.984638537572052e-06,
"loss": 0.1614,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 1.7590905699687769,
"learning_rate": 4.984559334451121e-06,
"loss": 0.1182,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 1.992998204932115,
"learning_rate": 4.984479928303221e-06,
"loss": 0.1097,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 1.7032225308271054,
"learning_rate": 4.984400319134841e-06,
"loss": 0.1166,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 2.170562253873519,
"learning_rate": 4.984320506952487e-06,
"loss": 0.2253,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 2.237592089222373,
"learning_rate": 4.9842404917626796e-06,
"loss": 0.1949,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 2.0106916989450587,
"learning_rate": 4.984160273571959e-06,
"loss": 0.1681,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 1.5887484417784243,
"learning_rate": 4.9840798523868785e-06,
"loss": 0.1987,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 2.1863186231198677,
"learning_rate": 4.983999228214011e-06,
"loss": 0.1688,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 1.73818173181658,
"learning_rate": 4.983918401059943e-06,
"loss": 0.1667,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 2.507383020515962,
"learning_rate": 4.983837370931282e-06,
"loss": 0.1969,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 2.0632014051403793,
"learning_rate": 4.983756137834647e-06,
"loss": 0.183,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 2.830188740520148,
"learning_rate": 4.9836747017766765e-06,
"loss": 0.2093,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 2.5110616036547038,
"learning_rate": 4.983593062764027e-06,
"loss": 0.2322,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 3.686743248745681,
"learning_rate": 4.983511220803367e-06,
"loss": 0.2445,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 1.679708381839253,
"learning_rate": 4.983429175901386e-06,
"loss": 0.1796,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 2.1827593155516722,
"learning_rate": 4.983346928064788e-06,
"loss": 0.1674,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 1.60561536399989,
"learning_rate": 4.9832644773002935e-06,
"loss": 0.1696,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 2.3818871014331418,
"learning_rate": 4.98318182361464e-06,
"loss": 0.231,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 2.466498074147868,
"learning_rate": 4.9830989670145825e-06,
"loss": 0.2363,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 2.3360214493938485,
"learning_rate": 4.9830159075068905e-06,
"loss": 0.2211,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 1.8065829881072444,
"learning_rate": 4.9829326450983514e-06,
"loss": 0.1743,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 2.69540573324766,
"learning_rate": 4.98284917979577e-06,
"loss": 0.1876,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 1.8906354216406325,
"learning_rate": 4.9827655116059656e-06,
"loss": 0.1592,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 1.743151148777257,
"learning_rate": 4.9826816405357755e-06,
"loss": 0.1746,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 1.5963849264202556,
"learning_rate": 4.982597566592054e-06,
"loss": 0.1244,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 2.7157092244830205,
"learning_rate": 4.982513289781671e-06,
"loss": 0.2332,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 1.9931400703765212,
"learning_rate": 4.982428810111512e-06,
"loss": 0.2113,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 1.3604077425808516,
"learning_rate": 4.9823441275884814e-06,
"loss": 0.1305,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 2.2607598123619517,
"learning_rate": 4.982259242219499e-06,
"loss": 0.1723,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 1.867118589561207,
"learning_rate": 4.9821741540115006e-06,
"loss": 0.1355,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 2.11150758750875,
"learning_rate": 4.982088862971441e-06,
"loss": 0.2181,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 2.922634212063935,
"learning_rate": 4.982003369106287e-06,
"loss": 0.1935,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 1.8213621057521336,
"learning_rate": 4.981917672423028e-06,
"loss": 0.1159,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 1.9973203363112062,
"learning_rate": 4.981831772928664e-06,
"loss": 0.1644,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 1.6435298569620178,
"learning_rate": 4.981745670630216e-06,
"loss": 0.1676,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 1.7090737346215599,
"learning_rate": 4.981659365534718e-06,
"loss": 0.1947,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 2.8644071628055365,
"learning_rate": 4.981572857649225e-06,
"loss": 0.2412,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 1.5071870677678134,
"learning_rate": 4.981486146980804e-06,
"loss": 0.1247,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 2.5523639597283436,
"learning_rate": 4.9813992335365415e-06,
"loss": 0.1636,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 1.6766352791010617,
"learning_rate": 4.98131211732354e-06,
"loss": 0.1659,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 2.6626571731411985,
"learning_rate": 4.981224798348917e-06,
"loss": 0.1777,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 1.7748484056177547,
"learning_rate": 4.981137276619809e-06,
"loss": 0.2038,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 1.6726970249923665,
"learning_rate": 4.9810495521433675e-06,
"loss": 0.167,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 2.3836088959731407,
"learning_rate": 4.9809616249267616e-06,
"loss": 0.1967,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 1.9478244630239012,
"learning_rate": 4.980873494977174e-06,
"loss": 0.2259,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 2.601912538074716,
"learning_rate": 4.98078516230181e-06,
"loss": 0.196,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 1.8252963162031037,
"learning_rate": 4.980696626907884e-06,
"loss": 0.1551,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.7882792458437706,
"learning_rate": 4.980607888802633e-06,
"loss": 0.1547,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 1.8674433444840757,
"learning_rate": 4.980518947993307e-06,
"loss": 0.1625,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 2.050135562104488,
"learning_rate": 4.980429804487176e-06,
"loss": 0.1706,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 3.040028729336044,
"learning_rate": 4.980340458291521e-06,
"loss": 0.2235,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 1.755025572252995,
"learning_rate": 4.980250909413646e-06,
"loss": 0.1451,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 2.636610646301175,
"learning_rate": 4.980161157860867e-06,
"loss": 0.1869,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 2.5942914069340715,
"learning_rate": 4.980071203640519e-06,
"loss": 0.1633,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 1.5184266230548011,
"learning_rate": 4.979981046759952e-06,
"loss": 0.1441,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 1.8681142182661066,
"learning_rate": 4.979890687226533e-06,
"loss": 0.1596,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 2.48564323404002,
"learning_rate": 4.979800125047647e-06,
"loss": 0.1481,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 2.3390506413519514,
"learning_rate": 4.979709360230692e-06,
"loss": 0.1889,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 2.017468095007692,
"learning_rate": 4.979618392783087e-06,
"loss": 0.1417,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 1.729598330112352,
"learning_rate": 4.979527222712266e-06,
"loss": 0.142,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 2.1368144580931747,
"learning_rate": 4.9794358500256765e-06,
"loss": 0.1636,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 1.9994448136168699,
"learning_rate": 4.979344274730786e-06,
"loss": 0.1604,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 3.428795563882251,
"learning_rate": 4.979252496835079e-06,
"loss": 0.2394,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 2.6996852974810768,
"learning_rate": 4.979160516346054e-06,
"loss": 0.2375,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 1.9797680166732188,
"learning_rate": 4.979068333271227e-06,
"loss": 0.1842,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 3.003957390141276,
"learning_rate": 4.978975947618131e-06,
"loss": 0.193,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 2.00845771414247,
"learning_rate": 4.978883359394316e-06,
"loss": 0.198,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 2.0203437551682186,
"learning_rate": 4.978790568607347e-06,
"loss": 0.1643,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 2.112746362210305,
"learning_rate": 4.9786975752648076e-06,
"loss": 0.2327,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 1.9220582393008747,
"learning_rate": 4.978604379374295e-06,
"loss": 0.1549,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 2.1402572457657545,
"learning_rate": 4.978510980943427e-06,
"loss": 0.139,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 2.4018554173698914,
"learning_rate": 4.978417379979834e-06,
"loss": 0.2455,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 1.951258020011642,
"learning_rate": 4.978323576491165e-06,
"loss": 0.1552,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 2.1010768496853323,
"learning_rate": 4.978229570485085e-06,
"loss": 0.2383,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 1.5821441832613072,
"learning_rate": 4.978135361969276e-06,
"loss": 0.1851,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 1.6009355908322205,
"learning_rate": 4.9780409509514375e-06,
"loss": 0.175,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.8650365534886528,
"learning_rate": 4.977946337439282e-06,
"loss": 0.2302,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 1.6321720020750403,
"learning_rate": 4.9778515214405436e-06,
"loss": 0.1919,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 1.5102194582450883,
"learning_rate": 4.977756502962967e-06,
"loss": 0.1206,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 2.069100224324352,
"learning_rate": 4.97766128201432e-06,
"loss": 0.1429,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 1.8931152672148568,
"learning_rate": 4.977565858602381e-06,
"loss": 0.1634,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 1.9388931474803874,
"learning_rate": 4.977470232734949e-06,
"loss": 0.1138,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 2.52659442383892,
"learning_rate": 4.977374404419838e-06,
"loss": 0.2011,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 1.9831728669000206,
"learning_rate": 4.977278373664877e-06,
"loss": 0.1475,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 1.8342304339485977,
"learning_rate": 4.977182140477916e-06,
"loss": 0.1801,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 1.9321185937866436,
"learning_rate": 4.977085704866817e-06,
"loss": 0.1787,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 1.8230541452731504,
"learning_rate": 4.97698906683946e-06,
"loss": 0.202,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 2.4982489548908062,
"learning_rate": 4.9768922264037435e-06,
"loss": 0.2283,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 2.134742327126813,
"learning_rate": 4.976795183567579e-06,
"loss": 0.1544,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 2.9581764452635184,
"learning_rate": 4.976697938338898e-06,
"loss": 0.1674,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 1.712602080023381,
"learning_rate": 4.976600490725645e-06,
"loss": 0.1568,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 1.7418610812844693,
"learning_rate": 4.976502840735785e-06,
"loss": 0.1945,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 2.138071978494717,
"learning_rate": 4.976404988377297e-06,
"loss": 0.1512,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 2.346885929916554,
"learning_rate": 4.976306933658176e-06,
"loss": 0.2262,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 2.020074510485992,
"learning_rate": 4.976208676586435e-06,
"loss": 0.2141,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 1.8763221281396283,
"learning_rate": 4.976110217170104e-06,
"loss": 0.1491,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 2.235721601006219,
"learning_rate": 4.976011555417228e-06,
"loss": 0.2058,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 1.315034818762656,
"learning_rate": 4.975912691335869e-06,
"loss": 0.1244,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 2.1199398350029757,
"learning_rate": 4.975813624934106e-06,
"loss": 0.1412,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 1.8709221572870474,
"learning_rate": 4.975714356220035e-06,
"loss": 0.1527,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 2.2421419230230657,
"learning_rate": 4.975614885201766e-06,
"loss": 0.1608,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 2.3078261939110454,
"learning_rate": 4.975515211887429e-06,
"loss": 0.1465,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 1.5895485837834087,
"learning_rate": 4.9754153362851684e-06,
"loss": 0.1197,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 1.7459488111256227,
"learning_rate": 4.975315258403145e-06,
"loss": 0.1528,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 1.7723162295113712,
"learning_rate": 4.975214978249537e-06,
"loss": 0.192,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 2.1669137038937905,
"learning_rate": 4.975114495832539e-06,
"loss": 0.2359,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 2.0603228355359535,
"learning_rate": 4.975013811160362e-06,
"loss": 0.1745,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 2.043894775326392,
"learning_rate": 4.974912924241233e-06,
"loss": 0.1624,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 1.6841728525009554,
"learning_rate": 4.974811835083397e-06,
"loss": 0.2189,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 2.6366675854172335,
"learning_rate": 4.974710543695114e-06,
"loss": 0.2328,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 2.4052804548672304,
"learning_rate": 4.974609050084661e-06,
"loss": 0.1886,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 2.0535117318370633,
"learning_rate": 4.974507354260332e-06,
"loss": 0.2303,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 2.0269029978513555,
"learning_rate": 4.974405456230436e-06,
"loss": 0.1671,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 2.7642802872985293,
"learning_rate": 4.974303356003301e-06,
"loss": 0.1344,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 1.7887955204908959,
"learning_rate": 4.974201053587268e-06,
"loss": 0.1681,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 1.9742201804444028,
"learning_rate": 4.9740985489907005e-06,
"loss": 0.138,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 2.166941374479256,
"learning_rate": 4.973995842221971e-06,
"loss": 0.1868,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 2.225119335059734,
"learning_rate": 4.973892933289476e-06,
"loss": 0.1567,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 1.8892762650773542,
"learning_rate": 4.97378982220162e-06,
"loss": 0.1488,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 1.8158100523332013,
"learning_rate": 4.973686508966832e-06,
"loss": 0.1301,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 2.0245407202628836,
"learning_rate": 4.973582993593554e-06,
"loss": 0.1695,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 2.7034498126253674,
"learning_rate": 4.973479276090244e-06,
"loss": 0.1737,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 2.065622568041038,
"learning_rate": 4.973375356465378e-06,
"loss": 0.149,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 1.9812676900095911,
"learning_rate": 4.973271234727447e-06,
"loss": 0.173,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 1.5726806580344541,
"learning_rate": 4.97316691088496e-06,
"loss": 0.1254,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 2.191785122658953,
"learning_rate": 4.973062384946442e-06,
"loss": 0.2233,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 1.035062440323858,
"learning_rate": 4.9729576569204345e-06,
"loss": 0.1013,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 1.6618268618936451,
"learning_rate": 4.972852726815495e-06,
"loss": 0.1611,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 1.3381515796606562,
"learning_rate": 4.972747594640197e-06,
"loss": 0.1669,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 2.0887228759944327,
"learning_rate": 4.9726422604031335e-06,
"loss": 0.1718,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 1.424194176219749,
"learning_rate": 4.97253672411291e-06,
"loss": 0.1771,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 1.5373795467776654,
"learning_rate": 4.972430985778152e-06,
"loss": 0.1118,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 2.6972031210443506,
"learning_rate": 4.972325045407499e-06,
"loss": 0.1702,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 3.1350549460340957,
"learning_rate": 4.972218903009608e-06,
"loss": 0.2161,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 2.1422204131037628,
"learning_rate": 4.972112558593153e-06,
"loss": 0.1902,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 2.041726060026698,
"learning_rate": 4.972006012166823e-06,
"loss": 0.2079,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 1.7346734861898188,
"learning_rate": 4.971899263739326e-06,
"loss": 0.1394,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 1.959916622945104,
"learning_rate": 4.971792313319384e-06,
"loss": 0.1901,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 1.6780700319385458,
"learning_rate": 4.971685160915737e-06,
"loss": 0.1623,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 2.08830134651656,
"learning_rate": 4.971577806537139e-06,
"loss": 0.1607,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 2.205231289993063,
"learning_rate": 4.971470250192366e-06,
"loss": 0.1851,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 2.911292420170041,
"learning_rate": 4.9713624918902045e-06,
"loss": 0.2235,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 2.1164751998531344,
"learning_rate": 4.971254531639461e-06,
"loss": 0.1556,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 2.740398833115599,
"learning_rate": 4.971146369448957e-06,
"loss": 0.206,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 1.797962382814168,
"learning_rate": 4.971038005327532e-06,
"loss": 0.161,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 1.995555524717142,
"learning_rate": 4.970929439284039e-06,
"loss": 0.1808,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 2.1172122131281927,
"learning_rate": 4.970820671327351e-06,
"loss": 0.189,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 1.8090573461125563,
"learning_rate": 4.9707117014663565e-06,
"loss": 0.1522,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 1.8419040286839186,
"learning_rate": 4.97060252970996e-06,
"loss": 0.2046,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 2.268977185876009,
"learning_rate": 4.970493156067081e-06,
"loss": 0.2247,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 2.1193932268543314,
"learning_rate": 4.970383580546658e-06,
"loss": 0.159,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 2.173218123449192,
"learning_rate": 4.970273803157645e-06,
"loss": 0.1851,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 1.9062873437813912,
"learning_rate": 4.970163823909013e-06,
"loss": 0.1431,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 2.2598849919184936,
"learning_rate": 4.970053642809748e-06,
"loss": 0.1831,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 2.181038873894579,
"learning_rate": 4.969943259868853e-06,
"loss": 0.1924,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 1.8247639377537164,
"learning_rate": 4.969832675095351e-06,
"loss": 0.151,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 1.9978374370947616,
"learning_rate": 4.969721888498275e-06,
"loss": 0.2343,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 2.0040249698932953,
"learning_rate": 4.96961090008668e-06,
"loss": 0.144,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 1.58491785029609,
"learning_rate": 4.969499709869635e-06,
"loss": 0.2297,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 1.9099928105281807,
"learning_rate": 4.969388317856225e-06,
"loss": 0.1643,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 2.506622826362881,
"learning_rate": 4.969276724055554e-06,
"loss": 0.2302,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 1.886779327578952,
"learning_rate": 4.969164928476741e-06,
"loss": 0.1305,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 2.193853436337964,
"learning_rate": 4.969052931128919e-06,
"loss": 0.1942,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 1.696380624819296,
"learning_rate": 4.968940732021243e-06,
"loss": 0.1757,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 1.9308212907452063,
"learning_rate": 4.9688283311628795e-06,
"loss": 0.1953,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 2.2015952320833927,
"learning_rate": 4.968715728563014e-06,
"loss": 0.2188,
"step": 555
},
{
"epoch": 0.2529572338489536,
"grad_norm": 1.8518723960249535,
"learning_rate": 4.968602924230847e-06,
"loss": 0.1439,
"step": 556
},
{
"epoch": 0.2534121929026388,
"grad_norm": 3.211322079508386,
"learning_rate": 4.968489918175598e-06,
"loss": 0.1758,
"step": 557
},
{
"epoch": 0.2538671519563239,
"grad_norm": 2.949982147696011,
"learning_rate": 4.9683767104065014e-06,
"loss": 0.1802,
"step": 558
},
{
"epoch": 0.2543221110100091,
"grad_norm": 2.2092600896288697,
"learning_rate": 4.968263300932806e-06,
"loss": 0.1898,
"step": 559
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.7931135921014567,
"learning_rate": 4.968149689763781e-06,
"loss": 0.1544,
"step": 560
},
{
"epoch": 0.2552320291173794,
"grad_norm": 1.7030840422806155,
"learning_rate": 4.968035876908708e-06,
"loss": 0.1639,
"step": 561
},
{
"epoch": 0.2556869881710646,
"grad_norm": 1.8718848217622976,
"learning_rate": 4.967921862376889e-06,
"loss": 0.2434,
"step": 562
},
{
"epoch": 0.25614194722474976,
"grad_norm": 2.2371670340279235,
"learning_rate": 4.9678076461776415e-06,
"loss": 0.2335,
"step": 563
},
{
"epoch": 0.25659690627843496,
"grad_norm": 1.8393455682211606,
"learning_rate": 4.9676932283202965e-06,
"loss": 0.1499,
"step": 564
},
{
"epoch": 0.2570518653321201,
"grad_norm": 2.4142531578801387,
"learning_rate": 4.967578608814205e-06,
"loss": 0.1949,
"step": 565
},
{
"epoch": 0.2575068243858053,
"grad_norm": 2.0642965255799735,
"learning_rate": 4.9674637876687345e-06,
"loss": 0.1858,
"step": 566
},
{
"epoch": 0.25796178343949044,
"grad_norm": 1.2532956879082058,
"learning_rate": 4.967348764893265e-06,
"loss": 0.1256,
"step": 567
},
{
"epoch": 0.2584167424931756,
"grad_norm": 2.1919850476807574,
"learning_rate": 4.967233540497197e-06,
"loss": 0.1554,
"step": 568
},
{
"epoch": 0.2588717015468608,
"grad_norm": 2.1554599148015186,
"learning_rate": 4.967118114489946e-06,
"loss": 0.2131,
"step": 569
},
{
"epoch": 0.25932666060054593,
"grad_norm": 1.7423629235975449,
"learning_rate": 4.967002486880944e-06,
"loss": 0.1488,
"step": 570
},
{
"epoch": 0.25978161965423113,
"grad_norm": 2.7181048243188,
"learning_rate": 4.966886657679641e-06,
"loss": 0.2501,
"step": 571
},
{
"epoch": 0.2602365787079163,
"grad_norm": 1.6717232797306434,
"learning_rate": 4.966770626895499e-06,
"loss": 0.1664,
"step": 572
},
{
"epoch": 0.2606915377616015,
"grad_norm": 2.1767030645167162,
"learning_rate": 4.966654394538002e-06,
"loss": 0.1921,
"step": 573
},
{
"epoch": 0.2611464968152866,
"grad_norm": 1.2471699088039891,
"learning_rate": 4.966537960616646e-06,
"loss": 0.0848,
"step": 574
},
{
"epoch": 0.26160145586897177,
"grad_norm": 2.0523431055962402,
"learning_rate": 4.9664213251409486e-06,
"loss": 0.2032,
"step": 575
},
{
"epoch": 0.26205641492265697,
"grad_norm": 1.9891959124678449,
"learning_rate": 4.9663044881204375e-06,
"loss": 0.1962,
"step": 576
},
{
"epoch": 0.2625113739763421,
"grad_norm": 2.0537761706631947,
"learning_rate": 4.9661874495646615e-06,
"loss": 0.1484,
"step": 577
},
{
"epoch": 0.2629663330300273,
"grad_norm": 1.7414302230897167,
"learning_rate": 4.9660702094831845e-06,
"loss": 0.1959,
"step": 578
},
{
"epoch": 0.26342129208371245,
"grad_norm": 2.975109707839724,
"learning_rate": 4.965952767885587e-06,
"loss": 0.215,
"step": 579
},
{
"epoch": 0.26387625113739765,
"grad_norm": 3.1187687651037126,
"learning_rate": 4.965835124781465e-06,
"loss": 0.2326,
"step": 580
},
{
"epoch": 0.2643312101910828,
"grad_norm": 1.7844067959067744,
"learning_rate": 4.965717280180432e-06,
"loss": 0.1616,
"step": 581
},
{
"epoch": 0.26478616924476794,
"grad_norm": 1.981807539010698,
"learning_rate": 4.965599234092118e-06,
"loss": 0.1275,
"step": 582
},
{
"epoch": 0.26524112829845314,
"grad_norm": 2.3418573353915964,
"learning_rate": 4.96548098652617e-06,
"loss": 0.2029,
"step": 583
},
{
"epoch": 0.2656960873521383,
"grad_norm": 1.9501727944201128,
"learning_rate": 4.965362537492249e-06,
"loss": 0.1839,
"step": 584
},
{
"epoch": 0.2661510464058235,
"grad_norm": 1.735679302563917,
"learning_rate": 4.9652438870000356e-06,
"loss": 0.185,
"step": 585
},
{
"epoch": 0.26660600545950863,
"grad_norm": 1.3821743738209817,
"learning_rate": 4.965125035059224e-06,
"loss": 0.117,
"step": 586
},
{
"epoch": 0.26706096451319383,
"grad_norm": 2.0524973617804196,
"learning_rate": 4.965005981679527e-06,
"loss": 0.1563,
"step": 587
},
{
"epoch": 0.267515923566879,
"grad_norm": 2.2596791906895395,
"learning_rate": 4.964886726870673e-06,
"loss": 0.2165,
"step": 588
},
{
"epoch": 0.2679708826205642,
"grad_norm": 1.890432704603994,
"learning_rate": 4.964767270642407e-06,
"loss": 0.1884,
"step": 589
},
{
"epoch": 0.2684258416742493,
"grad_norm": 1.6149961858038402,
"learning_rate": 4.964647613004491e-06,
"loss": 0.1353,
"step": 590
},
{
"epoch": 0.26888080072793447,
"grad_norm": 1.7116103543510561,
"learning_rate": 4.964527753966702e-06,
"loss": 0.1403,
"step": 591
},
{
"epoch": 0.26933575978161967,
"grad_norm": 2.400216438390535,
"learning_rate": 4.964407693538834e-06,
"loss": 0.1712,
"step": 592
},
{
"epoch": 0.2697907188353048,
"grad_norm": 2.3569276822171012,
"learning_rate": 4.9642874317307e-06,
"loss": 0.2541,
"step": 593
},
{
"epoch": 0.27024567788899,
"grad_norm": 1.3583233690609127,
"learning_rate": 4.964166968552124e-06,
"loss": 0.1881,
"step": 594
},
{
"epoch": 0.27070063694267515,
"grad_norm": 2.041956563972623,
"learning_rate": 4.9640463040129525e-06,
"loss": 0.2013,
"step": 595
},
{
"epoch": 0.27115559599636035,
"grad_norm": 2.1339742915351083,
"learning_rate": 4.963925438123044e-06,
"loss": 0.1486,
"step": 596
},
{
"epoch": 0.2716105550500455,
"grad_norm": 2.3589739110244947,
"learning_rate": 4.963804370892276e-06,
"loss": 0.1671,
"step": 597
},
{
"epoch": 0.27206551410373064,
"grad_norm": 2.041024711316621,
"learning_rate": 4.9636831023305405e-06,
"loss": 0.1773,
"step": 598
},
{
"epoch": 0.27252047315741584,
"grad_norm": 1.6966086145560721,
"learning_rate": 4.963561632447748e-06,
"loss": 0.1536,
"step": 599
},
{
"epoch": 0.272975432211101,
"grad_norm": 1.7956646862639238,
"learning_rate": 4.9634399612538255e-06,
"loss": 0.1665,
"step": 600
},
{
"epoch": 0.2734303912647862,
"grad_norm": 2.4039450245635816,
"learning_rate": 4.963318088758714e-06,
"loss": 0.186,
"step": 601
},
{
"epoch": 0.27388535031847133,
"grad_norm": 2.573374996121704,
"learning_rate": 4.963196014972371e-06,
"loss": 0.181,
"step": 602
},
{
"epoch": 0.27434030937215653,
"grad_norm": 2.3031446562333158,
"learning_rate": 4.963073739904775e-06,
"loss": 0.1896,
"step": 603
},
{
"epoch": 0.2747952684258417,
"grad_norm": 2.9296704327439533,
"learning_rate": 4.962951263565915e-06,
"loss": 0.2168,
"step": 604
},
{
"epoch": 0.2752502274795268,
"grad_norm": 2.3617995527569557,
"learning_rate": 4.962828585965801e-06,
"loss": 0.1815,
"step": 605
},
{
"epoch": 0.275705186533212,
"grad_norm": 2.1546354601106956,
"learning_rate": 4.962705707114457e-06,
"loss": 0.1658,
"step": 606
},
{
"epoch": 0.27616014558689717,
"grad_norm": 1.9872717123396686,
"learning_rate": 4.962582627021923e-06,
"loss": 0.1885,
"step": 607
},
{
"epoch": 0.27661510464058237,
"grad_norm": 2.3902452238732077,
"learning_rate": 4.962459345698258e-06,
"loss": 0.1934,
"step": 608
},
{
"epoch": 0.2770700636942675,
"grad_norm": 2.6613012891469334,
"learning_rate": 4.962335863153537e-06,
"loss": 0.2002,
"step": 609
},
{
"epoch": 0.2775250227479527,
"grad_norm": 1.5351443788779375,
"learning_rate": 4.962212179397847e-06,
"loss": 0.1524,
"step": 610
},
{
"epoch": 0.27797998180163785,
"grad_norm": 1.8149311646504362,
"learning_rate": 4.962088294441299e-06,
"loss": 0.1091,
"step": 611
},
{
"epoch": 0.278434940855323,
"grad_norm": 1.6923849341814876,
"learning_rate": 4.9619642082940135e-06,
"loss": 0.2258,
"step": 612
},
{
"epoch": 0.2788898999090082,
"grad_norm": 2.300540388195039,
"learning_rate": 4.9618399209661305e-06,
"loss": 0.1544,
"step": 613
},
{
"epoch": 0.27934485896269334,
"grad_norm": 2.2841254960366375,
"learning_rate": 4.961715432467807e-06,
"loss": 0.1537,
"step": 614
},
{
"epoch": 0.27979981801637854,
"grad_norm": 2.1565671846973764,
"learning_rate": 4.961590742809216e-06,
"loss": 0.1818,
"step": 615
},
{
"epoch": 0.2802547770700637,
"grad_norm": 1.4848634903553593,
"learning_rate": 4.961465852000545e-06,
"loss": 0.1379,
"step": 616
},
{
"epoch": 0.2807097361237489,
"grad_norm": 2.886386939634882,
"learning_rate": 4.961340760052001e-06,
"loss": 0.2137,
"step": 617
},
{
"epoch": 0.28116469517743403,
"grad_norm": 2.12342498143493,
"learning_rate": 4.961215466973806e-06,
"loss": 0.1609,
"step": 618
},
{
"epoch": 0.2816196542311192,
"grad_norm": 1.6272561537794945,
"learning_rate": 4.961089972776197e-06,
"loss": 0.1704,
"step": 619
},
{
"epoch": 0.2820746132848044,
"grad_norm": 2.177134514236334,
"learning_rate": 4.9609642774694285e-06,
"loss": 0.1844,
"step": 620
},
{
"epoch": 0.2825295723384895,
"grad_norm": 2.0060823387879396,
"learning_rate": 4.960838381063774e-06,
"loss": 0.1639,
"step": 621
},
{
"epoch": 0.2829845313921747,
"grad_norm": 2.0396430448753047,
"learning_rate": 4.960712283569521e-06,
"loss": 0.1832,
"step": 622
},
{
"epoch": 0.28343949044585987,
"grad_norm": 2.1577816713540345,
"learning_rate": 4.960585984996971e-06,
"loss": 0.1795,
"step": 623
},
{
"epoch": 0.28389444949954507,
"grad_norm": 2.1362683979802997,
"learning_rate": 4.960459485356447e-06,
"loss": 0.2442,
"step": 624
},
{
"epoch": 0.2843494085532302,
"grad_norm": 1.7854499328292173,
"learning_rate": 4.960332784658285e-06,
"loss": 0.1461,
"step": 625
},
{
"epoch": 0.28480436760691535,
"grad_norm": 2.1713858060672218,
"learning_rate": 4.960205882912839e-06,
"loss": 0.1743,
"step": 626
},
{
"epoch": 0.28525932666060055,
"grad_norm": 2.143444693552156,
"learning_rate": 4.9600787801304785e-06,
"loss": 0.2084,
"step": 627
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.8522682250986475,
"learning_rate": 4.959951476321589e-06,
"loss": 0.1946,
"step": 628
},
{
"epoch": 0.2861692447679709,
"grad_norm": 1.5982375639062243,
"learning_rate": 4.959823971496575e-06,
"loss": 0.1772,
"step": 629
},
{
"epoch": 0.28662420382165604,
"grad_norm": 1.8898991951503732,
"learning_rate": 4.959696265665853e-06,
"loss": 0.1804,
"step": 630
},
{
"epoch": 0.28707916287534124,
"grad_norm": 1.9040168480447408,
"learning_rate": 4.959568358839862e-06,
"loss": 0.1258,
"step": 631
},
{
"epoch": 0.2875341219290264,
"grad_norm": 1.8463510477056075,
"learning_rate": 4.95944025102905e-06,
"loss": 0.1414,
"step": 632
},
{
"epoch": 0.28798908098271153,
"grad_norm": 2.3179780847953055,
"learning_rate": 4.959311942243888e-06,
"loss": 0.2031,
"step": 633
},
{
"epoch": 0.28844404003639673,
"grad_norm": 1.724174452868963,
"learning_rate": 4.95918343249486e-06,
"loss": 0.1377,
"step": 634
},
{
"epoch": 0.2888989990900819,
"grad_norm": 1.7281757474887716,
"learning_rate": 4.959054721792469e-06,
"loss": 0.2074,
"step": 635
},
{
"epoch": 0.2893539581437671,
"grad_norm": 1.749321520269807,
"learning_rate": 4.958925810147231e-06,
"loss": 0.104,
"step": 636
},
{
"epoch": 0.2898089171974522,
"grad_norm": 1.8727315308914843,
"learning_rate": 4.958796697569679e-06,
"loss": 0.1325,
"step": 637
},
{
"epoch": 0.2902638762511374,
"grad_norm": 2.800322102970211,
"learning_rate": 4.958667384070365e-06,
"loss": 0.1583,
"step": 638
},
{
"epoch": 0.29071883530482256,
"grad_norm": 1.7822844611072806,
"learning_rate": 4.958537869659855e-06,
"loss": 0.2057,
"step": 639
},
{
"epoch": 0.2911737943585077,
"grad_norm": 2.745456907200946,
"learning_rate": 4.958408154348734e-06,
"loss": 0.1605,
"step": 640
},
{
"epoch": 0.2916287534121929,
"grad_norm": 2.233718920040976,
"learning_rate": 4.9582782381476e-06,
"loss": 0.1996,
"step": 641
},
{
"epoch": 0.29208371246587805,
"grad_norm": 2.2702620107271567,
"learning_rate": 4.958148121067071e-06,
"loss": 0.2927,
"step": 642
},
{
"epoch": 0.29253867151956325,
"grad_norm": 2.150177934476292,
"learning_rate": 4.9580178031177775e-06,
"loss": 0.1949,
"step": 643
},
{
"epoch": 0.2929936305732484,
"grad_norm": 1.4333466510228,
"learning_rate": 4.9578872843103694e-06,
"loss": 0.1481,
"step": 644
},
{
"epoch": 0.2934485896269336,
"grad_norm": 1.8148623461294702,
"learning_rate": 4.957756564655513e-06,
"loss": 0.1736,
"step": 645
},
{
"epoch": 0.29390354868061874,
"grad_norm": 1.8574102016300988,
"learning_rate": 4.957625644163888e-06,
"loss": 0.1893,
"step": 646
},
{
"epoch": 0.2943585077343039,
"grad_norm": 2.0598318825039694,
"learning_rate": 4.957494522846194e-06,
"loss": 0.1511,
"step": 647
},
{
"epoch": 0.2948134667879891,
"grad_norm": 1.8631745332908447,
"learning_rate": 4.957363200713146e-06,
"loss": 0.2403,
"step": 648
},
{
"epoch": 0.29526842584167423,
"grad_norm": 1.934970676847201,
"learning_rate": 4.957231677775475e-06,
"loss": 0.1782,
"step": 649
},
{
"epoch": 0.29572338489535943,
"grad_norm": 2.162311103918465,
"learning_rate": 4.957099954043928e-06,
"loss": 0.1894,
"step": 650
},
{
"epoch": 0.2961783439490446,
"grad_norm": 1.3750044807559711,
"learning_rate": 4.956968029529269e-06,
"loss": 0.1948,
"step": 651
},
{
"epoch": 0.2966333030027298,
"grad_norm": 1.7571318861097756,
"learning_rate": 4.956835904242277e-06,
"loss": 0.1715,
"step": 652
},
{
"epoch": 0.2970882620564149,
"grad_norm": 1.964585802559125,
"learning_rate": 4.9567035781937516e-06,
"loss": 0.1103,
"step": 653
},
{
"epoch": 0.29754322111010006,
"grad_norm": 1.9039563589608381,
"learning_rate": 4.9565710513945024e-06,
"loss": 0.1668,
"step": 654
},
{
"epoch": 0.29799818016378526,
"grad_norm": 1.837562224402912,
"learning_rate": 4.956438323855362e-06,
"loss": 0.129,
"step": 655
},
{
"epoch": 0.2984531392174704,
"grad_norm": 1.7630804326653742,
"learning_rate": 4.956305395587174e-06,
"loss": 0.1906,
"step": 656
},
{
"epoch": 0.2989080982711556,
"grad_norm": 1.910058814511253,
"learning_rate": 4.956172266600802e-06,
"loss": 0.124,
"step": 657
},
{
"epoch": 0.29936305732484075,
"grad_norm": 2.2105167684195757,
"learning_rate": 4.956038936907125e-06,
"loss": 0.1593,
"step": 658
},
{
"epoch": 0.29981801637852595,
"grad_norm": 2.253935685217962,
"learning_rate": 4.955905406517036e-06,
"loss": 0.1581,
"step": 659
},
{
"epoch": 0.3002729754322111,
"grad_norm": 2.5313373580598424,
"learning_rate": 4.95577167544145e-06,
"loss": 0.1813,
"step": 660
},
{
"epoch": 0.30072793448589624,
"grad_norm": 2.406722714489674,
"learning_rate": 4.955637743691291e-06,
"loss": 0.1633,
"step": 661
},
{
"epoch": 0.30118289353958144,
"grad_norm": 2.4238606966439487,
"learning_rate": 4.955503611277506e-06,
"loss": 0.1917,
"step": 662
},
{
"epoch": 0.3016378525932666,
"grad_norm": 2.6124988273739893,
"learning_rate": 4.955369278211055e-06,
"loss": 0.2094,
"step": 663
},
{
"epoch": 0.3020928116469518,
"grad_norm": 2.976761995472576,
"learning_rate": 4.955234744502914e-06,
"loss": 0.1909,
"step": 664
},
{
"epoch": 0.30254777070063693,
"grad_norm": 2.0362637594213053,
"learning_rate": 4.955100010164079e-06,
"loss": 0.1968,
"step": 665
},
{
"epoch": 0.30300272975432213,
"grad_norm": 1.8717270849356715,
"learning_rate": 4.954965075205557e-06,
"loss": 0.1612,
"step": 666
},
{
"epoch": 0.3034576888080073,
"grad_norm": 2.4021794148968953,
"learning_rate": 4.9548299396383755e-06,
"loss": 0.2181,
"step": 667
},
{
"epoch": 0.3039126478616925,
"grad_norm": 2.1388957119580367,
"learning_rate": 4.954694603473578e-06,
"loss": 0.1692,
"step": 668
},
{
"epoch": 0.3043676069153776,
"grad_norm": 2.1096028848377855,
"learning_rate": 4.954559066722222e-06,
"loss": 0.204,
"step": 669
},
{
"epoch": 0.30482256596906276,
"grad_norm": 1.9629095047383018,
"learning_rate": 4.954423329395385e-06,
"loss": 0.1997,
"step": 670
},
{
"epoch": 0.30527752502274796,
"grad_norm": 1.9442418917085225,
"learning_rate": 4.954287391504156e-06,
"loss": 0.1944,
"step": 671
},
{
"epoch": 0.3057324840764331,
"grad_norm": 2.229272182184504,
"learning_rate": 4.9541512530596455e-06,
"loss": 0.2029,
"step": 672
},
{
"epoch": 0.3061874431301183,
"grad_norm": 2.080623617831735,
"learning_rate": 4.954014914072978e-06,
"loss": 0.1881,
"step": 673
},
{
"epoch": 0.30664240218380345,
"grad_norm": 1.3909729404333016,
"learning_rate": 4.9538783745552934e-06,
"loss": 0.1446,
"step": 674
},
{
"epoch": 0.30709736123748865,
"grad_norm": 2.5204656795127303,
"learning_rate": 4.95374163451775e-06,
"loss": 0.2251,
"step": 675
},
{
"epoch": 0.3075523202911738,
"grad_norm": 2.8855471273631585,
"learning_rate": 4.953604693971521e-06,
"loss": 0.1832,
"step": 676
},
{
"epoch": 0.30800727934485894,
"grad_norm": 2.415452060739297,
"learning_rate": 4.953467552927798e-06,
"loss": 0.188,
"step": 677
},
{
"epoch": 0.30846223839854414,
"grad_norm": 3.3704774970598215,
"learning_rate": 4.9533302113977845e-06,
"loss": 0.2644,
"step": 678
},
{
"epoch": 0.3089171974522293,
"grad_norm": 3.0964762790397233,
"learning_rate": 4.9531926693927055e-06,
"loss": 0.1891,
"step": 679
},
{
"epoch": 0.3093721565059145,
"grad_norm": 2.3617921935041646,
"learning_rate": 4.953054926923801e-06,
"loss": 0.1791,
"step": 680
},
{
"epoch": 0.30982711555959963,
"grad_norm": 2.1015907363587836,
"learning_rate": 4.952916984002325e-06,
"loss": 0.154,
"step": 681
},
{
"epoch": 0.31028207461328483,
"grad_norm": 2.5909443467360944,
"learning_rate": 4.95277884063955e-06,
"loss": 0.1758,
"step": 682
},
{
"epoch": 0.31073703366697,
"grad_norm": 1.9161503782177982,
"learning_rate": 4.952640496846766e-06,
"loss": 0.1883,
"step": 683
},
{
"epoch": 0.3111919927206551,
"grad_norm": 2.2723462143890187,
"learning_rate": 4.952501952635276e-06,
"loss": 0.1813,
"step": 684
},
{
"epoch": 0.3116469517743403,
"grad_norm": 1.5779544920569608,
"learning_rate": 4.952363208016402e-06,
"loss": 0.183,
"step": 685
},
{
"epoch": 0.31210191082802546,
"grad_norm": 2.3768180703064834,
"learning_rate": 4.952224263001482e-06,
"loss": 0.139,
"step": 686
},
{
"epoch": 0.31255686988171066,
"grad_norm": 1.7932474239157794,
"learning_rate": 4.952085117601868e-06,
"loss": 0.1698,
"step": 687
},
{
"epoch": 0.3130118289353958,
"grad_norm": 2.1109045834120157,
"learning_rate": 4.951945771828933e-06,
"loss": 0.2482,
"step": 688
},
{
"epoch": 0.313466787989081,
"grad_norm": 1.6399625432585407,
"learning_rate": 4.951806225694061e-06,
"loss": 0.1809,
"step": 689
},
{
"epoch": 0.31392174704276615,
"grad_norm": 2.610023079079643,
"learning_rate": 4.951666479208658e-06,
"loss": 0.1964,
"step": 690
},
{
"epoch": 0.3143767060964513,
"grad_norm": 2.574945774612913,
"learning_rate": 4.951526532384141e-06,
"loss": 0.1827,
"step": 691
},
{
"epoch": 0.3148316651501365,
"grad_norm": 1.8594925752682625,
"learning_rate": 4.951386385231946e-06,
"loss": 0.1674,
"step": 692
},
{
"epoch": 0.31528662420382164,
"grad_norm": 1.6516261883969883,
"learning_rate": 4.951246037763528e-06,
"loss": 0.1342,
"step": 693
},
{
"epoch": 0.31574158325750684,
"grad_norm": 1.8608275979712807,
"learning_rate": 4.9511054899903524e-06,
"loss": 0.1657,
"step": 694
},
{
"epoch": 0.316196542311192,
"grad_norm": 2.3555359764575545,
"learning_rate": 4.950964741923905e-06,
"loss": 0.2022,
"step": 695
},
{
"epoch": 0.3166515013648772,
"grad_norm": 1.782390866267192,
"learning_rate": 4.950823793575688e-06,
"loss": 0.1517,
"step": 696
},
{
"epoch": 0.31710646041856233,
"grad_norm": 2.001725151610439,
"learning_rate": 4.950682644957218e-06,
"loss": 0.1745,
"step": 697
},
{
"epoch": 0.3175614194722475,
"grad_norm": 2.6801559375906585,
"learning_rate": 4.9505412960800295e-06,
"loss": 0.2196,
"step": 698
},
{
"epoch": 0.3180163785259327,
"grad_norm": 2.0435969601142583,
"learning_rate": 4.950399746955673e-06,
"loss": 0.1823,
"step": 699
},
{
"epoch": 0.3184713375796178,
"grad_norm": 3.135001392998494,
"learning_rate": 4.950257997595716e-06,
"loss": 0.1932,
"step": 700
},
{
"epoch": 0.318926296633303,
"grad_norm": 2.3774677479838484,
"learning_rate": 4.950116048011739e-06,
"loss": 0.1905,
"step": 701
},
{
"epoch": 0.31938125568698816,
"grad_norm": 1.8516165333723722,
"learning_rate": 4.949973898215344e-06,
"loss": 0.1503,
"step": 702
},
{
"epoch": 0.31983621474067336,
"grad_norm": 2.343561651154435,
"learning_rate": 4.949831548218146e-06,
"loss": 0.1441,
"step": 703
},
{
"epoch": 0.3202911737943585,
"grad_norm": 1.8104402427163653,
"learning_rate": 4.949688998031777e-06,
"loss": 0.1558,
"step": 704
},
{
"epoch": 0.32074613284804365,
"grad_norm": 2.144991489680201,
"learning_rate": 4.949546247667886e-06,
"loss": 0.1305,
"step": 705
},
{
"epoch": 0.32120109190172885,
"grad_norm": 1.8279214675219737,
"learning_rate": 4.949403297138137e-06,
"loss": 0.1336,
"step": 706
},
{
"epoch": 0.321656050955414,
"grad_norm": 2.3674168986503767,
"learning_rate": 4.949260146454212e-06,
"loss": 0.1764,
"step": 707
},
{
"epoch": 0.3221110100090992,
"grad_norm": 1.6483989227538907,
"learning_rate": 4.94911679562781e-06,
"loss": 0.159,
"step": 708
},
{
"epoch": 0.32256596906278434,
"grad_norm": 2.038187279529794,
"learning_rate": 4.948973244670643e-06,
"loss": 0.1485,
"step": 709
},
{
"epoch": 0.32302092811646954,
"grad_norm": 2.41476196989692,
"learning_rate": 4.948829493594441e-06,
"loss": 0.2091,
"step": 710
},
{
"epoch": 0.3234758871701547,
"grad_norm": 2.222757795496577,
"learning_rate": 4.9486855424109524e-06,
"loss": 0.1503,
"step": 711
},
{
"epoch": 0.32393084622383983,
"grad_norm": 1.850862512986181,
"learning_rate": 4.948541391131939e-06,
"loss": 0.1505,
"step": 712
},
{
"epoch": 0.32438580527752503,
"grad_norm": 2.3940666777003137,
"learning_rate": 4.948397039769181e-06,
"loss": 0.1578,
"step": 713
},
{
"epoch": 0.3248407643312102,
"grad_norm": 2.0487809609035113,
"learning_rate": 4.948252488334474e-06,
"loss": 0.1327,
"step": 714
},
{
"epoch": 0.3252957233848954,
"grad_norm": 1.4541195656219779,
"learning_rate": 4.948107736839629e-06,
"loss": 0.1994,
"step": 715
},
{
"epoch": 0.3257506824385805,
"grad_norm": 1.6302160419859526,
"learning_rate": 4.947962785296476e-06,
"loss": 0.1665,
"step": 716
},
{
"epoch": 0.3262056414922657,
"grad_norm": 2.761516841692211,
"learning_rate": 4.9478176337168594e-06,
"loss": 0.1622,
"step": 717
},
{
"epoch": 0.32666060054595086,
"grad_norm": 2.2365611293446865,
"learning_rate": 4.9476722821126386e-06,
"loss": 0.2191,
"step": 718
},
{
"epoch": 0.327115559599636,
"grad_norm": 2.267629869433733,
"learning_rate": 4.9475267304956945e-06,
"loss": 0.1608,
"step": 719
},
{
"epoch": 0.3275705186533212,
"grad_norm": 2.8370903035030812,
"learning_rate": 4.947380978877917e-06,
"loss": 0.2059,
"step": 720
},
{
"epoch": 0.32802547770700635,
"grad_norm": 1.7629045012494435,
"learning_rate": 4.947235027271219e-06,
"loss": 0.1644,
"step": 721
},
{
"epoch": 0.32848043676069155,
"grad_norm": 1.7514209523720954,
"learning_rate": 4.9470888756875265e-06,
"loss": 0.1443,
"step": 722
},
{
"epoch": 0.3289353958143767,
"grad_norm": 1.996409560436198,
"learning_rate": 4.946942524138782e-06,
"loss": 0.1589,
"step": 723
},
{
"epoch": 0.3293903548680619,
"grad_norm": 1.9499597954033492,
"learning_rate": 4.946795972636944e-06,
"loss": 0.1856,
"step": 724
},
{
"epoch": 0.32984531392174704,
"grad_norm": 1.6935756093459424,
"learning_rate": 4.94664922119399e-06,
"loss": 0.1866,
"step": 725
},
{
"epoch": 0.3303002729754322,
"grad_norm": 2.2750870343308818,
"learning_rate": 4.94650226982191e-06,
"loss": 0.1894,
"step": 726
},
{
"epoch": 0.3307552320291174,
"grad_norm": 1.7773678651655342,
"learning_rate": 4.9463551185327115e-06,
"loss": 0.2623,
"step": 727
},
{
"epoch": 0.33121019108280253,
"grad_norm": 2.3870710697996302,
"learning_rate": 4.946207767338422e-06,
"loss": 0.1708,
"step": 728
},
{
"epoch": 0.33166515013648773,
"grad_norm": 1.8969974183881673,
"learning_rate": 4.9460602162510805e-06,
"loss": 0.1758,
"step": 729
},
{
"epoch": 0.3321201091901729,
"grad_norm": 1.9352911073022974,
"learning_rate": 4.945912465282744e-06,
"loss": 0.1199,
"step": 730
},
{
"epoch": 0.3325750682438581,
"grad_norm": 1.8878423547131853,
"learning_rate": 4.945764514445487e-06,
"loss": 0.2117,
"step": 731
},
{
"epoch": 0.3330300272975432,
"grad_norm": 2.575730274178936,
"learning_rate": 4.9456163637513986e-06,
"loss": 0.2044,
"step": 732
},
{
"epoch": 0.33348498635122836,
"grad_norm": 2.7338168638267066,
"learning_rate": 4.945468013212585e-06,
"loss": 0.2238,
"step": 733
},
{
"epoch": 0.33393994540491356,
"grad_norm": 2.1060940314978702,
"learning_rate": 4.945319462841169e-06,
"loss": 0.1727,
"step": 734
},
{
"epoch": 0.3343949044585987,
"grad_norm": 1.8942361555213085,
"learning_rate": 4.94517071264929e-06,
"loss": 0.2168,
"step": 735
},
{
"epoch": 0.3348498635122839,
"grad_norm": 2.455108985215525,
"learning_rate": 4.945021762649102e-06,
"loss": 0.1525,
"step": 736
},
{
"epoch": 0.33530482256596905,
"grad_norm": 1.8066289984722876,
"learning_rate": 4.9448726128527776e-06,
"loss": 0.2014,
"step": 737
},
{
"epoch": 0.33575978161965425,
"grad_norm": 2.142750327891088,
"learning_rate": 4.944723263272504e-06,
"loss": 0.2155,
"step": 738
},
{
"epoch": 0.3362147406733394,
"grad_norm": 2.0611633591265814,
"learning_rate": 4.944573713920485e-06,
"loss": 0.19,
"step": 739
},
{
"epoch": 0.33666969972702454,
"grad_norm": 1.5473212219148849,
"learning_rate": 4.944423964808943e-06,
"loss": 0.1829,
"step": 740
},
{
"epoch": 0.33712465878070974,
"grad_norm": 1.7792548638263834,
"learning_rate": 4.944274015950113e-06,
"loss": 0.1563,
"step": 741
},
{
"epoch": 0.3375796178343949,
"grad_norm": 2.27825782486859,
"learning_rate": 4.944123867356249e-06,
"loss": 0.1462,
"step": 742
},
{
"epoch": 0.3380345768880801,
"grad_norm": 2.544197436295867,
"learning_rate": 4.943973519039619e-06,
"loss": 0.23,
"step": 743
},
{
"epoch": 0.33848953594176523,
"grad_norm": 2.1742558484011836,
"learning_rate": 4.943822971012511e-06,
"loss": 0.1382,
"step": 744
},
{
"epoch": 0.33894449499545043,
"grad_norm": 1.986842417086239,
"learning_rate": 4.943672223287226e-06,
"loss": 0.1751,
"step": 745
},
{
"epoch": 0.3393994540491356,
"grad_norm": 2.0458092345288144,
"learning_rate": 4.9435212758760815e-06,
"loss": 0.2008,
"step": 746
},
{
"epoch": 0.3398544131028208,
"grad_norm": 1.3986293648043162,
"learning_rate": 4.943370128791413e-06,
"loss": 0.1209,
"step": 747
},
{
"epoch": 0.3403093721565059,
"grad_norm": 1.7739101505934052,
"learning_rate": 4.943218782045574e-06,
"loss": 0.1651,
"step": 748
},
{
"epoch": 0.34076433121019106,
"grad_norm": 2.0878587765611867,
"learning_rate": 4.943067235650927e-06,
"loss": 0.1705,
"step": 749
},
{
"epoch": 0.34121929026387626,
"grad_norm": 1.7446405914839491,
"learning_rate": 4.942915489619859e-06,
"loss": 0.1604,
"step": 750
},
{
"epoch": 0.3416742493175614,
"grad_norm": 2.165396057344333,
"learning_rate": 4.9427635439647704e-06,
"loss": 0.1923,
"step": 751
},
{
"epoch": 0.3421292083712466,
"grad_norm": 1.7166625815039147,
"learning_rate": 4.942611398698075e-06,
"loss": 0.145,
"step": 752
},
{
"epoch": 0.34258416742493175,
"grad_norm": 1.3978926196223211,
"learning_rate": 4.942459053832208e-06,
"loss": 0.1246,
"step": 753
},
{
"epoch": 0.34303912647861695,
"grad_norm": 1.5203589407780953,
"learning_rate": 4.942306509379617e-06,
"loss": 0.1472,
"step": 754
},
{
"epoch": 0.3434940855323021,
"grad_norm": 1.6513608457469287,
"learning_rate": 4.942153765352767e-06,
"loss": 0.1408,
"step": 755
},
{
"epoch": 0.34394904458598724,
"grad_norm": 1.8035254782552455,
"learning_rate": 4.94200082176414e-06,
"loss": 0.1474,
"step": 756
},
{
"epoch": 0.34440400363967244,
"grad_norm": 2.1335404521767414,
"learning_rate": 4.941847678626234e-06,
"loss": 0.1755,
"step": 757
},
{
"epoch": 0.3448589626933576,
"grad_norm": 1.9408426816261404,
"learning_rate": 4.941694335951563e-06,
"loss": 0.2154,
"step": 758
},
{
"epoch": 0.3453139217470428,
"grad_norm": 1.749049542240047,
"learning_rate": 4.9415407937526575e-06,
"loss": 0.1482,
"step": 759
},
{
"epoch": 0.34576888080072793,
"grad_norm": 2.2747218478213598,
"learning_rate": 4.9413870520420635e-06,
"loss": 0.2213,
"step": 760
},
{
"epoch": 0.34622383985441313,
"grad_norm": 1.9679998520100659,
"learning_rate": 4.941233110832346e-06,
"loss": 0.1482,
"step": 761
},
{
"epoch": 0.3466787989080983,
"grad_norm": 2.7634133318079135,
"learning_rate": 4.941078970136082e-06,
"loss": 0.1649,
"step": 762
},
{
"epoch": 0.3471337579617834,
"grad_norm": 1.4323163769051608,
"learning_rate": 4.940924629965869e-06,
"loss": 0.152,
"step": 763
},
{
"epoch": 0.3475887170154686,
"grad_norm": 2.269381697045094,
"learning_rate": 4.940770090334319e-06,
"loss": 0.1446,
"step": 764
},
{
"epoch": 0.34804367606915376,
"grad_norm": 1.8723783038369444,
"learning_rate": 4.940615351254059e-06,
"loss": 0.1142,
"step": 765
},
{
"epoch": 0.34849863512283896,
"grad_norm": 1.8076648915776874,
"learning_rate": 4.940460412737734e-06,
"loss": 0.1944,
"step": 766
},
{
"epoch": 0.3489535941765241,
"grad_norm": 2.080159914413928,
"learning_rate": 4.940305274798005e-06,
"loss": 0.1582,
"step": 767
},
{
"epoch": 0.3494085532302093,
"grad_norm": 2.330746693235809,
"learning_rate": 4.940149937447549e-06,
"loss": 0.2007,
"step": 768
},
{
"epoch": 0.34986351228389445,
"grad_norm": 1.536246049438816,
"learning_rate": 4.939994400699061e-06,
"loss": 0.1408,
"step": 769
},
{
"epoch": 0.3503184713375796,
"grad_norm": 2.2894795215614994,
"learning_rate": 4.939838664565248e-06,
"loss": 0.1837,
"step": 770
},
{
"epoch": 0.3507734303912648,
"grad_norm": 1.6850122967374852,
"learning_rate": 4.939682729058839e-06,
"loss": 0.1289,
"step": 771
},
{
"epoch": 0.35122838944494994,
"grad_norm": 1.446339812351698,
"learning_rate": 4.939526594192574e-06,
"loss": 0.1329,
"step": 772
},
{
"epoch": 0.35168334849863514,
"grad_norm": 1.776973239663882,
"learning_rate": 4.939370259979213e-06,
"loss": 0.1178,
"step": 773
},
{
"epoch": 0.3521383075523203,
"grad_norm": 2.818513132709455,
"learning_rate": 4.9392137264315295e-06,
"loss": 0.2364,
"step": 774
},
{
"epoch": 0.3525932666060055,
"grad_norm": 1.6041796316256967,
"learning_rate": 4.939056993562316e-06,
"loss": 0.2278,
"step": 775
},
{
"epoch": 0.35304822565969063,
"grad_norm": 2.268295214561187,
"learning_rate": 4.9389000613843805e-06,
"loss": 0.1499,
"step": 776
},
{
"epoch": 0.3535031847133758,
"grad_norm": 2.531973358561036,
"learning_rate": 4.938742929910546e-06,
"loss": 0.1743,
"step": 777
},
{
"epoch": 0.353958143767061,
"grad_norm": 1.3677758044070074,
"learning_rate": 4.938585599153652e-06,
"loss": 0.1351,
"step": 778
},
{
"epoch": 0.3544131028207461,
"grad_norm": 2.4047975606277947,
"learning_rate": 4.938428069126555e-06,
"loss": 0.1951,
"step": 779
},
{
"epoch": 0.3548680618744313,
"grad_norm": 1.6598587480853697,
"learning_rate": 4.9382703398421285e-06,
"loss": 0.1602,
"step": 780
},
{
"epoch": 0.35532302092811646,
"grad_norm": 2.501614606596268,
"learning_rate": 4.938112411313261e-06,
"loss": 0.193,
"step": 781
},
{
"epoch": 0.35577797998180166,
"grad_norm": 1.7808472248973335,
"learning_rate": 4.937954283552858e-06,
"loss": 0.1322,
"step": 782
},
{
"epoch": 0.3562329390354868,
"grad_norm": 2.397821173092958,
"learning_rate": 4.93779595657384e-06,
"loss": 0.1819,
"step": 783
},
{
"epoch": 0.35668789808917195,
"grad_norm": 2.0407668064122495,
"learning_rate": 4.937637430389145e-06,
"loss": 0.1722,
"step": 784
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.9792096843409923,
"learning_rate": 4.937478705011729e-06,
"loss": 0.1349,
"step": 785
},
{
"epoch": 0.3575978161965423,
"grad_norm": 1.5581979975977567,
"learning_rate": 4.937319780454559e-06,
"loss": 0.1891,
"step": 786
},
{
"epoch": 0.3580527752502275,
"grad_norm": 1.3563862115066228,
"learning_rate": 4.937160656730625e-06,
"loss": 0.1622,
"step": 787
},
{
"epoch": 0.35850773430391264,
"grad_norm": 1.7874560137459294,
"learning_rate": 4.9370013338529274e-06,
"loss": 0.1606,
"step": 788
},
{
"epoch": 0.35896269335759784,
"grad_norm": 1.695354030268494,
"learning_rate": 4.936841811834486e-06,
"loss": 0.1725,
"step": 789
},
{
"epoch": 0.359417652411283,
"grad_norm": 1.5018417297722055,
"learning_rate": 4.936682090688337e-06,
"loss": 0.1568,
"step": 790
},
{
"epoch": 0.35987261146496813,
"grad_norm": 2.3008919876499276,
"learning_rate": 4.936522170427531e-06,
"loss": 0.1607,
"step": 791
},
{
"epoch": 0.36032757051865333,
"grad_norm": 2.145424436631978,
"learning_rate": 4.936362051065136e-06,
"loss": 0.136,
"step": 792
},
{
"epoch": 0.3607825295723385,
"grad_norm": 2.023227990902717,
"learning_rate": 4.936201732614238e-06,
"loss": 0.1568,
"step": 793
},
{
"epoch": 0.3612374886260237,
"grad_norm": 1.8119576330565363,
"learning_rate": 4.9360412150879355e-06,
"loss": 0.1291,
"step": 794
},
{
"epoch": 0.3616924476797088,
"grad_norm": 1.904733745689391,
"learning_rate": 4.935880498499346e-06,
"loss": 0.1262,
"step": 795
},
{
"epoch": 0.362147406733394,
"grad_norm": 2.1050139123506235,
"learning_rate": 4.935719582861604e-06,
"loss": 0.2027,
"step": 796
},
{
"epoch": 0.36260236578707916,
"grad_norm": 1.5866289163873395,
"learning_rate": 4.935558468187855e-06,
"loss": 0.1713,
"step": 797
},
{
"epoch": 0.3630573248407643,
"grad_norm": 2.266843952674795,
"learning_rate": 4.935397154491268e-06,
"loss": 0.1881,
"step": 798
},
{
"epoch": 0.3635122838944495,
"grad_norm": 1.9774458028018125,
"learning_rate": 4.935235641785023e-06,
"loss": 0.1837,
"step": 799
},
{
"epoch": 0.36396724294813465,
"grad_norm": 2.1853087729094796,
"learning_rate": 4.935073930082319e-06,
"loss": 0.176,
"step": 800
},
{
"epoch": 0.36442220200181985,
"grad_norm": 2.525766342273085,
"learning_rate": 4.93491201939637e-06,
"loss": 0.2015,
"step": 801
},
{
"epoch": 0.364877161055505,
"grad_norm": 2.5055378214905843,
"learning_rate": 4.934749909740408e-06,
"loss": 0.1961,
"step": 802
},
{
"epoch": 0.3653321201091902,
"grad_norm": 2.0645024314881035,
"learning_rate": 4.934587601127677e-06,
"loss": 0.1644,
"step": 803
},
{
"epoch": 0.36578707916287534,
"grad_norm": 2.0158906472533373,
"learning_rate": 4.934425093571442e-06,
"loss": 0.1911,
"step": 804
},
{
"epoch": 0.3662420382165605,
"grad_norm": 2.284162498710454,
"learning_rate": 4.934262387084984e-06,
"loss": 0.2008,
"step": 805
},
{
"epoch": 0.3666969972702457,
"grad_norm": 2.0973583334570547,
"learning_rate": 4.934099481681595e-06,
"loss": 0.1557,
"step": 806
},
{
"epoch": 0.36715195632393083,
"grad_norm": 2.2021201797945356,
"learning_rate": 4.933936377374589e-06,
"loss": 0.1524,
"step": 807
},
{
"epoch": 0.36760691537761603,
"grad_norm": 1.806808877742582,
"learning_rate": 4.933773074177293e-06,
"loss": 0.1738,
"step": 808
},
{
"epoch": 0.3680618744313012,
"grad_norm": 2.2345290767527386,
"learning_rate": 4.933609572103053e-06,
"loss": 0.1442,
"step": 809
},
{
"epoch": 0.3685168334849864,
"grad_norm": 1.9706491037079354,
"learning_rate": 4.933445871165229e-06,
"loss": 0.2354,
"step": 810
},
{
"epoch": 0.3689717925386715,
"grad_norm": 2.404773980417632,
"learning_rate": 4.933281971377197e-06,
"loss": 0.1719,
"step": 811
},
{
"epoch": 0.36942675159235666,
"grad_norm": 1.720683846457796,
"learning_rate": 4.933117872752352e-06,
"loss": 0.1914,
"step": 812
},
{
"epoch": 0.36988171064604186,
"grad_norm": 2.532410934524191,
"learning_rate": 4.932953575304102e-06,
"loss": 0.2144,
"step": 813
},
{
"epoch": 0.370336669699727,
"grad_norm": 5.0403677379252425,
"learning_rate": 4.932789079045873e-06,
"loss": 0.2595,
"step": 814
},
{
"epoch": 0.3707916287534122,
"grad_norm": 2.0578633523076437,
"learning_rate": 4.932624383991106e-06,
"loss": 0.1739,
"step": 815
},
{
"epoch": 0.37124658780709735,
"grad_norm": 1.9986709520957122,
"learning_rate": 4.9324594901532605e-06,
"loss": 0.1838,
"step": 816
},
{
"epoch": 0.37170154686078255,
"grad_norm": 1.7217394600458333,
"learning_rate": 4.93229439754581e-06,
"loss": 0.1579,
"step": 817
},
{
"epoch": 0.3721565059144677,
"grad_norm": 2.1321573080305813,
"learning_rate": 4.932129106182246e-06,
"loss": 0.1926,
"step": 818
},
{
"epoch": 0.37261146496815284,
"grad_norm": 2.793277438622436,
"learning_rate": 4.931963616076075e-06,
"loss": 0.2136,
"step": 819
},
{
"epoch": 0.37306642402183804,
"grad_norm": 1.7394149868487567,
"learning_rate": 4.93179792724082e-06,
"loss": 0.128,
"step": 820
},
{
"epoch": 0.3735213830755232,
"grad_norm": 1.82657006763275,
"learning_rate": 4.9316320396900195e-06,
"loss": 0.17,
"step": 821
},
{
"epoch": 0.3739763421292084,
"grad_norm": 1.823894210494748,
"learning_rate": 4.9314659534372305e-06,
"loss": 0.1981,
"step": 822
},
{
"epoch": 0.37443130118289353,
"grad_norm": 2.3539272175568775,
"learning_rate": 4.931299668496024e-06,
"loss": 0.1576,
"step": 823
},
{
"epoch": 0.37488626023657873,
"grad_norm": 2.5070798015414666,
"learning_rate": 4.931133184879988e-06,
"loss": 0.1886,
"step": 824
},
{
"epoch": 0.37534121929026387,
"grad_norm": 2.008848059538202,
"learning_rate": 4.930966502602727e-06,
"loss": 0.1605,
"step": 825
},
{
"epoch": 0.37579617834394907,
"grad_norm": 2.407030934613122,
"learning_rate": 4.930799621677862e-06,
"loss": 0.1802,
"step": 826
},
{
"epoch": 0.3762511373976342,
"grad_norm": 1.8420833153352183,
"learning_rate": 4.93063254211903e-06,
"loss": 0.1641,
"step": 827
},
{
"epoch": 0.37670609645131936,
"grad_norm": 2.159279850858488,
"learning_rate": 4.930465263939882e-06,
"loss": 0.1669,
"step": 828
},
{
"epoch": 0.37716105550500456,
"grad_norm": 2.5834296367648477,
"learning_rate": 4.9302977871540894e-06,
"loss": 0.2047,
"step": 829
},
{
"epoch": 0.3776160145586897,
"grad_norm": 1.6914011736844907,
"learning_rate": 4.930130111775336e-06,
"loss": 0.1153,
"step": 830
},
{
"epoch": 0.3780709736123749,
"grad_norm": 1.9730937065718759,
"learning_rate": 4.9299622378173245e-06,
"loss": 0.1321,
"step": 831
},
{
"epoch": 0.37852593266606005,
"grad_norm": 4.287590804185868,
"learning_rate": 4.929794165293773e-06,
"loss": 0.2942,
"step": 832
},
{
"epoch": 0.37898089171974525,
"grad_norm": 1.3332137290220585,
"learning_rate": 4.9296258942184145e-06,
"loss": 0.1089,
"step": 833
},
{
"epoch": 0.3794358507734304,
"grad_norm": 1.5962116643063975,
"learning_rate": 4.929457424605e-06,
"loss": 0.1214,
"step": 834
},
{
"epoch": 0.37989080982711554,
"grad_norm": 2.022957256898634,
"learning_rate": 4.929288756467296e-06,
"loss": 0.1853,
"step": 835
},
{
"epoch": 0.38034576888080074,
"grad_norm": 2.4282570688213863,
"learning_rate": 4.929119889819086e-06,
"loss": 0.1873,
"step": 836
},
{
"epoch": 0.3808007279344859,
"grad_norm": 2.9395172179458,
"learning_rate": 4.928950824674169e-06,
"loss": 0.2634,
"step": 837
},
{
"epoch": 0.3812556869881711,
"grad_norm": 2.2127049960140335,
"learning_rate": 4.928781561046359e-06,
"loss": 0.1663,
"step": 838
},
{
"epoch": 0.3817106460418562,
"grad_norm": 2.536562760970294,
"learning_rate": 4.928612098949488e-06,
"loss": 0.2011,
"step": 839
},
{
"epoch": 0.3821656050955414,
"grad_norm": 2.1855699037821514,
"learning_rate": 4.9284424383974026e-06,
"loss": 0.1794,
"step": 840
},
{
"epoch": 0.38262056414922657,
"grad_norm": 1.6332101956979397,
"learning_rate": 4.928272579403969e-06,
"loss": 0.1279,
"step": 841
},
{
"epoch": 0.3830755232029117,
"grad_norm": 1.5663751127122882,
"learning_rate": 4.928102521983067e-06,
"loss": 0.1985,
"step": 842
},
{
"epoch": 0.3835304822565969,
"grad_norm": 2.4747913159024195,
"learning_rate": 4.9279322661485906e-06,
"loss": 0.244,
"step": 843
},
{
"epoch": 0.38398544131028206,
"grad_norm": 1.9419499604147055,
"learning_rate": 4.927761811914455e-06,
"loss": 0.1996,
"step": 844
},
{
"epoch": 0.38444040036396726,
"grad_norm": 1.8790570447198083,
"learning_rate": 4.927591159294587e-06,
"loss": 0.1746,
"step": 845
},
{
"epoch": 0.3848953594176524,
"grad_norm": 3.2586686346278992,
"learning_rate": 4.927420308302933e-06,
"loss": 0.2099,
"step": 846
},
{
"epoch": 0.3853503184713376,
"grad_norm": 1.8912381154957223,
"learning_rate": 4.927249258953454e-06,
"loss": 0.2159,
"step": 847
},
{
"epoch": 0.38580527752502275,
"grad_norm": 2.5636879906209242,
"learning_rate": 4.927078011260126e-06,
"loss": 0.2142,
"step": 848
},
{
"epoch": 0.3862602365787079,
"grad_norm": 2.2557014215101794,
"learning_rate": 4.926906565236943e-06,
"loss": 0.2158,
"step": 849
},
{
"epoch": 0.3867151956323931,
"grad_norm": 2.0433651062149076,
"learning_rate": 4.926734920897916e-06,
"loss": 0.1564,
"step": 850
},
{
"epoch": 0.38717015468607824,
"grad_norm": 1.1448398063326757,
"learning_rate": 4.926563078257071e-06,
"loss": 0.1274,
"step": 851
},
{
"epoch": 0.38762511373976344,
"grad_norm": 1.5601081736798879,
"learning_rate": 4.926391037328448e-06,
"loss": 0.1742,
"step": 852
},
{
"epoch": 0.3880800727934486,
"grad_norm": 1.735106713842307,
"learning_rate": 4.926218798126108e-06,
"loss": 0.17,
"step": 853
},
{
"epoch": 0.3885350318471338,
"grad_norm": 1.8524828246659681,
"learning_rate": 4.926046360664124e-06,
"loss": 0.1359,
"step": 854
},
{
"epoch": 0.3889899909008189,
"grad_norm": 1.8327900649742344,
"learning_rate": 4.925873724956588e-06,
"loss": 0.1276,
"step": 855
},
{
"epoch": 0.38944494995450407,
"grad_norm": 1.7997603613849789,
"learning_rate": 4.9257008910176065e-06,
"loss": 0.236,
"step": 856
},
{
"epoch": 0.38989990900818927,
"grad_norm": 2.1973741478380893,
"learning_rate": 4.925527858861302e-06,
"loss": 0.1935,
"step": 857
},
{
"epoch": 0.3903548680618744,
"grad_norm": 2.086365440068575,
"learning_rate": 4.925354628501814e-06,
"loss": 0.1652,
"step": 858
},
{
"epoch": 0.3908098271155596,
"grad_norm": 1.8116013889379734,
"learning_rate": 4.925181199953299e-06,
"loss": 0.1612,
"step": 859
},
{
"epoch": 0.39126478616924476,
"grad_norm": 1.9247913507109833,
"learning_rate": 4.9250075732299285e-06,
"loss": 0.1588,
"step": 860
},
{
"epoch": 0.39171974522292996,
"grad_norm": 2.514293103428901,
"learning_rate": 4.92483374834589e-06,
"loss": 0.19,
"step": 861
},
{
"epoch": 0.3921747042766151,
"grad_norm": 2.0316288184050024,
"learning_rate": 4.9246597253153884e-06,
"loss": 0.1831,
"step": 862
},
{
"epoch": 0.39262966333030025,
"grad_norm": 1.5754846974100747,
"learning_rate": 4.924485504152644e-06,
"loss": 0.1466,
"step": 863
},
{
"epoch": 0.39308462238398545,
"grad_norm": 2.1731555902481685,
"learning_rate": 4.924311084871892e-06,
"loss": 0.1937,
"step": 864
},
{
"epoch": 0.3935395814376706,
"grad_norm": 1.5966819404904389,
"learning_rate": 4.924136467487387e-06,
"loss": 0.1251,
"step": 865
},
{
"epoch": 0.3939945404913558,
"grad_norm": 1.8663994781934827,
"learning_rate": 4.923961652013397e-06,
"loss": 0.1523,
"step": 866
},
{
"epoch": 0.39444949954504094,
"grad_norm": 2.1002789601399257,
"learning_rate": 4.923786638464207e-06,
"loss": 0.2129,
"step": 867
},
{
"epoch": 0.39490445859872614,
"grad_norm": 2.081418128383539,
"learning_rate": 4.9236114268541196e-06,
"loss": 0.1437,
"step": 868
},
{
"epoch": 0.3953594176524113,
"grad_norm": 2.447658119106072,
"learning_rate": 4.923436017197451e-06,
"loss": 0.201,
"step": 869
},
{
"epoch": 0.3958143767060964,
"grad_norm": 1.7750379508150516,
"learning_rate": 4.923260409508535e-06,
"loss": 0.1282,
"step": 870
},
{
"epoch": 0.3962693357597816,
"grad_norm": 1.6418670453366244,
"learning_rate": 4.9230846038017214e-06,
"loss": 0.2087,
"step": 871
},
{
"epoch": 0.39672429481346677,
"grad_norm": 1.7770417360691049,
"learning_rate": 4.922908600091378e-06,
"loss": 0.1372,
"step": 872
},
{
"epoch": 0.39717925386715197,
"grad_norm": 1.690148135895664,
"learning_rate": 4.9227323983918835e-06,
"loss": 0.1855,
"step": 873
},
{
"epoch": 0.3976342129208371,
"grad_norm": 1.5404851420453596,
"learning_rate": 4.922555998717639e-06,
"loss": 0.1398,
"step": 874
},
{
"epoch": 0.3980891719745223,
"grad_norm": 2.1706268320484328,
"learning_rate": 4.922379401083058e-06,
"loss": 0.1486,
"step": 875
},
{
"epoch": 0.39854413102820746,
"grad_norm": 3.0077672507475786,
"learning_rate": 4.922202605502573e-06,
"loss": 0.2077,
"step": 876
},
{
"epoch": 0.3989990900818926,
"grad_norm": 1.5486893349846256,
"learning_rate": 4.922025611990629e-06,
"loss": 0.1604,
"step": 877
},
{
"epoch": 0.3994540491355778,
"grad_norm": 1.8667533652947603,
"learning_rate": 4.92184842056169e-06,
"loss": 0.1722,
"step": 878
},
{
"epoch": 0.39990900818926295,
"grad_norm": 2.289002791626951,
"learning_rate": 4.921671031230235e-06,
"loss": 0.1647,
"step": 879
},
{
"epoch": 0.40036396724294815,
"grad_norm": 1.8286186193347604,
"learning_rate": 4.921493444010759e-06,
"loss": 0.1773,
"step": 880
},
{
"epoch": 0.4008189262966333,
"grad_norm": 1.8147441438330003,
"learning_rate": 4.921315658917774e-06,
"loss": 0.1711,
"step": 881
},
{
"epoch": 0.4012738853503185,
"grad_norm": 2.00913911322474,
"learning_rate": 4.921137675965809e-06,
"loss": 0.1263,
"step": 882
},
{
"epoch": 0.40172884440400364,
"grad_norm": 1.3862791101345426,
"learning_rate": 4.920959495169406e-06,
"loss": 0.1465,
"step": 883
},
{
"epoch": 0.4021838034576888,
"grad_norm": 2.4187567327639234,
"learning_rate": 4.920781116543126e-06,
"loss": 0.2198,
"step": 884
},
{
"epoch": 0.402638762511374,
"grad_norm": 1.6465776945830464,
"learning_rate": 4.920602540101546e-06,
"loss": 0.1309,
"step": 885
},
{
"epoch": 0.4030937215650591,
"grad_norm": 2.6312019573375682,
"learning_rate": 4.920423765859257e-06,
"loss": 0.1948,
"step": 886
},
{
"epoch": 0.4035486806187443,
"grad_norm": 1.9940911601496167,
"learning_rate": 4.920244793830869e-06,
"loss": 0.1657,
"step": 887
},
{
"epoch": 0.40400363967242947,
"grad_norm": 1.9526243984241491,
"learning_rate": 4.920065624031006e-06,
"loss": 0.1616,
"step": 888
},
{
"epoch": 0.40445859872611467,
"grad_norm": 1.5338098697837441,
"learning_rate": 4.919886256474309e-06,
"loss": 0.1512,
"step": 889
},
{
"epoch": 0.4049135577797998,
"grad_norm": 2.0468687722376773,
"learning_rate": 4.919706691175435e-06,
"loss": 0.1701,
"step": 890
},
{
"epoch": 0.40536851683348496,
"grad_norm": 2.200436787943407,
"learning_rate": 4.919526928149058e-06,
"loss": 0.2293,
"step": 891
},
{
"epoch": 0.40582347588717016,
"grad_norm": 1.8050882174330405,
"learning_rate": 4.919346967409867e-06,
"loss": 0.1602,
"step": 892
},
{
"epoch": 0.4062784349408553,
"grad_norm": 1.7135594043707498,
"learning_rate": 4.919166808972567e-06,
"loss": 0.2064,
"step": 893
},
{
"epoch": 0.4067333939945405,
"grad_norm": 2.612056409341394,
"learning_rate": 4.918986452851881e-06,
"loss": 0.1668,
"step": 894
},
{
"epoch": 0.40718835304822565,
"grad_norm": 2.016673285347467,
"learning_rate": 4.918805899062545e-06,
"loss": 0.1925,
"step": 895
},
{
"epoch": 0.40764331210191085,
"grad_norm": 1.4000022926360023,
"learning_rate": 4.9186251476193146e-06,
"loss": 0.1592,
"step": 896
},
{
"epoch": 0.408098271155596,
"grad_norm": 1.60492731991447,
"learning_rate": 4.918444198536959e-06,
"loss": 0.1731,
"step": 897
},
{
"epoch": 0.40855323020928114,
"grad_norm": 1.673902690478855,
"learning_rate": 4.918263051830267e-06,
"loss": 0.1228,
"step": 898
},
{
"epoch": 0.40900818926296634,
"grad_norm": 2.6755237129572484,
"learning_rate": 4.918081707514037e-06,
"loss": 0.1409,
"step": 899
},
{
"epoch": 0.4094631483166515,
"grad_norm": 1.9078463274657658,
"learning_rate": 4.917900165603091e-06,
"loss": 0.1276,
"step": 900
},
{
"epoch": 0.4099181073703367,
"grad_norm": 2.234681815409533,
"learning_rate": 4.9177184261122624e-06,
"loss": 0.1652,
"step": 901
},
{
"epoch": 0.4103730664240218,
"grad_norm": 2.839831167960225,
"learning_rate": 4.917536489056402e-06,
"loss": 0.1798,
"step": 902
},
{
"epoch": 0.410828025477707,
"grad_norm": 2.010867770048541,
"learning_rate": 4.9173543544503775e-06,
"loss": 0.1154,
"step": 903
},
{
"epoch": 0.41128298453139217,
"grad_norm": 2.08218098114304,
"learning_rate": 4.917172022309072e-06,
"loss": 0.1455,
"step": 904
},
{
"epoch": 0.41173794358507737,
"grad_norm": 1.7302162150410665,
"learning_rate": 4.916989492647385e-06,
"loss": 0.1193,
"step": 905
},
{
"epoch": 0.4121929026387625,
"grad_norm": 1.5485580925696725,
"learning_rate": 4.916806765480231e-06,
"loss": 0.0922,
"step": 906
},
{
"epoch": 0.41264786169244766,
"grad_norm": 1.7263185607767098,
"learning_rate": 4.9166238408225416e-06,
"loss": 0.2167,
"step": 907
},
{
"epoch": 0.41310282074613286,
"grad_norm": 1.9178320379998328,
"learning_rate": 4.916440718689267e-06,
"loss": 0.1554,
"step": 908
},
{
"epoch": 0.413557779799818,
"grad_norm": 1.8197306174687815,
"learning_rate": 4.916257399095369e-06,
"loss": 0.1474,
"step": 909
},
{
"epoch": 0.4140127388535032,
"grad_norm": 1.7449499320119561,
"learning_rate": 4.916073882055827e-06,
"loss": 0.1327,
"step": 910
},
{
"epoch": 0.41446769790718835,
"grad_norm": 2.4422880124371646,
"learning_rate": 4.91589016758564e-06,
"loss": 0.1937,
"step": 911
},
{
"epoch": 0.41492265696087355,
"grad_norm": 1.6511138034689814,
"learning_rate": 4.915706255699817e-06,
"loss": 0.1363,
"step": 912
},
{
"epoch": 0.4153776160145587,
"grad_norm": 2.143275165439444,
"learning_rate": 4.915522146413389e-06,
"loss": 0.2735,
"step": 913
},
{
"epoch": 0.41583257506824384,
"grad_norm": 1.924782534095729,
"learning_rate": 4.9153378397413985e-06,
"loss": 0.1751,
"step": 914
},
{
"epoch": 0.41628753412192904,
"grad_norm": 1.951438348175618,
"learning_rate": 4.915153335698908e-06,
"loss": 0.1619,
"step": 915
},
{
"epoch": 0.4167424931756142,
"grad_norm": 2.2127088657857548,
"learning_rate": 4.914968634300994e-06,
"loss": 0.2147,
"step": 916
},
{
"epoch": 0.4171974522292994,
"grad_norm": 1.6061838128612729,
"learning_rate": 4.914783735562748e-06,
"loss": 0.1499,
"step": 917
},
{
"epoch": 0.4176524112829845,
"grad_norm": 1.4285312675375041,
"learning_rate": 4.914598639499281e-06,
"loss": 0.1583,
"step": 918
},
{
"epoch": 0.4181073703366697,
"grad_norm": 1.6360040253886021,
"learning_rate": 4.914413346125717e-06,
"loss": 0.1066,
"step": 919
},
{
"epoch": 0.41856232939035487,
"grad_norm": 2.343895109900456,
"learning_rate": 4.914227855457199e-06,
"loss": 0.1823,
"step": 920
},
{
"epoch": 0.41901728844404,
"grad_norm": 2.318188728357057,
"learning_rate": 4.914042167508881e-06,
"loss": 0.1437,
"step": 921
},
{
"epoch": 0.4194722474977252,
"grad_norm": 2.3202387804341336,
"learning_rate": 4.9138562822959416e-06,
"loss": 0.1589,
"step": 922
},
{
"epoch": 0.41992720655141036,
"grad_norm": 2.608072279082345,
"learning_rate": 4.913670199833566e-06,
"loss": 0.1851,
"step": 923
},
{
"epoch": 0.42038216560509556,
"grad_norm": 2.181253773511138,
"learning_rate": 4.913483920136961e-06,
"loss": 0.1756,
"step": 924
},
{
"epoch": 0.4208371246587807,
"grad_norm": 2.211521150780038,
"learning_rate": 4.91329744322135e-06,
"loss": 0.1732,
"step": 925
},
{
"epoch": 0.4212920837124659,
"grad_norm": 1.812598878243348,
"learning_rate": 4.913110769101971e-06,
"loss": 0.166,
"step": 926
},
{
"epoch": 0.42174704276615105,
"grad_norm": 2.205776388483361,
"learning_rate": 4.912923897794077e-06,
"loss": 0.1771,
"step": 927
},
{
"epoch": 0.4222020018198362,
"grad_norm": 1.423655928174165,
"learning_rate": 4.912736829312938e-06,
"loss": 0.1489,
"step": 928
},
{
"epoch": 0.4226569608735214,
"grad_norm": 1.831805612119293,
"learning_rate": 4.912549563673842e-06,
"loss": 0.168,
"step": 929
},
{
"epoch": 0.42311191992720654,
"grad_norm": 1.4699738850406474,
"learning_rate": 4.912362100892091e-06,
"loss": 0.1674,
"step": 930
},
{
"epoch": 0.42356687898089174,
"grad_norm": 1.9047547244636083,
"learning_rate": 4.912174440983002e-06,
"loss": 0.1639,
"step": 931
},
{
"epoch": 0.4240218380345769,
"grad_norm": 2.0520314286066843,
"learning_rate": 4.911986583961912e-06,
"loss": 0.2138,
"step": 932
},
{
"epoch": 0.4244767970882621,
"grad_norm": 2.5542601480975278,
"learning_rate": 4.91179852984417e-06,
"loss": 0.2276,
"step": 933
},
{
"epoch": 0.4249317561419472,
"grad_norm": 1.5302053494447614,
"learning_rate": 4.911610278645144e-06,
"loss": 0.1489,
"step": 934
},
{
"epoch": 0.42538671519563237,
"grad_norm": 1.7414787617118297,
"learning_rate": 4.911421830380217e-06,
"loss": 0.1182,
"step": 935
},
{
"epoch": 0.42584167424931757,
"grad_norm": 1.7429292851594573,
"learning_rate": 4.911233185064788e-06,
"loss": 0.2064,
"step": 936
},
{
"epoch": 0.4262966333030027,
"grad_norm": 2.3105951171285968,
"learning_rate": 4.911044342714272e-06,
"loss": 0.1405,
"step": 937
},
{
"epoch": 0.4267515923566879,
"grad_norm": 1.779382452074537,
"learning_rate": 4.9108553033440995e-06,
"loss": 0.1291,
"step": 938
},
{
"epoch": 0.42720655141037306,
"grad_norm": 1.7957846625134024,
"learning_rate": 4.91066606696972e-06,
"loss": 0.1647,
"step": 939
},
{
"epoch": 0.42766151046405826,
"grad_norm": 2.3261521372348057,
"learning_rate": 4.910476633606597e-06,
"loss": 0.1927,
"step": 940
},
{
"epoch": 0.4281164695177434,
"grad_norm": 1.9153006743556071,
"learning_rate": 4.9102870032702075e-06,
"loss": 0.1584,
"step": 941
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.8033286854373174,
"learning_rate": 4.910097175976049e-06,
"loss": 0.1825,
"step": 942
},
{
"epoch": 0.42902638762511375,
"grad_norm": 2.8388348591880597,
"learning_rate": 4.909907151739634e-06,
"loss": 0.2113,
"step": 943
},
{
"epoch": 0.4294813466787989,
"grad_norm": 2.6244899475003813,
"learning_rate": 4.909716930576489e-06,
"loss": 0.1704,
"step": 944
},
{
"epoch": 0.4299363057324841,
"grad_norm": 2.112585064442849,
"learning_rate": 4.909526512502158e-06,
"loss": 0.1589,
"step": 945
},
{
"epoch": 0.43039126478616924,
"grad_norm": 2.289068427554651,
"learning_rate": 4.9093358975322025e-06,
"loss": 0.1714,
"step": 946
},
{
"epoch": 0.43084622383985444,
"grad_norm": 2.5327374827374065,
"learning_rate": 4.909145085682198e-06,
"loss": 0.2278,
"step": 947
},
{
"epoch": 0.4313011828935396,
"grad_norm": 2.1519696726150315,
"learning_rate": 4.908954076967737e-06,
"loss": 0.1561,
"step": 948
},
{
"epoch": 0.4317561419472247,
"grad_norm": 2.3965497202736485,
"learning_rate": 4.908762871404427e-06,
"loss": 0.2721,
"step": 949
},
{
"epoch": 0.4322111010009099,
"grad_norm": 1.7303730946554432,
"learning_rate": 4.908571469007893e-06,
"loss": 0.1886,
"step": 950
},
{
"epoch": 0.43266606005459507,
"grad_norm": 1.867974683826286,
"learning_rate": 4.908379869793776e-06,
"loss": 0.1621,
"step": 951
},
{
"epoch": 0.43312101910828027,
"grad_norm": 2.0573077802321134,
"learning_rate": 4.908188073777732e-06,
"loss": 0.1897,
"step": 952
},
{
"epoch": 0.4335759781619654,
"grad_norm": 1.4532292026282405,
"learning_rate": 4.9079960809754334e-06,
"loss": 0.1729,
"step": 953
},
{
"epoch": 0.4340309372156506,
"grad_norm": 1.962539816890548,
"learning_rate": 4.90780389140257e-06,
"loss": 0.1301,
"step": 954
},
{
"epoch": 0.43448589626933576,
"grad_norm": 2.4468234331381677,
"learning_rate": 4.907611505074846e-06,
"loss": 0.1709,
"step": 955
},
{
"epoch": 0.4349408553230209,
"grad_norm": 2.666497869750462,
"learning_rate": 4.907418922007983e-06,
"loss": 0.1628,
"step": 956
},
{
"epoch": 0.4353958143767061,
"grad_norm": 2.2137035827801226,
"learning_rate": 4.907226142217717e-06,
"loss": 0.1353,
"step": 957
},
{
"epoch": 0.43585077343039125,
"grad_norm": 2.572062185697332,
"learning_rate": 4.9070331657198015e-06,
"loss": 0.1745,
"step": 958
},
{
"epoch": 0.43630573248407645,
"grad_norm": 2.405655176153194,
"learning_rate": 4.906839992530006e-06,
"loss": 0.2171,
"step": 959
},
{
"epoch": 0.4367606915377616,
"grad_norm": 1.836795075502022,
"learning_rate": 4.906646622664115e-06,
"loss": 0.168,
"step": 960
},
{
"epoch": 0.4372156505914468,
"grad_norm": 2.166035033805183,
"learning_rate": 4.906453056137931e-06,
"loss": 0.1223,
"step": 961
},
{
"epoch": 0.43767060964513194,
"grad_norm": 2.072717194766617,
"learning_rate": 4.90625929296727e-06,
"loss": 0.2248,
"step": 962
},
{
"epoch": 0.4381255686988171,
"grad_norm": 1.8024426189806846,
"learning_rate": 4.9060653331679665e-06,
"loss": 0.1956,
"step": 963
},
{
"epoch": 0.4385805277525023,
"grad_norm": 1.8368071839220441,
"learning_rate": 4.90587117675587e-06,
"loss": 0.1601,
"step": 964
},
{
"epoch": 0.4390354868061874,
"grad_norm": 1.6602305730067044,
"learning_rate": 4.905676823746846e-06,
"loss": 0.1433,
"step": 965
},
{
"epoch": 0.4394904458598726,
"grad_norm": 1.2991365263950634,
"learning_rate": 4.9054822741567745e-06,
"loss": 0.1361,
"step": 966
},
{
"epoch": 0.43994540491355777,
"grad_norm": 2.1130749414647463,
"learning_rate": 4.905287528001555e-06,
"loss": 0.145,
"step": 967
},
{
"epoch": 0.44040036396724297,
"grad_norm": 1.8646843859502422,
"learning_rate": 4.905092585297102e-06,
"loss": 0.1685,
"step": 968
},
{
"epoch": 0.4408553230209281,
"grad_norm": 2.1749982799245693,
"learning_rate": 4.904897446059344e-06,
"loss": 0.1621,
"step": 969
},
{
"epoch": 0.44131028207461326,
"grad_norm": 2.334038135097662,
"learning_rate": 4.9047021103042255e-06,
"loss": 0.1486,
"step": 970
},
{
"epoch": 0.44176524112829846,
"grad_norm": 2.600358800525879,
"learning_rate": 4.904506578047712e-06,
"loss": 0.1603,
"step": 971
},
{
"epoch": 0.4422202001819836,
"grad_norm": 2.0684781990731436,
"learning_rate": 4.9043108493057785e-06,
"loss": 0.1708,
"step": 972
},
{
"epoch": 0.4426751592356688,
"grad_norm": 2.0255722402037852,
"learning_rate": 4.904114924094421e-06,
"loss": 0.1608,
"step": 973
},
{
"epoch": 0.44313011828935395,
"grad_norm": 2.1489494601016434,
"learning_rate": 4.903918802429648e-06,
"loss": 0.1829,
"step": 974
},
{
"epoch": 0.44358507734303915,
"grad_norm": 1.787442464619014,
"learning_rate": 4.9037224843274875e-06,
"loss": 0.2043,
"step": 975
},
{
"epoch": 0.4440400363967243,
"grad_norm": 2.343300114421743,
"learning_rate": 4.903525969803979e-06,
"loss": 0.2699,
"step": 976
},
{
"epoch": 0.44449499545040944,
"grad_norm": 1.865479334461903,
"learning_rate": 4.903329258875184e-06,
"loss": 0.1195,
"step": 977
},
{
"epoch": 0.44494995450409464,
"grad_norm": 1.9494468159837486,
"learning_rate": 4.903132351557175e-06,
"loss": 0.1465,
"step": 978
},
{
"epoch": 0.4454049135577798,
"grad_norm": 2.502406890844037,
"learning_rate": 4.902935247866043e-06,
"loss": 0.1378,
"step": 979
},
{
"epoch": 0.445859872611465,
"grad_norm": 2.036041143606274,
"learning_rate": 4.9027379478178935e-06,
"loss": 0.1483,
"step": 980
},
{
"epoch": 0.4463148316651501,
"grad_norm": 1.3077265314607576,
"learning_rate": 4.90254045142885e-06,
"loss": 0.0969,
"step": 981
},
{
"epoch": 0.4467697907188353,
"grad_norm": 2.0861883133616828,
"learning_rate": 4.90234275871505e-06,
"loss": 0.1392,
"step": 982
},
{
"epoch": 0.44722474977252047,
"grad_norm": 2.5093809608609274,
"learning_rate": 4.9021448696926486e-06,
"loss": 0.1743,
"step": 983
},
{
"epoch": 0.44767970882620567,
"grad_norm": 1.575875578739379,
"learning_rate": 4.901946784377816e-06,
"loss": 0.176,
"step": 984
},
{
"epoch": 0.4481346678798908,
"grad_norm": 1.5356501213932587,
"learning_rate": 4.90174850278674e-06,
"loss": 0.1484,
"step": 985
},
{
"epoch": 0.44858962693357596,
"grad_norm": 1.823863525681817,
"learning_rate": 4.901550024935623e-06,
"loss": 0.1854,
"step": 986
},
{
"epoch": 0.44904458598726116,
"grad_norm": 1.3758352509840184,
"learning_rate": 4.901351350840683e-06,
"loss": 0.1349,
"step": 987
},
{
"epoch": 0.4494995450409463,
"grad_norm": 2.0693941858838762,
"learning_rate": 4.901152480518155e-06,
"loss": 0.1663,
"step": 988
},
{
"epoch": 0.4499545040946315,
"grad_norm": 1.8873877263165615,
"learning_rate": 4.900953413984289e-06,
"loss": 0.1692,
"step": 989
},
{
"epoch": 0.45040946314831665,
"grad_norm": 1.4776284855591897,
"learning_rate": 4.900754151255353e-06,
"loss": 0.1971,
"step": 990
},
{
"epoch": 0.45086442220200185,
"grad_norm": 2.5191235263020912,
"learning_rate": 4.9005546923476305e-06,
"loss": 0.1998,
"step": 991
},
{
"epoch": 0.451319381255687,
"grad_norm": 1.8842919796768522,
"learning_rate": 4.9003550372774185e-06,
"loss": 0.1399,
"step": 992
},
{
"epoch": 0.45177434030937214,
"grad_norm": 2.063855552138974,
"learning_rate": 4.900155186061033e-06,
"loss": 0.1526,
"step": 993
},
{
"epoch": 0.45222929936305734,
"grad_norm": 2.367561517511786,
"learning_rate": 4.8999551387148045e-06,
"loss": 0.1599,
"step": 994
},
{
"epoch": 0.4526842584167425,
"grad_norm": 1.898969473092516,
"learning_rate": 4.89975489525508e-06,
"loss": 0.1902,
"step": 995
},
{
"epoch": 0.4531392174704277,
"grad_norm": 1.8129578397632808,
"learning_rate": 4.899554455698223e-06,
"loss": 0.1693,
"step": 996
},
{
"epoch": 0.4535941765241128,
"grad_norm": 1.8699568695488074,
"learning_rate": 4.899353820060612e-06,
"loss": 0.1581,
"step": 997
},
{
"epoch": 0.454049135577798,
"grad_norm": 1.7239980533667612,
"learning_rate": 4.899152988358643e-06,
"loss": 0.2098,
"step": 998
},
{
"epoch": 0.45450409463148317,
"grad_norm": 1.8097885043847937,
"learning_rate": 4.898951960608725e-06,
"loss": 0.1715,
"step": 999
},
{
"epoch": 0.4549590536851683,
"grad_norm": 1.8523553420273773,
"learning_rate": 4.8987507368272865e-06,
"loss": 0.16,
"step": 1000
},
{
"epoch": 0.4554140127388535,
"grad_norm": 2.0000127792736904,
"learning_rate": 4.898549317030772e-06,
"loss": 0.1632,
"step": 1001
},
{
"epoch": 0.45586897179253866,
"grad_norm": 1.862660132529776,
"learning_rate": 4.898347701235637e-06,
"loss": 0.1465,
"step": 1002
},
{
"epoch": 0.45632393084622386,
"grad_norm": 1.7361264176719555,
"learning_rate": 4.89814588945836e-06,
"loss": 0.1869,
"step": 1003
},
{
"epoch": 0.456778889899909,
"grad_norm": 1.978059539176156,
"learning_rate": 4.89794388171543e-06,
"loss": 0.1659,
"step": 1004
},
{
"epoch": 0.4572338489535942,
"grad_norm": 2.2207578653400906,
"learning_rate": 4.897741678023356e-06,
"loss": 0.1939,
"step": 1005
},
{
"epoch": 0.45768880800727935,
"grad_norm": 2.103052599683253,
"learning_rate": 4.897539278398659e-06,
"loss": 0.1812,
"step": 1006
},
{
"epoch": 0.4581437670609645,
"grad_norm": 2.546107708354434,
"learning_rate": 4.8973366828578804e-06,
"loss": 0.2054,
"step": 1007
},
{
"epoch": 0.4585987261146497,
"grad_norm": 1.9562513052044435,
"learning_rate": 4.897133891417574e-06,
"loss": 0.1693,
"step": 1008
},
{
"epoch": 0.45905368516833484,
"grad_norm": 2.5635809078172103,
"learning_rate": 4.896930904094311e-06,
"loss": 0.1689,
"step": 1009
},
{
"epoch": 0.45950864422202004,
"grad_norm": 2.401849938137445,
"learning_rate": 4.896727720904679e-06,
"loss": 0.1731,
"step": 1010
},
{
"epoch": 0.4599636032757052,
"grad_norm": 1.3521913269323886,
"learning_rate": 4.896524341865282e-06,
"loss": 0.0961,
"step": 1011
},
{
"epoch": 0.4604185623293904,
"grad_norm": 1.773432887084503,
"learning_rate": 4.896320766992737e-06,
"loss": 0.1875,
"step": 1012
},
{
"epoch": 0.4608735213830755,
"grad_norm": 1.7325101393470637,
"learning_rate": 4.896116996303682e-06,
"loss": 0.1534,
"step": 1013
},
{
"epoch": 0.46132848043676067,
"grad_norm": 1.8711913127871913,
"learning_rate": 4.895913029814766e-06,
"loss": 0.1476,
"step": 1014
},
{
"epoch": 0.46178343949044587,
"grad_norm": 1.98409281551755,
"learning_rate": 4.895708867542658e-06,
"loss": 0.2099,
"step": 1015
},
{
"epoch": 0.462238398544131,
"grad_norm": 1.835471556122073,
"learning_rate": 4.895504509504039e-06,
"loss": 0.141,
"step": 1016
},
{
"epoch": 0.4626933575978162,
"grad_norm": 1.7126193650485422,
"learning_rate": 4.89529995571561e-06,
"loss": 0.1569,
"step": 1017
},
{
"epoch": 0.46314831665150136,
"grad_norm": 1.5756476134085153,
"learning_rate": 4.895095206194086e-06,
"loss": 0.1599,
"step": 1018
},
{
"epoch": 0.46360327570518656,
"grad_norm": 1.6305833927339777,
"learning_rate": 4.894890260956198e-06,
"loss": 0.1266,
"step": 1019
},
{
"epoch": 0.4640582347588717,
"grad_norm": 2.8915138386415107,
"learning_rate": 4.8946851200186925e-06,
"loss": 0.1983,
"step": 1020
},
{
"epoch": 0.46451319381255685,
"grad_norm": 2.2750148686402873,
"learning_rate": 4.894479783398334e-06,
"loss": 0.2161,
"step": 1021
},
{
"epoch": 0.46496815286624205,
"grad_norm": 1.901328095270706,
"learning_rate": 4.8942742511119004e-06,
"loss": 0.2033,
"step": 1022
},
{
"epoch": 0.4654231119199272,
"grad_norm": 3.2947250275495747,
"learning_rate": 4.894068523176187e-06,
"loss": 0.258,
"step": 1023
},
{
"epoch": 0.4658780709736124,
"grad_norm": 1.9323682134416058,
"learning_rate": 4.8938625996080056e-06,
"loss": 0.1788,
"step": 1024
},
{
"epoch": 0.46633303002729753,
"grad_norm": 1.6185621650651296,
"learning_rate": 4.893656480424184e-06,
"loss": 0.1651,
"step": 1025
},
{
"epoch": 0.46678798908098273,
"grad_norm": 2.2508459323489,
"learning_rate": 4.893450165641564e-06,
"loss": 0.1558,
"step": 1026
},
{
"epoch": 0.4672429481346679,
"grad_norm": 1.213648480980067,
"learning_rate": 4.893243655277005e-06,
"loss": 0.1507,
"step": 1027
},
{
"epoch": 0.467697907188353,
"grad_norm": 2.1023746671368513,
"learning_rate": 4.893036949347383e-06,
"loss": 0.1721,
"step": 1028
},
{
"epoch": 0.4681528662420382,
"grad_norm": 2.278948598326534,
"learning_rate": 4.892830047869588e-06,
"loss": 0.1884,
"step": 1029
},
{
"epoch": 0.46860782529572337,
"grad_norm": 2.9197430954616683,
"learning_rate": 4.892622950860527e-06,
"loss": 0.1741,
"step": 1030
},
{
"epoch": 0.46906278434940857,
"grad_norm": 1.2879852782085728,
"learning_rate": 4.892415658337123e-06,
"loss": 0.1342,
"step": 1031
},
{
"epoch": 0.4695177434030937,
"grad_norm": 2.0909020173909973,
"learning_rate": 4.892208170316317e-06,
"loss": 0.1907,
"step": 1032
},
{
"epoch": 0.4699727024567789,
"grad_norm": 2.0408884938878957,
"learning_rate": 4.892000486815062e-06,
"loss": 0.1553,
"step": 1033
},
{
"epoch": 0.47042766151046406,
"grad_norm": 1.8109063186030263,
"learning_rate": 4.891792607850328e-06,
"loss": 0.154,
"step": 1034
},
{
"epoch": 0.4708826205641492,
"grad_norm": 2.2630304525012126,
"learning_rate": 4.891584533439104e-06,
"loss": 0.2079,
"step": 1035
},
{
"epoch": 0.4713375796178344,
"grad_norm": 1.539332632871382,
"learning_rate": 4.891376263598393e-06,
"loss": 0.1432,
"step": 1036
},
{
"epoch": 0.47179253867151955,
"grad_norm": 1.7957849792578133,
"learning_rate": 4.891167798345213e-06,
"loss": 0.1511,
"step": 1037
},
{
"epoch": 0.47224749772520475,
"grad_norm": 2.741729093401805,
"learning_rate": 4.890959137696598e-06,
"loss": 0.2263,
"step": 1038
},
{
"epoch": 0.4727024567788899,
"grad_norm": 1.7348700401664916,
"learning_rate": 4.890750281669601e-06,
"loss": 0.1298,
"step": 1039
},
{
"epoch": 0.4731574158325751,
"grad_norm": 1.7001320886150055,
"learning_rate": 4.890541230281287e-06,
"loss": 0.1168,
"step": 1040
},
{
"epoch": 0.47361237488626023,
"grad_norm": 1.8500860192841622,
"learning_rate": 4.8903319835487385e-06,
"loss": 0.1644,
"step": 1041
},
{
"epoch": 0.4740673339399454,
"grad_norm": 1.8582195164199888,
"learning_rate": 4.890122541489056e-06,
"loss": 0.2426,
"step": 1042
},
{
"epoch": 0.4745222929936306,
"grad_norm": 1.2923171102528221,
"learning_rate": 4.889912904119353e-06,
"loss": 0.165,
"step": 1043
},
{
"epoch": 0.4749772520473157,
"grad_norm": 2.2842684826182778,
"learning_rate": 4.88970307145676e-06,
"loss": 0.1853,
"step": 1044
},
{
"epoch": 0.4754322111010009,
"grad_norm": 1.8277244050495731,
"learning_rate": 4.889493043518423e-06,
"loss": 0.2139,
"step": 1045
},
{
"epoch": 0.47588717015468607,
"grad_norm": 2.021142913969995,
"learning_rate": 4.889282820321506e-06,
"loss": 0.1312,
"step": 1046
},
{
"epoch": 0.47634212920837127,
"grad_norm": 1.8896361944599618,
"learning_rate": 4.889072401883187e-06,
"loss": 0.224,
"step": 1047
},
{
"epoch": 0.4767970882620564,
"grad_norm": 1.552692831396847,
"learning_rate": 4.88886178822066e-06,
"loss": 0.1772,
"step": 1048
},
{
"epoch": 0.47725204731574156,
"grad_norm": 1.8340975280187983,
"learning_rate": 4.888650979351136e-06,
"loss": 0.1702,
"step": 1049
},
{
"epoch": 0.47770700636942676,
"grad_norm": 2.1830448534590547,
"learning_rate": 4.888439975291841e-06,
"loss": 0.2436,
"step": 1050
},
{
"epoch": 0.4781619654231119,
"grad_norm": 1.6348401870707816,
"learning_rate": 4.888228776060017e-06,
"loss": 0.1926,
"step": 1051
},
{
"epoch": 0.4786169244767971,
"grad_norm": 1.7078513906709398,
"learning_rate": 4.888017381672923e-06,
"loss": 0.1601,
"step": 1052
},
{
"epoch": 0.47907188353048225,
"grad_norm": 2.240745720528745,
"learning_rate": 4.887805792147832e-06,
"loss": 0.1766,
"step": 1053
},
{
"epoch": 0.47952684258416745,
"grad_norm": 2.428487112277442,
"learning_rate": 4.887594007502036e-06,
"loss": 0.1789,
"step": 1054
},
{
"epoch": 0.4799818016378526,
"grad_norm": 2.1865518873285645,
"learning_rate": 4.887382027752838e-06,
"loss": 0.199,
"step": 1055
},
{
"epoch": 0.48043676069153773,
"grad_norm": 1.898629261883439,
"learning_rate": 4.8871698529175636e-06,
"loss": 0.1665,
"step": 1056
},
{
"epoch": 0.48089171974522293,
"grad_norm": 1.7954561311174488,
"learning_rate": 4.886957483013549e-06,
"loss": 0.1812,
"step": 1057
},
{
"epoch": 0.4813466787989081,
"grad_norm": 1.8221114015246185,
"learning_rate": 4.886744918058149e-06,
"loss": 0.2063,
"step": 1058
},
{
"epoch": 0.4818016378525933,
"grad_norm": 2.7770081846232544,
"learning_rate": 4.886532158068732e-06,
"loss": 0.2088,
"step": 1059
},
{
"epoch": 0.4822565969062784,
"grad_norm": 2.1115268373643477,
"learning_rate": 4.886319203062683e-06,
"loss": 0.1444,
"step": 1060
},
{
"epoch": 0.4827115559599636,
"grad_norm": 2.071172717908372,
"learning_rate": 4.886106053057408e-06,
"loss": 0.1661,
"step": 1061
},
{
"epoch": 0.48316651501364877,
"grad_norm": 2.2607152479196104,
"learning_rate": 4.88589270807032e-06,
"loss": 0.1859,
"step": 1062
},
{
"epoch": 0.48362147406733397,
"grad_norm": 1.692360966817902,
"learning_rate": 4.885679168118855e-06,
"loss": 0.1864,
"step": 1063
},
{
"epoch": 0.4840764331210191,
"grad_norm": 1.7710659763891554,
"learning_rate": 4.8854654332204635e-06,
"loss": 0.1646,
"step": 1064
},
{
"epoch": 0.48453139217470426,
"grad_norm": 1.9598218562809384,
"learning_rate": 4.885251503392607e-06,
"loss": 0.1402,
"step": 1065
},
{
"epoch": 0.48498635122838946,
"grad_norm": 2.1204508830789988,
"learning_rate": 4.885037378652771e-06,
"loss": 0.1891,
"step": 1066
},
{
"epoch": 0.4854413102820746,
"grad_norm": 2.3589815655452653,
"learning_rate": 4.884823059018451e-06,
"loss": 0.1555,
"step": 1067
},
{
"epoch": 0.4858962693357598,
"grad_norm": 2.5392202747520245,
"learning_rate": 4.88460854450716e-06,
"loss": 0.192,
"step": 1068
},
{
"epoch": 0.48635122838944495,
"grad_norm": 2.3012454866986833,
"learning_rate": 4.884393835136427e-06,
"loss": 0.2073,
"step": 1069
},
{
"epoch": 0.48680618744313015,
"grad_norm": 1.7363057272250617,
"learning_rate": 4.884178930923799e-06,
"loss": 0.1909,
"step": 1070
},
{
"epoch": 0.4872611464968153,
"grad_norm": 2.5682234171797638,
"learning_rate": 4.883963831886834e-06,
"loss": 0.2505,
"step": 1071
},
{
"epoch": 0.48771610555050043,
"grad_norm": 2.1085560059563435,
"learning_rate": 4.8837485380431115e-06,
"loss": 0.1713,
"step": 1072
},
{
"epoch": 0.48817106460418563,
"grad_norm": 1.8882533184752026,
"learning_rate": 4.883533049410223e-06,
"loss": 0.1602,
"step": 1073
},
{
"epoch": 0.4886260236578708,
"grad_norm": 2.9321383026683985,
"learning_rate": 4.8833173660057785e-06,
"loss": 0.2554,
"step": 1074
},
{
"epoch": 0.489080982711556,
"grad_norm": 2.531195131930091,
"learning_rate": 4.8831014878474004e-06,
"loss": 0.1797,
"step": 1075
},
{
"epoch": 0.4895359417652411,
"grad_norm": 1.9044052944051695,
"learning_rate": 4.882885414952732e-06,
"loss": 0.1738,
"step": 1076
},
{
"epoch": 0.4899909008189263,
"grad_norm": 1.8646399638677997,
"learning_rate": 4.882669147339428e-06,
"loss": 0.123,
"step": 1077
},
{
"epoch": 0.49044585987261147,
"grad_norm": 1.6244921355768605,
"learning_rate": 4.882452685025161e-06,
"loss": 0.1207,
"step": 1078
},
{
"epoch": 0.4909008189262966,
"grad_norm": 2.6418064094824625,
"learning_rate": 4.88223602802762e-06,
"loss": 0.1651,
"step": 1079
},
{
"epoch": 0.4913557779799818,
"grad_norm": 1.7280688771591737,
"learning_rate": 4.882019176364509e-06,
"loss": 0.1654,
"step": 1080
},
{
"epoch": 0.49181073703366696,
"grad_norm": 1.688117545561323,
"learning_rate": 4.881802130053548e-06,
"loss": 0.1779,
"step": 1081
},
{
"epoch": 0.49226569608735216,
"grad_norm": 1.9343631314892762,
"learning_rate": 4.881584889112473e-06,
"loss": 0.1378,
"step": 1082
},
{
"epoch": 0.4927206551410373,
"grad_norm": 2.0445775883054194,
"learning_rate": 4.881367453559036e-06,
"loss": 0.1945,
"step": 1083
},
{
"epoch": 0.4931756141947225,
"grad_norm": 2.0708720739438835,
"learning_rate": 4.881149823411005e-06,
"loss": 0.155,
"step": 1084
},
{
"epoch": 0.49363057324840764,
"grad_norm": 1.8016295656127952,
"learning_rate": 4.880931998686162e-06,
"loss": 0.1374,
"step": 1085
},
{
"epoch": 0.4940855323020928,
"grad_norm": 1.8010911071848295,
"learning_rate": 4.880713979402311e-06,
"loss": 0.2764,
"step": 1086
},
{
"epoch": 0.494540491355778,
"grad_norm": 2.2201593715577945,
"learning_rate": 4.880495765577263e-06,
"loss": 0.1785,
"step": 1087
},
{
"epoch": 0.49499545040946313,
"grad_norm": 2.5150440926935183,
"learning_rate": 4.880277357228852e-06,
"loss": 0.1415,
"step": 1088
},
{
"epoch": 0.49545040946314833,
"grad_norm": 1.4882801876169178,
"learning_rate": 4.880058754374923e-06,
"loss": 0.1528,
"step": 1089
},
{
"epoch": 0.4959053685168335,
"grad_norm": 1.9307317316728292,
"learning_rate": 4.879839957033343e-06,
"loss": 0.1661,
"step": 1090
},
{
"epoch": 0.4963603275705187,
"grad_norm": 1.6645987589280862,
"learning_rate": 4.879620965221987e-06,
"loss": 0.1058,
"step": 1091
},
{
"epoch": 0.4968152866242038,
"grad_norm": 1.1436431770468727,
"learning_rate": 4.879401778958755e-06,
"loss": 0.0867,
"step": 1092
},
{
"epoch": 0.49727024567788897,
"grad_norm": 2.072303030104995,
"learning_rate": 4.8791823982615525e-06,
"loss": 0.1454,
"step": 1093
},
{
"epoch": 0.49772520473157417,
"grad_norm": 1.4026343543836923,
"learning_rate": 4.878962823148308e-06,
"loss": 0.1176,
"step": 1094
},
{
"epoch": 0.4981801637852593,
"grad_norm": 2.4971931111745795,
"learning_rate": 4.878743053636968e-06,
"loss": 0.2058,
"step": 1095
},
{
"epoch": 0.4986351228389445,
"grad_norm": 1.9096703970153857,
"learning_rate": 4.878523089745485e-06,
"loss": 0.2389,
"step": 1096
},
{
"epoch": 0.49909008189262966,
"grad_norm": 1.7150797344948416,
"learning_rate": 4.878302931491837e-06,
"loss": 0.1408,
"step": 1097
},
{
"epoch": 0.49954504094631486,
"grad_norm": 1.8467538410779647,
"learning_rate": 4.8780825788940145e-06,
"loss": 0.1212,
"step": 1098
},
{
"epoch": 0.5,
"grad_norm": 1.6653067013861202,
"learning_rate": 4.877862031970023e-06,
"loss": 0.165,
"step": 1099
},
{
"epoch": 0.5004549590536852,
"grad_norm": 2.7780746831303866,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.1959,
"step": 1100
},
{
"epoch": 0.5009099181073703,
"grad_norm": 2.735253037622924,
"learning_rate": 4.877420355215637e-06,
"loss": 0.1853,
"step": 1101
},
{
"epoch": 0.5013648771610555,
"grad_norm": 2.2801170489693474,
"learning_rate": 4.877199225421334e-06,
"loss": 0.223,
"step": 1102
},
{
"epoch": 0.5018198362147407,
"grad_norm": 1.5121408545649673,
"learning_rate": 4.8769779013730454e-06,
"loss": 0.1766,
"step": 1103
},
{
"epoch": 0.5022747952684259,
"grad_norm": 1.2736560565952975,
"learning_rate": 4.876756383088858e-06,
"loss": 0.1147,
"step": 1104
},
{
"epoch": 0.502729754322111,
"grad_norm": 2.234019869097899,
"learning_rate": 4.876534670586872e-06,
"loss": 0.1353,
"step": 1105
},
{
"epoch": 0.5031847133757962,
"grad_norm": 2.231499299533909,
"learning_rate": 4.8763127638852045e-06,
"loss": 0.1542,
"step": 1106
},
{
"epoch": 0.5036396724294814,
"grad_norm": 1.8302676611966564,
"learning_rate": 4.87609066300199e-06,
"loss": 0.1494,
"step": 1107
},
{
"epoch": 0.5040946314831665,
"grad_norm": 2.4154877250923157,
"learning_rate": 4.875868367955376e-06,
"loss": 0.1937,
"step": 1108
},
{
"epoch": 0.5045495905368517,
"grad_norm": 2.752908247632549,
"learning_rate": 4.87564587876353e-06,
"loss": 0.2127,
"step": 1109
},
{
"epoch": 0.5050045495905369,
"grad_norm": 1.718053996922888,
"learning_rate": 4.87542319544463e-06,
"loss": 0.1702,
"step": 1110
},
{
"epoch": 0.5054595086442221,
"grad_norm": 1.5702103077744012,
"learning_rate": 4.875200318016873e-06,
"loss": 0.1566,
"step": 1111
},
{
"epoch": 0.5059144676979072,
"grad_norm": 2.0381911393844825,
"learning_rate": 4.8749772464984736e-06,
"loss": 0.2017,
"step": 1112
},
{
"epoch": 0.5063694267515924,
"grad_norm": 1.8176309130216741,
"learning_rate": 4.874753980907658e-06,
"loss": 0.1864,
"step": 1113
},
{
"epoch": 0.5068243858052776,
"grad_norm": 2.1308929915187753,
"learning_rate": 4.8745305212626714e-06,
"loss": 0.1726,
"step": 1114
},
{
"epoch": 0.5072793448589626,
"grad_norm": 1.8139775978694637,
"learning_rate": 4.874306867581775e-06,
"loss": 0.1761,
"step": 1115
},
{
"epoch": 0.5077343039126478,
"grad_norm": 1.7183373875600083,
"learning_rate": 4.874083019883242e-06,
"loss": 0.1333,
"step": 1116
},
{
"epoch": 0.508189262966333,
"grad_norm": 1.8665339958095688,
"learning_rate": 4.873858978185367e-06,
"loss": 0.1932,
"step": 1117
},
{
"epoch": 0.5086442220200182,
"grad_norm": 2.352764145779797,
"learning_rate": 4.8736347425064565e-06,
"loss": 0.2031,
"step": 1118
},
{
"epoch": 0.5090991810737033,
"grad_norm": 2.678329346866304,
"learning_rate": 4.873410312864833e-06,
"loss": 0.214,
"step": 1119
},
{
"epoch": 0.5095541401273885,
"grad_norm": 2.350844354697721,
"learning_rate": 4.8731856892788384e-06,
"loss": 0.2144,
"step": 1120
},
{
"epoch": 0.5100090991810737,
"grad_norm": 1.9729175722269603,
"learning_rate": 4.872960871766826e-06,
"loss": 0.2081,
"step": 1121
},
{
"epoch": 0.5104640582347588,
"grad_norm": 1.4433063314456696,
"learning_rate": 4.8727358603471675e-06,
"loss": 0.1703,
"step": 1122
},
{
"epoch": 0.510919017288444,
"grad_norm": 2.5283375453779704,
"learning_rate": 4.872510655038249e-06,
"loss": 0.1536,
"step": 1123
},
{
"epoch": 0.5113739763421292,
"grad_norm": 1.3858205152408392,
"learning_rate": 4.872285255858476e-06,
"loss": 0.1458,
"step": 1124
},
{
"epoch": 0.5118289353958144,
"grad_norm": 2.0487135879281024,
"learning_rate": 4.872059662826263e-06,
"loss": 0.1661,
"step": 1125
},
{
"epoch": 0.5122838944494995,
"grad_norm": 1.9472322837633822,
"learning_rate": 4.8718338759600465e-06,
"loss": 0.1786,
"step": 1126
},
{
"epoch": 0.5127388535031847,
"grad_norm": 1.6310032173739817,
"learning_rate": 4.871607895278278e-06,
"loss": 0.1626,
"step": 1127
},
{
"epoch": 0.5131938125568699,
"grad_norm": 1.985456014356635,
"learning_rate": 4.871381720799421e-06,
"loss": 0.1702,
"step": 1128
},
{
"epoch": 0.513648771610555,
"grad_norm": 2.402370971493488,
"learning_rate": 4.8711553525419595e-06,
"loss": 0.194,
"step": 1129
},
{
"epoch": 0.5141037306642402,
"grad_norm": 1.5072231886887353,
"learning_rate": 4.87092879052439e-06,
"loss": 0.1573,
"step": 1130
},
{
"epoch": 0.5145586897179254,
"grad_norm": 1.449165092170168,
"learning_rate": 4.8707020347652275e-06,
"loss": 0.1246,
"step": 1131
},
{
"epoch": 0.5150136487716106,
"grad_norm": 1.8608730682993475,
"learning_rate": 4.870475085283001e-06,
"loss": 0.1831,
"step": 1132
},
{
"epoch": 0.5154686078252957,
"grad_norm": 1.9183857631670505,
"learning_rate": 4.870247942096254e-06,
"loss": 0.1638,
"step": 1133
},
{
"epoch": 0.5159235668789809,
"grad_norm": 1.6563135044982633,
"learning_rate": 4.870020605223551e-06,
"loss": 0.1421,
"step": 1134
},
{
"epoch": 0.5163785259326661,
"grad_norm": 1.7995838783709266,
"learning_rate": 4.869793074683466e-06,
"loss": 0.147,
"step": 1135
},
{
"epoch": 0.5168334849863512,
"grad_norm": 1.9548719996118153,
"learning_rate": 4.8695653504945925e-06,
"loss": 0.1575,
"step": 1136
},
{
"epoch": 0.5172884440400364,
"grad_norm": 1.7522375644775081,
"learning_rate": 4.8693374326755405e-06,
"loss": 0.1495,
"step": 1137
},
{
"epoch": 0.5177434030937216,
"grad_norm": 1.3062327753186558,
"learning_rate": 4.869109321244932e-06,
"loss": 0.116,
"step": 1138
},
{
"epoch": 0.5181983621474068,
"grad_norm": 1.8868647769132803,
"learning_rate": 4.86888101622141e-06,
"loss": 0.1794,
"step": 1139
},
{
"epoch": 0.5186533212010919,
"grad_norm": 1.8158313749710562,
"learning_rate": 4.868652517623629e-06,
"loss": 0.1391,
"step": 1140
},
{
"epoch": 0.5191082802547771,
"grad_norm": 1.8111217984491637,
"learning_rate": 4.86842382547026e-06,
"loss": 0.1494,
"step": 1141
},
{
"epoch": 0.5195632393084623,
"grad_norm": 2.8090775733585835,
"learning_rate": 4.868194939779992e-06,
"loss": 0.1896,
"step": 1142
},
{
"epoch": 0.5200181983621474,
"grad_norm": 1.9497550190765165,
"learning_rate": 4.867965860571529e-06,
"loss": 0.1552,
"step": 1143
},
{
"epoch": 0.5204731574158326,
"grad_norm": 1.990627059765444,
"learning_rate": 4.867736587863589e-06,
"loss": 0.2094,
"step": 1144
},
{
"epoch": 0.5209281164695178,
"grad_norm": 2.247771495871837,
"learning_rate": 4.867507121674907e-06,
"loss": 0.2391,
"step": 1145
},
{
"epoch": 0.521383075523203,
"grad_norm": 2.120187054464733,
"learning_rate": 4.867277462024235e-06,
"loss": 0.1775,
"step": 1146
},
{
"epoch": 0.521838034576888,
"grad_norm": 1.7774801845384391,
"learning_rate": 4.8670476089303395e-06,
"loss": 0.2129,
"step": 1147
},
{
"epoch": 0.5222929936305732,
"grad_norm": 1.5308807746672268,
"learning_rate": 4.866817562412003e-06,
"loss": 0.2109,
"step": 1148
},
{
"epoch": 0.5227479526842584,
"grad_norm": 1.219763540490379,
"learning_rate": 4.866587322488024e-06,
"loss": 0.1529,
"step": 1149
},
{
"epoch": 0.5232029117379435,
"grad_norm": 1.63359106412129,
"learning_rate": 4.866356889177216e-06,
"loss": 0.1663,
"step": 1150
},
{
"epoch": 0.5236578707916287,
"grad_norm": 1.4722036099751108,
"learning_rate": 4.866126262498409e-06,
"loss": 0.1727,
"step": 1151
},
{
"epoch": 0.5241128298453139,
"grad_norm": 2.4915301409486172,
"learning_rate": 4.865895442470449e-06,
"loss": 0.1966,
"step": 1152
},
{
"epoch": 0.5245677888989991,
"grad_norm": 1.5523199921916622,
"learning_rate": 4.865664429112199e-06,
"loss": 0.1451,
"step": 1153
},
{
"epoch": 0.5250227479526842,
"grad_norm": 2.0323520596343627,
"learning_rate": 4.8654332224425345e-06,
"loss": 0.1504,
"step": 1154
},
{
"epoch": 0.5254777070063694,
"grad_norm": 2.4530093356672094,
"learning_rate": 4.865201822480349e-06,
"loss": 0.1872,
"step": 1155
},
{
"epoch": 0.5259326660600546,
"grad_norm": 1.3735368464159743,
"learning_rate": 4.864970229244552e-06,
"loss": 0.111,
"step": 1156
},
{
"epoch": 0.5263876251137397,
"grad_norm": 1.824736780190326,
"learning_rate": 4.864738442754068e-06,
"loss": 0.135,
"step": 1157
},
{
"epoch": 0.5268425841674249,
"grad_norm": 1.9990020682765113,
"learning_rate": 4.864506463027837e-06,
"loss": 0.1745,
"step": 1158
},
{
"epoch": 0.5272975432211101,
"grad_norm": 1.4799527599994446,
"learning_rate": 4.864274290084816e-06,
"loss": 0.167,
"step": 1159
},
{
"epoch": 0.5277525022747953,
"grad_norm": 2.4687809077301295,
"learning_rate": 4.864041923943978e-06,
"loss": 0.1732,
"step": 1160
},
{
"epoch": 0.5282074613284804,
"grad_norm": 2.641693873435684,
"learning_rate": 4.863809364624309e-06,
"loss": 0.2128,
"step": 1161
},
{
"epoch": 0.5286624203821656,
"grad_norm": 1.5492373372050023,
"learning_rate": 4.863576612144814e-06,
"loss": 0.1592,
"step": 1162
},
{
"epoch": 0.5291173794358508,
"grad_norm": 2.3572852462486313,
"learning_rate": 4.863343666524512e-06,
"loss": 0.2061,
"step": 1163
},
{
"epoch": 0.5295723384895359,
"grad_norm": 1.8838845870200471,
"learning_rate": 4.863110527782437e-06,
"loss": 0.1798,
"step": 1164
},
{
"epoch": 0.5300272975432211,
"grad_norm": 2.304263001470561,
"learning_rate": 4.8628771959376435e-06,
"loss": 0.1556,
"step": 1165
},
{
"epoch": 0.5304822565969063,
"grad_norm": 1.7674923531547297,
"learning_rate": 4.862643671009195e-06,
"loss": 0.1333,
"step": 1166
},
{
"epoch": 0.5309372156505915,
"grad_norm": 1.393097189340672,
"learning_rate": 4.862409953016175e-06,
"loss": 0.155,
"step": 1167
},
{
"epoch": 0.5313921747042766,
"grad_norm": 1.74325807786759,
"learning_rate": 4.862176041977683e-06,
"loss": 0.1656,
"step": 1168
},
{
"epoch": 0.5318471337579618,
"grad_norm": 1.572029172895186,
"learning_rate": 4.861941937912832e-06,
"loss": 0.131,
"step": 1169
},
{
"epoch": 0.532302092811647,
"grad_norm": 2.008491262720168,
"learning_rate": 4.861707640840752e-06,
"loss": 0.1548,
"step": 1170
},
{
"epoch": 0.5327570518653321,
"grad_norm": 1.482082349852649,
"learning_rate": 4.861473150780589e-06,
"loss": 0.1628,
"step": 1171
},
{
"epoch": 0.5332120109190173,
"grad_norm": 1.6791945913602067,
"learning_rate": 4.8612384677515054e-06,
"loss": 0.1785,
"step": 1172
},
{
"epoch": 0.5336669699727025,
"grad_norm": 2.20432127668894,
"learning_rate": 4.861003591772677e-06,
"loss": 0.1716,
"step": 1173
},
{
"epoch": 0.5341219290263877,
"grad_norm": 1.9304068948412856,
"learning_rate": 4.860768522863297e-06,
"loss": 0.1503,
"step": 1174
},
{
"epoch": 0.5345768880800728,
"grad_norm": 1.5238718585240933,
"learning_rate": 4.860533261042574e-06,
"loss": 0.1539,
"step": 1175
},
{
"epoch": 0.535031847133758,
"grad_norm": 1.2432245247502896,
"learning_rate": 4.8602978063297336e-06,
"loss": 0.1721,
"step": 1176
},
{
"epoch": 0.5354868061874432,
"grad_norm": 1.8986627233525826,
"learning_rate": 4.8600621587440155e-06,
"loss": 0.1717,
"step": 1177
},
{
"epoch": 0.5359417652411284,
"grad_norm": 1.6746020896303164,
"learning_rate": 4.859826318304676e-06,
"loss": 0.198,
"step": 1178
},
{
"epoch": 0.5363967242948134,
"grad_norm": 1.0811516795291998,
"learning_rate": 4.859590285030986e-06,
"loss": 0.1441,
"step": 1179
},
{
"epoch": 0.5368516833484986,
"grad_norm": 1.3182569447840091,
"learning_rate": 4.859354058942234e-06,
"loss": 0.143,
"step": 1180
},
{
"epoch": 0.5373066424021838,
"grad_norm": 1.5442971076277365,
"learning_rate": 4.859117640057723e-06,
"loss": 0.1708,
"step": 1181
},
{
"epoch": 0.5377616014558689,
"grad_norm": 2.2346125174953744,
"learning_rate": 4.858881028396773e-06,
"loss": 0.2581,
"step": 1182
},
{
"epoch": 0.5382165605095541,
"grad_norm": 2.16866059231189,
"learning_rate": 4.8586442239787165e-06,
"loss": 0.1566,
"step": 1183
},
{
"epoch": 0.5386715195632393,
"grad_norm": 2.2940342617095357,
"learning_rate": 4.858407226822906e-06,
"loss": 0.2362,
"step": 1184
},
{
"epoch": 0.5391264786169245,
"grad_norm": 1.722886466945642,
"learning_rate": 4.858170036948707e-06,
"loss": 0.1643,
"step": 1185
},
{
"epoch": 0.5395814376706096,
"grad_norm": 1.8036922634291395,
"learning_rate": 4.857932654375503e-06,
"loss": 0.1399,
"step": 1186
},
{
"epoch": 0.5400363967242948,
"grad_norm": 2.4595201733911995,
"learning_rate": 4.857695079122691e-06,
"loss": 0.2806,
"step": 1187
},
{
"epoch": 0.54049135577798,
"grad_norm": 1.5611995597597812,
"learning_rate": 4.857457311209683e-06,
"loss": 0.1436,
"step": 1188
},
{
"epoch": 0.5409463148316651,
"grad_norm": 2.155441619580459,
"learning_rate": 4.857219350655911e-06,
"loss": 0.1502,
"step": 1189
},
{
"epoch": 0.5414012738853503,
"grad_norm": 1.7590257643884393,
"learning_rate": 4.856981197480818e-06,
"loss": 0.1832,
"step": 1190
},
{
"epoch": 0.5418562329390355,
"grad_norm": 1.5219476359124613,
"learning_rate": 4.856742851703866e-06,
"loss": 0.1478,
"step": 1191
},
{
"epoch": 0.5423111919927207,
"grad_norm": 1.9739406001713575,
"learning_rate": 4.856504313344531e-06,
"loss": 0.2435,
"step": 1192
},
{
"epoch": 0.5427661510464058,
"grad_norm": 2.084318032784521,
"learning_rate": 4.8562655824223055e-06,
"loss": 0.1409,
"step": 1193
},
{
"epoch": 0.543221110100091,
"grad_norm": 1.1509311969673588,
"learning_rate": 4.856026658956697e-06,
"loss": 0.1281,
"step": 1194
},
{
"epoch": 0.5436760691537762,
"grad_norm": 1.14005541818581,
"learning_rate": 4.8557875429672295e-06,
"loss": 0.1438,
"step": 1195
},
{
"epoch": 0.5441310282074613,
"grad_norm": 1.6453379692427774,
"learning_rate": 4.855548234473444e-06,
"loss": 0.1898,
"step": 1196
},
{
"epoch": 0.5445859872611465,
"grad_norm": 3.715053618797708,
"learning_rate": 4.8553087334948935e-06,
"loss": 0.1884,
"step": 1197
},
{
"epoch": 0.5450409463148317,
"grad_norm": 1.9604960579417277,
"learning_rate": 4.855069040051149e-06,
"loss": 0.1668,
"step": 1198
},
{
"epoch": 0.5454959053685169,
"grad_norm": 2.008712099431151,
"learning_rate": 4.854829154161799e-06,
"loss": 0.2458,
"step": 1199
},
{
"epoch": 0.545950864422202,
"grad_norm": 1.670617885602165,
"learning_rate": 4.854589075846445e-06,
"loss": 0.195,
"step": 1200
},
{
"epoch": 0.5464058234758872,
"grad_norm": 1.3262735122543114,
"learning_rate": 4.854348805124704e-06,
"loss": 0.1564,
"step": 1201
},
{
"epoch": 0.5468607825295724,
"grad_norm": 1.9039774091054742,
"learning_rate": 4.85410834201621e-06,
"loss": 0.1379,
"step": 1202
},
{
"epoch": 0.5473157415832575,
"grad_norm": 2.3929812156260364,
"learning_rate": 4.8538676865406155e-06,
"loss": 0.2412,
"step": 1203
},
{
"epoch": 0.5477707006369427,
"grad_norm": 1.551384727017807,
"learning_rate": 4.853626838717582e-06,
"loss": 0.117,
"step": 1204
},
{
"epoch": 0.5482256596906279,
"grad_norm": 5.414582318339853,
"learning_rate": 4.853385798566793e-06,
"loss": 0.1437,
"step": 1205
},
{
"epoch": 0.5486806187443131,
"grad_norm": 1.6881825100786558,
"learning_rate": 4.8531445661079444e-06,
"loss": 0.1232,
"step": 1206
},
{
"epoch": 0.5491355777979982,
"grad_norm": 1.6096306897948298,
"learning_rate": 4.852903141360749e-06,
"loss": 0.161,
"step": 1207
},
{
"epoch": 0.5495905368516834,
"grad_norm": 1.7692527628598336,
"learning_rate": 4.852661524344933e-06,
"loss": 0.1217,
"step": 1208
},
{
"epoch": 0.5500454959053686,
"grad_norm": 2.162642212991987,
"learning_rate": 4.852419715080244e-06,
"loss": 0.1986,
"step": 1209
},
{
"epoch": 0.5505004549590536,
"grad_norm": 1.4975052036096086,
"learning_rate": 4.852177713586437e-06,
"loss": 0.1435,
"step": 1210
},
{
"epoch": 0.5509554140127388,
"grad_norm": 1.5907183445636404,
"learning_rate": 4.85193551988329e-06,
"loss": 0.1642,
"step": 1211
},
{
"epoch": 0.551410373066424,
"grad_norm": 1.9999573598736464,
"learning_rate": 4.851693133990594e-06,
"loss": 0.1807,
"step": 1212
},
{
"epoch": 0.5518653321201092,
"grad_norm": 2.294525710441773,
"learning_rate": 4.851450555928155e-06,
"loss": 0.1624,
"step": 1213
},
{
"epoch": 0.5523202911737943,
"grad_norm": 2.233884971616304,
"learning_rate": 4.851207785715797e-06,
"loss": 0.2324,
"step": 1214
},
{
"epoch": 0.5527752502274795,
"grad_norm": 2.0057194457772924,
"learning_rate": 4.850964823373355e-06,
"loss": 0.2105,
"step": 1215
},
{
"epoch": 0.5532302092811647,
"grad_norm": 1.9893088992044121,
"learning_rate": 4.850721668920685e-06,
"loss": 0.1784,
"step": 1216
},
{
"epoch": 0.5536851683348498,
"grad_norm": 1.811776169286512,
"learning_rate": 4.850478322377657e-06,
"loss": 0.1716,
"step": 1217
},
{
"epoch": 0.554140127388535,
"grad_norm": 2.4345407872833134,
"learning_rate": 4.8502347837641536e-06,
"loss": 0.2649,
"step": 1218
},
{
"epoch": 0.5545950864422202,
"grad_norm": 1.4197781095132433,
"learning_rate": 4.8499910531000776e-06,
"loss": 0.1473,
"step": 1219
},
{
"epoch": 0.5550500454959054,
"grad_norm": 2.7980447769263637,
"learning_rate": 4.849747130405346e-06,
"loss": 0.2153,
"step": 1220
},
{
"epoch": 0.5555050045495905,
"grad_norm": 1.6352047446815658,
"learning_rate": 4.849503015699889e-06,
"loss": 0.1485,
"step": 1221
},
{
"epoch": 0.5559599636032757,
"grad_norm": 2.1831084601819066,
"learning_rate": 4.849258709003657e-06,
"loss": 0.1818,
"step": 1222
},
{
"epoch": 0.5564149226569609,
"grad_norm": 1.541290763289794,
"learning_rate": 4.849014210336612e-06,
"loss": 0.1947,
"step": 1223
},
{
"epoch": 0.556869881710646,
"grad_norm": 2.2775888091930723,
"learning_rate": 4.848769519718734e-06,
"loss": 0.2152,
"step": 1224
},
{
"epoch": 0.5573248407643312,
"grad_norm": 2.473887631559974,
"learning_rate": 4.848524637170018e-06,
"loss": 0.1588,
"step": 1225
},
{
"epoch": 0.5577797998180164,
"grad_norm": 1.7255823206927379,
"learning_rate": 4.848279562710474e-06,
"loss": 0.2174,
"step": 1226
},
{
"epoch": 0.5582347588717016,
"grad_norm": 1.8250707498997563,
"learning_rate": 4.848034296360129e-06,
"loss": 0.1404,
"step": 1227
},
{
"epoch": 0.5586897179253867,
"grad_norm": 1.3973858443687242,
"learning_rate": 4.847788838139025e-06,
"loss": 0.1598,
"step": 1228
},
{
"epoch": 0.5591446769790719,
"grad_norm": 1.6880241463364833,
"learning_rate": 4.847543188067219e-06,
"loss": 0.1361,
"step": 1229
},
{
"epoch": 0.5595996360327571,
"grad_norm": 1.6583472347876314,
"learning_rate": 4.847297346164786e-06,
"loss": 0.1681,
"step": 1230
},
{
"epoch": 0.5600545950864422,
"grad_norm": 1.5526904315702266,
"learning_rate": 4.8470513124518134e-06,
"loss": 0.1704,
"step": 1231
},
{
"epoch": 0.5605095541401274,
"grad_norm": 2.9080178304839333,
"learning_rate": 4.8468050869484075e-06,
"loss": 0.2189,
"step": 1232
},
{
"epoch": 0.5609645131938126,
"grad_norm": 2.272625265359496,
"learning_rate": 4.846558669674688e-06,
"loss": 0.1796,
"step": 1233
},
{
"epoch": 0.5614194722474978,
"grad_norm": 2.1487306294232997,
"learning_rate": 4.8463120606507904e-06,
"loss": 0.1853,
"step": 1234
},
{
"epoch": 0.5618744313011829,
"grad_norm": 2.013831962718606,
"learning_rate": 4.846065259896867e-06,
"loss": 0.1844,
"step": 1235
},
{
"epoch": 0.5623293903548681,
"grad_norm": 1.8287089471640992,
"learning_rate": 4.845818267433086e-06,
"loss": 0.1784,
"step": 1236
},
{
"epoch": 0.5627843494085533,
"grad_norm": 1.800058629818333,
"learning_rate": 4.845571083279629e-06,
"loss": 0.1552,
"step": 1237
},
{
"epoch": 0.5632393084622384,
"grad_norm": 1.2446217689129786,
"learning_rate": 4.845323707456696e-06,
"loss": 0.1685,
"step": 1238
},
{
"epoch": 0.5636942675159236,
"grad_norm": 2.6424245053307787,
"learning_rate": 4.845076139984502e-06,
"loss": 0.2754,
"step": 1239
},
{
"epoch": 0.5641492265696088,
"grad_norm": 1.9189782085118383,
"learning_rate": 4.844828380883274e-06,
"loss": 0.1686,
"step": 1240
},
{
"epoch": 0.564604185623294,
"grad_norm": 1.2992527617302185,
"learning_rate": 4.844580430173261e-06,
"loss": 0.1576,
"step": 1241
},
{
"epoch": 0.565059144676979,
"grad_norm": 1.771767593474412,
"learning_rate": 4.8443322878747236e-06,
"loss": 0.1201,
"step": 1242
},
{
"epoch": 0.5655141037306642,
"grad_norm": 1.3113844210494432,
"learning_rate": 4.844083954007938e-06,
"loss": 0.1933,
"step": 1243
},
{
"epoch": 0.5659690627843494,
"grad_norm": 1.780274550683715,
"learning_rate": 4.843835428593198e-06,
"loss": 0.2449,
"step": 1244
},
{
"epoch": 0.5664240218380345,
"grad_norm": 2.0286348942605734,
"learning_rate": 4.84358671165081e-06,
"loss": 0.2206,
"step": 1245
},
{
"epoch": 0.5668789808917197,
"grad_norm": 1.9183674174882497,
"learning_rate": 4.843337803201102e-06,
"loss": 0.1932,
"step": 1246
},
{
"epoch": 0.5673339399454049,
"grad_norm": 1.8589987750417598,
"learning_rate": 4.8430887032644094e-06,
"loss": 0.2063,
"step": 1247
},
{
"epoch": 0.5677888989990901,
"grad_norm": 1.8997293354336255,
"learning_rate": 4.842839411861089e-06,
"loss": 0.15,
"step": 1248
},
{
"epoch": 0.5682438580527752,
"grad_norm": 1.5956283554174595,
"learning_rate": 4.842589929011513e-06,
"loss": 0.1249,
"step": 1249
},
{
"epoch": 0.5686988171064604,
"grad_norm": 1.7264729567079007,
"learning_rate": 4.8423402547360665e-06,
"loss": 0.1731,
"step": 1250
},
{
"epoch": 0.5691537761601456,
"grad_norm": 1.9220135807111425,
"learning_rate": 4.842090389055153e-06,
"loss": 0.1143,
"step": 1251
},
{
"epoch": 0.5696087352138307,
"grad_norm": 1.7921638992770812,
"learning_rate": 4.841840331989189e-06,
"loss": 0.1976,
"step": 1252
},
{
"epoch": 0.5700636942675159,
"grad_norm": 2.000993623816501,
"learning_rate": 4.841590083558608e-06,
"loss": 0.1768,
"step": 1253
},
{
"epoch": 0.5705186533212011,
"grad_norm": 2.4830094815396304,
"learning_rate": 4.841339643783861e-06,
"loss": 0.2043,
"step": 1254
},
{
"epoch": 0.5709736123748863,
"grad_norm": 1.5989796561168585,
"learning_rate": 4.841089012685412e-06,
"loss": 0.1778,
"step": 1255
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.8137268898691017,
"learning_rate": 4.840838190283741e-06,
"loss": 0.1692,
"step": 1256
},
{
"epoch": 0.5718835304822566,
"grad_norm": 1.7559519711217326,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.0939,
"step": 1257
},
{
"epoch": 0.5723384895359418,
"grad_norm": 1.7192722836354088,
"learning_rate": 4.840335971652732e-06,
"loss": 0.1255,
"step": 1258
},
{
"epoch": 0.5727934485896269,
"grad_norm": 1.9835814338763256,
"learning_rate": 4.840084575464434e-06,
"loss": 0.1945,
"step": 1259
},
{
"epoch": 0.5732484076433121,
"grad_norm": 1.8517843659588205,
"learning_rate": 4.839832988054992e-06,
"loss": 0.187,
"step": 1260
},
{
"epoch": 0.5737033666969973,
"grad_norm": 1.8951856802928044,
"learning_rate": 4.839581209444966e-06,
"loss": 0.1196,
"step": 1261
},
{
"epoch": 0.5741583257506825,
"grad_norm": 2.3401876182004386,
"learning_rate": 4.839329239654927e-06,
"loss": 0.2252,
"step": 1262
},
{
"epoch": 0.5746132848043676,
"grad_norm": 2.1924333176646145,
"learning_rate": 4.839077078705468e-06,
"loss": 0.137,
"step": 1263
},
{
"epoch": 0.5750682438580528,
"grad_norm": 1.6673068426763284,
"learning_rate": 4.838824726617194e-06,
"loss": 0.157,
"step": 1264
},
{
"epoch": 0.575523202911738,
"grad_norm": 1.7250800520215972,
"learning_rate": 4.838572183410725e-06,
"loss": 0.1808,
"step": 1265
},
{
"epoch": 0.5759781619654231,
"grad_norm": 1.6457142786345031,
"learning_rate": 4.838319449106697e-06,
"loss": 0.1635,
"step": 1266
},
{
"epoch": 0.5764331210191083,
"grad_norm": 1.5575525689618337,
"learning_rate": 4.838066523725764e-06,
"loss": 0.1127,
"step": 1267
},
{
"epoch": 0.5768880800727935,
"grad_norm": 2.5767156490698833,
"learning_rate": 4.837813407288594e-06,
"loss": 0.1798,
"step": 1268
},
{
"epoch": 0.5773430391264787,
"grad_norm": 1.9108956938528818,
"learning_rate": 4.837560099815869e-06,
"loss": 0.202,
"step": 1269
},
{
"epoch": 0.5777979981801638,
"grad_norm": 1.956778308687979,
"learning_rate": 4.837306601328289e-06,
"loss": 0.1806,
"step": 1270
},
{
"epoch": 0.578252957233849,
"grad_norm": 1.775478489276246,
"learning_rate": 4.837052911846569e-06,
"loss": 0.1695,
"step": 1271
},
{
"epoch": 0.5787079162875342,
"grad_norm": 1.787242091669647,
"learning_rate": 4.836799031391439e-06,
"loss": 0.1745,
"step": 1272
},
{
"epoch": 0.5791628753412192,
"grad_norm": 1.0591727928255608,
"learning_rate": 4.836544959983645e-06,
"loss": 0.1343,
"step": 1273
},
{
"epoch": 0.5796178343949044,
"grad_norm": 1.5740206900027498,
"learning_rate": 4.8362906976439485e-06,
"loss": 0.1635,
"step": 1274
},
{
"epoch": 0.5800727934485896,
"grad_norm": 1.5937545527814416,
"learning_rate": 4.836036244393127e-06,
"loss": 0.1581,
"step": 1275
},
{
"epoch": 0.5805277525022748,
"grad_norm": 1.813708807716678,
"learning_rate": 4.835781600251973e-06,
"loss": 0.2269,
"step": 1276
},
{
"epoch": 0.5809827115559599,
"grad_norm": 2.0796570235313836,
"learning_rate": 4.835526765241295e-06,
"loss": 0.1924,
"step": 1277
},
{
"epoch": 0.5814376706096451,
"grad_norm": 1.6083810261665097,
"learning_rate": 4.835271739381917e-06,
"loss": 0.1541,
"step": 1278
},
{
"epoch": 0.5818926296633303,
"grad_norm": 1.365537997124497,
"learning_rate": 4.835016522694678e-06,
"loss": 0.136,
"step": 1279
},
{
"epoch": 0.5823475887170154,
"grad_norm": 1.8893838729814614,
"learning_rate": 4.834761115200434e-06,
"loss": 0.2207,
"step": 1280
},
{
"epoch": 0.5828025477707006,
"grad_norm": 1.4870021241117473,
"learning_rate": 4.834505516920055e-06,
"loss": 0.1879,
"step": 1281
},
{
"epoch": 0.5832575068243858,
"grad_norm": 1.4165326048713465,
"learning_rate": 4.834249727874428e-06,
"loss": 0.1263,
"step": 1282
},
{
"epoch": 0.583712465878071,
"grad_norm": 1.8197657860371343,
"learning_rate": 4.833993748084455e-06,
"loss": 0.1727,
"step": 1283
},
{
"epoch": 0.5841674249317561,
"grad_norm": 1.715508493394312,
"learning_rate": 4.833737577571052e-06,
"loss": 0.1497,
"step": 1284
},
{
"epoch": 0.5846223839854413,
"grad_norm": 2.0061239985491555,
"learning_rate": 4.833481216355153e-06,
"loss": 0.1646,
"step": 1285
},
{
"epoch": 0.5850773430391265,
"grad_norm": 4.355130184989222,
"learning_rate": 4.833224664457709e-06,
"loss": 0.2076,
"step": 1286
},
{
"epoch": 0.5855323020928116,
"grad_norm": 3.155573393148588,
"learning_rate": 4.83296792189968e-06,
"loss": 0.2413,
"step": 1287
},
{
"epoch": 0.5859872611464968,
"grad_norm": 1.656224319251134,
"learning_rate": 4.83271098870205e-06,
"loss": 0.1237,
"step": 1288
},
{
"epoch": 0.586442220200182,
"grad_norm": 1.5671815338330013,
"learning_rate": 4.832453864885811e-06,
"loss": 0.1461,
"step": 1289
},
{
"epoch": 0.5868971792538672,
"grad_norm": 1.4490558461440097,
"learning_rate": 4.832196550471976e-06,
"loss": 0.1719,
"step": 1290
},
{
"epoch": 0.5873521383075523,
"grad_norm": 1.4391467760040138,
"learning_rate": 4.831939045481571e-06,
"loss": 0.1598,
"step": 1291
},
{
"epoch": 0.5878070973612375,
"grad_norm": 1.853086020668375,
"learning_rate": 4.8316813499356375e-06,
"loss": 0.1654,
"step": 1292
},
{
"epoch": 0.5882620564149227,
"grad_norm": 1.6999807809193854,
"learning_rate": 4.831423463855235e-06,
"loss": 0.1516,
"step": 1293
},
{
"epoch": 0.5887170154686078,
"grad_norm": 2.070573438132845,
"learning_rate": 4.8311653872614345e-06,
"loss": 0.1161,
"step": 1294
},
{
"epoch": 0.589171974522293,
"grad_norm": 1.6686744603097172,
"learning_rate": 4.830907120175327e-06,
"loss": 0.1584,
"step": 1295
},
{
"epoch": 0.5896269335759782,
"grad_norm": 2.089342697132724,
"learning_rate": 4.830648662618015e-06,
"loss": 0.2365,
"step": 1296
},
{
"epoch": 0.5900818926296634,
"grad_norm": 1.5894012047277333,
"learning_rate": 4.83039001461062e-06,
"loss": 0.1097,
"step": 1297
},
{
"epoch": 0.5905368516833485,
"grad_norm": 1.8782696857030252,
"learning_rate": 4.830131176174276e-06,
"loss": 0.151,
"step": 1298
},
{
"epoch": 0.5909918107370337,
"grad_norm": 1.958971362169023,
"learning_rate": 4.829872147330136e-06,
"loss": 0.1841,
"step": 1299
},
{
"epoch": 0.5914467697907189,
"grad_norm": 1.3360501731813752,
"learning_rate": 4.829612928099366e-06,
"loss": 0.1457,
"step": 1300
},
{
"epoch": 0.591901728844404,
"grad_norm": 1.638219511935524,
"learning_rate": 4.829353518503147e-06,
"loss": 0.1583,
"step": 1301
},
{
"epoch": 0.5923566878980892,
"grad_norm": 2.0096056545692025,
"learning_rate": 4.829093918562678e-06,
"loss": 0.1491,
"step": 1302
},
{
"epoch": 0.5928116469517744,
"grad_norm": 1.7893380227892468,
"learning_rate": 4.828834128299173e-06,
"loss": 0.1592,
"step": 1303
},
{
"epoch": 0.5932666060054596,
"grad_norm": 1.5760903095424181,
"learning_rate": 4.828574147733859e-06,
"loss": 0.1646,
"step": 1304
},
{
"epoch": 0.5937215650591446,
"grad_norm": 1.6385972545017617,
"learning_rate": 4.828313976887982e-06,
"loss": 0.1228,
"step": 1305
},
{
"epoch": 0.5941765241128298,
"grad_norm": 1.7350084151113443,
"learning_rate": 4.8280536157828e-06,
"loss": 0.1532,
"step": 1306
},
{
"epoch": 0.594631483166515,
"grad_norm": 2.1711615974874223,
"learning_rate": 4.827793064439592e-06,
"loss": 0.1551,
"step": 1307
},
{
"epoch": 0.5950864422202001,
"grad_norm": 2.5688116012952125,
"learning_rate": 4.8275323228796455e-06,
"loss": 0.18,
"step": 1308
},
{
"epoch": 0.5955414012738853,
"grad_norm": 1.534845536955317,
"learning_rate": 4.8272713911242695e-06,
"loss": 0.121,
"step": 1309
},
{
"epoch": 0.5959963603275705,
"grad_norm": 1.9028349069881882,
"learning_rate": 4.827010269194785e-06,
"loss": 0.1228,
"step": 1310
},
{
"epoch": 0.5964513193812557,
"grad_norm": 2.1051164199599,
"learning_rate": 4.8267489571125295e-06,
"loss": 0.1465,
"step": 1311
},
{
"epoch": 0.5969062784349408,
"grad_norm": 2.9999435749849073,
"learning_rate": 4.826487454898857e-06,
"loss": 0.2635,
"step": 1312
},
{
"epoch": 0.597361237488626,
"grad_norm": 1.880715290875366,
"learning_rate": 4.826225762575136e-06,
"loss": 0.194,
"step": 1313
},
{
"epoch": 0.5978161965423112,
"grad_norm": 1.6843651365954362,
"learning_rate": 4.825963880162752e-06,
"loss": 0.1792,
"step": 1314
},
{
"epoch": 0.5982711555959963,
"grad_norm": 1.606704753365435,
"learning_rate": 4.825701807683102e-06,
"loss": 0.1399,
"step": 1315
},
{
"epoch": 0.5987261146496815,
"grad_norm": 1.8783582719750365,
"learning_rate": 4.825439545157603e-06,
"loss": 0.1743,
"step": 1316
},
{
"epoch": 0.5991810737033667,
"grad_norm": 2.39834669557369,
"learning_rate": 4.825177092607687e-06,
"loss": 0.2576,
"step": 1317
},
{
"epoch": 0.5996360327570519,
"grad_norm": 1.5809346444064956,
"learning_rate": 4.8249144500547995e-06,
"loss": 0.1266,
"step": 1318
},
{
"epoch": 0.600090991810737,
"grad_norm": 1.6731917139944308,
"learning_rate": 4.824651617520402e-06,
"loss": 0.1722,
"step": 1319
},
{
"epoch": 0.6005459508644222,
"grad_norm": 1.9934684665371283,
"learning_rate": 4.824388595025972e-06,
"loss": 0.1863,
"step": 1320
},
{
"epoch": 0.6010009099181074,
"grad_norm": 1.7396149145777957,
"learning_rate": 4.824125382593003e-06,
"loss": 0.1582,
"step": 1321
},
{
"epoch": 0.6014558689717925,
"grad_norm": 1.7746494679795604,
"learning_rate": 4.823861980243003e-06,
"loss": 0.1485,
"step": 1322
},
{
"epoch": 0.6019108280254777,
"grad_norm": 1.8309083669399964,
"learning_rate": 4.823598387997497e-06,
"loss": 0.1495,
"step": 1323
},
{
"epoch": 0.6023657870791629,
"grad_norm": 1.9534496331991582,
"learning_rate": 4.823334605878024e-06,
"loss": 0.1462,
"step": 1324
},
{
"epoch": 0.6028207461328481,
"grad_norm": 2.1011605763315138,
"learning_rate": 4.82307063390614e-06,
"loss": 0.1853,
"step": 1325
},
{
"epoch": 0.6032757051865332,
"grad_norm": 2.5503968401256465,
"learning_rate": 4.822806472103413e-06,
"loss": 0.2297,
"step": 1326
},
{
"epoch": 0.6037306642402184,
"grad_norm": 1.4853028085158964,
"learning_rate": 4.822542120491431e-06,
"loss": 0.1692,
"step": 1327
},
{
"epoch": 0.6041856232939036,
"grad_norm": 1.5826380640650177,
"learning_rate": 4.822277579091796e-06,
"loss": 0.1845,
"step": 1328
},
{
"epoch": 0.6046405823475887,
"grad_norm": 1.7941875470339128,
"learning_rate": 4.822012847926125e-06,
"loss": 0.1723,
"step": 1329
},
{
"epoch": 0.6050955414012739,
"grad_norm": 1.6317178871077942,
"learning_rate": 4.821747927016049e-06,
"loss": 0.1309,
"step": 1330
},
{
"epoch": 0.6055505004549591,
"grad_norm": 1.5814757694833934,
"learning_rate": 4.821482816383219e-06,
"loss": 0.1565,
"step": 1331
},
{
"epoch": 0.6060054595086443,
"grad_norm": 1.5304957435111453,
"learning_rate": 4.821217516049296e-06,
"loss": 0.1373,
"step": 1332
},
{
"epoch": 0.6064604185623294,
"grad_norm": 1.5147254102931988,
"learning_rate": 4.82095202603596e-06,
"loss": 0.1431,
"step": 1333
},
{
"epoch": 0.6069153776160146,
"grad_norm": 1.5663436015338144,
"learning_rate": 4.820686346364906e-06,
"loss": 0.156,
"step": 1334
},
{
"epoch": 0.6073703366696998,
"grad_norm": 2.053796214560493,
"learning_rate": 4.820420477057843e-06,
"loss": 0.1874,
"step": 1335
},
{
"epoch": 0.607825295723385,
"grad_norm": 1.95739593906374,
"learning_rate": 4.820154418136498e-06,
"loss": 0.1526,
"step": 1336
},
{
"epoch": 0.60828025477707,
"grad_norm": 1.8483495445052411,
"learning_rate": 4.819888169622612e-06,
"loss": 0.2036,
"step": 1337
},
{
"epoch": 0.6087352138307552,
"grad_norm": 1.9503495611822523,
"learning_rate": 4.819621731537942e-06,
"loss": 0.2066,
"step": 1338
},
{
"epoch": 0.6091901728844404,
"grad_norm": 1.5722530391175293,
"learning_rate": 4.819355103904259e-06,
"loss": 0.1419,
"step": 1339
},
{
"epoch": 0.6096451319381255,
"grad_norm": 1.8367765104613556,
"learning_rate": 4.81908828674335e-06,
"loss": 0.1775,
"step": 1340
},
{
"epoch": 0.6101000909918107,
"grad_norm": 1.6359632675531957,
"learning_rate": 4.81882128007702e-06,
"loss": 0.1361,
"step": 1341
},
{
"epoch": 0.6105550500454959,
"grad_norm": 1.6249604007945537,
"learning_rate": 4.818554083927086e-06,
"loss": 0.1501,
"step": 1342
},
{
"epoch": 0.6110100090991811,
"grad_norm": 1.507444149214357,
"learning_rate": 4.818286698315383e-06,
"loss": 0.1318,
"step": 1343
},
{
"epoch": 0.6114649681528662,
"grad_norm": 1.714948580415853,
"learning_rate": 4.818019123263761e-06,
"loss": 0.1576,
"step": 1344
},
{
"epoch": 0.6119199272065514,
"grad_norm": 1.6310779918465994,
"learning_rate": 4.817751358794084e-06,
"loss": 0.1505,
"step": 1345
},
{
"epoch": 0.6123748862602366,
"grad_norm": 1.9516095925204497,
"learning_rate": 4.8174834049282325e-06,
"loss": 0.1513,
"step": 1346
},
{
"epoch": 0.6128298453139217,
"grad_norm": 1.6535718997078614,
"learning_rate": 4.817215261688104e-06,
"loss": 0.1509,
"step": 1347
},
{
"epoch": 0.6132848043676069,
"grad_norm": 1.7050249250163263,
"learning_rate": 4.816946929095607e-06,
"loss": 0.143,
"step": 1348
},
{
"epoch": 0.6137397634212921,
"grad_norm": 1.9555072177299098,
"learning_rate": 4.816678407172671e-06,
"loss": 0.1702,
"step": 1349
},
{
"epoch": 0.6141947224749773,
"grad_norm": 1.6603270300616475,
"learning_rate": 4.816409695941238e-06,
"loss": 0.1525,
"step": 1350
},
{
"epoch": 0.6146496815286624,
"grad_norm": 2.052319098264881,
"learning_rate": 4.816140795423265e-06,
"loss": 0.1553,
"step": 1351
},
{
"epoch": 0.6151046405823476,
"grad_norm": 2.316846556963137,
"learning_rate": 4.8158717056407255e-06,
"loss": 0.2204,
"step": 1352
},
{
"epoch": 0.6155595996360328,
"grad_norm": 2.268897705180763,
"learning_rate": 4.815602426615609e-06,
"loss": 0.172,
"step": 1353
},
{
"epoch": 0.6160145586897179,
"grad_norm": 2.277033042904883,
"learning_rate": 4.815332958369919e-06,
"loss": 0.1952,
"step": 1354
},
{
"epoch": 0.6164695177434031,
"grad_norm": 2.203261308039804,
"learning_rate": 4.815063300925677e-06,
"loss": 0.1778,
"step": 1355
},
{
"epoch": 0.6169244767970883,
"grad_norm": 1.5542993423497844,
"learning_rate": 4.814793454304915e-06,
"loss": 0.1831,
"step": 1356
},
{
"epoch": 0.6173794358507735,
"grad_norm": 1.3687836885728237,
"learning_rate": 4.814523418529686e-06,
"loss": 0.1438,
"step": 1357
},
{
"epoch": 0.6178343949044586,
"grad_norm": 1.803336916930759,
"learning_rate": 4.814253193622056e-06,
"loss": 0.1426,
"step": 1358
},
{
"epoch": 0.6182893539581438,
"grad_norm": 1.521636702652137,
"learning_rate": 4.813982779604106e-06,
"loss": 0.1214,
"step": 1359
},
{
"epoch": 0.618744313011829,
"grad_norm": 1.5404670484043497,
"learning_rate": 4.813712176497933e-06,
"loss": 0.1366,
"step": 1360
},
{
"epoch": 0.6191992720655141,
"grad_norm": 1.689965450022471,
"learning_rate": 4.813441384325649e-06,
"loss": 0.1346,
"step": 1361
},
{
"epoch": 0.6196542311191993,
"grad_norm": 1.7814370136900919,
"learning_rate": 4.813170403109383e-06,
"loss": 0.1444,
"step": 1362
},
{
"epoch": 0.6201091901728845,
"grad_norm": 1.72215098605925,
"learning_rate": 4.8128992328712774e-06,
"loss": 0.1127,
"step": 1363
},
{
"epoch": 0.6205641492265697,
"grad_norm": 1.504745997390183,
"learning_rate": 4.812627873633492e-06,
"loss": 0.149,
"step": 1364
},
{
"epoch": 0.6210191082802548,
"grad_norm": 1.6905090686600799,
"learning_rate": 4.8123563254182e-06,
"loss": 0.1457,
"step": 1365
},
{
"epoch": 0.62147406733394,
"grad_norm": 2.1753494024731683,
"learning_rate": 4.8120845882475924e-06,
"loss": 0.1995,
"step": 1366
},
{
"epoch": 0.6219290263876252,
"grad_norm": 2.793385404562888,
"learning_rate": 4.8118126621438734e-06,
"loss": 0.2318,
"step": 1367
},
{
"epoch": 0.6223839854413102,
"grad_norm": 1.7667899225260022,
"learning_rate": 4.811540547129263e-06,
"loss": 0.2251,
"step": 1368
},
{
"epoch": 0.6228389444949954,
"grad_norm": 2.1679573967859787,
"learning_rate": 4.811268243225999e-06,
"loss": 0.1784,
"step": 1369
},
{
"epoch": 0.6232939035486806,
"grad_norm": 2.4497161330069424,
"learning_rate": 4.810995750456331e-06,
"loss": 0.1795,
"step": 1370
},
{
"epoch": 0.6237488626023658,
"grad_norm": 1.9632525184445888,
"learning_rate": 4.810723068842526e-06,
"loss": 0.1757,
"step": 1371
},
{
"epoch": 0.6242038216560509,
"grad_norm": 1.78757490589868,
"learning_rate": 4.810450198406867e-06,
"loss": 0.1994,
"step": 1372
},
{
"epoch": 0.6246587807097361,
"grad_norm": 2.5975000715086907,
"learning_rate": 4.810177139171653e-06,
"loss": 0.2177,
"step": 1373
},
{
"epoch": 0.6251137397634213,
"grad_norm": 1.8861807982376269,
"learning_rate": 4.809903891159195e-06,
"loss": 0.1318,
"step": 1374
},
{
"epoch": 0.6255686988171064,
"grad_norm": 2.474014583254649,
"learning_rate": 4.809630454391822e-06,
"loss": 0.165,
"step": 1375
},
{
"epoch": 0.6260236578707916,
"grad_norm": 1.8420180765220768,
"learning_rate": 4.80935682889188e-06,
"loss": 0.1997,
"step": 1376
},
{
"epoch": 0.6264786169244768,
"grad_norm": 1.8871529282732857,
"learning_rate": 4.809083014681726e-06,
"loss": 0.239,
"step": 1377
},
{
"epoch": 0.626933575978162,
"grad_norm": 1.776688876661572,
"learning_rate": 4.808809011783735e-06,
"loss": 0.1876,
"step": 1378
},
{
"epoch": 0.6273885350318471,
"grad_norm": 1.806661163792066,
"learning_rate": 4.808534820220299e-06,
"loss": 0.148,
"step": 1379
},
{
"epoch": 0.6278434940855323,
"grad_norm": 2.0820743721382007,
"learning_rate": 4.8082604400138226e-06,
"loss": 0.2015,
"step": 1380
},
{
"epoch": 0.6282984531392175,
"grad_norm": 1.5614420996583043,
"learning_rate": 4.807985871186726e-06,
"loss": 0.1277,
"step": 1381
},
{
"epoch": 0.6287534121929026,
"grad_norm": 1.5694923603817514,
"learning_rate": 4.8077111137614484e-06,
"loss": 0.1345,
"step": 1382
},
{
"epoch": 0.6292083712465878,
"grad_norm": 2.2610976098352116,
"learning_rate": 4.8074361677604394e-06,
"loss": 0.1732,
"step": 1383
},
{
"epoch": 0.629663330300273,
"grad_norm": 2.0760282221755704,
"learning_rate": 4.807161033206168e-06,
"loss": 0.1936,
"step": 1384
},
{
"epoch": 0.6301182893539582,
"grad_norm": 1.805894786082926,
"learning_rate": 4.806885710121114e-06,
"loss": 0.1536,
"step": 1385
},
{
"epoch": 0.6305732484076433,
"grad_norm": 1.9667669224198192,
"learning_rate": 4.806610198527779e-06,
"loss": 0.1729,
"step": 1386
},
{
"epoch": 0.6310282074613285,
"grad_norm": 1.9797354051419906,
"learning_rate": 4.8063344984486755e-06,
"loss": 0.2071,
"step": 1387
},
{
"epoch": 0.6314831665150137,
"grad_norm": 2.5955252782084224,
"learning_rate": 4.806058609906331e-06,
"loss": 0.181,
"step": 1388
},
{
"epoch": 0.6319381255686988,
"grad_norm": 2.707367730234045,
"learning_rate": 4.805782532923292e-06,
"loss": 0.2299,
"step": 1389
},
{
"epoch": 0.632393084622384,
"grad_norm": 1.579292631208614,
"learning_rate": 4.805506267522116e-06,
"loss": 0.2235,
"step": 1390
},
{
"epoch": 0.6328480436760692,
"grad_norm": 3.332380342069127,
"learning_rate": 4.80522981372538e-06,
"loss": 0.2485,
"step": 1391
},
{
"epoch": 0.6333030027297544,
"grad_norm": 1.597247684736274,
"learning_rate": 4.804953171555674e-06,
"loss": 0.1511,
"step": 1392
},
{
"epoch": 0.6337579617834395,
"grad_norm": 2.050037449702685,
"learning_rate": 4.8046763410356046e-06,
"loss": 0.1732,
"step": 1393
},
{
"epoch": 0.6342129208371247,
"grad_norm": 1.6703199484658815,
"learning_rate": 4.804399322187791e-06,
"loss": 0.1832,
"step": 1394
},
{
"epoch": 0.6346678798908099,
"grad_norm": 2.4171080690553155,
"learning_rate": 4.8041221150348725e-06,
"loss": 0.2519,
"step": 1395
},
{
"epoch": 0.635122838944495,
"grad_norm": 1.7415236452607812,
"learning_rate": 4.8038447195995e-06,
"loss": 0.1942,
"step": 1396
},
{
"epoch": 0.6355777979981801,
"grad_norm": 2.0585293521798,
"learning_rate": 4.80356713590434e-06,
"loss": 0.1806,
"step": 1397
},
{
"epoch": 0.6360327570518653,
"grad_norm": 1.6543360161164664,
"learning_rate": 4.803289363972078e-06,
"loss": 0.1953,
"step": 1398
},
{
"epoch": 0.6364877161055505,
"grad_norm": 1.952726003661859,
"learning_rate": 4.8030114038254094e-06,
"loss": 0.164,
"step": 1399
},
{
"epoch": 0.6369426751592356,
"grad_norm": 1.6177022530921434,
"learning_rate": 4.80273325548705e-06,
"loss": 0.1798,
"step": 1400
},
{
"epoch": 0.6373976342129208,
"grad_norm": 1.9292090840839082,
"learning_rate": 4.802454918979728e-06,
"loss": 0.1652,
"step": 1401
},
{
"epoch": 0.637852593266606,
"grad_norm": 1.9210595574243916,
"learning_rate": 4.802176394326187e-06,
"loss": 0.2007,
"step": 1402
},
{
"epoch": 0.6383075523202911,
"grad_norm": 1.464054312422107,
"learning_rate": 4.801897681549188e-06,
"loss": 0.129,
"step": 1403
},
{
"epoch": 0.6387625113739763,
"grad_norm": 1.9150864430756966,
"learning_rate": 4.801618780671506e-06,
"loss": 0.1634,
"step": 1404
},
{
"epoch": 0.6392174704276615,
"grad_norm": 1.4873483060535149,
"learning_rate": 4.801339691715932e-06,
"loss": 0.1463,
"step": 1405
},
{
"epoch": 0.6396724294813467,
"grad_norm": 2.3690804594133623,
"learning_rate": 4.8010604147052695e-06,
"loss": 0.1606,
"step": 1406
},
{
"epoch": 0.6401273885350318,
"grad_norm": 2.3100068394442497,
"learning_rate": 4.800780949662343e-06,
"loss": 0.1904,
"step": 1407
},
{
"epoch": 0.640582347588717,
"grad_norm": 1.5363867596702172,
"learning_rate": 4.800501296609986e-06,
"loss": 0.1053,
"step": 1408
},
{
"epoch": 0.6410373066424022,
"grad_norm": 1.606538550331431,
"learning_rate": 4.800221455571053e-06,
"loss": 0.1397,
"step": 1409
},
{
"epoch": 0.6414922656960873,
"grad_norm": 1.611596105149799,
"learning_rate": 4.7999414265684105e-06,
"loss": 0.1303,
"step": 1410
},
{
"epoch": 0.6419472247497725,
"grad_norm": 1.6262064168900117,
"learning_rate": 4.79966120962494e-06,
"loss": 0.1564,
"step": 1411
},
{
"epoch": 0.6424021838034577,
"grad_norm": 2.015359106142208,
"learning_rate": 4.799380804763542e-06,
"loss": 0.1619,
"step": 1412
},
{
"epoch": 0.6428571428571429,
"grad_norm": 2.0480276409863465,
"learning_rate": 4.799100212007128e-06,
"loss": 0.1711,
"step": 1413
},
{
"epoch": 0.643312101910828,
"grad_norm": 1.9220142745677993,
"learning_rate": 4.7988194313786275e-06,
"loss": 0.1496,
"step": 1414
},
{
"epoch": 0.6437670609645132,
"grad_norm": 1.5592119110073082,
"learning_rate": 4.798538462900984e-06,
"loss": 0.1563,
"step": 1415
},
{
"epoch": 0.6442220200181984,
"grad_norm": 2.7928579618942764,
"learning_rate": 4.798257306597157e-06,
"loss": 0.2031,
"step": 1416
},
{
"epoch": 0.6446769790718835,
"grad_norm": 1.579272373938799,
"learning_rate": 4.797975962490122e-06,
"loss": 0.1501,
"step": 1417
},
{
"epoch": 0.6451319381255687,
"grad_norm": 1.5556034741269746,
"learning_rate": 4.797694430602869e-06,
"loss": 0.1125,
"step": 1418
},
{
"epoch": 0.6455868971792539,
"grad_norm": 2.4067503053827273,
"learning_rate": 4.797412710958405e-06,
"loss": 0.2154,
"step": 1419
},
{
"epoch": 0.6460418562329391,
"grad_norm": 2.143935212981359,
"learning_rate": 4.797130803579747e-06,
"loss": 0.1694,
"step": 1420
},
{
"epoch": 0.6464968152866242,
"grad_norm": 2.6240019391696667,
"learning_rate": 4.796848708489935e-06,
"loss": 0.2811,
"step": 1421
},
{
"epoch": 0.6469517743403094,
"grad_norm": 1.5174877651602559,
"learning_rate": 4.796566425712018e-06,
"loss": 0.1435,
"step": 1422
},
{
"epoch": 0.6474067333939946,
"grad_norm": 1.6834754436981423,
"learning_rate": 4.796283955269065e-06,
"loss": 0.1816,
"step": 1423
},
{
"epoch": 0.6478616924476797,
"grad_norm": 1.5804322468618368,
"learning_rate": 4.796001297184156e-06,
"loss": 0.1471,
"step": 1424
},
{
"epoch": 0.6483166515013649,
"grad_norm": 1.8327883828431184,
"learning_rate": 4.79571845148039e-06,
"loss": 0.2011,
"step": 1425
},
{
"epoch": 0.6487716105550501,
"grad_norm": 1.4039853389905468,
"learning_rate": 4.795435418180879e-06,
"loss": 0.1074,
"step": 1426
},
{
"epoch": 0.6492265696087353,
"grad_norm": 1.664983557085843,
"learning_rate": 4.795152197308753e-06,
"loss": 0.148,
"step": 1427
},
{
"epoch": 0.6496815286624203,
"grad_norm": 1.6844695222093484,
"learning_rate": 4.794868788887154e-06,
"loss": 0.1207,
"step": 1428
},
{
"epoch": 0.6501364877161055,
"grad_norm": 1.3430612047901953,
"learning_rate": 4.79458519293924e-06,
"loss": 0.1437,
"step": 1429
},
{
"epoch": 0.6505914467697907,
"grad_norm": 1.6637985127807216,
"learning_rate": 4.794301409488187e-06,
"loss": 0.1478,
"step": 1430
},
{
"epoch": 0.6510464058234758,
"grad_norm": 1.385729637043462,
"learning_rate": 4.7940174385571835e-06,
"loss": 0.1627,
"step": 1431
},
{
"epoch": 0.651501364877161,
"grad_norm": 2.0471057598981632,
"learning_rate": 4.793733280169435e-06,
"loss": 0.2172,
"step": 1432
},
{
"epoch": 0.6519563239308462,
"grad_norm": 2.804939948704313,
"learning_rate": 4.7934489343481614e-06,
"loss": 0.2366,
"step": 1433
},
{
"epoch": 0.6524112829845314,
"grad_norm": 2.1472377723290568,
"learning_rate": 4.7931644011165975e-06,
"loss": 0.1418,
"step": 1434
},
{
"epoch": 0.6528662420382165,
"grad_norm": 1.9918480481257164,
"learning_rate": 4.792879680497995e-06,
"loss": 0.186,
"step": 1435
},
{
"epoch": 0.6533212010919017,
"grad_norm": 2.5064644756915655,
"learning_rate": 4.79259477251562e-06,
"loss": 0.2048,
"step": 1436
},
{
"epoch": 0.6537761601455869,
"grad_norm": 2.3512727211776263,
"learning_rate": 4.792309677192753e-06,
"loss": 0.2052,
"step": 1437
},
{
"epoch": 0.654231119199272,
"grad_norm": 1.9202855097301381,
"learning_rate": 4.79202439455269e-06,
"loss": 0.1458,
"step": 1438
},
{
"epoch": 0.6546860782529572,
"grad_norm": 1.4271813740118833,
"learning_rate": 4.791738924618745e-06,
"loss": 0.1211,
"step": 1439
},
{
"epoch": 0.6551410373066424,
"grad_norm": 2.032712581115854,
"learning_rate": 4.791453267414245e-06,
"loss": 0.1836,
"step": 1440
},
{
"epoch": 0.6555959963603276,
"grad_norm": 1.858326597247768,
"learning_rate": 4.7911674229625316e-06,
"loss": 0.1539,
"step": 1441
},
{
"epoch": 0.6560509554140127,
"grad_norm": 1.9149985878919944,
"learning_rate": 4.790881391286963e-06,
"loss": 0.1492,
"step": 1442
},
{
"epoch": 0.6565059144676979,
"grad_norm": 2.224611827457958,
"learning_rate": 4.790595172410914e-06,
"loss": 0.1771,
"step": 1443
},
{
"epoch": 0.6569608735213831,
"grad_norm": 2.2710831934815423,
"learning_rate": 4.79030876635777e-06,
"loss": 0.1816,
"step": 1444
},
{
"epoch": 0.6574158325750682,
"grad_norm": 1.686396567912197,
"learning_rate": 4.790022173150938e-06,
"loss": 0.1715,
"step": 1445
},
{
"epoch": 0.6578707916287534,
"grad_norm": 1.6844379519791872,
"learning_rate": 4.789735392813835e-06,
"loss": 0.1612,
"step": 1446
},
{
"epoch": 0.6583257506824386,
"grad_norm": 1.9308684762069341,
"learning_rate": 4.789448425369896e-06,
"loss": 0.1943,
"step": 1447
},
{
"epoch": 0.6587807097361238,
"grad_norm": 1.7813876642605184,
"learning_rate": 4.789161270842571e-06,
"loss": 0.133,
"step": 1448
},
{
"epoch": 0.6592356687898089,
"grad_norm": 1.7016656003147437,
"learning_rate": 4.7888739292553235e-06,
"loss": 0.1787,
"step": 1449
},
{
"epoch": 0.6596906278434941,
"grad_norm": 1.788996418731665,
"learning_rate": 4.788586400631636e-06,
"loss": 0.2144,
"step": 1450
},
{
"epoch": 0.6601455868971793,
"grad_norm": 1.1868611743252886,
"learning_rate": 4.788298684995003e-06,
"loss": 0.1411,
"step": 1451
},
{
"epoch": 0.6606005459508644,
"grad_norm": 1.3784782394299329,
"learning_rate": 4.7880107823689355e-06,
"loss": 0.1394,
"step": 1452
},
{
"epoch": 0.6610555050045496,
"grad_norm": 2.38570648853941,
"learning_rate": 4.787722692776958e-06,
"loss": 0.2177,
"step": 1453
},
{
"epoch": 0.6615104640582348,
"grad_norm": 1.885827372966156,
"learning_rate": 4.787434416242615e-06,
"loss": 0.1932,
"step": 1454
},
{
"epoch": 0.66196542311192,
"grad_norm": 2.0741165529803305,
"learning_rate": 4.787145952789461e-06,
"loss": 0.1916,
"step": 1455
},
{
"epoch": 0.6624203821656051,
"grad_norm": 2.2824023726624216,
"learning_rate": 4.786857302441069e-06,
"loss": 0.154,
"step": 1456
},
{
"epoch": 0.6628753412192903,
"grad_norm": 1.9364048955005693,
"learning_rate": 4.786568465221025e-06,
"loss": 0.1456,
"step": 1457
},
{
"epoch": 0.6633303002729755,
"grad_norm": 2.085706626351343,
"learning_rate": 4.7862794411529315e-06,
"loss": 0.2085,
"step": 1458
},
{
"epoch": 0.6637852593266605,
"grad_norm": 1.614288560024189,
"learning_rate": 4.7859902302604075e-06,
"loss": 0.174,
"step": 1459
},
{
"epoch": 0.6642402183803457,
"grad_norm": 2.5891987139037305,
"learning_rate": 4.785700832567085e-06,
"loss": 0.2207,
"step": 1460
},
{
"epoch": 0.664695177434031,
"grad_norm": 1.60390922794205,
"learning_rate": 4.785411248096613e-06,
"loss": 0.1694,
"step": 1461
},
{
"epoch": 0.6651501364877161,
"grad_norm": 1.9008758556011767,
"learning_rate": 4.785121476872654e-06,
"loss": 0.1917,
"step": 1462
},
{
"epoch": 0.6656050955414012,
"grad_norm": 1.8830534414569509,
"learning_rate": 4.784831518918888e-06,
"loss": 0.1738,
"step": 1463
},
{
"epoch": 0.6660600545950864,
"grad_norm": 1.7207750442706227,
"learning_rate": 4.784541374259008e-06,
"loss": 0.15,
"step": 1464
},
{
"epoch": 0.6665150136487716,
"grad_norm": 1.875368507153303,
"learning_rate": 4.7842510429167244e-06,
"loss": 0.1785,
"step": 1465
},
{
"epoch": 0.6669699727024567,
"grad_norm": 1.423039570984651,
"learning_rate": 4.783960524915761e-06,
"loss": 0.1618,
"step": 1466
},
{
"epoch": 0.6674249317561419,
"grad_norm": 3.369804205318982,
"learning_rate": 4.783669820279858e-06,
"loss": 0.2151,
"step": 1467
},
{
"epoch": 0.6678798908098271,
"grad_norm": 1.7236530224714224,
"learning_rate": 4.783378929032769e-06,
"loss": 0.1449,
"step": 1468
},
{
"epoch": 0.6683348498635123,
"grad_norm": 1.897670469007501,
"learning_rate": 4.783087851198267e-06,
"loss": 0.1565,
"step": 1469
},
{
"epoch": 0.6687898089171974,
"grad_norm": 2.120484944530229,
"learning_rate": 4.7827965868001356e-06,
"loss": 0.146,
"step": 1470
},
{
"epoch": 0.6692447679708826,
"grad_norm": 1.5164080428619426,
"learning_rate": 4.782505135862176e-06,
"loss": 0.1948,
"step": 1471
},
{
"epoch": 0.6696997270245678,
"grad_norm": 1.7069357913374903,
"learning_rate": 4.782213498408205e-06,
"loss": 0.1592,
"step": 1472
},
{
"epoch": 0.6701546860782529,
"grad_norm": 1.809748302750509,
"learning_rate": 4.781921674462053e-06,
"loss": 0.1314,
"step": 1473
},
{
"epoch": 0.6706096451319381,
"grad_norm": 2.1951569204558927,
"learning_rate": 4.781629664047566e-06,
"loss": 0.1845,
"step": 1474
},
{
"epoch": 0.6710646041856233,
"grad_norm": 1.3071594737849044,
"learning_rate": 4.781337467188607e-06,
"loss": 0.1436,
"step": 1475
},
{
"epoch": 0.6715195632393085,
"grad_norm": 1.945295439800649,
"learning_rate": 4.781045083909053e-06,
"loss": 0.1855,
"step": 1476
},
{
"epoch": 0.6719745222929936,
"grad_norm": 2.1383665971380053,
"learning_rate": 4.780752514232796e-06,
"loss": 0.1746,
"step": 1477
},
{
"epoch": 0.6724294813466788,
"grad_norm": 1.9493775213300697,
"learning_rate": 4.780459758183743e-06,
"loss": 0.136,
"step": 1478
},
{
"epoch": 0.672884440400364,
"grad_norm": 1.5588501717449852,
"learning_rate": 4.780166815785817e-06,
"loss": 0.1564,
"step": 1479
},
{
"epoch": 0.6733393994540491,
"grad_norm": 1.9111191141451183,
"learning_rate": 4.7798736870629554e-06,
"loss": 0.1722,
"step": 1480
},
{
"epoch": 0.6737943585077343,
"grad_norm": 1.7396374086258946,
"learning_rate": 4.779580372039113e-06,
"loss": 0.1569,
"step": 1481
},
{
"epoch": 0.6742493175614195,
"grad_norm": 2.2814229407003563,
"learning_rate": 4.779286870738256e-06,
"loss": 0.1576,
"step": 1482
},
{
"epoch": 0.6747042766151047,
"grad_norm": 2.543619017373989,
"learning_rate": 4.778993183184371e-06,
"loss": 0.1743,
"step": 1483
},
{
"epoch": 0.6751592356687898,
"grad_norm": 2.003249738108025,
"learning_rate": 4.778699309401453e-06,
"loss": 0.2083,
"step": 1484
},
{
"epoch": 0.675614194722475,
"grad_norm": 1.7140899951572492,
"learning_rate": 4.7784052494135195e-06,
"loss": 0.1649,
"step": 1485
},
{
"epoch": 0.6760691537761602,
"grad_norm": 1.6177440846188005,
"learning_rate": 4.778111003244596e-06,
"loss": 0.1706,
"step": 1486
},
{
"epoch": 0.6765241128298453,
"grad_norm": 1.3540158476274282,
"learning_rate": 4.777816570918731e-06,
"loss": 0.1474,
"step": 1487
},
{
"epoch": 0.6769790718835305,
"grad_norm": 1.8863006900369008,
"learning_rate": 4.777521952459982e-06,
"loss": 0.1995,
"step": 1488
},
{
"epoch": 0.6774340309372157,
"grad_norm": 2.2667108941921073,
"learning_rate": 4.777227147892424e-06,
"loss": 0.1855,
"step": 1489
},
{
"epoch": 0.6778889899909009,
"grad_norm": 1.9407891934102777,
"learning_rate": 4.776932157240147e-06,
"loss": 0.1503,
"step": 1490
},
{
"epoch": 0.678343949044586,
"grad_norm": 2.102459646475576,
"learning_rate": 4.776636980527257e-06,
"loss": 0.1388,
"step": 1491
},
{
"epoch": 0.6787989080982711,
"grad_norm": 2.08408986696494,
"learning_rate": 4.776341617777874e-06,
"loss": 0.1933,
"step": 1492
},
{
"epoch": 0.6792538671519563,
"grad_norm": 1.5090681867773854,
"learning_rate": 4.776046069016135e-06,
"loss": 0.1617,
"step": 1493
},
{
"epoch": 0.6797088262056415,
"grad_norm": 2.463007954699752,
"learning_rate": 4.775750334266188e-06,
"loss": 0.2267,
"step": 1494
},
{
"epoch": 0.6801637852593266,
"grad_norm": 1.0819737688059052,
"learning_rate": 4.775454413552202e-06,
"loss": 0.1047,
"step": 1495
},
{
"epoch": 0.6806187443130118,
"grad_norm": 2.180583587749644,
"learning_rate": 4.775158306898358e-06,
"loss": 0.1147,
"step": 1496
},
{
"epoch": 0.681073703366697,
"grad_norm": 1.4888818210097596,
"learning_rate": 4.774862014328849e-06,
"loss": 0.1531,
"step": 1497
},
{
"epoch": 0.6815286624203821,
"grad_norm": 1.4821796970713637,
"learning_rate": 4.774565535867892e-06,
"loss": 0.163,
"step": 1498
},
{
"epoch": 0.6819836214740673,
"grad_norm": 1.9349751384396998,
"learning_rate": 4.77426887153971e-06,
"loss": 0.1602,
"step": 1499
},
{
"epoch": 0.6824385805277525,
"grad_norm": 2.068635944499767,
"learning_rate": 4.773972021368546e-06,
"loss": 0.1934,
"step": 1500
},
{
"epoch": 0.6828935395814377,
"grad_norm": 1.9557854149934149,
"learning_rate": 4.773674985378658e-06,
"loss": 0.2143,
"step": 1501
},
{
"epoch": 0.6833484986351228,
"grad_norm": 2.6563423898144936,
"learning_rate": 4.773377763594319e-06,
"loss": 0.1837,
"step": 1502
},
{
"epoch": 0.683803457688808,
"grad_norm": 2.4819107124862856,
"learning_rate": 4.773080356039814e-06,
"loss": 0.1975,
"step": 1503
},
{
"epoch": 0.6842584167424932,
"grad_norm": 1.7036233463379575,
"learning_rate": 4.772782762739448e-06,
"loss": 0.1848,
"step": 1504
},
{
"epoch": 0.6847133757961783,
"grad_norm": 1.9141994818014876,
"learning_rate": 4.772484983717539e-06,
"loss": 0.2006,
"step": 1505
},
{
"epoch": 0.6851683348498635,
"grad_norm": 2.4521735191952114,
"learning_rate": 4.77218701899842e-06,
"loss": 0.2101,
"step": 1506
},
{
"epoch": 0.6856232939035487,
"grad_norm": 2.0961682322351174,
"learning_rate": 4.771888868606438e-06,
"loss": 0.2065,
"step": 1507
},
{
"epoch": 0.6860782529572339,
"grad_norm": 1.6218330474990592,
"learning_rate": 4.771590532565957e-06,
"loss": 0.1255,
"step": 1508
},
{
"epoch": 0.686533212010919,
"grad_norm": 1.9721609486698313,
"learning_rate": 4.771292010901357e-06,
"loss": 0.1303,
"step": 1509
},
{
"epoch": 0.6869881710646042,
"grad_norm": 2.121063258188487,
"learning_rate": 4.77099330363703e-06,
"loss": 0.149,
"step": 1510
},
{
"epoch": 0.6874431301182894,
"grad_norm": 1.4516172378682393,
"learning_rate": 4.770694410797387e-06,
"loss": 0.1318,
"step": 1511
},
{
"epoch": 0.6878980891719745,
"grad_norm": 1.6701384225121902,
"learning_rate": 4.770395332406851e-06,
"loss": 0.1459,
"step": 1512
},
{
"epoch": 0.6883530482256597,
"grad_norm": 1.6796065018549693,
"learning_rate": 4.770096068489861e-06,
"loss": 0.1599,
"step": 1513
},
{
"epoch": 0.6888080072793449,
"grad_norm": 1.235533430237688,
"learning_rate": 4.769796619070872e-06,
"loss": 0.1519,
"step": 1514
},
{
"epoch": 0.6892629663330301,
"grad_norm": 1.3347747968404207,
"learning_rate": 4.769496984174353e-06,
"loss": 0.1064,
"step": 1515
},
{
"epoch": 0.6897179253867152,
"grad_norm": 1.5781140890537728,
"learning_rate": 4.769197163824791e-06,
"loss": 0.1435,
"step": 1516
},
{
"epoch": 0.6901728844404004,
"grad_norm": 2.213137403753888,
"learning_rate": 4.768897158046683e-06,
"loss": 0.1866,
"step": 1517
},
{
"epoch": 0.6906278434940856,
"grad_norm": 1.5778012312077723,
"learning_rate": 4.768596966864546e-06,
"loss": 0.1604,
"step": 1518
},
{
"epoch": 0.6910828025477707,
"grad_norm": 1.652969574663111,
"learning_rate": 4.76829659030291e-06,
"loss": 0.1869,
"step": 1519
},
{
"epoch": 0.6915377616014559,
"grad_norm": 1.5361209471256771,
"learning_rate": 4.767996028386319e-06,
"loss": 0.1457,
"step": 1520
},
{
"epoch": 0.6919927206551411,
"grad_norm": 2.936222163725796,
"learning_rate": 4.767695281139336e-06,
"loss": 0.1881,
"step": 1521
},
{
"epoch": 0.6924476797088263,
"grad_norm": 2.3134771803324905,
"learning_rate": 4.767394348586535e-06,
"loss": 0.1599,
"step": 1522
},
{
"epoch": 0.6929026387625113,
"grad_norm": 2.4498437084815428,
"learning_rate": 4.767093230752507e-06,
"loss": 0.2138,
"step": 1523
},
{
"epoch": 0.6933575978161965,
"grad_norm": 1.5332362659492962,
"learning_rate": 4.766791927661859e-06,
"loss": 0.151,
"step": 1524
},
{
"epoch": 0.6938125568698817,
"grad_norm": 1.7915535564744174,
"learning_rate": 4.766490439339211e-06,
"loss": 0.1318,
"step": 1525
},
{
"epoch": 0.6942675159235668,
"grad_norm": 1.6447847233863087,
"learning_rate": 4.7661887658092e-06,
"loss": 0.162,
"step": 1526
},
{
"epoch": 0.694722474977252,
"grad_norm": 2.9781233092582866,
"learning_rate": 4.765886907096477e-06,
"loss": 0.2619,
"step": 1527
},
{
"epoch": 0.6951774340309372,
"grad_norm": 1.7140676149721272,
"learning_rate": 4.7655848632257084e-06,
"loss": 0.1425,
"step": 1528
},
{
"epoch": 0.6956323930846224,
"grad_norm": 2.4534906180849116,
"learning_rate": 4.7652826342215764e-06,
"loss": 0.236,
"step": 1529
},
{
"epoch": 0.6960873521383075,
"grad_norm": 1.6478858265647598,
"learning_rate": 4.764980220108777e-06,
"loss": 0.1955,
"step": 1530
},
{
"epoch": 0.6965423111919927,
"grad_norm": 2.306316562409567,
"learning_rate": 4.764677620912022e-06,
"loss": 0.2079,
"step": 1531
},
{
"epoch": 0.6969972702456779,
"grad_norm": 1.644994735808915,
"learning_rate": 4.764374836656041e-06,
"loss": 0.1442,
"step": 1532
},
{
"epoch": 0.697452229299363,
"grad_norm": 1.4036507182888944,
"learning_rate": 4.764071867365571e-06,
"loss": 0.1638,
"step": 1533
},
{
"epoch": 0.6979071883530482,
"grad_norm": 1.5164218367626467,
"learning_rate": 4.763768713065375e-06,
"loss": 0.156,
"step": 1534
},
{
"epoch": 0.6983621474067334,
"grad_norm": 1.7701773803690557,
"learning_rate": 4.763465373780223e-06,
"loss": 0.1145,
"step": 1535
},
{
"epoch": 0.6988171064604186,
"grad_norm": 2.076859289782232,
"learning_rate": 4.763161849534902e-06,
"loss": 0.1561,
"step": 1536
},
{
"epoch": 0.6992720655141037,
"grad_norm": 1.6167208008101008,
"learning_rate": 4.762858140354214e-06,
"loss": 0.1621,
"step": 1537
},
{
"epoch": 0.6997270245677889,
"grad_norm": 1.4746209465407152,
"learning_rate": 4.7625542462629785e-06,
"loss": 0.1768,
"step": 1538
},
{
"epoch": 0.7001819836214741,
"grad_norm": 1.4200002114989836,
"learning_rate": 4.762250167286027e-06,
"loss": 0.0995,
"step": 1539
},
{
"epoch": 0.7006369426751592,
"grad_norm": 2.080064440715621,
"learning_rate": 4.761945903448209e-06,
"loss": 0.2274,
"step": 1540
},
{
"epoch": 0.7010919017288444,
"grad_norm": 1.346792584477521,
"learning_rate": 4.761641454774386e-06,
"loss": 0.1219,
"step": 1541
},
{
"epoch": 0.7015468607825296,
"grad_norm": 2.36691492405669,
"learning_rate": 4.761336821289436e-06,
"loss": 0.2965,
"step": 1542
},
{
"epoch": 0.7020018198362148,
"grad_norm": 1.773901757295841,
"learning_rate": 4.761032003018254e-06,
"loss": 0.163,
"step": 1543
},
{
"epoch": 0.7024567788898999,
"grad_norm": 1.6774939072873407,
"learning_rate": 4.760726999985748e-06,
"loss": 0.1315,
"step": 1544
},
{
"epoch": 0.7029117379435851,
"grad_norm": 1.6552217973496692,
"learning_rate": 4.7604218122168406e-06,
"loss": 0.1298,
"step": 1545
},
{
"epoch": 0.7033666969972703,
"grad_norm": 1.91830208867601,
"learning_rate": 4.760116439736471e-06,
"loss": 0.2525,
"step": 1546
},
{
"epoch": 0.7038216560509554,
"grad_norm": 1.564874376143588,
"learning_rate": 4.759810882569591e-06,
"loss": 0.1863,
"step": 1547
},
{
"epoch": 0.7042766151046406,
"grad_norm": 1.4864041422513101,
"learning_rate": 4.759505140741172e-06,
"loss": 0.1063,
"step": 1548
},
{
"epoch": 0.7047315741583258,
"grad_norm": 2.549801333631036,
"learning_rate": 4.759199214276196e-06,
"loss": 0.2505,
"step": 1549
},
{
"epoch": 0.705186533212011,
"grad_norm": 1.5401594920414479,
"learning_rate": 4.758893103199665e-06,
"loss": 0.1624,
"step": 1550
},
{
"epoch": 0.7056414922656961,
"grad_norm": 1.6343764429957106,
"learning_rate": 4.758586807536588e-06,
"loss": 0.1545,
"step": 1551
},
{
"epoch": 0.7060964513193813,
"grad_norm": 1.6039711645022867,
"learning_rate": 4.758280327311998e-06,
"loss": 0.1134,
"step": 1552
},
{
"epoch": 0.7065514103730665,
"grad_norm": 2.2883990951010063,
"learning_rate": 4.757973662550938e-06,
"loss": 0.1899,
"step": 1553
},
{
"epoch": 0.7070063694267515,
"grad_norm": 1.7249554511478242,
"learning_rate": 4.757666813278466e-06,
"loss": 0.1725,
"step": 1554
},
{
"epoch": 0.7074613284804367,
"grad_norm": 2.041262841608907,
"learning_rate": 4.757359779519659e-06,
"loss": 0.2481,
"step": 1555
},
{
"epoch": 0.707916287534122,
"grad_norm": 1.7815243564082959,
"learning_rate": 4.757052561299604e-06,
"loss": 0.2166,
"step": 1556
},
{
"epoch": 0.7083712465878071,
"grad_norm": 1.5514238648411727,
"learning_rate": 4.756745158643407e-06,
"loss": 0.224,
"step": 1557
},
{
"epoch": 0.7088262056414922,
"grad_norm": 1.8608039671832461,
"learning_rate": 4.7564375715761865e-06,
"loss": 0.2223,
"step": 1558
},
{
"epoch": 0.7092811646951774,
"grad_norm": 1.6157629653628103,
"learning_rate": 4.756129800123078e-06,
"loss": 0.1293,
"step": 1559
},
{
"epoch": 0.7097361237488626,
"grad_norm": 1.4596213449886457,
"learning_rate": 4.755821844309232e-06,
"loss": 0.1805,
"step": 1560
},
{
"epoch": 0.7101910828025477,
"grad_norm": 1.7295068196827752,
"learning_rate": 4.75551370415981e-06,
"loss": 0.1599,
"step": 1561
},
{
"epoch": 0.7106460418562329,
"grad_norm": 2.0606393433385612,
"learning_rate": 4.755205379699996e-06,
"loss": 0.1941,
"step": 1562
},
{
"epoch": 0.7111010009099181,
"grad_norm": 2.0979325727754294,
"learning_rate": 4.75489687095498e-06,
"loss": 0.1913,
"step": 1563
},
{
"epoch": 0.7115559599636033,
"grad_norm": 2.2303398669678076,
"learning_rate": 4.754588177949977e-06,
"loss": 0.1478,
"step": 1564
},
{
"epoch": 0.7120109190172884,
"grad_norm": 2.093261606281437,
"learning_rate": 4.7542793007102086e-06,
"loss": 0.1815,
"step": 1565
},
{
"epoch": 0.7124658780709736,
"grad_norm": 1.4472751266274675,
"learning_rate": 4.7539702392609165e-06,
"loss": 0.1697,
"step": 1566
},
{
"epoch": 0.7129208371246588,
"grad_norm": 2.0281126718428077,
"learning_rate": 4.753660993627356e-06,
"loss": 0.0948,
"step": 1567
},
{
"epoch": 0.7133757961783439,
"grad_norm": 1.5189147438423232,
"learning_rate": 4.753351563834795e-06,
"loss": 0.1727,
"step": 1568
},
{
"epoch": 0.7138307552320291,
"grad_norm": 1.7409543127807352,
"learning_rate": 4.753041949908521e-06,
"loss": 0.1642,
"step": 1569
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.194503112395564,
"learning_rate": 4.752732151873834e-06,
"loss": 0.2196,
"step": 1570
},
{
"epoch": 0.7147406733393995,
"grad_norm": 1.697163266188786,
"learning_rate": 4.752422169756048e-06,
"loss": 0.1672,
"step": 1571
},
{
"epoch": 0.7151956323930846,
"grad_norm": 1.8134253244717562,
"learning_rate": 4.752112003580495e-06,
"loss": 0.1603,
"step": 1572
},
{
"epoch": 0.7156505914467698,
"grad_norm": 2.3783985389961915,
"learning_rate": 4.751801653372518e-06,
"loss": 0.1731,
"step": 1573
},
{
"epoch": 0.716105550500455,
"grad_norm": 2.5039159852054795,
"learning_rate": 4.751491119157481e-06,
"loss": 0.1865,
"step": 1574
},
{
"epoch": 0.7165605095541401,
"grad_norm": 1.619599621691377,
"learning_rate": 4.751180400960756e-06,
"loss": 0.1746,
"step": 1575
},
{
"epoch": 0.7170154686078253,
"grad_norm": 1.65152231646464,
"learning_rate": 4.7508694988077355e-06,
"loss": 0.1515,
"step": 1576
},
{
"epoch": 0.7174704276615105,
"grad_norm": 2.465040491157821,
"learning_rate": 4.750558412723824e-06,
"loss": 0.1966,
"step": 1577
},
{
"epoch": 0.7179253867151957,
"grad_norm": 2.2789812780893364,
"learning_rate": 4.750247142734442e-06,
"loss": 0.1599,
"step": 1578
},
{
"epoch": 0.7183803457688808,
"grad_norm": 1.7581577660091943,
"learning_rate": 4.749935688865026e-06,
"loss": 0.141,
"step": 1579
},
{
"epoch": 0.718835304822566,
"grad_norm": 2.1794165158833914,
"learning_rate": 4.749624051141026e-06,
"loss": 0.1088,
"step": 1580
},
{
"epoch": 0.7192902638762512,
"grad_norm": 1.443223743964179,
"learning_rate": 4.7493122295879076e-06,
"loss": 0.1189,
"step": 1581
},
{
"epoch": 0.7197452229299363,
"grad_norm": 2.35745890496679,
"learning_rate": 4.7490002242311525e-06,
"loss": 0.2129,
"step": 1582
},
{
"epoch": 0.7202001819836215,
"grad_norm": 1.5523835122804504,
"learning_rate": 4.748688035096255e-06,
"loss": 0.2081,
"step": 1583
},
{
"epoch": 0.7206551410373067,
"grad_norm": 2.4968010568360692,
"learning_rate": 4.748375662208726e-06,
"loss": 0.1759,
"step": 1584
},
{
"epoch": 0.7211101000909919,
"grad_norm": 1.9165363158958804,
"learning_rate": 4.748063105594092e-06,
"loss": 0.2267,
"step": 1585
},
{
"epoch": 0.721565059144677,
"grad_norm": 1.7864622532435137,
"learning_rate": 4.747750365277892e-06,
"loss": 0.1648,
"step": 1586
},
{
"epoch": 0.7220200181983621,
"grad_norm": 1.8532777769110087,
"learning_rate": 4.747437441285684e-06,
"loss": 0.1501,
"step": 1587
},
{
"epoch": 0.7224749772520473,
"grad_norm": 1.7539173333380942,
"learning_rate": 4.747124333643038e-06,
"loss": 0.1883,
"step": 1588
},
{
"epoch": 0.7229299363057324,
"grad_norm": 1.7153189766040051,
"learning_rate": 4.746811042375538e-06,
"loss": 0.1308,
"step": 1589
},
{
"epoch": 0.7233848953594176,
"grad_norm": 1.5162583630812903,
"learning_rate": 4.746497567508787e-06,
"loss": 0.1571,
"step": 1590
},
{
"epoch": 0.7238398544131028,
"grad_norm": 1.5546810521185177,
"learning_rate": 4.7461839090684e-06,
"loss": 0.1694,
"step": 1591
},
{
"epoch": 0.724294813466788,
"grad_norm": 2.0021940033485404,
"learning_rate": 4.745870067080007e-06,
"loss": 0.171,
"step": 1592
},
{
"epoch": 0.7247497725204731,
"grad_norm": 2.221217513727709,
"learning_rate": 4.7455560415692545e-06,
"loss": 0.231,
"step": 1593
},
{
"epoch": 0.7252047315741583,
"grad_norm": 2.222153805045267,
"learning_rate": 4.745241832561803e-06,
"loss": 0.1446,
"step": 1594
},
{
"epoch": 0.7256596906278435,
"grad_norm": 1.784667663061202,
"learning_rate": 4.744927440083329e-06,
"loss": 0.1646,
"step": 1595
},
{
"epoch": 0.7261146496815286,
"grad_norm": 1.7626687045318659,
"learning_rate": 4.744612864159522e-06,
"loss": 0.1685,
"step": 1596
},
{
"epoch": 0.7265696087352138,
"grad_norm": 1.9909235520315078,
"learning_rate": 4.7442981048160895e-06,
"loss": 0.1854,
"step": 1597
},
{
"epoch": 0.727024567788899,
"grad_norm": 2.4131359111724464,
"learning_rate": 4.74398316207875e-06,
"loss": 0.1784,
"step": 1598
},
{
"epoch": 0.7274795268425842,
"grad_norm": 2.3390737079991215,
"learning_rate": 4.74366803597324e-06,
"loss": 0.28,
"step": 1599
},
{
"epoch": 0.7279344858962693,
"grad_norm": 1.5176778250654925,
"learning_rate": 4.743352726525311e-06,
"loss": 0.1119,
"step": 1600
},
{
"epoch": 0.7283894449499545,
"grad_norm": 1.612075524542219,
"learning_rate": 4.743037233760728e-06,
"loss": 0.1548,
"step": 1601
},
{
"epoch": 0.7288444040036397,
"grad_norm": 2.082336981370237,
"learning_rate": 4.742721557705271e-06,
"loss": 0.1907,
"step": 1602
},
{
"epoch": 0.7292993630573248,
"grad_norm": 1.8874163681919673,
"learning_rate": 4.7424056983847374e-06,
"loss": 0.1872,
"step": 1603
},
{
"epoch": 0.72975432211101,
"grad_norm": 1.9161874420851024,
"learning_rate": 4.7420896558249366e-06,
"loss": 0.1199,
"step": 1604
},
{
"epoch": 0.7302092811646952,
"grad_norm": 1.9339794473206677,
"learning_rate": 4.741773430051694e-06,
"loss": 0.1467,
"step": 1605
},
{
"epoch": 0.7306642402183804,
"grad_norm": 1.5901851811892251,
"learning_rate": 4.74145702109085e-06,
"loss": 0.1094,
"step": 1606
},
{
"epoch": 0.7311191992720655,
"grad_norm": 2.678117310973907,
"learning_rate": 4.741140428968261e-06,
"loss": 0.2545,
"step": 1607
},
{
"epoch": 0.7315741583257507,
"grad_norm": 1.4456239768846677,
"learning_rate": 4.740823653709797e-06,
"loss": 0.101,
"step": 1608
},
{
"epoch": 0.7320291173794359,
"grad_norm": 1.5614448809750465,
"learning_rate": 4.740506695341343e-06,
"loss": 0.135,
"step": 1609
},
{
"epoch": 0.732484076433121,
"grad_norm": 1.9409375225046157,
"learning_rate": 4.740189553888801e-06,
"loss": 0.2674,
"step": 1610
},
{
"epoch": 0.7329390354868062,
"grad_norm": 1.757285590607046,
"learning_rate": 4.739872229378085e-06,
"loss": 0.1358,
"step": 1611
},
{
"epoch": 0.7333939945404914,
"grad_norm": 1.7119351957596494,
"learning_rate": 4.739554721835125e-06,
"loss": 0.1405,
"step": 1612
},
{
"epoch": 0.7338489535941766,
"grad_norm": 1.5407585285384973,
"learning_rate": 4.739237031285867e-06,
"loss": 0.1789,
"step": 1613
},
{
"epoch": 0.7343039126478617,
"grad_norm": 1.8412394540639878,
"learning_rate": 4.738919157756272e-06,
"loss": 0.1741,
"step": 1614
},
{
"epoch": 0.7347588717015469,
"grad_norm": 1.9093990086684758,
"learning_rate": 4.738601101272313e-06,
"loss": 0.1972,
"step": 1615
},
{
"epoch": 0.7352138307552321,
"grad_norm": 1.6531050134000445,
"learning_rate": 4.738282861859983e-06,
"loss": 0.1828,
"step": 1616
},
{
"epoch": 0.7356687898089171,
"grad_norm": 1.6958094821678005,
"learning_rate": 4.737964439545284e-06,
"loss": 0.1623,
"step": 1617
},
{
"epoch": 0.7361237488626023,
"grad_norm": 1.9487516983862898,
"learning_rate": 4.737645834354238e-06,
"loss": 0.1761,
"step": 1618
},
{
"epoch": 0.7365787079162875,
"grad_norm": 1.5339742875273046,
"learning_rate": 4.737327046312879e-06,
"loss": 0.1188,
"step": 1619
},
{
"epoch": 0.7370336669699727,
"grad_norm": 1.8259875586922627,
"learning_rate": 4.737008075447259e-06,
"loss": 0.13,
"step": 1620
},
{
"epoch": 0.7374886260236578,
"grad_norm": 2.112705655098723,
"learning_rate": 4.73668892178344e-06,
"loss": 0.162,
"step": 1621
},
{
"epoch": 0.737943585077343,
"grad_norm": 2.1191881288248755,
"learning_rate": 4.736369585347503e-06,
"loss": 0.1882,
"step": 1622
},
{
"epoch": 0.7383985441310282,
"grad_norm": 2.42511490554677,
"learning_rate": 4.736050066165544e-06,
"loss": 0.168,
"step": 1623
},
{
"epoch": 0.7388535031847133,
"grad_norm": 2.5180747249974678,
"learning_rate": 4.735730364263671e-06,
"loss": 0.2462,
"step": 1624
},
{
"epoch": 0.7393084622383985,
"grad_norm": 1.899152814897376,
"learning_rate": 4.735410479668009e-06,
"loss": 0.1649,
"step": 1625
},
{
"epoch": 0.7397634212920837,
"grad_norm": 2.5891320586414506,
"learning_rate": 4.735090412404697e-06,
"loss": 0.2112,
"step": 1626
},
{
"epoch": 0.7402183803457689,
"grad_norm": 1.6256945799338343,
"learning_rate": 4.734770162499891e-06,
"loss": 0.0995,
"step": 1627
},
{
"epoch": 0.740673339399454,
"grad_norm": 2.115890838067561,
"learning_rate": 4.734449729979759e-06,
"loss": 0.1863,
"step": 1628
},
{
"epoch": 0.7411282984531392,
"grad_norm": 1.8207130234699649,
"learning_rate": 4.734129114870486e-06,
"loss": 0.1621,
"step": 1629
},
{
"epoch": 0.7415832575068244,
"grad_norm": 2.419448299752305,
"learning_rate": 4.733808317198271e-06,
"loss": 0.1682,
"step": 1630
},
{
"epoch": 0.7420382165605095,
"grad_norm": 1.864719563201482,
"learning_rate": 4.733487336989327e-06,
"loss": 0.1534,
"step": 1631
},
{
"epoch": 0.7424931756141947,
"grad_norm": 2.480364363656269,
"learning_rate": 4.733166174269886e-06,
"loss": 0.186,
"step": 1632
},
{
"epoch": 0.7429481346678799,
"grad_norm": 2.0606766178805116,
"learning_rate": 4.732844829066189e-06,
"loss": 0.2189,
"step": 1633
},
{
"epoch": 0.7434030937215651,
"grad_norm": 2.162055464706376,
"learning_rate": 4.732523301404497e-06,
"loss": 0.1969,
"step": 1634
},
{
"epoch": 0.7438580527752502,
"grad_norm": 2.12376584678073,
"learning_rate": 4.732201591311082e-06,
"loss": 0.2101,
"step": 1635
},
{
"epoch": 0.7443130118289354,
"grad_norm": 1.5079389097876976,
"learning_rate": 4.731879698812233e-06,
"loss": 0.1802,
"step": 1636
},
{
"epoch": 0.7447679708826206,
"grad_norm": 1.744034863658637,
"learning_rate": 4.731557623934255e-06,
"loss": 0.1398,
"step": 1637
},
{
"epoch": 0.7452229299363057,
"grad_norm": 2.7848754471064043,
"learning_rate": 4.7312353667034645e-06,
"loss": 0.2499,
"step": 1638
},
{
"epoch": 0.7456778889899909,
"grad_norm": 2.58334353852049,
"learning_rate": 4.730912927146197e-06,
"loss": 0.2203,
"step": 1639
},
{
"epoch": 0.7461328480436761,
"grad_norm": 2.0325933883862066,
"learning_rate": 4.7305903052888e-06,
"loss": 0.1563,
"step": 1640
},
{
"epoch": 0.7465878070973613,
"grad_norm": 2.3443549071357057,
"learning_rate": 4.730267501157636e-06,
"loss": 0.1896,
"step": 1641
},
{
"epoch": 0.7470427661510464,
"grad_norm": 2.003548520587404,
"learning_rate": 4.729944514779084e-06,
"loss": 0.1705,
"step": 1642
},
{
"epoch": 0.7474977252047316,
"grad_norm": 1.3567793569480755,
"learning_rate": 4.729621346179536e-06,
"loss": 0.1429,
"step": 1643
},
{
"epoch": 0.7479526842584168,
"grad_norm": 1.9172209433761784,
"learning_rate": 4.7292979953854e-06,
"loss": 0.1224,
"step": 1644
},
{
"epoch": 0.7484076433121019,
"grad_norm": 1.7854487682262081,
"learning_rate": 4.7289744624231004e-06,
"loss": 0.1753,
"step": 1645
},
{
"epoch": 0.7488626023657871,
"grad_norm": 2.0357381373480377,
"learning_rate": 4.728650747319073e-06,
"loss": 0.1844,
"step": 1646
},
{
"epoch": 0.7493175614194723,
"grad_norm": 2.295347780668863,
"learning_rate": 4.728326850099771e-06,
"loss": 0.1949,
"step": 1647
},
{
"epoch": 0.7497725204731575,
"grad_norm": 2.2592022682113564,
"learning_rate": 4.728002770791663e-06,
"loss": 0.1641,
"step": 1648
},
{
"epoch": 0.7502274795268425,
"grad_norm": 1.8794487431290805,
"learning_rate": 4.727678509421229e-06,
"loss": 0.1672,
"step": 1649
},
{
"epoch": 0.7506824385805277,
"grad_norm": 1.471409298797821,
"learning_rate": 4.727354066014968e-06,
"loss": 0.1251,
"step": 1650
},
{
"epoch": 0.7511373976342129,
"grad_norm": 1.2272497564159228,
"learning_rate": 4.727029440599391e-06,
"loss": 0.1165,
"step": 1651
},
{
"epoch": 0.7515923566878981,
"grad_norm": 1.7826119947445478,
"learning_rate": 4.726704633201025e-06,
"loss": 0.1367,
"step": 1652
},
{
"epoch": 0.7520473157415832,
"grad_norm": 1.5654538387161951,
"learning_rate": 4.726379643846412e-06,
"loss": 0.1622,
"step": 1653
},
{
"epoch": 0.7525022747952684,
"grad_norm": 2.0792625449816255,
"learning_rate": 4.726054472562109e-06,
"loss": 0.1741,
"step": 1654
},
{
"epoch": 0.7529572338489536,
"grad_norm": 1.5223527837461277,
"learning_rate": 4.725729119374687e-06,
"loss": 0.1198,
"step": 1655
},
{
"epoch": 0.7534121929026387,
"grad_norm": 1.5290108835892176,
"learning_rate": 4.725403584310734e-06,
"loss": 0.1026,
"step": 1656
},
{
"epoch": 0.7538671519563239,
"grad_norm": 2.155319535005024,
"learning_rate": 4.725077867396849e-06,
"loss": 0.1652,
"step": 1657
},
{
"epoch": 0.7543221110100091,
"grad_norm": 1.565904420652083,
"learning_rate": 4.724751968659648e-06,
"loss": 0.1628,
"step": 1658
},
{
"epoch": 0.7547770700636943,
"grad_norm": 2.9773420234850345,
"learning_rate": 4.724425888125764e-06,
"loss": 0.2409,
"step": 1659
},
{
"epoch": 0.7552320291173794,
"grad_norm": 2.3428961739867304,
"learning_rate": 4.724099625821842e-06,
"loss": 0.2216,
"step": 1660
},
{
"epoch": 0.7556869881710646,
"grad_norm": 1.7855741504776685,
"learning_rate": 4.723773181774543e-06,
"loss": 0.1468,
"step": 1661
},
{
"epoch": 0.7561419472247498,
"grad_norm": 1.96972618488323,
"learning_rate": 4.723446556010542e-06,
"loss": 0.1981,
"step": 1662
},
{
"epoch": 0.7565969062784349,
"grad_norm": 1.6758348642722924,
"learning_rate": 4.7231197485565275e-06,
"loss": 0.169,
"step": 1663
},
{
"epoch": 0.7570518653321201,
"grad_norm": 1.3954523503838552,
"learning_rate": 4.722792759439209e-06,
"loss": 0.1224,
"step": 1664
},
{
"epoch": 0.7575068243858053,
"grad_norm": 2.060909913997174,
"learning_rate": 4.722465588685302e-06,
"loss": 0.2087,
"step": 1665
},
{
"epoch": 0.7579617834394905,
"grad_norm": 1.5474467660765128,
"learning_rate": 4.722138236321545e-06,
"loss": 0.1013,
"step": 1666
},
{
"epoch": 0.7584167424931756,
"grad_norm": 2.430153104930812,
"learning_rate": 4.721810702374687e-06,
"loss": 0.1439,
"step": 1667
},
{
"epoch": 0.7588717015468608,
"grad_norm": 1.7773306327385723,
"learning_rate": 4.721482986871491e-06,
"loss": 0.1485,
"step": 1668
},
{
"epoch": 0.759326660600546,
"grad_norm": 2.927464615752266,
"learning_rate": 4.721155089838738e-06,
"loss": 0.1962,
"step": 1669
},
{
"epoch": 0.7597816196542311,
"grad_norm": 1.9730589581225906,
"learning_rate": 4.720827011303222e-06,
"loss": 0.1503,
"step": 1670
},
{
"epoch": 0.7602365787079163,
"grad_norm": 1.953497394359563,
"learning_rate": 4.720498751291751e-06,
"loss": 0.182,
"step": 1671
},
{
"epoch": 0.7606915377616015,
"grad_norm": 1.7839379977035983,
"learning_rate": 4.72017030983115e-06,
"loss": 0.2198,
"step": 1672
},
{
"epoch": 0.7611464968152867,
"grad_norm": 1.7993088459777005,
"learning_rate": 4.7198416869482575e-06,
"loss": 0.1696,
"step": 1673
},
{
"epoch": 0.7616014558689718,
"grad_norm": 1.892794250792964,
"learning_rate": 4.719512882669926e-06,
"loss": 0.1776,
"step": 1674
},
{
"epoch": 0.762056414922657,
"grad_norm": 2.0202484520052035,
"learning_rate": 4.719183897023027e-06,
"loss": 0.1673,
"step": 1675
},
{
"epoch": 0.7625113739763422,
"grad_norm": 2.2601047076044414,
"learning_rate": 4.718854730034441e-06,
"loss": 0.2183,
"step": 1676
},
{
"epoch": 0.7629663330300273,
"grad_norm": 1.8760309869672118,
"learning_rate": 4.718525381731066e-06,
"loss": 0.1476,
"step": 1677
},
{
"epoch": 0.7634212920837125,
"grad_norm": 1.5663417379599454,
"learning_rate": 4.718195852139816e-06,
"loss": 0.2014,
"step": 1678
},
{
"epoch": 0.7638762511373977,
"grad_norm": 2.338496392531513,
"learning_rate": 4.717866141287618e-06,
"loss": 0.2422,
"step": 1679
},
{
"epoch": 0.7643312101910829,
"grad_norm": 1.9053967868206603,
"learning_rate": 4.717536249201416e-06,
"loss": 0.1953,
"step": 1680
},
{
"epoch": 0.7647861692447679,
"grad_norm": 1.831121224420973,
"learning_rate": 4.7172061759081646e-06,
"loss": 0.1626,
"step": 1681
},
{
"epoch": 0.7652411282984531,
"grad_norm": 2.234380631915828,
"learning_rate": 4.716875921434838e-06,
"loss": 0.1754,
"step": 1682
},
{
"epoch": 0.7656960873521383,
"grad_norm": 1.9990356821604962,
"learning_rate": 4.716545485808421e-06,
"loss": 0.1613,
"step": 1683
},
{
"epoch": 0.7661510464058234,
"grad_norm": 1.956500719133962,
"learning_rate": 4.716214869055918e-06,
"loss": 0.1747,
"step": 1684
},
{
"epoch": 0.7666060054595086,
"grad_norm": 1.7944596997359672,
"learning_rate": 4.715884071204344e-06,
"loss": 0.116,
"step": 1685
},
{
"epoch": 0.7670609645131938,
"grad_norm": 1.93926106516618,
"learning_rate": 4.715553092280731e-06,
"loss": 0.2121,
"step": 1686
},
{
"epoch": 0.767515923566879,
"grad_norm": 2.4656357214922626,
"learning_rate": 4.7152219323121246e-06,
"loss": 0.1772,
"step": 1687
},
{
"epoch": 0.7679708826205641,
"grad_norm": 2.2402320393494253,
"learning_rate": 4.714890591325586e-06,
"loss": 0.2021,
"step": 1688
},
{
"epoch": 0.7684258416742493,
"grad_norm": 1.7903156076682725,
"learning_rate": 4.714559069348189e-06,
"loss": 0.1825,
"step": 1689
},
{
"epoch": 0.7688808007279345,
"grad_norm": 1.6420985192646667,
"learning_rate": 4.714227366407027e-06,
"loss": 0.1475,
"step": 1690
},
{
"epoch": 0.7693357597816196,
"grad_norm": 2.2750484746487936,
"learning_rate": 4.7138954825292035e-06,
"loss": 0.1492,
"step": 1691
},
{
"epoch": 0.7697907188353048,
"grad_norm": 1.992613507205851,
"learning_rate": 4.71356341774184e-06,
"loss": 0.2004,
"step": 1692
},
{
"epoch": 0.77024567788899,
"grad_norm": 1.8507536466532999,
"learning_rate": 4.713231172072069e-06,
"loss": 0.1665,
"step": 1693
},
{
"epoch": 0.7707006369426752,
"grad_norm": 2.074124213121433,
"learning_rate": 4.712898745547043e-06,
"loss": 0.1901,
"step": 1694
},
{
"epoch": 0.7711555959963603,
"grad_norm": 2.2217772464991628,
"learning_rate": 4.712566138193923e-06,
"loss": 0.2007,
"step": 1695
},
{
"epoch": 0.7716105550500455,
"grad_norm": 2.1110958043430936,
"learning_rate": 4.712233350039892e-06,
"loss": 0.1711,
"step": 1696
},
{
"epoch": 0.7720655141037307,
"grad_norm": 1.7733407712061509,
"learning_rate": 4.711900381112141e-06,
"loss": 0.1401,
"step": 1697
},
{
"epoch": 0.7725204731574158,
"grad_norm": 1.9082417250906683,
"learning_rate": 4.71156723143788e-06,
"loss": 0.1707,
"step": 1698
},
{
"epoch": 0.772975432211101,
"grad_norm": 1.8677365381806925,
"learning_rate": 4.711233901044332e-06,
"loss": 0.1868,
"step": 1699
},
{
"epoch": 0.7734303912647862,
"grad_norm": 2.0411961738002464,
"learning_rate": 4.710900389958735e-06,
"loss": 0.1744,
"step": 1700
},
{
"epoch": 0.7738853503184714,
"grad_norm": 2.1935749697701,
"learning_rate": 4.710566698208343e-06,
"loss": 0.2385,
"step": 1701
},
{
"epoch": 0.7743403093721565,
"grad_norm": 1.7404480081781704,
"learning_rate": 4.710232825820424e-06,
"loss": 0.1499,
"step": 1702
},
{
"epoch": 0.7747952684258417,
"grad_norm": 1.477154965489664,
"learning_rate": 4.709898772822258e-06,
"loss": 0.1207,
"step": 1703
},
{
"epoch": 0.7752502274795269,
"grad_norm": 1.7903520569742504,
"learning_rate": 4.709564539241145e-06,
"loss": 0.1257,
"step": 1704
},
{
"epoch": 0.775705186533212,
"grad_norm": 1.509438293191361,
"learning_rate": 4.709230125104396e-06,
"loss": 0.1333,
"step": 1705
},
{
"epoch": 0.7761601455868972,
"grad_norm": 1.352600254451033,
"learning_rate": 4.708895530439339e-06,
"loss": 0.1297,
"step": 1706
},
{
"epoch": 0.7766151046405824,
"grad_norm": 2.80931496450313,
"learning_rate": 4.708560755273313e-06,
"loss": 0.1572,
"step": 1707
},
{
"epoch": 0.7770700636942676,
"grad_norm": 2.614552054035137,
"learning_rate": 4.7082257996336765e-06,
"loss": 0.2392,
"step": 1708
},
{
"epoch": 0.7775250227479527,
"grad_norm": 1.3897711262928594,
"learning_rate": 4.707890663547801e-06,
"loss": 0.1898,
"step": 1709
},
{
"epoch": 0.7779799818016379,
"grad_norm": 1.3068004754745945,
"learning_rate": 4.7075553470430695e-06,
"loss": 0.1541,
"step": 1710
},
{
"epoch": 0.778434940855323,
"grad_norm": 2.0471283874239337,
"learning_rate": 4.707219850146885e-06,
"loss": 0.189,
"step": 1711
},
{
"epoch": 0.7788898999090081,
"grad_norm": 1.406237335222361,
"learning_rate": 4.706884172886662e-06,
"loss": 0.1534,
"step": 1712
},
{
"epoch": 0.7793448589626933,
"grad_norm": 1.430209112364991,
"learning_rate": 4.706548315289831e-06,
"loss": 0.1505,
"step": 1713
},
{
"epoch": 0.7797998180163785,
"grad_norm": 1.9880980157191188,
"learning_rate": 4.706212277383836e-06,
"loss": 0.1455,
"step": 1714
},
{
"epoch": 0.7802547770700637,
"grad_norm": 1.9444624934450598,
"learning_rate": 4.705876059196136e-06,
"loss": 0.1919,
"step": 1715
},
{
"epoch": 0.7807097361237488,
"grad_norm": 1.845006648683808,
"learning_rate": 4.705539660754208e-06,
"loss": 0.1379,
"step": 1716
},
{
"epoch": 0.781164695177434,
"grad_norm": 1.7044046674717437,
"learning_rate": 4.705203082085538e-06,
"loss": 0.1323,
"step": 1717
},
{
"epoch": 0.7816196542311192,
"grad_norm": 1.7912067195327883,
"learning_rate": 4.70486632321763e-06,
"loss": 0.2117,
"step": 1718
},
{
"epoch": 0.7820746132848043,
"grad_norm": 1.9320743936658202,
"learning_rate": 4.7045293841780034e-06,
"loss": 0.1375,
"step": 1719
},
{
"epoch": 0.7825295723384895,
"grad_norm": 1.7315009532080885,
"learning_rate": 4.704192264994193e-06,
"loss": 0.1162,
"step": 1720
},
{
"epoch": 0.7829845313921747,
"grad_norm": 1.6176947094849203,
"learning_rate": 4.703854965693743e-06,
"loss": 0.1318,
"step": 1721
},
{
"epoch": 0.7834394904458599,
"grad_norm": 2.40560948473341,
"learning_rate": 4.703517486304218e-06,
"loss": 0.1747,
"step": 1722
},
{
"epoch": 0.783894449499545,
"grad_norm": 1.6675266396651778,
"learning_rate": 4.703179826853195e-06,
"loss": 0.1853,
"step": 1723
},
{
"epoch": 0.7843494085532302,
"grad_norm": 1.8036543539560768,
"learning_rate": 4.702841987368265e-06,
"loss": 0.1358,
"step": 1724
},
{
"epoch": 0.7848043676069154,
"grad_norm": 2.164797051503019,
"learning_rate": 4.702503967877038e-06,
"loss": 0.1531,
"step": 1725
},
{
"epoch": 0.7852593266606005,
"grad_norm": 1.6083401375044635,
"learning_rate": 4.702165768407132e-06,
"loss": 0.1984,
"step": 1726
},
{
"epoch": 0.7857142857142857,
"grad_norm": 1.9227015105668148,
"learning_rate": 4.701827388986185e-06,
"loss": 0.1962,
"step": 1727
},
{
"epoch": 0.7861692447679709,
"grad_norm": 2.234973410496376,
"learning_rate": 4.701488829641845e-06,
"loss": 0.1313,
"step": 1728
},
{
"epoch": 0.7866242038216561,
"grad_norm": 1.4707235359776172,
"learning_rate": 4.701150090401782e-06,
"loss": 0.1384,
"step": 1729
},
{
"epoch": 0.7870791628753412,
"grad_norm": 1.4795549767962248,
"learning_rate": 4.700811171293673e-06,
"loss": 0.1192,
"step": 1730
},
{
"epoch": 0.7875341219290264,
"grad_norm": 1.4765672888773027,
"learning_rate": 4.700472072345214e-06,
"loss": 0.1445,
"step": 1731
},
{
"epoch": 0.7879890809827116,
"grad_norm": 1.7959851809677527,
"learning_rate": 4.700132793584113e-06,
"loss": 0.176,
"step": 1732
},
{
"epoch": 0.7884440400363967,
"grad_norm": 2.0011742977871365,
"learning_rate": 4.699793335038098e-06,
"loss": 0.2073,
"step": 1733
},
{
"epoch": 0.7888989990900819,
"grad_norm": 1.5877933891450462,
"learning_rate": 4.699453696734905e-06,
"loss": 0.1163,
"step": 1734
},
{
"epoch": 0.7893539581437671,
"grad_norm": 1.994398441190682,
"learning_rate": 4.699113878702288e-06,
"loss": 0.1997,
"step": 1735
},
{
"epoch": 0.7898089171974523,
"grad_norm": 1.6186509072614172,
"learning_rate": 4.698773880968017e-06,
"loss": 0.1359,
"step": 1736
},
{
"epoch": 0.7902638762511374,
"grad_norm": 1.3756660961296079,
"learning_rate": 4.698433703559874e-06,
"loss": 0.1717,
"step": 1737
},
{
"epoch": 0.7907188353048226,
"grad_norm": 1.4461545675657563,
"learning_rate": 4.698093346505656e-06,
"loss": 0.1381,
"step": 1738
},
{
"epoch": 0.7911737943585078,
"grad_norm": 1.975346854852977,
"learning_rate": 4.697752809833177e-06,
"loss": 0.1651,
"step": 1739
},
{
"epoch": 0.7916287534121929,
"grad_norm": 2.098203427770575,
"learning_rate": 4.697412093570263e-06,
"loss": 0.1966,
"step": 1740
},
{
"epoch": 0.792083712465878,
"grad_norm": 1.7884148647415081,
"learning_rate": 4.697071197744756e-06,
"loss": 0.1603,
"step": 1741
},
{
"epoch": 0.7925386715195633,
"grad_norm": 2.20000836754146,
"learning_rate": 4.6967301223845115e-06,
"loss": 0.168,
"step": 1742
},
{
"epoch": 0.7929936305732485,
"grad_norm": 1.469643335454165,
"learning_rate": 4.696388867517403e-06,
"loss": 0.1574,
"step": 1743
},
{
"epoch": 0.7934485896269335,
"grad_norm": 1.7067059652811334,
"learning_rate": 4.696047433171316e-06,
"loss": 0.098,
"step": 1744
},
{
"epoch": 0.7939035486806187,
"grad_norm": 2.0780505106943896,
"learning_rate": 4.695705819374149e-06,
"loss": 0.178,
"step": 1745
},
{
"epoch": 0.7943585077343039,
"grad_norm": 1.8450097546428101,
"learning_rate": 4.695364026153818e-06,
"loss": 0.1637,
"step": 1746
},
{
"epoch": 0.794813466787989,
"grad_norm": 1.4237762817404553,
"learning_rate": 4.695022053538253e-06,
"loss": 0.1416,
"step": 1747
},
{
"epoch": 0.7952684258416742,
"grad_norm": 2.485744457764155,
"learning_rate": 4.694679901555398e-06,
"loss": 0.2207,
"step": 1748
},
{
"epoch": 0.7957233848953594,
"grad_norm": 2.5149587392089128,
"learning_rate": 4.694337570233213e-06,
"loss": 0.1485,
"step": 1749
},
{
"epoch": 0.7961783439490446,
"grad_norm": 2.0342522616249736,
"learning_rate": 4.693995059599672e-06,
"loss": 0.2071,
"step": 1750
},
{
"epoch": 0.7966333030027297,
"grad_norm": 1.7181022322257762,
"learning_rate": 4.693652369682762e-06,
"loss": 0.2112,
"step": 1751
},
{
"epoch": 0.7970882620564149,
"grad_norm": 1.843190625559269,
"learning_rate": 4.693309500510487e-06,
"loss": 0.1632,
"step": 1752
},
{
"epoch": 0.7975432211101001,
"grad_norm": 2.7841529899485917,
"learning_rate": 4.692966452110864e-06,
"loss": 0.1534,
"step": 1753
},
{
"epoch": 0.7979981801637852,
"grad_norm": 1.5395427013532956,
"learning_rate": 4.6926232245119265e-06,
"loss": 0.2195,
"step": 1754
},
{
"epoch": 0.7984531392174704,
"grad_norm": 2.5074996998585335,
"learning_rate": 4.69227981774172e-06,
"loss": 0.1856,
"step": 1755
},
{
"epoch": 0.7989080982711556,
"grad_norm": 2.449264992514986,
"learning_rate": 4.691936231828308e-06,
"loss": 0.1779,
"step": 1756
},
{
"epoch": 0.7993630573248408,
"grad_norm": 2.481345422810722,
"learning_rate": 4.691592466799766e-06,
"loss": 0.1889,
"step": 1757
},
{
"epoch": 0.7998180163785259,
"grad_norm": 1.637751747233988,
"learning_rate": 4.691248522684184e-06,
"loss": 0.1349,
"step": 1758
},
{
"epoch": 0.8002729754322111,
"grad_norm": 1.6804430027452057,
"learning_rate": 4.690904399509668e-06,
"loss": 0.1435,
"step": 1759
},
{
"epoch": 0.8007279344858963,
"grad_norm": 2.742847873433655,
"learning_rate": 4.69056009730434e-06,
"loss": 0.2232,
"step": 1760
},
{
"epoch": 0.8011828935395814,
"grad_norm": 2.40569741832729,
"learning_rate": 4.690215616096332e-06,
"loss": 0.1711,
"step": 1761
},
{
"epoch": 0.8016378525932666,
"grad_norm": 2.4832090753479834,
"learning_rate": 4.689870955913796e-06,
"loss": 0.1587,
"step": 1762
},
{
"epoch": 0.8020928116469518,
"grad_norm": 2.0194488171697063,
"learning_rate": 4.689526116784894e-06,
"loss": 0.167,
"step": 1763
},
{
"epoch": 0.802547770700637,
"grad_norm": 3.338733219322262,
"learning_rate": 4.689181098737805e-06,
"loss": 0.2404,
"step": 1764
},
{
"epoch": 0.8030027297543221,
"grad_norm": 2.150659967515375,
"learning_rate": 4.6888359018007235e-06,
"loss": 0.1288,
"step": 1765
},
{
"epoch": 0.8034576888080073,
"grad_norm": 1.9131033030180753,
"learning_rate": 4.6884905260018565e-06,
"loss": 0.1638,
"step": 1766
},
{
"epoch": 0.8039126478616925,
"grad_norm": 1.7799343855450172,
"learning_rate": 4.688144971369427e-06,
"loss": 0.2032,
"step": 1767
},
{
"epoch": 0.8043676069153776,
"grad_norm": 1.9191485121544656,
"learning_rate": 4.687799237931673e-06,
"loss": 0.1597,
"step": 1768
},
{
"epoch": 0.8048225659690628,
"grad_norm": 1.5130848101685814,
"learning_rate": 4.687453325716844e-06,
"loss": 0.1572,
"step": 1769
},
{
"epoch": 0.805277525022748,
"grad_norm": 2.380748372992281,
"learning_rate": 4.687107234753208e-06,
"loss": 0.1617,
"step": 1770
},
{
"epoch": 0.8057324840764332,
"grad_norm": 2.7874285940928067,
"learning_rate": 4.686760965069046e-06,
"loss": 0.1679,
"step": 1771
},
{
"epoch": 0.8061874431301183,
"grad_norm": 1.9146816786227654,
"learning_rate": 4.686414516692653e-06,
"loss": 0.2267,
"step": 1772
},
{
"epoch": 0.8066424021838035,
"grad_norm": 1.6656788150165645,
"learning_rate": 4.68606788965234e-06,
"loss": 0.1608,
"step": 1773
},
{
"epoch": 0.8070973612374887,
"grad_norm": 2.859758352959496,
"learning_rate": 4.68572108397643e-06,
"loss": 0.2065,
"step": 1774
},
{
"epoch": 0.8075523202911737,
"grad_norm": 1.7922493594029372,
"learning_rate": 4.6853740996932645e-06,
"loss": 0.1331,
"step": 1775
},
{
"epoch": 0.8080072793448589,
"grad_norm": 1.9382561831132192,
"learning_rate": 4.685026936831196e-06,
"loss": 0.1693,
"step": 1776
},
{
"epoch": 0.8084622383985441,
"grad_norm": 2.2029297725133237,
"learning_rate": 4.684679595418595e-06,
"loss": 0.1988,
"step": 1777
},
{
"epoch": 0.8089171974522293,
"grad_norm": 1.6643742621321755,
"learning_rate": 4.684332075483843e-06,
"loss": 0.1776,
"step": 1778
},
{
"epoch": 0.8093721565059144,
"grad_norm": 1.928150435175855,
"learning_rate": 4.6839843770553374e-06,
"loss": 0.2135,
"step": 1779
},
{
"epoch": 0.8098271155595996,
"grad_norm": 1.5299034058186116,
"learning_rate": 4.683636500161491e-06,
"loss": 0.1287,
"step": 1780
},
{
"epoch": 0.8102820746132848,
"grad_norm": 1.7105211102821978,
"learning_rate": 4.683288444830732e-06,
"loss": 0.1858,
"step": 1781
},
{
"epoch": 0.8107370336669699,
"grad_norm": 2.065121875110959,
"learning_rate": 4.6829402110915015e-06,
"loss": 0.1573,
"step": 1782
},
{
"epoch": 0.8111919927206551,
"grad_norm": 1.7915836692891514,
"learning_rate": 4.682591798972253e-06,
"loss": 0.163,
"step": 1783
},
{
"epoch": 0.8116469517743403,
"grad_norm": 1.9011358499015634,
"learning_rate": 4.682243208501461e-06,
"loss": 0.1565,
"step": 1784
},
{
"epoch": 0.8121019108280255,
"grad_norm": 1.8705464277674988,
"learning_rate": 4.681894439707609e-06,
"loss": 0.1532,
"step": 1785
},
{
"epoch": 0.8125568698817106,
"grad_norm": 1.5282025887885624,
"learning_rate": 4.681545492619195e-06,
"loss": 0.1212,
"step": 1786
},
{
"epoch": 0.8130118289353958,
"grad_norm": 2.4618870744714823,
"learning_rate": 4.681196367264736e-06,
"loss": 0.1737,
"step": 1787
},
{
"epoch": 0.813466787989081,
"grad_norm": 1.5010216528583702,
"learning_rate": 4.680847063672761e-06,
"loss": 0.1349,
"step": 1788
},
{
"epoch": 0.8139217470427661,
"grad_norm": 1.577176673126615,
"learning_rate": 4.680497581871811e-06,
"loss": 0.1736,
"step": 1789
},
{
"epoch": 0.8143767060964513,
"grad_norm": 2.2216456467027603,
"learning_rate": 4.680147921890447e-06,
"loss": 0.1589,
"step": 1790
},
{
"epoch": 0.8148316651501365,
"grad_norm": 2.2828861135151377,
"learning_rate": 4.67979808375724e-06,
"loss": 0.1864,
"step": 1791
},
{
"epoch": 0.8152866242038217,
"grad_norm": 2.411410847612128,
"learning_rate": 4.679448067500777e-06,
"loss": 0.1704,
"step": 1792
},
{
"epoch": 0.8157415832575068,
"grad_norm": 2.6745924756823274,
"learning_rate": 4.67909787314966e-06,
"loss": 0.1855,
"step": 1793
},
{
"epoch": 0.816196542311192,
"grad_norm": 1.7549666443082432,
"learning_rate": 4.678747500732505e-06,
"loss": 0.2204,
"step": 1794
},
{
"epoch": 0.8166515013648772,
"grad_norm": 2.4603767836599086,
"learning_rate": 4.6783969502779455e-06,
"loss": 0.1805,
"step": 1795
},
{
"epoch": 0.8171064604185623,
"grad_norm": 1.5762472297440564,
"learning_rate": 4.6780462218146236e-06,
"loss": 0.1393,
"step": 1796
},
{
"epoch": 0.8175614194722475,
"grad_norm": 1.6619849736476204,
"learning_rate": 4.6776953153712005e-06,
"loss": 0.2041,
"step": 1797
},
{
"epoch": 0.8180163785259327,
"grad_norm": 1.7094043878723117,
"learning_rate": 4.67734423097635e-06,
"loss": 0.1603,
"step": 1798
},
{
"epoch": 0.8184713375796179,
"grad_norm": 1.2928545358221282,
"learning_rate": 4.676992968658762e-06,
"loss": 0.1517,
"step": 1799
},
{
"epoch": 0.818926296633303,
"grad_norm": 1.4763652797153222,
"learning_rate": 4.67664152844714e-06,
"loss": 0.0939,
"step": 1800
},
{
"epoch": 0.8193812556869882,
"grad_norm": 2.260551569771672,
"learning_rate": 4.676289910370202e-06,
"loss": 0.1902,
"step": 1801
},
{
"epoch": 0.8198362147406734,
"grad_norm": 2.047407982208326,
"learning_rate": 4.675938114456682e-06,
"loss": 0.1767,
"step": 1802
},
{
"epoch": 0.8202911737943585,
"grad_norm": 1.5430069759954768,
"learning_rate": 4.675586140735323e-06,
"loss": 0.1955,
"step": 1803
},
{
"epoch": 0.8207461328480437,
"grad_norm": 2.2295561077574404,
"learning_rate": 4.675233989234891e-06,
"loss": 0.211,
"step": 1804
},
{
"epoch": 0.8212010919017289,
"grad_norm": 1.639085591327469,
"learning_rate": 4.67488165998416e-06,
"loss": 0.1163,
"step": 1805
},
{
"epoch": 0.821656050955414,
"grad_norm": 1.8522836776109448,
"learning_rate": 4.674529153011922e-06,
"loss": 0.1879,
"step": 1806
},
{
"epoch": 0.8221110100090991,
"grad_norm": 2.1812381655305653,
"learning_rate": 4.674176468346982e-06,
"loss": 0.1773,
"step": 1807
},
{
"epoch": 0.8225659690627843,
"grad_norm": 1.9367383257783326,
"learning_rate": 4.673823606018158e-06,
"loss": 0.2019,
"step": 1808
},
{
"epoch": 0.8230209281164695,
"grad_norm": 1.8576560873873327,
"learning_rate": 4.673470566054288e-06,
"loss": 0.1492,
"step": 1809
},
{
"epoch": 0.8234758871701547,
"grad_norm": 1.9497069876088635,
"learning_rate": 4.673117348484217e-06,
"loss": 0.1745,
"step": 1810
},
{
"epoch": 0.8239308462238398,
"grad_norm": 1.4193615554141685,
"learning_rate": 4.672763953336811e-06,
"loss": 0.1463,
"step": 1811
},
{
"epoch": 0.824385805277525,
"grad_norm": 2.8057971610463928,
"learning_rate": 4.672410380640946e-06,
"loss": 0.2285,
"step": 1812
},
{
"epoch": 0.8248407643312102,
"grad_norm": 1.8069198589432123,
"learning_rate": 4.672056630425516e-06,
"loss": 0.1228,
"step": 1813
},
{
"epoch": 0.8252957233848953,
"grad_norm": 1.3408435512517318,
"learning_rate": 4.671702702719426e-06,
"loss": 0.1436,
"step": 1814
},
{
"epoch": 0.8257506824385805,
"grad_norm": 2.0862527734688197,
"learning_rate": 4.671348597551599e-06,
"loss": 0.2169,
"step": 1815
},
{
"epoch": 0.8262056414922657,
"grad_norm": 2.132580252859084,
"learning_rate": 4.670994314950971e-06,
"loss": 0.2017,
"step": 1816
},
{
"epoch": 0.8266606005459509,
"grad_norm": 2.5991268132353853,
"learning_rate": 4.6706398549464905e-06,
"loss": 0.2089,
"step": 1817
},
{
"epoch": 0.827115559599636,
"grad_norm": 2.3181044896129275,
"learning_rate": 4.670285217567124e-06,
"loss": 0.1531,
"step": 1818
},
{
"epoch": 0.8275705186533212,
"grad_norm": 1.7235570690460182,
"learning_rate": 4.6699304028418516e-06,
"loss": 0.1933,
"step": 1819
},
{
"epoch": 0.8280254777070064,
"grad_norm": 3.3016739021057884,
"learning_rate": 4.669575410799665e-06,
"loss": 0.2017,
"step": 1820
},
{
"epoch": 0.8284804367606915,
"grad_norm": 1.3897879186817867,
"learning_rate": 4.669220241469573e-06,
"loss": 0.1393,
"step": 1821
},
{
"epoch": 0.8289353958143767,
"grad_norm": 1.7530097372349214,
"learning_rate": 4.668864894880599e-06,
"loss": 0.2163,
"step": 1822
},
{
"epoch": 0.8293903548680619,
"grad_norm": 2.7080878088048337,
"learning_rate": 4.668509371061781e-06,
"loss": 0.2166,
"step": 1823
},
{
"epoch": 0.8298453139217471,
"grad_norm": 1.9706360861102925,
"learning_rate": 4.668153670042171e-06,
"loss": 0.2253,
"step": 1824
},
{
"epoch": 0.8303002729754322,
"grad_norm": 1.830442854507149,
"learning_rate": 4.667797791850833e-06,
"loss": 0.1526,
"step": 1825
},
{
"epoch": 0.8307552320291174,
"grad_norm": 1.7672909680061333,
"learning_rate": 4.6674417365168495e-06,
"loss": 0.156,
"step": 1826
},
{
"epoch": 0.8312101910828026,
"grad_norm": 1.627604242773907,
"learning_rate": 4.667085504069315e-06,
"loss": 0.1965,
"step": 1827
},
{
"epoch": 0.8316651501364877,
"grad_norm": 1.6049507259721845,
"learning_rate": 4.66672909453734e-06,
"loss": 0.1678,
"step": 1828
},
{
"epoch": 0.8321201091901729,
"grad_norm": 2.31296929571475,
"learning_rate": 4.6663725079500485e-06,
"loss": 0.2126,
"step": 1829
},
{
"epoch": 0.8325750682438581,
"grad_norm": 1.7717243650944572,
"learning_rate": 4.666015744336578e-06,
"loss": 0.1333,
"step": 1830
},
{
"epoch": 0.8330300272975433,
"grad_norm": 1.9376666152516604,
"learning_rate": 4.665658803726083e-06,
"loss": 0.161,
"step": 1831
},
{
"epoch": 0.8334849863512284,
"grad_norm": 1.9363986365280477,
"learning_rate": 4.6653016861477315e-06,
"loss": 0.1736,
"step": 1832
},
{
"epoch": 0.8339399454049136,
"grad_norm": 1.0684481587552732,
"learning_rate": 4.664944391630704e-06,
"loss": 0.1187,
"step": 1833
},
{
"epoch": 0.8343949044585988,
"grad_norm": 1.9806679260858633,
"learning_rate": 4.664586920204197e-06,
"loss": 0.1945,
"step": 1834
},
{
"epoch": 0.8348498635122839,
"grad_norm": 2.002852794100086,
"learning_rate": 4.664229271897422e-06,
"loss": 0.1449,
"step": 1835
},
{
"epoch": 0.835304822565969,
"grad_norm": 1.416210291100934,
"learning_rate": 4.663871446739606e-06,
"loss": 0.2015,
"step": 1836
},
{
"epoch": 0.8357597816196543,
"grad_norm": 1.8546810159993223,
"learning_rate": 4.663513444759986e-06,
"loss": 0.1461,
"step": 1837
},
{
"epoch": 0.8362147406733395,
"grad_norm": 2.054627126988846,
"learning_rate": 4.663155265987818e-06,
"loss": 0.1779,
"step": 1838
},
{
"epoch": 0.8366696997270245,
"grad_norm": 1.8928121217305771,
"learning_rate": 4.66279691045237e-06,
"loss": 0.1843,
"step": 1839
},
{
"epoch": 0.8371246587807097,
"grad_norm": 2.3586323101492552,
"learning_rate": 4.662438378182927e-06,
"loss": 0.2396,
"step": 1840
},
{
"epoch": 0.8375796178343949,
"grad_norm": 1.8299500333063181,
"learning_rate": 4.662079669208783e-06,
"loss": 0.1645,
"step": 1841
},
{
"epoch": 0.83803457688808,
"grad_norm": 2.3480837865967215,
"learning_rate": 4.661720783559254e-06,
"loss": 0.1788,
"step": 1842
},
{
"epoch": 0.8384895359417652,
"grad_norm": 1.883623814508302,
"learning_rate": 4.661361721263664e-06,
"loss": 0.1624,
"step": 1843
},
{
"epoch": 0.8389444949954504,
"grad_norm": 2.160772908461247,
"learning_rate": 4.661002482351355e-06,
"loss": 0.1908,
"step": 1844
},
{
"epoch": 0.8393994540491356,
"grad_norm": 2.187162279477086,
"learning_rate": 4.660643066851682e-06,
"loss": 0.1808,
"step": 1845
},
{
"epoch": 0.8398544131028207,
"grad_norm": 1.8531325552871911,
"learning_rate": 4.6602834747940155e-06,
"loss": 0.1914,
"step": 1846
},
{
"epoch": 0.8403093721565059,
"grad_norm": 1.4831331336407363,
"learning_rate": 4.6599237062077385e-06,
"loss": 0.142,
"step": 1847
},
{
"epoch": 0.8407643312101911,
"grad_norm": 1.931468706645427,
"learning_rate": 4.65956376112225e-06,
"loss": 0.2224,
"step": 1848
},
{
"epoch": 0.8412192902638762,
"grad_norm": 1.3694171323558038,
"learning_rate": 4.659203639566965e-06,
"loss": 0.1375,
"step": 1849
},
{
"epoch": 0.8416742493175614,
"grad_norm": 1.936812425945626,
"learning_rate": 4.658843341571308e-06,
"loss": 0.1342,
"step": 1850
},
{
"epoch": 0.8421292083712466,
"grad_norm": 1.6211061965620477,
"learning_rate": 4.6584828671647235e-06,
"loss": 0.1241,
"step": 1851
},
{
"epoch": 0.8425841674249318,
"grad_norm": 1.1366286902231244,
"learning_rate": 4.658122216376666e-06,
"loss": 0.1378,
"step": 1852
},
{
"epoch": 0.8430391264786169,
"grad_norm": 1.6359146658906643,
"learning_rate": 4.657761389236607e-06,
"loss": 0.2118,
"step": 1853
},
{
"epoch": 0.8434940855323021,
"grad_norm": 2.5329878550243734,
"learning_rate": 4.657400385774032e-06,
"loss": 0.2193,
"step": 1854
},
{
"epoch": 0.8439490445859873,
"grad_norm": 2.5278755724681425,
"learning_rate": 4.65703920601844e-06,
"loss": 0.2768,
"step": 1855
},
{
"epoch": 0.8444040036396724,
"grad_norm": 1.590463345818293,
"learning_rate": 4.656677849999345e-06,
"loss": 0.139,
"step": 1856
},
{
"epoch": 0.8448589626933576,
"grad_norm": 2.5309928033982154,
"learning_rate": 4.656316317746275e-06,
"loss": 0.1896,
"step": 1857
},
{
"epoch": 0.8453139217470428,
"grad_norm": 1.9131067732573241,
"learning_rate": 4.655954609288775e-06,
"loss": 0.1584,
"step": 1858
},
{
"epoch": 0.845768880800728,
"grad_norm": 1.676858006295649,
"learning_rate": 4.655592724656399e-06,
"loss": 0.1413,
"step": 1859
},
{
"epoch": 0.8462238398544131,
"grad_norm": 1.2591774278723207,
"learning_rate": 4.655230663878721e-06,
"loss": 0.106,
"step": 1860
},
{
"epoch": 0.8466787989080983,
"grad_norm": 1.7932854876030564,
"learning_rate": 4.654868426985326e-06,
"loss": 0.1417,
"step": 1861
},
{
"epoch": 0.8471337579617835,
"grad_norm": 1.6149020601443298,
"learning_rate": 4.654506014005814e-06,
"loss": 0.1632,
"step": 1862
},
{
"epoch": 0.8475887170154686,
"grad_norm": 2.4429847082643734,
"learning_rate": 4.6541434249698e-06,
"loss": 0.1726,
"step": 1863
},
{
"epoch": 0.8480436760691538,
"grad_norm": 1.958537494840022,
"learning_rate": 4.6537806599069144e-06,
"loss": 0.1918,
"step": 1864
},
{
"epoch": 0.848498635122839,
"grad_norm": 2.0524656641640573,
"learning_rate": 4.653417718846799e-06,
"loss": 0.1824,
"step": 1865
},
{
"epoch": 0.8489535941765242,
"grad_norm": 2.471476245561928,
"learning_rate": 4.6530546018191126e-06,
"loss": 0.1833,
"step": 1866
},
{
"epoch": 0.8494085532302093,
"grad_norm": 1.792641798980951,
"learning_rate": 4.652691308853526e-06,
"loss": 0.1409,
"step": 1867
},
{
"epoch": 0.8498635122838945,
"grad_norm": 1.663452952464092,
"learning_rate": 4.652327839979729e-06,
"loss": 0.1707,
"step": 1868
},
{
"epoch": 0.8503184713375797,
"grad_norm": 2.201926398578509,
"learning_rate": 4.651964195227419e-06,
"loss": 0.1399,
"step": 1869
},
{
"epoch": 0.8507734303912647,
"grad_norm": 1.8923698849228574,
"learning_rate": 4.651600374626315e-06,
"loss": 0.1381,
"step": 1870
},
{
"epoch": 0.8512283894449499,
"grad_norm": 1.4952325363772294,
"learning_rate": 4.651236378206144e-06,
"loss": 0.1521,
"step": 1871
},
{
"epoch": 0.8516833484986351,
"grad_norm": 1.6339894998223452,
"learning_rate": 4.650872205996651e-06,
"loss": 0.1813,
"step": 1872
},
{
"epoch": 0.8521383075523203,
"grad_norm": 2.105965789292229,
"learning_rate": 4.650507858027595e-06,
"loss": 0.1482,
"step": 1873
},
{
"epoch": 0.8525932666060054,
"grad_norm": 1.9949585656638686,
"learning_rate": 4.6501433343287475e-06,
"loss": 0.1851,
"step": 1874
},
{
"epoch": 0.8530482256596906,
"grad_norm": 2.4070104220391326,
"learning_rate": 4.6497786349298975e-06,
"loss": 0.1662,
"step": 1875
},
{
"epoch": 0.8535031847133758,
"grad_norm": 1.7461886999738794,
"learning_rate": 4.649413759860846e-06,
"loss": 0.1254,
"step": 1876
},
{
"epoch": 0.8539581437670609,
"grad_norm": 2.2792475326190016,
"learning_rate": 4.649048709151408e-06,
"loss": 0.2312,
"step": 1877
},
{
"epoch": 0.8544131028207461,
"grad_norm": 1.3426843322261688,
"learning_rate": 4.648683482831415e-06,
"loss": 0.1172,
"step": 1878
},
{
"epoch": 0.8548680618744313,
"grad_norm": 2.382337203322208,
"learning_rate": 4.648318080930711e-06,
"loss": 0.2074,
"step": 1879
},
{
"epoch": 0.8553230209281165,
"grad_norm": 1.712854915430822,
"learning_rate": 4.647952503479154e-06,
"loss": 0.1704,
"step": 1880
},
{
"epoch": 0.8557779799818016,
"grad_norm": 1.8081149874596518,
"learning_rate": 4.6475867505066195e-06,
"loss": 0.1917,
"step": 1881
},
{
"epoch": 0.8562329390354868,
"grad_norm": 1.9751613678879965,
"learning_rate": 4.647220822042995e-06,
"loss": 0.1735,
"step": 1882
},
{
"epoch": 0.856687898089172,
"grad_norm": 2.1327662706521906,
"learning_rate": 4.64685471811818e-06,
"loss": 0.1449,
"step": 1883
},
{
"epoch": 0.8571428571428571,
"grad_norm": 2.064198155606807,
"learning_rate": 4.646488438762094e-06,
"loss": 0.23,
"step": 1884
},
{
"epoch": 0.8575978161965423,
"grad_norm": 1.506998926934666,
"learning_rate": 4.646121984004666e-06,
"loss": 0.165,
"step": 1885
},
{
"epoch": 0.8580527752502275,
"grad_norm": 1.8322392109933523,
"learning_rate": 4.64575535387584e-06,
"loss": 0.2264,
"step": 1886
},
{
"epoch": 0.8585077343039127,
"grad_norm": 2.0388479228852048,
"learning_rate": 4.645388548405578e-06,
"loss": 0.2175,
"step": 1887
},
{
"epoch": 0.8589626933575978,
"grad_norm": 2.097249131206244,
"learning_rate": 4.645021567623852e-06,
"loss": 0.2196,
"step": 1888
},
{
"epoch": 0.859417652411283,
"grad_norm": 1.5275188180484371,
"learning_rate": 4.644654411560651e-06,
"loss": 0.1417,
"step": 1889
},
{
"epoch": 0.8598726114649682,
"grad_norm": 1.8944498906948435,
"learning_rate": 4.644287080245975e-06,
"loss": 0.1795,
"step": 1890
},
{
"epoch": 0.8603275705186533,
"grad_norm": 1.983029598334522,
"learning_rate": 4.643919573709843e-06,
"loss": 0.1986,
"step": 1891
},
{
"epoch": 0.8607825295723385,
"grad_norm": 1.6266032809421398,
"learning_rate": 4.6435518919822854e-06,
"loss": 0.207,
"step": 1892
},
{
"epoch": 0.8612374886260237,
"grad_norm": 2.19323813493903,
"learning_rate": 4.643184035093348e-06,
"loss": 0.1393,
"step": 1893
},
{
"epoch": 0.8616924476797089,
"grad_norm": 1.8257509692409855,
"learning_rate": 4.642816003073089e-06,
"loss": 0.1634,
"step": 1894
},
{
"epoch": 0.862147406733394,
"grad_norm": 1.2900997861243053,
"learning_rate": 4.6424477959515836e-06,
"loss": 0.1654,
"step": 1895
},
{
"epoch": 0.8626023657870792,
"grad_norm": 1.540771601167976,
"learning_rate": 4.642079413758919e-06,
"loss": 0.1518,
"step": 1896
},
{
"epoch": 0.8630573248407644,
"grad_norm": 1.899942137953783,
"learning_rate": 4.641710856525199e-06,
"loss": 0.1821,
"step": 1897
},
{
"epoch": 0.8635122838944495,
"grad_norm": 1.4129439458546442,
"learning_rate": 4.641342124280539e-06,
"loss": 0.1716,
"step": 1898
},
{
"epoch": 0.8639672429481347,
"grad_norm": 2.3313958542346995,
"learning_rate": 4.6409732170550705e-06,
"loss": 0.1687,
"step": 1899
},
{
"epoch": 0.8644222020018199,
"grad_norm": 1.4646430511341277,
"learning_rate": 4.64060413487894e-06,
"loss": 0.1321,
"step": 1900
},
{
"epoch": 0.864877161055505,
"grad_norm": 1.6578645771032987,
"learning_rate": 4.640234877782306e-06,
"loss": 0.1339,
"step": 1901
},
{
"epoch": 0.8653321201091901,
"grad_norm": 2.115428055628657,
"learning_rate": 4.639865445795344e-06,
"loss": 0.155,
"step": 1902
},
{
"epoch": 0.8657870791628753,
"grad_norm": 1.4246658971760144,
"learning_rate": 4.63949583894824e-06,
"loss": 0.1211,
"step": 1903
},
{
"epoch": 0.8662420382165605,
"grad_norm": 1.8915165798317974,
"learning_rate": 4.639126057271199e-06,
"loss": 0.1943,
"step": 1904
},
{
"epoch": 0.8666969972702456,
"grad_norm": 1.4359286477489568,
"learning_rate": 4.6387561007944355e-06,
"loss": 0.1927,
"step": 1905
},
{
"epoch": 0.8671519563239308,
"grad_norm": 1.7402908671263166,
"learning_rate": 4.638385969548183e-06,
"loss": 0.197,
"step": 1906
},
{
"epoch": 0.867606915377616,
"grad_norm": 1.6362731205557584,
"learning_rate": 4.638015663562686e-06,
"loss": 0.1383,
"step": 1907
},
{
"epoch": 0.8680618744313012,
"grad_norm": 2.4913116222464837,
"learning_rate": 4.637645182868204e-06,
"loss": 0.2,
"step": 1908
},
{
"epoch": 0.8685168334849863,
"grad_norm": 1.254842356705368,
"learning_rate": 4.637274527495011e-06,
"loss": 0.121,
"step": 1909
},
{
"epoch": 0.8689717925386715,
"grad_norm": 1.5120107885745528,
"learning_rate": 4.6369036974733955e-06,
"loss": 0.1464,
"step": 1910
},
{
"epoch": 0.8694267515923567,
"grad_norm": 1.392142485713207,
"learning_rate": 4.63653269283366e-06,
"loss": 0.1325,
"step": 1911
},
{
"epoch": 0.8698817106460418,
"grad_norm": 1.6362489180779098,
"learning_rate": 4.636161513606122e-06,
"loss": 0.1887,
"step": 1912
},
{
"epoch": 0.870336669699727,
"grad_norm": 1.7061998927826107,
"learning_rate": 4.6357901598211105e-06,
"loss": 0.1559,
"step": 1913
},
{
"epoch": 0.8707916287534122,
"grad_norm": 1.7490187306928824,
"learning_rate": 4.635418631508974e-06,
"loss": 0.1504,
"step": 1914
},
{
"epoch": 0.8712465878070974,
"grad_norm": 1.7459918799385958,
"learning_rate": 4.635046928700069e-06,
"loss": 0.1737,
"step": 1915
},
{
"epoch": 0.8717015468607825,
"grad_norm": 2.128565340614342,
"learning_rate": 4.634675051424771e-06,
"loss": 0.1843,
"step": 1916
},
{
"epoch": 0.8721565059144677,
"grad_norm": 1.5616930523249197,
"learning_rate": 4.634302999713468e-06,
"loss": 0.1004,
"step": 1917
},
{
"epoch": 0.8726114649681529,
"grad_norm": 1.886440296737102,
"learning_rate": 4.633930773596563e-06,
"loss": 0.2085,
"step": 1918
},
{
"epoch": 0.873066424021838,
"grad_norm": 1.6874199025414718,
"learning_rate": 4.633558373104472e-06,
"loss": 0.1965,
"step": 1919
},
{
"epoch": 0.8735213830755232,
"grad_norm": 1.4248884120885352,
"learning_rate": 4.633185798267625e-06,
"loss": 0.1814,
"step": 1920
},
{
"epoch": 0.8739763421292084,
"grad_norm": 2.0576525781987107,
"learning_rate": 4.632813049116467e-06,
"loss": 0.2251,
"step": 1921
},
{
"epoch": 0.8744313011828936,
"grad_norm": 2.422851032077204,
"learning_rate": 4.63244012568146e-06,
"loss": 0.1949,
"step": 1922
},
{
"epoch": 0.8748862602365787,
"grad_norm": 2.1417664356799087,
"learning_rate": 4.632067027993076e-06,
"loss": 0.1548,
"step": 1923
},
{
"epoch": 0.8753412192902639,
"grad_norm": 1.4407274073506169,
"learning_rate": 4.631693756081802e-06,
"loss": 0.1252,
"step": 1924
},
{
"epoch": 0.8757961783439491,
"grad_norm": 1.6004631673541039,
"learning_rate": 4.631320309978141e-06,
"loss": 0.1876,
"step": 1925
},
{
"epoch": 0.8762511373976342,
"grad_norm": 1.7251546761372085,
"learning_rate": 4.630946689712609e-06,
"loss": 0.1624,
"step": 1926
},
{
"epoch": 0.8767060964513194,
"grad_norm": 1.7738030549432209,
"learning_rate": 4.630572895315737e-06,
"loss": 0.1748,
"step": 1927
},
{
"epoch": 0.8771610555050046,
"grad_norm": 1.0086649768907636,
"learning_rate": 4.63019892681807e-06,
"loss": 0.1032,
"step": 1928
},
{
"epoch": 0.8776160145586898,
"grad_norm": 1.2701304891541718,
"learning_rate": 4.629824784250166e-06,
"loss": 0.1192,
"step": 1929
},
{
"epoch": 0.8780709736123748,
"grad_norm": 1.6784044296991356,
"learning_rate": 4.629450467642599e-06,
"loss": 0.1265,
"step": 1930
},
{
"epoch": 0.87852593266606,
"grad_norm": 1.976065902819502,
"learning_rate": 4.629075977025957e-06,
"loss": 0.1681,
"step": 1931
},
{
"epoch": 0.8789808917197452,
"grad_norm": 1.6213814808866245,
"learning_rate": 4.62870131243084e-06,
"loss": 0.1493,
"step": 1932
},
{
"epoch": 0.8794358507734303,
"grad_norm": 1.9807101332336867,
"learning_rate": 4.628326473887865e-06,
"loss": 0.1095,
"step": 1933
},
{
"epoch": 0.8798908098271155,
"grad_norm": 1.3613443516857038,
"learning_rate": 4.627951461427663e-06,
"loss": 0.0886,
"step": 1934
},
{
"epoch": 0.8803457688808007,
"grad_norm": 2.294295361155117,
"learning_rate": 4.627576275080876e-06,
"loss": 0.1782,
"step": 1935
},
{
"epoch": 0.8808007279344859,
"grad_norm": 1.465162455531879,
"learning_rate": 4.627200914878165e-06,
"loss": 0.1689,
"step": 1936
},
{
"epoch": 0.881255686988171,
"grad_norm": 1.9852567754309711,
"learning_rate": 4.6268253808502005e-06,
"loss": 0.1953,
"step": 1937
},
{
"epoch": 0.8817106460418562,
"grad_norm": 1.3259365892059651,
"learning_rate": 4.626449673027671e-06,
"loss": 0.1186,
"step": 1938
},
{
"epoch": 0.8821656050955414,
"grad_norm": 2.311627846572585,
"learning_rate": 4.626073791441278e-06,
"loss": 0.175,
"step": 1939
},
{
"epoch": 0.8826205641492265,
"grad_norm": 1.403685443623727,
"learning_rate": 4.625697736121735e-06,
"loss": 0.1632,
"step": 1940
},
{
"epoch": 0.8830755232029117,
"grad_norm": 1.8370812337880758,
"learning_rate": 4.6253215070997735e-06,
"loss": 0.1805,
"step": 1941
},
{
"epoch": 0.8835304822565969,
"grad_norm": 1.7617734494239499,
"learning_rate": 4.624945104406135e-06,
"loss": 0.1484,
"step": 1942
},
{
"epoch": 0.8839854413102821,
"grad_norm": 1.2929099916167694,
"learning_rate": 4.624568528071579e-06,
"loss": 0.1109,
"step": 1943
},
{
"epoch": 0.8844404003639672,
"grad_norm": 1.6991526267122765,
"learning_rate": 4.624191778126879e-06,
"loss": 0.1833,
"step": 1944
},
{
"epoch": 0.8848953594176524,
"grad_norm": 1.947027254377722,
"learning_rate": 4.623814854602818e-06,
"loss": 0.2251,
"step": 1945
},
{
"epoch": 0.8853503184713376,
"grad_norm": 1.7473125338322357,
"learning_rate": 4.623437757530198e-06,
"loss": 0.1144,
"step": 1946
},
{
"epoch": 0.8858052775250227,
"grad_norm": 1.56986818124434,
"learning_rate": 4.623060486939835e-06,
"loss": 0.1507,
"step": 1947
},
{
"epoch": 0.8862602365787079,
"grad_norm": 2.2731317429688995,
"learning_rate": 4.622683042862556e-06,
"loss": 0.1854,
"step": 1948
},
{
"epoch": 0.8867151956323931,
"grad_norm": 1.5668080033034493,
"learning_rate": 4.622305425329205e-06,
"loss": 0.1093,
"step": 1949
},
{
"epoch": 0.8871701546860783,
"grad_norm": 1.4666243413929643,
"learning_rate": 4.621927634370638e-06,
"loss": 0.1179,
"step": 1950
},
{
"epoch": 0.8876251137397634,
"grad_norm": 2.142207445885291,
"learning_rate": 4.621549670017727e-06,
"loss": 0.2196,
"step": 1951
},
{
"epoch": 0.8880800727934486,
"grad_norm": 1.9929367654553447,
"learning_rate": 4.6211715323013595e-06,
"loss": 0.1926,
"step": 1952
},
{
"epoch": 0.8885350318471338,
"grad_norm": 1.8377495474805912,
"learning_rate": 4.6207932212524325e-06,
"loss": 0.1879,
"step": 1953
},
{
"epoch": 0.8889899909008189,
"grad_norm": 1.8025632169370749,
"learning_rate": 4.620414736901861e-06,
"loss": 0.1627,
"step": 1954
},
{
"epoch": 0.8894449499545041,
"grad_norm": 1.7867128092311804,
"learning_rate": 4.620036079280573e-06,
"loss": 0.2169,
"step": 1955
},
{
"epoch": 0.8898999090081893,
"grad_norm": 2.4571527122530776,
"learning_rate": 4.619657248419511e-06,
"loss": 0.2337,
"step": 1956
},
{
"epoch": 0.8903548680618745,
"grad_norm": 1.5424608043537418,
"learning_rate": 4.61927824434963e-06,
"loss": 0.134,
"step": 1957
},
{
"epoch": 0.8908098271155596,
"grad_norm": 1.8248865805885555,
"learning_rate": 4.6188990671019015e-06,
"loss": 0.1473,
"step": 1958
},
{
"epoch": 0.8912647861692448,
"grad_norm": 1.2825883167116863,
"learning_rate": 4.618519716707311e-06,
"loss": 0.1377,
"step": 1959
},
{
"epoch": 0.89171974522293,
"grad_norm": 1.9837251078508047,
"learning_rate": 4.618140193196856e-06,
"loss": 0.1736,
"step": 1960
},
{
"epoch": 0.892174704276615,
"grad_norm": 1.604956750795707,
"learning_rate": 4.61776049660155e-06,
"loss": 0.1711,
"step": 1961
},
{
"epoch": 0.8926296633303002,
"grad_norm": 1.5703167687380166,
"learning_rate": 4.61738062695242e-06,
"loss": 0.1519,
"step": 1962
},
{
"epoch": 0.8930846223839854,
"grad_norm": 2.2186984451911638,
"learning_rate": 4.617000584280506e-06,
"loss": 0.1443,
"step": 1963
},
{
"epoch": 0.8935395814376706,
"grad_norm": 1.906102770647992,
"learning_rate": 4.616620368616866e-06,
"loss": 0.1878,
"step": 1964
},
{
"epoch": 0.8939945404913557,
"grad_norm": 2.0871942985325167,
"learning_rate": 4.616239979992568e-06,
"loss": 0.2384,
"step": 1965
},
{
"epoch": 0.8944494995450409,
"grad_norm": 1.6638677246444422,
"learning_rate": 4.615859418438695e-06,
"loss": 0.1792,
"step": 1966
},
{
"epoch": 0.8949044585987261,
"grad_norm": 1.387205154257509,
"learning_rate": 4.615478683986345e-06,
"loss": 0.144,
"step": 1967
},
{
"epoch": 0.8953594176524113,
"grad_norm": 1.8836562093395437,
"learning_rate": 4.6150977766666315e-06,
"loss": 0.2174,
"step": 1968
},
{
"epoch": 0.8958143767060964,
"grad_norm": 1.9229400987313323,
"learning_rate": 4.614716696510679e-06,
"loss": 0.2241,
"step": 1969
},
{
"epoch": 0.8962693357597816,
"grad_norm": 1.8744101552937114,
"learning_rate": 4.614335443549628e-06,
"loss": 0.1402,
"step": 1970
},
{
"epoch": 0.8967242948134668,
"grad_norm": 1.7357579966910537,
"learning_rate": 4.613954017814633e-06,
"loss": 0.1286,
"step": 1971
},
{
"epoch": 0.8971792538671519,
"grad_norm": 1.8840478367784224,
"learning_rate": 4.613572419336862e-06,
"loss": 0.1342,
"step": 1972
},
{
"epoch": 0.8976342129208371,
"grad_norm": 1.5927521655138008,
"learning_rate": 4.613190648147497e-06,
"loss": 0.1513,
"step": 1973
},
{
"epoch": 0.8980891719745223,
"grad_norm": 2.065610545817281,
"learning_rate": 4.612808704277736e-06,
"loss": 0.2084,
"step": 1974
},
{
"epoch": 0.8985441310282075,
"grad_norm": 1.5284731538672136,
"learning_rate": 4.612426587758789e-06,
"loss": 0.188,
"step": 1975
},
{
"epoch": 0.8989990900818926,
"grad_norm": 2.023375971468293,
"learning_rate": 4.612044298621881e-06,
"loss": 0.1344,
"step": 1976
},
{
"epoch": 0.8994540491355778,
"grad_norm": 1.9534402095489405,
"learning_rate": 4.611661836898252e-06,
"loss": 0.1738,
"step": 1977
},
{
"epoch": 0.899909008189263,
"grad_norm": 1.9156260955002997,
"learning_rate": 4.611279202619151e-06,
"loss": 0.1668,
"step": 1978
},
{
"epoch": 0.9003639672429481,
"grad_norm": 1.9526723286463348,
"learning_rate": 4.61089639581585e-06,
"loss": 0.1669,
"step": 1979
},
{
"epoch": 0.9008189262966333,
"grad_norm": 1.9056078059584818,
"learning_rate": 4.610513416519628e-06,
"loss": 0.1507,
"step": 1980
},
{
"epoch": 0.9012738853503185,
"grad_norm": 1.5105931587228634,
"learning_rate": 4.6101302647617806e-06,
"loss": 0.1488,
"step": 1981
},
{
"epoch": 0.9017288444040037,
"grad_norm": 2.0835062062044347,
"learning_rate": 4.609746940573617e-06,
"loss": 0.1324,
"step": 1982
},
{
"epoch": 0.9021838034576888,
"grad_norm": 1.9577939305337912,
"learning_rate": 4.609363443986461e-06,
"loss": 0.1636,
"step": 1983
},
{
"epoch": 0.902638762511374,
"grad_norm": 1.7800989438629395,
"learning_rate": 4.60897977503165e-06,
"loss": 0.1754,
"step": 1984
},
{
"epoch": 0.9030937215650592,
"grad_norm": 2.1110656440447544,
"learning_rate": 4.608595933740536e-06,
"loss": 0.2122,
"step": 1985
},
{
"epoch": 0.9035486806187443,
"grad_norm": 1.286237936407134,
"learning_rate": 4.608211920144485e-06,
"loss": 0.202,
"step": 1986
},
{
"epoch": 0.9040036396724295,
"grad_norm": 2.2604741864786178,
"learning_rate": 4.607827734274876e-06,
"loss": 0.1669,
"step": 1987
},
{
"epoch": 0.9044585987261147,
"grad_norm": 1.7607840905259224,
"learning_rate": 4.607443376163104e-06,
"loss": 0.1375,
"step": 1988
},
{
"epoch": 0.9049135577797999,
"grad_norm": 1.7402029650347348,
"learning_rate": 4.607058845840576e-06,
"loss": 0.1431,
"step": 1989
},
{
"epoch": 0.905368516833485,
"grad_norm": 1.666160268732321,
"learning_rate": 4.606674143338714e-06,
"loss": 0.1485,
"step": 1990
},
{
"epoch": 0.9058234758871702,
"grad_norm": 2.0612124207721654,
"learning_rate": 4.606289268688955e-06,
"loss": 0.1419,
"step": 1991
},
{
"epoch": 0.9062784349408554,
"grad_norm": 1.9143034406546822,
"learning_rate": 4.605904221922749e-06,
"loss": 0.1842,
"step": 1992
},
{
"epoch": 0.9067333939945404,
"grad_norm": 2.410587966058405,
"learning_rate": 4.6055190030715605e-06,
"loss": 0.1858,
"step": 1993
},
{
"epoch": 0.9071883530482256,
"grad_norm": 1.4389936850061738,
"learning_rate": 4.605133612166868e-06,
"loss": 0.1387,
"step": 1994
},
{
"epoch": 0.9076433121019108,
"grad_norm": 1.546723165322591,
"learning_rate": 4.604748049240162e-06,
"loss": 0.1353,
"step": 1995
},
{
"epoch": 0.908098271155596,
"grad_norm": 1.510897129777589,
"learning_rate": 4.604362314322951e-06,
"loss": 0.1322,
"step": 1996
},
{
"epoch": 0.9085532302092811,
"grad_norm": 2.3885439589368147,
"learning_rate": 4.603976407446756e-06,
"loss": 0.1656,
"step": 1997
},
{
"epoch": 0.9090081892629663,
"grad_norm": 1.193637078798613,
"learning_rate": 4.603590328643108e-06,
"loss": 0.1057,
"step": 1998
},
{
"epoch": 0.9094631483166515,
"grad_norm": 1.910033395843472,
"learning_rate": 4.60320407794356e-06,
"loss": 0.1519,
"step": 1999
},
{
"epoch": 0.9099181073703366,
"grad_norm": 1.6867999496406765,
"learning_rate": 4.602817655379672e-06,
"loss": 0.1776,
"step": 2000
},
{
"epoch": 0.9103730664240218,
"grad_norm": 1.7117660414525686,
"learning_rate": 4.602431060983022e-06,
"loss": 0.1451,
"step": 2001
},
{
"epoch": 0.910828025477707,
"grad_norm": 1.4990428536514322,
"learning_rate": 4.6020442947852e-06,
"loss": 0.1409,
"step": 2002
},
{
"epoch": 0.9112829845313922,
"grad_norm": 1.446262498955875,
"learning_rate": 4.6016573568178105e-06,
"loss": 0.1135,
"step": 2003
},
{
"epoch": 0.9117379435850773,
"grad_norm": 1.6571232403743137,
"learning_rate": 4.601270247112473e-06,
"loss": 0.2404,
"step": 2004
},
{
"epoch": 0.9121929026387625,
"grad_norm": 2.0064329107593646,
"learning_rate": 4.60088296570082e-06,
"loss": 0.1905,
"step": 2005
},
{
"epoch": 0.9126478616924477,
"grad_norm": 1.4125062029338067,
"learning_rate": 4.600495512614499e-06,
"loss": 0.1117,
"step": 2006
},
{
"epoch": 0.9131028207461328,
"grad_norm": 1.8059848267053757,
"learning_rate": 4.60010788788517e-06,
"loss": 0.2289,
"step": 2007
},
{
"epoch": 0.913557779799818,
"grad_norm": 1.8237596303340968,
"learning_rate": 4.5997200915445095e-06,
"loss": 0.1983,
"step": 2008
},
{
"epoch": 0.9140127388535032,
"grad_norm": 1.6824481144619179,
"learning_rate": 4.599332123624204e-06,
"loss": 0.1361,
"step": 2009
},
{
"epoch": 0.9144676979071884,
"grad_norm": 1.5469841434239995,
"learning_rate": 4.598943984155959e-06,
"loss": 0.1561,
"step": 2010
},
{
"epoch": 0.9149226569608735,
"grad_norm": 1.1721008124510859,
"learning_rate": 4.598555673171489e-06,
"loss": 0.0997,
"step": 2011
},
{
"epoch": 0.9153776160145587,
"grad_norm": 1.367389738430673,
"learning_rate": 4.5981671907025275e-06,
"loss": 0.124,
"step": 2012
},
{
"epoch": 0.9158325750682439,
"grad_norm": 1.9852471647698953,
"learning_rate": 4.597778536780818e-06,
"loss": 0.1746,
"step": 2013
},
{
"epoch": 0.916287534121929,
"grad_norm": 2.1379896488178405,
"learning_rate": 4.597389711438121e-06,
"loss": 0.2387,
"step": 2014
},
{
"epoch": 0.9167424931756142,
"grad_norm": 1.4433682072802856,
"learning_rate": 4.597000714706207e-06,
"loss": 0.1261,
"step": 2015
},
{
"epoch": 0.9171974522292994,
"grad_norm": 1.92195373557543,
"learning_rate": 4.596611546616865e-06,
"loss": 0.1982,
"step": 2016
},
{
"epoch": 0.9176524112829846,
"grad_norm": 1.9323067168518875,
"learning_rate": 4.596222207201896e-06,
"loss": 0.1767,
"step": 2017
},
{
"epoch": 0.9181073703366697,
"grad_norm": 1.7925696405315172,
"learning_rate": 4.595832696493115e-06,
"loss": 0.1692,
"step": 2018
},
{
"epoch": 0.9185623293903549,
"grad_norm": 1.6896362560345692,
"learning_rate": 4.59544301452235e-06,
"loss": 0.1527,
"step": 2019
},
{
"epoch": 0.9190172884440401,
"grad_norm": 2.6520358388003307,
"learning_rate": 4.595053161321444e-06,
"loss": 0.2183,
"step": 2020
},
{
"epoch": 0.9194722474977252,
"grad_norm": 1.8502691763569332,
"learning_rate": 4.594663136922256e-06,
"loss": 0.2027,
"step": 2021
},
{
"epoch": 0.9199272065514104,
"grad_norm": 1.66876391954138,
"learning_rate": 4.594272941356655e-06,
"loss": 0.1592,
"step": 2022
},
{
"epoch": 0.9203821656050956,
"grad_norm": 2.000282499671209,
"learning_rate": 4.593882574656528e-06,
"loss": 0.1899,
"step": 2023
},
{
"epoch": 0.9208371246587808,
"grad_norm": 2.1057167872680864,
"learning_rate": 4.5934920368537724e-06,
"loss": 0.1649,
"step": 2024
},
{
"epoch": 0.9212920837124658,
"grad_norm": 2.3421388058050603,
"learning_rate": 4.593101327980301e-06,
"loss": 0.1953,
"step": 2025
},
{
"epoch": 0.921747042766151,
"grad_norm": 1.4619166894313524,
"learning_rate": 4.592710448068043e-06,
"loss": 0.1645,
"step": 2026
},
{
"epoch": 0.9222020018198362,
"grad_norm": 2.1135622970646457,
"learning_rate": 4.592319397148936e-06,
"loss": 0.1391,
"step": 2027
},
{
"epoch": 0.9226569608735213,
"grad_norm": 1.2948388707877838,
"learning_rate": 4.5919281752549386e-06,
"loss": 0.1465,
"step": 2028
},
{
"epoch": 0.9231119199272065,
"grad_norm": 2.587913347360957,
"learning_rate": 4.5915367824180165e-06,
"loss": 0.2171,
"step": 2029
},
{
"epoch": 0.9235668789808917,
"grad_norm": 1.2685293245744347,
"learning_rate": 4.591145218670154e-06,
"loss": 0.1127,
"step": 2030
},
{
"epoch": 0.9240218380345769,
"grad_norm": 1.99832008478398,
"learning_rate": 4.590753484043348e-06,
"loss": 0.1795,
"step": 2031
},
{
"epoch": 0.924476797088262,
"grad_norm": 1.9341588389439468,
"learning_rate": 4.590361578569609e-06,
"loss": 0.1625,
"step": 2032
},
{
"epoch": 0.9249317561419472,
"grad_norm": 1.906987896729889,
"learning_rate": 4.589969502280962e-06,
"loss": 0.1292,
"step": 2033
},
{
"epoch": 0.9253867151956324,
"grad_norm": 1.3759296704205837,
"learning_rate": 4.589577255209445e-06,
"loss": 0.1618,
"step": 2034
},
{
"epoch": 0.9258416742493175,
"grad_norm": 1.7824080215785223,
"learning_rate": 4.589184837387112e-06,
"loss": 0.1571,
"step": 2035
},
{
"epoch": 0.9262966333030027,
"grad_norm": 1.969233090292503,
"learning_rate": 4.588792248846028e-06,
"loss": 0.1565,
"step": 2036
},
{
"epoch": 0.9267515923566879,
"grad_norm": 2.0350441155725982,
"learning_rate": 4.588399489618274e-06,
"loss": 0.2092,
"step": 2037
},
{
"epoch": 0.9272065514103731,
"grad_norm": 1.3739303279350978,
"learning_rate": 4.588006559735945e-06,
"loss": 0.1144,
"step": 2038
},
{
"epoch": 0.9276615104640582,
"grad_norm": 1.8231719010868002,
"learning_rate": 4.587613459231149e-06,
"loss": 0.19,
"step": 2039
},
{
"epoch": 0.9281164695177434,
"grad_norm": 1.7222249399366698,
"learning_rate": 4.5872201881360105e-06,
"loss": 0.1818,
"step": 2040
},
{
"epoch": 0.9285714285714286,
"grad_norm": 1.9962016913755094,
"learning_rate": 4.586826746482662e-06,
"loss": 0.1858,
"step": 2041
},
{
"epoch": 0.9290263876251137,
"grad_norm": 1.581565012958607,
"learning_rate": 4.586433134303257e-06,
"loss": 0.1388,
"step": 2042
},
{
"epoch": 0.9294813466787989,
"grad_norm": 2.2212237230761342,
"learning_rate": 4.586039351629959e-06,
"loss": 0.1627,
"step": 2043
},
{
"epoch": 0.9299363057324841,
"grad_norm": 2.4442840318574954,
"learning_rate": 4.585645398494944e-06,
"loss": 0.1421,
"step": 2044
},
{
"epoch": 0.9303912647861693,
"grad_norm": 1.63124630524275,
"learning_rate": 4.585251274930406e-06,
"loss": 0.1553,
"step": 2045
},
{
"epoch": 0.9308462238398544,
"grad_norm": 1.9068361286149722,
"learning_rate": 4.584856980968552e-06,
"loss": 0.195,
"step": 2046
},
{
"epoch": 0.9313011828935396,
"grad_norm": 1.8750052649788462,
"learning_rate": 4.584462516641599e-06,
"loss": 0.1843,
"step": 2047
},
{
"epoch": 0.9317561419472248,
"grad_norm": 1.8692305314343534,
"learning_rate": 4.584067881981784e-06,
"loss": 0.1607,
"step": 2048
},
{
"epoch": 0.9322111010009099,
"grad_norm": 1.7454178600595318,
"learning_rate": 4.583673077021352e-06,
"loss": 0.1166,
"step": 2049
},
{
"epoch": 0.9326660600545951,
"grad_norm": 1.7370379964519336,
"learning_rate": 4.583278101792567e-06,
"loss": 0.1658,
"step": 2050
},
{
"epoch": 0.9331210191082803,
"grad_norm": 1.6957581344539345,
"learning_rate": 4.582882956327704e-06,
"loss": 0.1394,
"step": 2051
},
{
"epoch": 0.9335759781619655,
"grad_norm": 1.8052091804015933,
"learning_rate": 4.58248764065905e-06,
"loss": 0.1571,
"step": 2052
},
{
"epoch": 0.9340309372156506,
"grad_norm": 1.5675006184278855,
"learning_rate": 4.582092154818912e-06,
"loss": 0.145,
"step": 2053
},
{
"epoch": 0.9344858962693358,
"grad_norm": 1.6024320375744705,
"learning_rate": 4.581696498839605e-06,
"loss": 0.2042,
"step": 2054
},
{
"epoch": 0.934940855323021,
"grad_norm": 1.8058483639041405,
"learning_rate": 4.581300672753462e-06,
"loss": 0.1661,
"step": 2055
},
{
"epoch": 0.935395814376706,
"grad_norm": 1.9556770558432066,
"learning_rate": 4.580904676592826e-06,
"loss": 0.1767,
"step": 2056
},
{
"epoch": 0.9358507734303912,
"grad_norm": 1.5186464139909968,
"learning_rate": 4.580508510390057e-06,
"loss": 0.1131,
"step": 2057
},
{
"epoch": 0.9363057324840764,
"grad_norm": 1.5844512517498417,
"learning_rate": 4.580112174177529e-06,
"loss": 0.1815,
"step": 2058
},
{
"epoch": 0.9367606915377616,
"grad_norm": 1.382066796659836,
"learning_rate": 4.5797156679876274e-06,
"loss": 0.1073,
"step": 2059
},
{
"epoch": 0.9372156505914467,
"grad_norm": 2.7590592902292332,
"learning_rate": 4.5793189918527524e-06,
"loss": 0.3083,
"step": 2060
},
{
"epoch": 0.9376706096451319,
"grad_norm": 2.097729619621905,
"learning_rate": 4.5789221458053205e-06,
"loss": 0.1572,
"step": 2061
},
{
"epoch": 0.9381255686988171,
"grad_norm": 2.269383743265302,
"learning_rate": 4.578525129877759e-06,
"loss": 0.2157,
"step": 2062
},
{
"epoch": 0.9385805277525022,
"grad_norm": 1.704369436738576,
"learning_rate": 4.5781279441025105e-06,
"loss": 0.1746,
"step": 2063
},
{
"epoch": 0.9390354868061874,
"grad_norm": 1.961199267422335,
"learning_rate": 4.577730588512031e-06,
"loss": 0.1794,
"step": 2064
},
{
"epoch": 0.9394904458598726,
"grad_norm": 2.0070527773957663,
"learning_rate": 4.577333063138791e-06,
"loss": 0.1744,
"step": 2065
},
{
"epoch": 0.9399454049135578,
"grad_norm": 1.4918844273699323,
"learning_rate": 4.576935368015274e-06,
"loss": 0.1614,
"step": 2066
},
{
"epoch": 0.9404003639672429,
"grad_norm": 1.957075251939811,
"learning_rate": 4.576537503173978e-06,
"loss": 0.2007,
"step": 2067
},
{
"epoch": 0.9408553230209281,
"grad_norm": 2.1344327287579916,
"learning_rate": 4.576139468647415e-06,
"loss": 0.1953,
"step": 2068
},
{
"epoch": 0.9413102820746133,
"grad_norm": 2.052141999542276,
"learning_rate": 4.575741264468111e-06,
"loss": 0.1247,
"step": 2069
},
{
"epoch": 0.9417652411282984,
"grad_norm": 1.9687685313144003,
"learning_rate": 4.575342890668603e-06,
"loss": 0.1941,
"step": 2070
},
{
"epoch": 0.9422202001819836,
"grad_norm": 2.1906738543597695,
"learning_rate": 4.574944347281448e-06,
"loss": 0.2436,
"step": 2071
},
{
"epoch": 0.9426751592356688,
"grad_norm": 2.0326378397322253,
"learning_rate": 4.5745456343392114e-06,
"loss": 0.1916,
"step": 2072
},
{
"epoch": 0.943130118289354,
"grad_norm": 1.9398275581691273,
"learning_rate": 4.574146751874473e-06,
"loss": 0.2243,
"step": 2073
},
{
"epoch": 0.9435850773430391,
"grad_norm": 1.583576444036144,
"learning_rate": 4.57374769991983e-06,
"loss": 0.1335,
"step": 2074
},
{
"epoch": 0.9440400363967243,
"grad_norm": 1.49493272878593,
"learning_rate": 4.573348478507888e-06,
"loss": 0.132,
"step": 2075
},
{
"epoch": 0.9444949954504095,
"grad_norm": 2.191087505295727,
"learning_rate": 4.5729490876712725e-06,
"loss": 0.2728,
"step": 2076
},
{
"epoch": 0.9449499545040946,
"grad_norm": 1.5696743668055735,
"learning_rate": 4.572549527442619e-06,
"loss": 0.1167,
"step": 2077
},
{
"epoch": 0.9454049135577798,
"grad_norm": 1.4703104600885406,
"learning_rate": 4.572149797854578e-06,
"loss": 0.1481,
"step": 2078
},
{
"epoch": 0.945859872611465,
"grad_norm": 1.3375471658633535,
"learning_rate": 4.571749898939813e-06,
"loss": 0.1448,
"step": 2079
},
{
"epoch": 0.9463148316651502,
"grad_norm": 1.1353706299658501,
"learning_rate": 4.5713498307310024e-06,
"loss": 0.1095,
"step": 2080
},
{
"epoch": 0.9467697907188353,
"grad_norm": 1.170226192835475,
"learning_rate": 4.570949593260837e-06,
"loss": 0.1025,
"step": 2081
},
{
"epoch": 0.9472247497725205,
"grad_norm": 1.611590656998796,
"learning_rate": 4.570549186562024e-06,
"loss": 0.1648,
"step": 2082
},
{
"epoch": 0.9476797088262057,
"grad_norm": 1.9894469425244659,
"learning_rate": 4.570148610667281e-06,
"loss": 0.2171,
"step": 2083
},
{
"epoch": 0.9481346678798908,
"grad_norm": 2.6290643290299403,
"learning_rate": 4.569747865609343e-06,
"loss": 0.2035,
"step": 2084
},
{
"epoch": 0.948589626933576,
"grad_norm": 1.9997278123807103,
"learning_rate": 4.569346951420957e-06,
"loss": 0.219,
"step": 2085
},
{
"epoch": 0.9490445859872612,
"grad_norm": 2.3647369288676465,
"learning_rate": 4.568945868134882e-06,
"loss": 0.1821,
"step": 2086
},
{
"epoch": 0.9494995450409464,
"grad_norm": 1.4361032491832602,
"learning_rate": 4.568544615783894e-06,
"loss": 0.174,
"step": 2087
},
{
"epoch": 0.9499545040946314,
"grad_norm": 2.4948435319990794,
"learning_rate": 4.568143194400782e-06,
"loss": 0.162,
"step": 2088
},
{
"epoch": 0.9504094631483166,
"grad_norm": 2.3391791745125823,
"learning_rate": 4.567741604018348e-06,
"loss": 0.1731,
"step": 2089
},
{
"epoch": 0.9508644222020018,
"grad_norm": 1.9417130047261684,
"learning_rate": 4.567339844669407e-06,
"loss": 0.2115,
"step": 2090
},
{
"epoch": 0.9513193812556869,
"grad_norm": 1.341309783614821,
"learning_rate": 4.566937916386791e-06,
"loss": 0.1207,
"step": 2091
},
{
"epoch": 0.9517743403093721,
"grad_norm": 1.8063160975644432,
"learning_rate": 4.566535819203342e-06,
"loss": 0.1484,
"step": 2092
},
{
"epoch": 0.9522292993630573,
"grad_norm": 1.4064547804406506,
"learning_rate": 4.566133553151918e-06,
"loss": 0.1696,
"step": 2093
},
{
"epoch": 0.9526842584167425,
"grad_norm": 1.5123792301862293,
"learning_rate": 4.565731118265392e-06,
"loss": 0.1513,
"step": 2094
},
{
"epoch": 0.9531392174704276,
"grad_norm": 2.6660242675499974,
"learning_rate": 4.5653285145766465e-06,
"loss": 0.1967,
"step": 2095
},
{
"epoch": 0.9535941765241128,
"grad_norm": 1.3182075171271719,
"learning_rate": 4.564925742118583e-06,
"loss": 0.1647,
"step": 2096
},
{
"epoch": 0.954049135577798,
"grad_norm": 2.0246143369138583,
"learning_rate": 4.564522800924111e-06,
"loss": 0.1933,
"step": 2097
},
{
"epoch": 0.9545040946314831,
"grad_norm": 1.5229871866624265,
"learning_rate": 4.56411969102616e-06,
"loss": 0.1262,
"step": 2098
},
{
"epoch": 0.9549590536851683,
"grad_norm": 1.6259281484911337,
"learning_rate": 4.5637164124576695e-06,
"loss": 0.22,
"step": 2099
},
{
"epoch": 0.9554140127388535,
"grad_norm": 2.2924228140977534,
"learning_rate": 4.563312965251594e-06,
"loss": 0.1788,
"step": 2100
},
{
"epoch": 0.9558689717925387,
"grad_norm": 2.145017083065323,
"learning_rate": 4.562909349440899e-06,
"loss": 0.1997,
"step": 2101
},
{
"epoch": 0.9563239308462238,
"grad_norm": 1.4998751606083633,
"learning_rate": 4.5625055650585695e-06,
"loss": 0.1268,
"step": 2102
},
{
"epoch": 0.956778889899909,
"grad_norm": 2.212976295267469,
"learning_rate": 4.562101612137599e-06,
"loss": 0.1717,
"step": 2103
},
{
"epoch": 0.9572338489535942,
"grad_norm": 1.679438029199367,
"learning_rate": 4.561697490710998e-06,
"loss": 0.1072,
"step": 2104
},
{
"epoch": 0.9576888080072793,
"grad_norm": 2.079365510674891,
"learning_rate": 4.561293200811787e-06,
"loss": 0.1746,
"step": 2105
},
{
"epoch": 0.9581437670609645,
"grad_norm": 1.686198495026396,
"learning_rate": 4.560888742473005e-06,
"loss": 0.1561,
"step": 2106
},
{
"epoch": 0.9585987261146497,
"grad_norm": 1.6637740262678333,
"learning_rate": 4.560484115727703e-06,
"loss": 0.202,
"step": 2107
},
{
"epoch": 0.9590536851683349,
"grad_norm": 1.3363367490497915,
"learning_rate": 4.560079320608942e-06,
"loss": 0.1505,
"step": 2108
},
{
"epoch": 0.95950864422202,
"grad_norm": 1.3524224143962482,
"learning_rate": 4.5596743571498035e-06,
"loss": 0.1556,
"step": 2109
},
{
"epoch": 0.9599636032757052,
"grad_norm": 2.051012825316942,
"learning_rate": 4.5592692253833775e-06,
"loss": 0.1557,
"step": 2110
},
{
"epoch": 0.9604185623293904,
"grad_norm": 1.8725405774246842,
"learning_rate": 4.5588639253427705e-06,
"loss": 0.1361,
"step": 2111
},
{
"epoch": 0.9608735213830755,
"grad_norm": 1.6129721682768872,
"learning_rate": 4.558458457061101e-06,
"loss": 0.1604,
"step": 2112
},
{
"epoch": 0.9613284804367607,
"grad_norm": 2.4257644594708654,
"learning_rate": 4.5580528205715024e-06,
"loss": 0.1728,
"step": 2113
},
{
"epoch": 0.9617834394904459,
"grad_norm": 2.2020262494310714,
"learning_rate": 4.557647015907121e-06,
"loss": 0.1982,
"step": 2114
},
{
"epoch": 0.9622383985441311,
"grad_norm": 1.3942660783602792,
"learning_rate": 4.557241043101118e-06,
"loss": 0.1263,
"step": 2115
},
{
"epoch": 0.9626933575978162,
"grad_norm": 1.6927990416728342,
"learning_rate": 4.556834902186667e-06,
"loss": 0.2537,
"step": 2116
},
{
"epoch": 0.9631483166515014,
"grad_norm": 2.0785259665220646,
"learning_rate": 4.556428593196956e-06,
"loss": 0.1927,
"step": 2117
},
{
"epoch": 0.9636032757051866,
"grad_norm": 1.7131650413165849,
"learning_rate": 4.556022116165189e-06,
"loss": 0.2146,
"step": 2118
},
{
"epoch": 0.9640582347588716,
"grad_norm": 1.7560312461053569,
"learning_rate": 4.555615471124578e-06,
"loss": 0.1429,
"step": 2119
},
{
"epoch": 0.9645131938125568,
"grad_norm": 1.4424071339171873,
"learning_rate": 4.555208658108354e-06,
"loss": 0.1017,
"step": 2120
},
{
"epoch": 0.964968152866242,
"grad_norm": 2.366476482520588,
"learning_rate": 4.55480167714976e-06,
"loss": 0.1701,
"step": 2121
},
{
"epoch": 0.9654231119199272,
"grad_norm": 1.3193271811867113,
"learning_rate": 4.554394528282052e-06,
"loss": 0.1608,
"step": 2122
},
{
"epoch": 0.9658780709736123,
"grad_norm": 1.6112197038225973,
"learning_rate": 4.553987211538501e-06,
"loss": 0.1663,
"step": 2123
},
{
"epoch": 0.9663330300272975,
"grad_norm": 2.2120821423419477,
"learning_rate": 4.5535797269523906e-06,
"loss": 0.1761,
"step": 2124
},
{
"epoch": 0.9667879890809827,
"grad_norm": 1.9459325657347053,
"learning_rate": 4.55317207455702e-06,
"loss": 0.1648,
"step": 2125
},
{
"epoch": 0.9672429481346679,
"grad_norm": 1.2258892841488513,
"learning_rate": 4.552764254385697e-06,
"loss": 0.113,
"step": 2126
},
{
"epoch": 0.967697907188353,
"grad_norm": 1.7595258140929935,
"learning_rate": 4.552356266471751e-06,
"loss": 0.1773,
"step": 2127
},
{
"epoch": 0.9681528662420382,
"grad_norm": 1.9664757298212556,
"learning_rate": 4.55194811084852e-06,
"loss": 0.165,
"step": 2128
},
{
"epoch": 0.9686078252957234,
"grad_norm": 2.222530250938157,
"learning_rate": 4.551539787549354e-06,
"loss": 0.2096,
"step": 2129
},
{
"epoch": 0.9690627843494085,
"grad_norm": 1.3774868751004326,
"learning_rate": 4.551131296607623e-06,
"loss": 0.1089,
"step": 2130
},
{
"epoch": 0.9695177434030937,
"grad_norm": 1.8067013761642468,
"learning_rate": 4.550722638056703e-06,
"loss": 0.1323,
"step": 2131
},
{
"epoch": 0.9699727024567789,
"grad_norm": 2.24991176799243,
"learning_rate": 4.550313811929993e-06,
"loss": 0.1334,
"step": 2132
},
{
"epoch": 0.9704276615104641,
"grad_norm": 2.72004150671695,
"learning_rate": 4.549904818260895e-06,
"loss": 0.1775,
"step": 2133
},
{
"epoch": 0.9708826205641492,
"grad_norm": 2.342721771224346,
"learning_rate": 4.549495657082834e-06,
"loss": 0.191,
"step": 2134
},
{
"epoch": 0.9713375796178344,
"grad_norm": 2.2728812324499534,
"learning_rate": 4.549086328429242e-06,
"loss": 0.1425,
"step": 2135
},
{
"epoch": 0.9717925386715196,
"grad_norm": 1.453499597882781,
"learning_rate": 4.548676832333569e-06,
"loss": 0.1316,
"step": 2136
},
{
"epoch": 0.9722474977252047,
"grad_norm": 2.01603990428807,
"learning_rate": 4.548267168829279e-06,
"loss": 0.1307,
"step": 2137
},
{
"epoch": 0.9727024567788899,
"grad_norm": 1.6605060275137966,
"learning_rate": 4.547857337949844e-06,
"loss": 0.1399,
"step": 2138
},
{
"epoch": 0.9731574158325751,
"grad_norm": 1.5535531332266466,
"learning_rate": 4.5474473397287556e-06,
"loss": 0.1321,
"step": 2139
},
{
"epoch": 0.9736123748862603,
"grad_norm": 1.5373238474360202,
"learning_rate": 4.547037174199517e-06,
"loss": 0.1343,
"step": 2140
},
{
"epoch": 0.9740673339399454,
"grad_norm": 1.8078338860297858,
"learning_rate": 4.546626841395645e-06,
"loss": 0.1635,
"step": 2141
},
{
"epoch": 0.9745222929936306,
"grad_norm": 2.3652157653146326,
"learning_rate": 4.54621634135067e-06,
"loss": 0.1574,
"step": 2142
},
{
"epoch": 0.9749772520473158,
"grad_norm": 1.582720512511224,
"learning_rate": 4.545805674098136e-06,
"loss": 0.1834,
"step": 2143
},
{
"epoch": 0.9754322111010009,
"grad_norm": 1.603799084987541,
"learning_rate": 4.545394839671601e-06,
"loss": 0.1464,
"step": 2144
},
{
"epoch": 0.9758871701546861,
"grad_norm": 2.2937187508235612,
"learning_rate": 4.544983838104637e-06,
"loss": 0.1689,
"step": 2145
},
{
"epoch": 0.9763421292083713,
"grad_norm": 1.5827694703198016,
"learning_rate": 4.544572669430828e-06,
"loss": 0.1974,
"step": 2146
},
{
"epoch": 0.9767970882620565,
"grad_norm": 1.5229863728993667,
"learning_rate": 4.544161333683775e-06,
"loss": 0.1347,
"step": 2147
},
{
"epoch": 0.9772520473157416,
"grad_norm": 1.7227170284858135,
"learning_rate": 4.543749830897088e-06,
"loss": 0.2186,
"step": 2148
},
{
"epoch": 0.9777070063694268,
"grad_norm": 1.9401788313572834,
"learning_rate": 4.543338161104395e-06,
"loss": 0.1674,
"step": 2149
},
{
"epoch": 0.978161965423112,
"grad_norm": 1.4440321556413929,
"learning_rate": 4.542926324339335e-06,
"loss": 0.1518,
"step": 2150
},
{
"epoch": 0.978616924476797,
"grad_norm": 1.5863469206535143,
"learning_rate": 4.542514320635561e-06,
"loss": 0.1548,
"step": 2151
},
{
"epoch": 0.9790718835304822,
"grad_norm": 1.7952124026440508,
"learning_rate": 4.542102150026741e-06,
"loss": 0.2011,
"step": 2152
},
{
"epoch": 0.9795268425841674,
"grad_norm": 1.2781168765483073,
"learning_rate": 4.541689812546556e-06,
"loss": 0.1708,
"step": 2153
},
{
"epoch": 0.9799818016378526,
"grad_norm": 2.275201017608769,
"learning_rate": 4.541277308228698e-06,
"loss": 0.2655,
"step": 2154
},
{
"epoch": 0.9804367606915377,
"grad_norm": 1.6797512508176873,
"learning_rate": 4.540864637106879e-06,
"loss": 0.1526,
"step": 2155
},
{
"epoch": 0.9808917197452229,
"grad_norm": 1.7795439392430585,
"learning_rate": 4.540451799214817e-06,
"loss": 0.1561,
"step": 2156
},
{
"epoch": 0.9813466787989081,
"grad_norm": 2.2915523451786766,
"learning_rate": 4.540038794586248e-06,
"loss": 0.1603,
"step": 2157
},
{
"epoch": 0.9818016378525932,
"grad_norm": 2.2274131509949537,
"learning_rate": 4.539625623254923e-06,
"loss": 0.1423,
"step": 2158
},
{
"epoch": 0.9822565969062784,
"grad_norm": 1.3978925866840657,
"learning_rate": 4.539212285254601e-06,
"loss": 0.1708,
"step": 2159
},
{
"epoch": 0.9827115559599636,
"grad_norm": 1.7857894009279391,
"learning_rate": 4.5387987806190615e-06,
"loss": 0.1893,
"step": 2160
},
{
"epoch": 0.9831665150136488,
"grad_norm": 1.518791485457489,
"learning_rate": 4.538385109382093e-06,
"loss": 0.1709,
"step": 2161
},
{
"epoch": 0.9836214740673339,
"grad_norm": 1.3743190231639797,
"learning_rate": 4.537971271577498e-06,
"loss": 0.1746,
"step": 2162
},
{
"epoch": 0.9840764331210191,
"grad_norm": 1.1750088863525163,
"learning_rate": 4.537557267239093e-06,
"loss": 0.108,
"step": 2163
},
{
"epoch": 0.9845313921747043,
"grad_norm": 1.2225308832618265,
"learning_rate": 4.537143096400712e-06,
"loss": 0.1061,
"step": 2164
},
{
"epoch": 0.9849863512283894,
"grad_norm": 2.1247362714767415,
"learning_rate": 4.536728759096195e-06,
"loss": 0.179,
"step": 2165
},
{
"epoch": 0.9854413102820746,
"grad_norm": 1.808580318181682,
"learning_rate": 4.536314255359402e-06,
"loss": 0.1335,
"step": 2166
},
{
"epoch": 0.9858962693357598,
"grad_norm": 1.6790298431680175,
"learning_rate": 4.535899585224204e-06,
"loss": 0.1493,
"step": 2167
},
{
"epoch": 0.986351228389445,
"grad_norm": 3.0332484593824245,
"learning_rate": 4.535484748724486e-06,
"loss": 0.2063,
"step": 2168
},
{
"epoch": 0.9868061874431301,
"grad_norm": 1.6421323451507468,
"learning_rate": 4.535069745894147e-06,
"loss": 0.1673,
"step": 2169
},
{
"epoch": 0.9872611464968153,
"grad_norm": 1.9282204111223042,
"learning_rate": 4.534654576767098e-06,
"loss": 0.1428,
"step": 2170
},
{
"epoch": 0.9877161055505005,
"grad_norm": 1.4541197485662065,
"learning_rate": 4.534239241377266e-06,
"loss": 0.1901,
"step": 2171
},
{
"epoch": 0.9881710646041856,
"grad_norm": 3.2268329342995554,
"learning_rate": 4.5338237397585895e-06,
"loss": 0.2441,
"step": 2172
},
{
"epoch": 0.9886260236578708,
"grad_norm": 2.4649363175751646,
"learning_rate": 4.533408071945021e-06,
"loss": 0.1763,
"step": 2173
},
{
"epoch": 0.989080982711556,
"grad_norm": 1.8464040284824113,
"learning_rate": 4.532992237970528e-06,
"loss": 0.1646,
"step": 2174
},
{
"epoch": 0.9895359417652412,
"grad_norm": 2.115464473457186,
"learning_rate": 4.532576237869091e-06,
"loss": 0.1468,
"step": 2175
},
{
"epoch": 0.9899909008189263,
"grad_norm": 1.6765582325152246,
"learning_rate": 4.5321600716747025e-06,
"loss": 0.1377,
"step": 2176
},
{
"epoch": 0.9904458598726115,
"grad_norm": 1.8413627666297776,
"learning_rate": 4.531743739421369e-06,
"loss": 0.181,
"step": 2177
},
{
"epoch": 0.9909008189262967,
"grad_norm": 1.7110916137165555,
"learning_rate": 4.531327241143114e-06,
"loss": 0.1418,
"step": 2178
},
{
"epoch": 0.9913557779799818,
"grad_norm": 2.3165603295554726,
"learning_rate": 4.530910576873969e-06,
"loss": 0.1666,
"step": 2179
},
{
"epoch": 0.991810737033667,
"grad_norm": 2.0264888702689254,
"learning_rate": 4.530493746647984e-06,
"loss": 0.1653,
"step": 2180
},
{
"epoch": 0.9922656960873522,
"grad_norm": 3.7082736074441227,
"learning_rate": 4.530076750499219e-06,
"loss": 0.1955,
"step": 2181
},
{
"epoch": 0.9927206551410374,
"grad_norm": 1.4980795502080217,
"learning_rate": 4.52965958846175e-06,
"loss": 0.1763,
"step": 2182
},
{
"epoch": 0.9931756141947224,
"grad_norm": 1.328886576986546,
"learning_rate": 4.529242260569665e-06,
"loss": 0.135,
"step": 2183
},
{
"epoch": 0.9936305732484076,
"grad_norm": 2.4602783485410478,
"learning_rate": 4.528824766857067e-06,
"loss": 0.225,
"step": 2184
},
{
"epoch": 0.9940855323020928,
"grad_norm": 2.656745825690249,
"learning_rate": 4.5284071073580715e-06,
"loss": 0.1623,
"step": 2185
},
{
"epoch": 0.9945404913557779,
"grad_norm": 2.191300990353365,
"learning_rate": 4.527989282106807e-06,
"loss": 0.145,
"step": 2186
},
{
"epoch": 0.9949954504094631,
"grad_norm": 2.3096174225453043,
"learning_rate": 4.527571291137416e-06,
"loss": 0.2047,
"step": 2187
},
{
"epoch": 0.9954504094631483,
"grad_norm": 2.2206355508554374,
"learning_rate": 4.527153134484056e-06,
"loss": 0.1978,
"step": 2188
},
{
"epoch": 0.9959053685168335,
"grad_norm": 1.5575737643430931,
"learning_rate": 4.5267348121808965e-06,
"loss": 0.1083,
"step": 2189
},
{
"epoch": 0.9963603275705186,
"grad_norm": 1.1842592978237663,
"learning_rate": 4.526316324262121e-06,
"loss": 0.1418,
"step": 2190
},
{
"epoch": 0.9968152866242038,
"grad_norm": 2.066729296311549,
"learning_rate": 4.525897670761926e-06,
"loss": 0.1555,
"step": 2191
},
{
"epoch": 0.997270245677889,
"grad_norm": 1.8945946795231638,
"learning_rate": 4.525478851714522e-06,
"loss": 0.1602,
"step": 2192
},
{
"epoch": 0.9977252047315741,
"grad_norm": 2.288603637382534,
"learning_rate": 4.525059867154133e-06,
"loss": 0.1728,
"step": 2193
},
{
"epoch": 0.9981801637852593,
"grad_norm": 1.548625455808381,
"learning_rate": 4.5246407171149975e-06,
"loss": 0.1535,
"step": 2194
},
{
"epoch": 0.9986351228389445,
"grad_norm": 1.7795058207338135,
"learning_rate": 4.5242214016313655e-06,
"loss": 0.1937,
"step": 2195
},
{
"epoch": 0.9990900818926297,
"grad_norm": 1.8173123394415125,
"learning_rate": 4.523801920737501e-06,
"loss": 0.1855,
"step": 2196
},
{
"epoch": 0.9995450409463148,
"grad_norm": 1.5328423318772029,
"learning_rate": 4.523382274467684e-06,
"loss": 0.1734,
"step": 2197
},
{
"epoch": 1.0,
"grad_norm": 1.6888871167302404,
"learning_rate": 4.522962462856206e-06,
"loss": 0.1061,
"step": 2198
},
{
"epoch": 1.000454959053685,
"grad_norm": 1.0169999119479456,
"learning_rate": 4.522542485937369e-06,
"loss": 0.051,
"step": 2199
},
{
"epoch": 1.0009099181073704,
"grad_norm": 1.6609923808472133,
"learning_rate": 4.522122343745495e-06,
"loss": 0.0982,
"step": 2200
},
{
"epoch": 1.0013648771610555,
"grad_norm": 1.2283700830083324,
"learning_rate": 4.521702036314915e-06,
"loss": 0.068,
"step": 2201
},
{
"epoch": 1.0018198362147406,
"grad_norm": 1.220074312624483,
"learning_rate": 4.521281563679973e-06,
"loss": 0.0629,
"step": 2202
},
{
"epoch": 1.0022747952684259,
"grad_norm": 1.4941719880778739,
"learning_rate": 4.5208609258750314e-06,
"loss": 0.0755,
"step": 2203
},
{
"epoch": 1.002729754322111,
"grad_norm": 1.1143728511252875,
"learning_rate": 4.52044012293446e-06,
"loss": 0.0587,
"step": 2204
},
{
"epoch": 1.0031847133757963,
"grad_norm": 1.5319847923881116,
"learning_rate": 4.520019154892646e-06,
"loss": 0.0851,
"step": 2205
},
{
"epoch": 1.0036396724294814,
"grad_norm": 1.2636498680398078,
"learning_rate": 4.519598021783989e-06,
"loss": 0.0993,
"step": 2206
},
{
"epoch": 1.0040946314831665,
"grad_norm": 1.5487488091959216,
"learning_rate": 4.519176723642903e-06,
"loss": 0.113,
"step": 2207
},
{
"epoch": 1.0045495905368518,
"grad_norm": 1.5557166129958784,
"learning_rate": 4.518755260503813e-06,
"loss": 0.0788,
"step": 2208
},
{
"epoch": 1.0050045495905369,
"grad_norm": 1.2818157097100387,
"learning_rate": 4.51833363240116e-06,
"loss": 0.0743,
"step": 2209
},
{
"epoch": 1.005459508644222,
"grad_norm": 1.200932009259888,
"learning_rate": 4.517911839369398e-06,
"loss": 0.0811,
"step": 2210
},
{
"epoch": 1.0059144676979073,
"grad_norm": 1.4486327355662423,
"learning_rate": 4.517489881442993e-06,
"loss": 0.062,
"step": 2211
},
{
"epoch": 1.0063694267515924,
"grad_norm": 1.3527098955371344,
"learning_rate": 4.517067758656424e-06,
"loss": 0.0627,
"step": 2212
},
{
"epoch": 1.0068243858052774,
"grad_norm": 1.4047497974003487,
"learning_rate": 4.516645471044188e-06,
"loss": 0.0651,
"step": 2213
},
{
"epoch": 1.0072793448589628,
"grad_norm": 1.4164244968906639,
"learning_rate": 4.516223018640791e-06,
"loss": 0.0714,
"step": 2214
},
{
"epoch": 1.0077343039126478,
"grad_norm": 1.5809882117425458,
"learning_rate": 4.515800401480754e-06,
"loss": 0.0989,
"step": 2215
},
{
"epoch": 1.008189262966333,
"grad_norm": 1.6844068994280326,
"learning_rate": 4.515377619598612e-06,
"loss": 0.1007,
"step": 2216
},
{
"epoch": 1.0086442220200182,
"grad_norm": 1.5732620970585767,
"learning_rate": 4.514954673028913e-06,
"loss": 0.0765,
"step": 2217
},
{
"epoch": 1.0090991810737033,
"grad_norm": 1.3651454362527589,
"learning_rate": 4.5145315618062155e-06,
"loss": 0.0817,
"step": 2218
},
{
"epoch": 1.0095541401273886,
"grad_norm": 1.7849697070364972,
"learning_rate": 4.514108285965098e-06,
"loss": 0.0946,
"step": 2219
},
{
"epoch": 1.0100090991810737,
"grad_norm": 1.4164875410963866,
"learning_rate": 4.513684845540146e-06,
"loss": 0.067,
"step": 2220
},
{
"epoch": 1.0104640582347588,
"grad_norm": 1.7807110987231174,
"learning_rate": 4.5132612405659625e-06,
"loss": 0.1131,
"step": 2221
},
{
"epoch": 1.0109190172884441,
"grad_norm": 1.6962102867596296,
"learning_rate": 4.5128374710771625e-06,
"loss": 0.1001,
"step": 2222
},
{
"epoch": 1.0113739763421292,
"grad_norm": 1.9807611103838136,
"learning_rate": 4.512413537108374e-06,
"loss": 0.1216,
"step": 2223
},
{
"epoch": 1.0118289353958143,
"grad_norm": 2.2071849786855195,
"learning_rate": 4.511989438694239e-06,
"loss": 0.0758,
"step": 2224
},
{
"epoch": 1.0122838944494996,
"grad_norm": 1.41006582199038,
"learning_rate": 4.511565175869415e-06,
"loss": 0.0676,
"step": 2225
},
{
"epoch": 1.0127388535031847,
"grad_norm": 1.5005194178509522,
"learning_rate": 4.511140748668566e-06,
"loss": 0.0845,
"step": 2226
},
{
"epoch": 1.0131938125568698,
"grad_norm": 1.2291494575864939,
"learning_rate": 4.510716157126379e-06,
"loss": 0.0611,
"step": 2227
},
{
"epoch": 1.013648771610555,
"grad_norm": 2.4795116846611975,
"learning_rate": 4.510291401277548e-06,
"loss": 0.0983,
"step": 2228
},
{
"epoch": 1.0141037306642402,
"grad_norm": 2.657277286309681,
"learning_rate": 4.509866481156781e-06,
"loss": 0.1101,
"step": 2229
},
{
"epoch": 1.0145586897179253,
"grad_norm": 1.8196308245882602,
"learning_rate": 4.509441396798802e-06,
"loss": 0.0998,
"step": 2230
},
{
"epoch": 1.0150136487716106,
"grad_norm": 1.9314931582074881,
"learning_rate": 4.5090161482383475e-06,
"loss": 0.0936,
"step": 2231
},
{
"epoch": 1.0154686078252957,
"grad_norm": 1.2746342487726179,
"learning_rate": 4.508590735510166e-06,
"loss": 0.0676,
"step": 2232
},
{
"epoch": 1.015923566878981,
"grad_norm": 1.8859048739802027,
"learning_rate": 4.508165158649019e-06,
"loss": 0.0811,
"step": 2233
},
{
"epoch": 1.016378525932666,
"grad_norm": 1.6756178231136896,
"learning_rate": 4.507739417689685e-06,
"loss": 0.0747,
"step": 2234
},
{
"epoch": 1.0168334849863512,
"grad_norm": 1.3984270258928366,
"learning_rate": 4.507313512666953e-06,
"loss": 0.075,
"step": 2235
},
{
"epoch": 1.0172884440400365,
"grad_norm": 1.5242107845200688,
"learning_rate": 4.506887443615625e-06,
"loss": 0.0823,
"step": 2236
},
{
"epoch": 1.0177434030937216,
"grad_norm": 1.5995342787535922,
"learning_rate": 4.506461210570518e-06,
"loss": 0.0971,
"step": 2237
},
{
"epoch": 1.0181983621474067,
"grad_norm": 1.1425078029916038,
"learning_rate": 4.506034813566462e-06,
"loss": 0.1233,
"step": 2238
},
{
"epoch": 1.018653321201092,
"grad_norm": 1.4187790734010148,
"learning_rate": 4.505608252638301e-06,
"loss": 0.0934,
"step": 2239
},
{
"epoch": 1.019108280254777,
"grad_norm": 1.9848336082848856,
"learning_rate": 4.50518152782089e-06,
"loss": 0.1203,
"step": 2240
},
{
"epoch": 1.0195632393084622,
"grad_norm": 1.2043374157232327,
"learning_rate": 4.504754639149101e-06,
"loss": 0.0709,
"step": 2241
},
{
"epoch": 1.0200181983621475,
"grad_norm": 1.36618996999929,
"learning_rate": 4.504327586657814e-06,
"loss": 0.0647,
"step": 2242
},
{
"epoch": 1.0204731574158326,
"grad_norm": 1.563535065138085,
"learning_rate": 4.50390037038193e-06,
"loss": 0.0833,
"step": 2243
},
{
"epoch": 1.0209281164695176,
"grad_norm": 1.5296584792807861,
"learning_rate": 4.503472990356357e-06,
"loss": 0.0946,
"step": 2244
},
{
"epoch": 1.021383075523203,
"grad_norm": 1.512634883619265,
"learning_rate": 4.503045446616018e-06,
"loss": 0.0715,
"step": 2245
},
{
"epoch": 1.021838034576888,
"grad_norm": 1.3010427168043244,
"learning_rate": 4.502617739195852e-06,
"loss": 0.0873,
"step": 2246
},
{
"epoch": 1.0222929936305734,
"grad_norm": 1.387157397416425,
"learning_rate": 4.502189868130807e-06,
"loss": 0.0763,
"step": 2247
},
{
"epoch": 1.0227479526842584,
"grad_norm": 1.828795187833686,
"learning_rate": 4.501761833455849e-06,
"loss": 0.1319,
"step": 2248
},
{
"epoch": 1.0232029117379435,
"grad_norm": 1.3445669290205065,
"learning_rate": 4.501333635205952e-06,
"loss": 0.068,
"step": 2249
},
{
"epoch": 1.0236578707916288,
"grad_norm": 1.5610944674651466,
"learning_rate": 4.5009052734161095e-06,
"loss": 0.0739,
"step": 2250
},
{
"epoch": 1.024112829845314,
"grad_norm": 1.2525841076083186,
"learning_rate": 4.500476748121324e-06,
"loss": 0.1094,
"step": 2251
},
{
"epoch": 1.024567788898999,
"grad_norm": 1.5118810013113924,
"learning_rate": 4.500048059356613e-06,
"loss": 0.1041,
"step": 2252
},
{
"epoch": 1.0250227479526843,
"grad_norm": 1.318153460904525,
"learning_rate": 4.499619207157007e-06,
"loss": 0.0851,
"step": 2253
},
{
"epoch": 1.0254777070063694,
"grad_norm": 1.3005012388734132,
"learning_rate": 4.499190191557549e-06,
"loss": 0.1007,
"step": 2254
},
{
"epoch": 1.0259326660600545,
"grad_norm": 1.7684251321269342,
"learning_rate": 4.498761012593296e-06,
"loss": 0.1144,
"step": 2255
},
{
"epoch": 1.0263876251137398,
"grad_norm": 1.2065670700113398,
"learning_rate": 4.498331670299321e-06,
"loss": 0.1344,
"step": 2256
},
{
"epoch": 1.026842584167425,
"grad_norm": 1.6857989870574055,
"learning_rate": 4.497902164710704e-06,
"loss": 0.0642,
"step": 2257
},
{
"epoch": 1.02729754322111,
"grad_norm": 1.6473004600696095,
"learning_rate": 4.497472495862547e-06,
"loss": 0.0981,
"step": 2258
},
{
"epoch": 1.0277525022747953,
"grad_norm": 1.3689985527437365,
"learning_rate": 4.497042663789957e-06,
"loss": 0.0813,
"step": 2259
},
{
"epoch": 1.0282074613284804,
"grad_norm": 1.6484955662328646,
"learning_rate": 4.496612668528059e-06,
"loss": 0.1318,
"step": 2260
},
{
"epoch": 1.0286624203821657,
"grad_norm": 1.2301308018690613,
"learning_rate": 4.496182510111991e-06,
"loss": 0.1323,
"step": 2261
},
{
"epoch": 1.0291173794358508,
"grad_norm": 1.3974663767006335,
"learning_rate": 4.495752188576902e-06,
"loss": 0.1113,
"step": 2262
},
{
"epoch": 1.0295723384895359,
"grad_norm": 1.9572449646613161,
"learning_rate": 4.4953217039579574e-06,
"loss": 0.1108,
"step": 2263
},
{
"epoch": 1.0300272975432212,
"grad_norm": 1.5604560381918156,
"learning_rate": 4.494891056290335e-06,
"loss": 0.126,
"step": 2264
},
{
"epoch": 1.0304822565969063,
"grad_norm": 1.7509136256359128,
"learning_rate": 4.494460245609223e-06,
"loss": 0.0767,
"step": 2265
},
{
"epoch": 1.0309372156505914,
"grad_norm": 1.5345571279100725,
"learning_rate": 4.494029271949827e-06,
"loss": 0.1008,
"step": 2266
},
{
"epoch": 1.0313921747042767,
"grad_norm": 1.0263814664645543,
"learning_rate": 4.493598135347363e-06,
"loss": 0.0931,
"step": 2267
},
{
"epoch": 1.0318471337579618,
"grad_norm": 2.0480255592331584,
"learning_rate": 4.493166835837064e-06,
"loss": 0.0681,
"step": 2268
},
{
"epoch": 1.0323020928116469,
"grad_norm": 1.8761109395251792,
"learning_rate": 4.492735373454171e-06,
"loss": 0.1086,
"step": 2269
},
{
"epoch": 1.0327570518653322,
"grad_norm": 1.897488467663145,
"learning_rate": 4.492303748233943e-06,
"loss": 0.1267,
"step": 2270
},
{
"epoch": 1.0332120109190173,
"grad_norm": 1.7630394900644286,
"learning_rate": 4.49187196021165e-06,
"loss": 0.148,
"step": 2271
},
{
"epoch": 1.0336669699727024,
"grad_norm": 1.557460432820476,
"learning_rate": 4.491440009422575e-06,
"loss": 0.0822,
"step": 2272
},
{
"epoch": 1.0341219290263877,
"grad_norm": 2.2035963282826474,
"learning_rate": 4.491007895902016e-06,
"loss": 0.1237,
"step": 2273
},
{
"epoch": 1.0345768880800728,
"grad_norm": 1.7055574933768018,
"learning_rate": 4.490575619685283e-06,
"loss": 0.101,
"step": 2274
},
{
"epoch": 1.035031847133758,
"grad_norm": 2.3176332211637103,
"learning_rate": 4.4901431808077e-06,
"loss": 0.0965,
"step": 2275
},
{
"epoch": 1.0354868061874432,
"grad_norm": 1.9372753009751453,
"learning_rate": 4.489710579304603e-06,
"loss": 0.1356,
"step": 2276
},
{
"epoch": 1.0359417652411282,
"grad_norm": 1.3110102653721396,
"learning_rate": 4.489277815211343e-06,
"loss": 0.0544,
"step": 2277
},
{
"epoch": 1.0363967242948136,
"grad_norm": 1.4905691930121885,
"learning_rate": 4.488844888563284e-06,
"loss": 0.1552,
"step": 2278
},
{
"epoch": 1.0368516833484986,
"grad_norm": 1.2129187548833384,
"learning_rate": 4.488411799395802e-06,
"loss": 0.0635,
"step": 2279
},
{
"epoch": 1.0373066424021837,
"grad_norm": 1.7307605999371245,
"learning_rate": 4.487978547744287e-06,
"loss": 0.0718,
"step": 2280
},
{
"epoch": 1.037761601455869,
"grad_norm": 4.002919733780402,
"learning_rate": 4.487545133644143e-06,
"loss": 0.0918,
"step": 2281
},
{
"epoch": 1.0382165605095541,
"grad_norm": 1.434451235166591,
"learning_rate": 4.487111557130787e-06,
"loss": 0.1087,
"step": 2282
},
{
"epoch": 1.0386715195632392,
"grad_norm": 1.6326264823457393,
"learning_rate": 4.486677818239647e-06,
"loss": 0.0943,
"step": 2283
},
{
"epoch": 1.0391264786169245,
"grad_norm": 1.6173934297359729,
"learning_rate": 4.486243917006169e-06,
"loss": 0.0825,
"step": 2284
},
{
"epoch": 1.0395814376706096,
"grad_norm": 1.330454351983684,
"learning_rate": 4.485809853465807e-06,
"loss": 0.0505,
"step": 2285
},
{
"epoch": 1.0400363967242947,
"grad_norm": 1.3258755084207146,
"learning_rate": 4.4853756276540315e-06,
"loss": 0.0877,
"step": 2286
},
{
"epoch": 1.04049135577798,
"grad_norm": 1.4601501745351109,
"learning_rate": 4.484941239606326e-06,
"loss": 0.0861,
"step": 2287
},
{
"epoch": 1.040946314831665,
"grad_norm": 1.978079069134469,
"learning_rate": 4.484506689358186e-06,
"loss": 0.1226,
"step": 2288
},
{
"epoch": 1.0414012738853504,
"grad_norm": 1.3962311543656398,
"learning_rate": 4.484071976945121e-06,
"loss": 0.0687,
"step": 2289
},
{
"epoch": 1.0418562329390355,
"grad_norm": 1.2605481862079213,
"learning_rate": 4.483637102402655e-06,
"loss": 0.1035,
"step": 2290
},
{
"epoch": 1.0423111919927206,
"grad_norm": 1.3191554559607057,
"learning_rate": 4.4832020657663224e-06,
"loss": 0.0789,
"step": 2291
},
{
"epoch": 1.042766151046406,
"grad_norm": 1.7983136808453735,
"learning_rate": 4.482766867071673e-06,
"loss": 0.068,
"step": 2292
},
{
"epoch": 1.043221110100091,
"grad_norm": 1.3901753138130788,
"learning_rate": 4.482331506354269e-06,
"loss": 0.1017,
"step": 2293
},
{
"epoch": 1.043676069153776,
"grad_norm": 1.581469571449512,
"learning_rate": 4.4818959836496876e-06,
"loss": 0.0639,
"step": 2294
},
{
"epoch": 1.0441310282074614,
"grad_norm": 1.269815942746802,
"learning_rate": 4.481460298993515e-06,
"loss": 0.0625,
"step": 2295
},
{
"epoch": 1.0445859872611465,
"grad_norm": 1.3773026873827707,
"learning_rate": 4.481024452421357e-06,
"loss": 0.0815,
"step": 2296
},
{
"epoch": 1.0450409463148316,
"grad_norm": 1.4926712499107542,
"learning_rate": 4.480588443968825e-06,
"loss": 0.0651,
"step": 2297
},
{
"epoch": 1.0454959053685169,
"grad_norm": 1.3393174273757424,
"learning_rate": 4.4801522736715505e-06,
"loss": 0.0853,
"step": 2298
},
{
"epoch": 1.045950864422202,
"grad_norm": 1.5129017760803518,
"learning_rate": 4.479715941565174e-06,
"loss": 0.054,
"step": 2299
},
{
"epoch": 1.046405823475887,
"grad_norm": 2.0616493840890255,
"learning_rate": 4.4792794476853514e-06,
"loss": 0.0808,
"step": 2300
},
{
"epoch": 1.0468607825295724,
"grad_norm": 1.5861310389241974,
"learning_rate": 4.47884279206775e-06,
"loss": 0.0927,
"step": 2301
},
{
"epoch": 1.0473157415832575,
"grad_norm": 0.928390801162424,
"learning_rate": 4.478405974748054e-06,
"loss": 0.0722,
"step": 2302
},
{
"epoch": 1.0477707006369428,
"grad_norm": 1.5458094332092187,
"learning_rate": 4.477968995761954e-06,
"loss": 0.0867,
"step": 2303
},
{
"epoch": 1.0482256596906279,
"grad_norm": 1.5404011995876956,
"learning_rate": 4.477531855145161e-06,
"loss": 0.0902,
"step": 2304
},
{
"epoch": 1.048680618744313,
"grad_norm": 1.3434412855749513,
"learning_rate": 4.477094552933395e-06,
"loss": 0.0655,
"step": 2305
},
{
"epoch": 1.0491355777979983,
"grad_norm": 1.083100442302988,
"learning_rate": 4.476657089162391e-06,
"loss": 0.066,
"step": 2306
},
{
"epoch": 1.0495905368516834,
"grad_norm": 1.3871586676322527,
"learning_rate": 4.476219463867897e-06,
"loss": 0.1087,
"step": 2307
},
{
"epoch": 1.0500454959053684,
"grad_norm": 1.7852029642214748,
"learning_rate": 4.475781677085671e-06,
"loss": 0.0916,
"step": 2308
},
{
"epoch": 1.0505004549590538,
"grad_norm": 1.4206975802030928,
"learning_rate": 4.4753437288514904e-06,
"loss": 0.0664,
"step": 2309
},
{
"epoch": 1.0509554140127388,
"grad_norm": 1.464232148884979,
"learning_rate": 4.47490561920114e-06,
"loss": 0.098,
"step": 2310
},
{
"epoch": 1.051410373066424,
"grad_norm": 1.7389093637922037,
"learning_rate": 4.474467348170421e-06,
"loss": 0.0926,
"step": 2311
},
{
"epoch": 1.0518653321201092,
"grad_norm": 1.6567765919211275,
"learning_rate": 4.474028915795148e-06,
"loss": 0.1079,
"step": 2312
},
{
"epoch": 1.0523202911737943,
"grad_norm": 0.8043045141598315,
"learning_rate": 4.473590322111145e-06,
"loss": 0.0639,
"step": 2313
},
{
"epoch": 1.0527752502274794,
"grad_norm": 1.535130658359192,
"learning_rate": 4.473151567154255e-06,
"loss": 0.0806,
"step": 2314
},
{
"epoch": 1.0532302092811647,
"grad_norm": 1.2136793848488039,
"learning_rate": 4.472712650960328e-06,
"loss": 0.0732,
"step": 2315
},
{
"epoch": 1.0536851683348498,
"grad_norm": 1.4191160149688276,
"learning_rate": 4.472273573565234e-06,
"loss": 0.1603,
"step": 2316
},
{
"epoch": 1.0541401273885351,
"grad_norm": 1.812354142724077,
"learning_rate": 4.471834335004849e-06,
"loss": 0.1629,
"step": 2317
},
{
"epoch": 1.0545950864422202,
"grad_norm": 1.1853207063745665,
"learning_rate": 4.471394935315067e-06,
"loss": 0.0429,
"step": 2318
},
{
"epoch": 1.0550500454959053,
"grad_norm": 1.7435537882257561,
"learning_rate": 4.470955374531794e-06,
"loss": 0.1269,
"step": 2319
},
{
"epoch": 1.0555050045495906,
"grad_norm": 1.7557827405058806,
"learning_rate": 4.470515652690947e-06,
"loss": 0.065,
"step": 2320
},
{
"epoch": 1.0559599636032757,
"grad_norm": 1.413841453700311,
"learning_rate": 4.470075769828461e-06,
"loss": 0.0972,
"step": 2321
},
{
"epoch": 1.0564149226569608,
"grad_norm": 2.027164177434821,
"learning_rate": 4.46963572598028e-06,
"loss": 0.1036,
"step": 2322
},
{
"epoch": 1.056869881710646,
"grad_norm": 1.3937151595286825,
"learning_rate": 4.469195521182362e-06,
"loss": 0.0962,
"step": 2323
},
{
"epoch": 1.0573248407643312,
"grad_norm": 1.6401213468826432,
"learning_rate": 4.468755155470679e-06,
"loss": 0.0932,
"step": 2324
},
{
"epoch": 1.0577797998180163,
"grad_norm": 2.338885175215576,
"learning_rate": 4.468314628881214e-06,
"loss": 0.0962,
"step": 2325
},
{
"epoch": 1.0582347588717016,
"grad_norm": 1.4115973810191336,
"learning_rate": 4.467873941449969e-06,
"loss": 0.1021,
"step": 2326
},
{
"epoch": 1.0586897179253867,
"grad_norm": 1.982422405584423,
"learning_rate": 4.46743309321295e-06,
"loss": 0.1079,
"step": 2327
},
{
"epoch": 1.0591446769790718,
"grad_norm": 1.7740653248101632,
"learning_rate": 4.466992084206185e-06,
"loss": 0.1169,
"step": 2328
},
{
"epoch": 1.059599636032757,
"grad_norm": 1.116268548969285,
"learning_rate": 4.466550914465709e-06,
"loss": 0.0657,
"step": 2329
},
{
"epoch": 1.0600545950864422,
"grad_norm": 1.8360092943419488,
"learning_rate": 4.466109584027573e-06,
"loss": 0.127,
"step": 2330
},
{
"epoch": 1.0605095541401275,
"grad_norm": 1.3810676537742754,
"learning_rate": 4.465668092927841e-06,
"loss": 0.0856,
"step": 2331
},
{
"epoch": 1.0609645131938126,
"grad_norm": 2.185972325771388,
"learning_rate": 4.465226441202589e-06,
"loss": 0.0851,
"step": 2332
},
{
"epoch": 1.0614194722474977,
"grad_norm": 1.3875472079527142,
"learning_rate": 4.464784628887908e-06,
"loss": 0.0792,
"step": 2333
},
{
"epoch": 1.061874431301183,
"grad_norm": 1.2775951274791801,
"learning_rate": 4.4643426560199e-06,
"loss": 0.104,
"step": 2334
},
{
"epoch": 1.062329390354868,
"grad_norm": 1.5319736940172268,
"learning_rate": 4.46390052263468e-06,
"loss": 0.1104,
"step": 2335
},
{
"epoch": 1.0627843494085532,
"grad_norm": 1.812780273198809,
"learning_rate": 4.463458228768378e-06,
"loss": 0.0949,
"step": 2336
},
{
"epoch": 1.0632393084622385,
"grad_norm": 1.5756060982683149,
"learning_rate": 4.463015774457137e-06,
"loss": 0.082,
"step": 2337
},
{
"epoch": 1.0636942675159236,
"grad_norm": 2.6744844011663917,
"learning_rate": 4.462573159737113e-06,
"loss": 0.1212,
"step": 2338
},
{
"epoch": 1.0641492265696086,
"grad_norm": 1.2563398274616853,
"learning_rate": 4.462130384644472e-06,
"loss": 0.0768,
"step": 2339
},
{
"epoch": 1.064604185623294,
"grad_norm": 1.8057420294279858,
"learning_rate": 4.461687449215397e-06,
"loss": 0.1099,
"step": 2340
},
{
"epoch": 1.065059144676979,
"grad_norm": 1.6208315079433049,
"learning_rate": 4.4612443534860826e-06,
"loss": 0.1144,
"step": 2341
},
{
"epoch": 1.0655141037306644,
"grad_norm": 1.9711864344243992,
"learning_rate": 4.460801097492737e-06,
"loss": 0.0856,
"step": 2342
},
{
"epoch": 1.0659690627843494,
"grad_norm": 1.3323713152755212,
"learning_rate": 4.460357681271579e-06,
"loss": 0.0715,
"step": 2343
},
{
"epoch": 1.0664240218380345,
"grad_norm": 1.6353594143577714,
"learning_rate": 4.4599141048588454e-06,
"loss": 0.111,
"step": 2344
},
{
"epoch": 1.0668789808917198,
"grad_norm": 1.921680218643112,
"learning_rate": 4.4594703682907825e-06,
"loss": 0.1084,
"step": 2345
},
{
"epoch": 1.067333939945405,
"grad_norm": 1.6583549389810224,
"learning_rate": 4.459026471603649e-06,
"loss": 0.1051,
"step": 2346
},
{
"epoch": 1.06778889899909,
"grad_norm": 1.7686266077660249,
"learning_rate": 4.45858241483372e-06,
"loss": 0.1108,
"step": 2347
},
{
"epoch": 1.0682438580527753,
"grad_norm": 1.2657212497494363,
"learning_rate": 4.458138198017281e-06,
"loss": 0.0775,
"step": 2348
},
{
"epoch": 1.0686988171064604,
"grad_norm": 1.294854322669401,
"learning_rate": 4.457693821190631e-06,
"loss": 0.0991,
"step": 2349
},
{
"epoch": 1.0691537761601455,
"grad_norm": 1.6787540486710895,
"learning_rate": 4.4572492843900815e-06,
"loss": 0.1061,
"step": 2350
},
{
"epoch": 1.0696087352138308,
"grad_norm": 1.2916611688046353,
"learning_rate": 4.456804587651961e-06,
"loss": 0.0997,
"step": 2351
},
{
"epoch": 1.070063694267516,
"grad_norm": 1.1797535857178234,
"learning_rate": 4.456359731012606e-06,
"loss": 0.1019,
"step": 2352
},
{
"epoch": 1.070518653321201,
"grad_norm": 1.4074451049825587,
"learning_rate": 4.455914714508369e-06,
"loss": 0.0639,
"step": 2353
},
{
"epoch": 1.0709736123748863,
"grad_norm": 0.7791870489522308,
"learning_rate": 4.455469538175614e-06,
"loss": 0.0293,
"step": 2354
},
{
"epoch": 1.0714285714285714,
"grad_norm": 1.3432260603887558,
"learning_rate": 4.455024202050719e-06,
"loss": 0.086,
"step": 2355
},
{
"epoch": 1.0718835304822565,
"grad_norm": 1.4625155799519551,
"learning_rate": 4.454578706170075e-06,
"loss": 0.0726,
"step": 2356
},
{
"epoch": 1.0723384895359418,
"grad_norm": 1.9522119831099414,
"learning_rate": 4.454133050570087e-06,
"loss": 0.0687,
"step": 2357
},
{
"epoch": 1.0727934485896269,
"grad_norm": 1.561587548295498,
"learning_rate": 4.453687235287169e-06,
"loss": 0.133,
"step": 2358
},
{
"epoch": 1.0732484076433122,
"grad_norm": 1.2057828723386872,
"learning_rate": 4.453241260357754e-06,
"loss": 0.0913,
"step": 2359
},
{
"epoch": 1.0737033666969973,
"grad_norm": 1.666054721084408,
"learning_rate": 4.452795125818283e-06,
"loss": 0.0971,
"step": 2360
},
{
"epoch": 1.0741583257506824,
"grad_norm": 1.758685408172953,
"learning_rate": 4.4523488317052146e-06,
"loss": 0.1075,
"step": 2361
},
{
"epoch": 1.0746132848043677,
"grad_norm": 1.105397570856634,
"learning_rate": 4.451902378055015e-06,
"loss": 0.0573,
"step": 2362
},
{
"epoch": 1.0750682438580528,
"grad_norm": 1.192901271256021,
"learning_rate": 4.451455764904169e-06,
"loss": 0.0809,
"step": 2363
},
{
"epoch": 1.0755232029117379,
"grad_norm": 1.819087657943071,
"learning_rate": 4.45100899228917e-06,
"loss": 0.0997,
"step": 2364
},
{
"epoch": 1.0759781619654232,
"grad_norm": 1.3969388862666674,
"learning_rate": 4.4505620602465275e-06,
"loss": 0.0601,
"step": 2365
},
{
"epoch": 1.0764331210191083,
"grad_norm": 2.1004515911969937,
"learning_rate": 4.450114968812761e-06,
"loss": 0.1059,
"step": 2366
},
{
"epoch": 1.0768880800727934,
"grad_norm": 1.3898874863369548,
"learning_rate": 4.449667718024406e-06,
"loss": 0.1217,
"step": 2367
},
{
"epoch": 1.0773430391264787,
"grad_norm": 1.624148028385408,
"learning_rate": 4.449220307918011e-06,
"loss": 0.1426,
"step": 2368
},
{
"epoch": 1.0777979981801638,
"grad_norm": 1.3957158550214264,
"learning_rate": 4.448772738530134e-06,
"loss": 0.065,
"step": 2369
},
{
"epoch": 1.078252957233849,
"grad_norm": 1.2170939851594698,
"learning_rate": 4.44832500989735e-06,
"loss": 0.0431,
"step": 2370
},
{
"epoch": 1.0787079162875342,
"grad_norm": 1.4145038782998978,
"learning_rate": 4.447877122056243e-06,
"loss": 0.0672,
"step": 2371
},
{
"epoch": 1.0791628753412192,
"grad_norm": 1.6983412550072923,
"learning_rate": 4.447429075043416e-06,
"loss": 0.0645,
"step": 2372
},
{
"epoch": 1.0796178343949046,
"grad_norm": 1.9437215682706028,
"learning_rate": 4.4469808688954786e-06,
"loss": 0.0798,
"step": 2373
},
{
"epoch": 1.0800727934485896,
"grad_norm": 1.3885506691120681,
"learning_rate": 4.446532503649058e-06,
"loss": 0.1103,
"step": 2374
},
{
"epoch": 1.0805277525022747,
"grad_norm": 1.3760694731918508,
"learning_rate": 4.44608397934079e-06,
"loss": 0.0658,
"step": 2375
},
{
"epoch": 1.08098271155596,
"grad_norm": 1.4014742842676748,
"learning_rate": 4.445635296007329e-06,
"loss": 0.0777,
"step": 2376
},
{
"epoch": 1.0814376706096451,
"grad_norm": 1.5083231204611136,
"learning_rate": 4.445186453685339e-06,
"loss": 0.0765,
"step": 2377
},
{
"epoch": 1.0818926296633302,
"grad_norm": 2.31100453638565,
"learning_rate": 4.444737452411494e-06,
"loss": 0.1285,
"step": 2378
},
{
"epoch": 1.0823475887170155,
"grad_norm": 2.400477978408628,
"learning_rate": 4.444288292222488e-06,
"loss": 0.1032,
"step": 2379
},
{
"epoch": 1.0828025477707006,
"grad_norm": 1.2288090886103258,
"learning_rate": 4.443838973155023e-06,
"loss": 0.0732,
"step": 2380
},
{
"epoch": 1.0832575068243857,
"grad_norm": 1.7401608518222071,
"learning_rate": 4.443389495245816e-06,
"loss": 0.1038,
"step": 2381
},
{
"epoch": 1.083712465878071,
"grad_norm": 1.0676718989217244,
"learning_rate": 4.442939858531594e-06,
"loss": 0.0977,
"step": 2382
},
{
"epoch": 1.084167424931756,
"grad_norm": 2.16417029576833,
"learning_rate": 4.442490063049103e-06,
"loss": 0.1247,
"step": 2383
},
{
"epoch": 1.0846223839854412,
"grad_norm": 1.7397604358649068,
"learning_rate": 4.442040108835095e-06,
"loss": 0.0734,
"step": 2384
},
{
"epoch": 1.0850773430391265,
"grad_norm": 1.3344372550818824,
"learning_rate": 4.44158999592634e-06,
"loss": 0.0738,
"step": 2385
},
{
"epoch": 1.0855323020928116,
"grad_norm": 1.464102086807412,
"learning_rate": 4.441139724359617e-06,
"loss": 0.069,
"step": 2386
},
{
"epoch": 1.085987261146497,
"grad_norm": 1.2702083100987853,
"learning_rate": 4.440689294171724e-06,
"loss": 0.0731,
"step": 2387
},
{
"epoch": 1.086442220200182,
"grad_norm": 1.7208341236115763,
"learning_rate": 4.440238705399465e-06,
"loss": 0.0894,
"step": 2388
},
{
"epoch": 1.086897179253867,
"grad_norm": 1.717461266806642,
"learning_rate": 4.439787958079662e-06,
"loss": 0.0913,
"step": 2389
},
{
"epoch": 1.0873521383075524,
"grad_norm": 1.5936201417077822,
"learning_rate": 4.439337052249146e-06,
"loss": 0.0853,
"step": 2390
},
{
"epoch": 1.0878070973612375,
"grad_norm": 1.5280204524637513,
"learning_rate": 4.4388859879447645e-06,
"loss": 0.0725,
"step": 2391
},
{
"epoch": 1.0882620564149226,
"grad_norm": 1.7709159752994665,
"learning_rate": 4.438434765203376e-06,
"loss": 0.1374,
"step": 2392
},
{
"epoch": 1.0887170154686079,
"grad_norm": 1.7267099736271705,
"learning_rate": 4.4379833840618524e-06,
"loss": 0.1174,
"step": 2393
},
{
"epoch": 1.089171974522293,
"grad_norm": 1.4910726524631923,
"learning_rate": 4.4375318445570785e-06,
"loss": 0.0655,
"step": 2394
},
{
"epoch": 1.089626933575978,
"grad_norm": 1.8163886098625441,
"learning_rate": 4.437080146725951e-06,
"loss": 0.0546,
"step": 2395
},
{
"epoch": 1.0900818926296634,
"grad_norm": 1.2219692369480206,
"learning_rate": 4.436628290605384e-06,
"loss": 0.0672,
"step": 2396
},
{
"epoch": 1.0905368516833485,
"grad_norm": 1.6116626987809923,
"learning_rate": 4.436176276232297e-06,
"loss": 0.1028,
"step": 2397
},
{
"epoch": 1.0909918107370338,
"grad_norm": 2.3052452656431255,
"learning_rate": 4.4357241036436294e-06,
"loss": 0.0939,
"step": 2398
},
{
"epoch": 1.0914467697907189,
"grad_norm": 0.9223535743607304,
"learning_rate": 4.435271772876329e-06,
"loss": 0.0689,
"step": 2399
},
{
"epoch": 1.091901728844404,
"grad_norm": 1.531866494757431,
"learning_rate": 4.434819283967359e-06,
"loss": 0.1145,
"step": 2400
},
{
"epoch": 1.0923566878980893,
"grad_norm": 2.012408668977357,
"learning_rate": 4.434366636953695e-06,
"loss": 0.0655,
"step": 2401
},
{
"epoch": 1.0928116469517744,
"grad_norm": 1.4296585397558859,
"learning_rate": 4.433913831872324e-06,
"loss": 0.0663,
"step": 2402
},
{
"epoch": 1.0932666060054594,
"grad_norm": 1.5463695757532308,
"learning_rate": 4.43346086876025e-06,
"loss": 0.1785,
"step": 2403
},
{
"epoch": 1.0937215650591448,
"grad_norm": 2.2667173046164253,
"learning_rate": 4.433007747654484e-06,
"loss": 0.0963,
"step": 2404
},
{
"epoch": 1.0941765241128298,
"grad_norm": 1.7874869125348338,
"learning_rate": 4.432554468592054e-06,
"loss": 0.1245,
"step": 2405
},
{
"epoch": 1.094631483166515,
"grad_norm": 2.0669862144476387,
"learning_rate": 4.432101031610001e-06,
"loss": 0.1237,
"step": 2406
},
{
"epoch": 1.0950864422202002,
"grad_norm": 1.6979511768981763,
"learning_rate": 4.431647436745376e-06,
"loss": 0.0888,
"step": 2407
},
{
"epoch": 1.0955414012738853,
"grad_norm": 1.9257787054792377,
"learning_rate": 4.431193684035246e-06,
"loss": 0.0816,
"step": 2408
},
{
"epoch": 1.0959963603275704,
"grad_norm": 1.541493056259052,
"learning_rate": 4.43073977351669e-06,
"loss": 0.0766,
"step": 2409
},
{
"epoch": 1.0964513193812557,
"grad_norm": 2.051380197110344,
"learning_rate": 4.430285705226799e-06,
"loss": 0.0692,
"step": 2410
},
{
"epoch": 1.0969062784349408,
"grad_norm": 1.574334878171295,
"learning_rate": 4.429831479202676e-06,
"loss": 0.0867,
"step": 2411
},
{
"epoch": 1.097361237488626,
"grad_norm": 1.297944277206769,
"learning_rate": 4.429377095481441e-06,
"loss": 0.0729,
"step": 2412
},
{
"epoch": 1.0978161965423112,
"grad_norm": 1.4644868521714023,
"learning_rate": 4.428922554100221e-06,
"loss": 0.1372,
"step": 2413
},
{
"epoch": 1.0982711555959963,
"grad_norm": 1.1220705548281613,
"learning_rate": 4.428467855096163e-06,
"loss": 0.0775,
"step": 2414
},
{
"epoch": 1.0987261146496816,
"grad_norm": 2.3884661536435043,
"learning_rate": 4.428012998506419e-06,
"loss": 0.0783,
"step": 2415
},
{
"epoch": 1.0991810737033667,
"grad_norm": 1.3934936655417303,
"learning_rate": 4.42755798436816e-06,
"loss": 0.0993,
"step": 2416
},
{
"epoch": 1.0996360327570518,
"grad_norm": 1.7787119321180418,
"learning_rate": 4.427102812718568e-06,
"loss": 0.0923,
"step": 2417
},
{
"epoch": 1.100090991810737,
"grad_norm": 2.0287950182704018,
"learning_rate": 4.426647483594836e-06,
"loss": 0.1214,
"step": 2418
},
{
"epoch": 1.1005459508644222,
"grad_norm": 1.2227878126042278,
"learning_rate": 4.4261919970341724e-06,
"loss": 0.109,
"step": 2419
},
{
"epoch": 1.1010009099181073,
"grad_norm": 1.4547250907863465,
"learning_rate": 4.425736353073798e-06,
"loss": 0.0639,
"step": 2420
},
{
"epoch": 1.1014558689717926,
"grad_norm": 1.361745944169816,
"learning_rate": 4.425280551750945e-06,
"loss": 0.0779,
"step": 2421
},
{
"epoch": 1.1019108280254777,
"grad_norm": 1.4312448198815029,
"learning_rate": 4.42482459310286e-06,
"loss": 0.097,
"step": 2422
},
{
"epoch": 1.1023657870791628,
"grad_norm": 1.5917118093221942,
"learning_rate": 4.424368477166801e-06,
"loss": 0.0981,
"step": 2423
},
{
"epoch": 1.102820746132848,
"grad_norm": 1.4650250955165152,
"learning_rate": 4.423912203980041e-06,
"loss": 0.114,
"step": 2424
},
{
"epoch": 1.1032757051865332,
"grad_norm": 1.6849750447492673,
"learning_rate": 4.423455773579865e-06,
"loss": 0.072,
"step": 2425
},
{
"epoch": 1.1037306642402185,
"grad_norm": 1.678029572619772,
"learning_rate": 4.422999186003568e-06,
"loss": 0.0943,
"step": 2426
},
{
"epoch": 1.1041856232939036,
"grad_norm": 1.1098076423379506,
"learning_rate": 4.422542441288462e-06,
"loss": 0.0731,
"step": 2427
},
{
"epoch": 1.1046405823475887,
"grad_norm": 1.4743567185549873,
"learning_rate": 4.42208553947187e-06,
"loss": 0.109,
"step": 2428
},
{
"epoch": 1.105095541401274,
"grad_norm": 1.3759474671598095,
"learning_rate": 4.4216284805911275e-06,
"loss": 0.0924,
"step": 2429
},
{
"epoch": 1.105550500454959,
"grad_norm": 2.0527322032275794,
"learning_rate": 4.421171264683584e-06,
"loss": 0.106,
"step": 2430
},
{
"epoch": 1.1060054595086442,
"grad_norm": 1.664729158421169,
"learning_rate": 4.4207138917866e-06,
"loss": 0.1339,
"step": 2431
},
{
"epoch": 1.1064604185623295,
"grad_norm": 1.8178200019923791,
"learning_rate": 4.420256361937551e-06,
"loss": 0.093,
"step": 2432
},
{
"epoch": 1.1069153776160146,
"grad_norm": 1.1183446921626512,
"learning_rate": 4.419798675173824e-06,
"loss": 0.0646,
"step": 2433
},
{
"epoch": 1.1073703366696996,
"grad_norm": 1.3726858689513264,
"learning_rate": 4.419340831532819e-06,
"loss": 0.0813,
"step": 2434
},
{
"epoch": 1.107825295723385,
"grad_norm": 1.3403945446236318,
"learning_rate": 4.418882831051949e-06,
"loss": 0.0754,
"step": 2435
},
{
"epoch": 1.10828025477707,
"grad_norm": 1.6141383424379385,
"learning_rate": 4.418424673768639e-06,
"loss": 0.0661,
"step": 2436
},
{
"epoch": 1.1087352138307551,
"grad_norm": 1.0940032242798146,
"learning_rate": 4.417966359720329e-06,
"loss": 0.0318,
"step": 2437
},
{
"epoch": 1.1091901728844404,
"grad_norm": 1.3623311010378927,
"learning_rate": 4.417507888944469e-06,
"loss": 0.0637,
"step": 2438
},
{
"epoch": 1.1096451319381255,
"grad_norm": 2.141865035990428,
"learning_rate": 4.417049261478525e-06,
"loss": 0.1037,
"step": 2439
},
{
"epoch": 1.1101000909918108,
"grad_norm": 1.420497893607898,
"learning_rate": 4.416590477359971e-06,
"loss": 0.0564,
"step": 2440
},
{
"epoch": 1.110555050045496,
"grad_norm": 1.2732829960352239,
"learning_rate": 4.416131536626299e-06,
"loss": 0.1076,
"step": 2441
},
{
"epoch": 1.111010009099181,
"grad_norm": 1.4336397689648444,
"learning_rate": 4.415672439315011e-06,
"loss": 0.1066,
"step": 2442
},
{
"epoch": 1.1114649681528663,
"grad_norm": 1.0286658142783538,
"learning_rate": 4.415213185463623e-06,
"loss": 0.0992,
"step": 2443
},
{
"epoch": 1.1119199272065514,
"grad_norm": 1.5137672717842037,
"learning_rate": 4.414753775109661e-06,
"loss": 0.0474,
"step": 2444
},
{
"epoch": 1.1123748862602365,
"grad_norm": 1.7400780554313313,
"learning_rate": 4.414294208290669e-06,
"loss": 0.1138,
"step": 2445
},
{
"epoch": 1.1128298453139218,
"grad_norm": 1.644624340954533,
"learning_rate": 4.413834485044199e-06,
"loss": 0.08,
"step": 2446
},
{
"epoch": 1.113284804367607,
"grad_norm": 1.4630415788998294,
"learning_rate": 4.413374605407817e-06,
"loss": 0.0523,
"step": 2447
},
{
"epoch": 1.113739763421292,
"grad_norm": 1.8356228780285462,
"learning_rate": 4.412914569419103e-06,
"loss": 0.0811,
"step": 2448
},
{
"epoch": 1.1141947224749773,
"grad_norm": 1.324899907458732,
"learning_rate": 4.412454377115649e-06,
"loss": 0.0888,
"step": 2449
},
{
"epoch": 1.1146496815286624,
"grad_norm": 1.4895058777507912,
"learning_rate": 4.411994028535061e-06,
"loss": 0.1094,
"step": 2450
},
{
"epoch": 1.1151046405823477,
"grad_norm": 1.6376764275236961,
"learning_rate": 4.411533523714954e-06,
"loss": 0.0661,
"step": 2451
},
{
"epoch": 1.1155595996360328,
"grad_norm": 1.3175933666660855,
"learning_rate": 4.41107286269296e-06,
"loss": 0.0832,
"step": 2452
},
{
"epoch": 1.1160145586897179,
"grad_norm": 1.4664317140231247,
"learning_rate": 4.410612045506722e-06,
"loss": 0.1019,
"step": 2453
},
{
"epoch": 1.1164695177434032,
"grad_norm": 1.697124490095177,
"learning_rate": 4.410151072193897e-06,
"loss": 0.1164,
"step": 2454
},
{
"epoch": 1.1169244767970883,
"grad_norm": 1.520297101782584,
"learning_rate": 4.409689942792152e-06,
"loss": 0.0824,
"step": 2455
},
{
"epoch": 1.1173794358507734,
"grad_norm": 1.693914191969565,
"learning_rate": 4.409228657339168e-06,
"loss": 0.13,
"step": 2456
},
{
"epoch": 1.1178343949044587,
"grad_norm": 2.024825308244833,
"learning_rate": 4.4087672158726415e-06,
"loss": 0.0874,
"step": 2457
},
{
"epoch": 1.1182893539581438,
"grad_norm": 1.6218817682748383,
"learning_rate": 4.408305618430277e-06,
"loss": 0.0877,
"step": 2458
},
{
"epoch": 1.1187443130118289,
"grad_norm": 2.1554598427149054,
"learning_rate": 4.407843865049797e-06,
"loss": 0.0932,
"step": 2459
},
{
"epoch": 1.1191992720655142,
"grad_norm": 1.711228616600094,
"learning_rate": 4.40738195576893e-06,
"loss": 0.064,
"step": 2460
},
{
"epoch": 1.1196542311191993,
"grad_norm": 1.8471856875898178,
"learning_rate": 4.406919890625424e-06,
"loss": 0.0987,
"step": 2461
},
{
"epoch": 1.1201091901728844,
"grad_norm": 1.1003500159856345,
"learning_rate": 4.406457669657036e-06,
"loss": 0.0759,
"step": 2462
},
{
"epoch": 1.1205641492265697,
"grad_norm": 2.109594577114758,
"learning_rate": 4.405995292901537e-06,
"loss": 0.0942,
"step": 2463
},
{
"epoch": 1.1210191082802548,
"grad_norm": 1.8182386073569805,
"learning_rate": 4.40553276039671e-06,
"loss": 0.1389,
"step": 2464
},
{
"epoch": 1.1214740673339398,
"grad_norm": 1.4379586293025806,
"learning_rate": 4.4050700721803505e-06,
"loss": 0.099,
"step": 2465
},
{
"epoch": 1.1219290263876252,
"grad_norm": 1.4425166537042247,
"learning_rate": 4.404607228290269e-06,
"loss": 0.0861,
"step": 2466
},
{
"epoch": 1.1223839854413102,
"grad_norm": 1.4093172987847846,
"learning_rate": 4.404144228764285e-06,
"loss": 0.0621,
"step": 2467
},
{
"epoch": 1.1228389444949956,
"grad_norm": 1.8641838091648237,
"learning_rate": 4.403681073640235e-06,
"loss": 0.1364,
"step": 2468
},
{
"epoch": 1.1232939035486806,
"grad_norm": 1.4149844792642807,
"learning_rate": 4.403217762955963e-06,
"loss": 0.0738,
"step": 2469
},
{
"epoch": 1.1237488626023657,
"grad_norm": 1.167003064546788,
"learning_rate": 4.402754296749331e-06,
"loss": 0.1399,
"step": 2470
},
{
"epoch": 1.124203821656051,
"grad_norm": 1.3706100775947843,
"learning_rate": 4.402290675058211e-06,
"loss": 0.0743,
"step": 2471
},
{
"epoch": 1.1246587807097361,
"grad_norm": 1.3145920684357588,
"learning_rate": 4.401826897920487e-06,
"loss": 0.1099,
"step": 2472
},
{
"epoch": 1.1251137397634212,
"grad_norm": 1.5982593223467985,
"learning_rate": 4.4013629653740575e-06,
"loss": 0.0645,
"step": 2473
},
{
"epoch": 1.1255686988171065,
"grad_norm": 1.652131477085118,
"learning_rate": 4.400898877456833e-06,
"loss": 0.1091,
"step": 2474
},
{
"epoch": 1.1260236578707916,
"grad_norm": 1.1449819643243202,
"learning_rate": 4.400434634206737e-06,
"loss": 0.068,
"step": 2475
},
{
"epoch": 1.1264786169244767,
"grad_norm": 1.144310552102497,
"learning_rate": 4.399970235661705e-06,
"loss": 0.0685,
"step": 2476
},
{
"epoch": 1.126933575978162,
"grad_norm": 1.2448262081573807,
"learning_rate": 4.399505681859685e-06,
"loss": 0.0932,
"step": 2477
},
{
"epoch": 1.127388535031847,
"grad_norm": 1.1408663298803172,
"learning_rate": 4.399040972838639e-06,
"loss": 0.0423,
"step": 2478
},
{
"epoch": 1.1278434940855324,
"grad_norm": 1.699409897859247,
"learning_rate": 4.398576108636541e-06,
"loss": 0.0787,
"step": 2479
},
{
"epoch": 1.1282984531392175,
"grad_norm": 1.7864933002408017,
"learning_rate": 4.398111089291378e-06,
"loss": 0.0892,
"step": 2480
},
{
"epoch": 1.1287534121929026,
"grad_norm": 2.14798840196358,
"learning_rate": 4.3976459148411464e-06,
"loss": 0.1009,
"step": 2481
},
{
"epoch": 1.129208371246588,
"grad_norm": 1.5385879391737598,
"learning_rate": 4.3971805853238616e-06,
"loss": 0.081,
"step": 2482
},
{
"epoch": 1.129663330300273,
"grad_norm": 2.531930467512664,
"learning_rate": 4.396715100777547e-06,
"loss": 0.0686,
"step": 2483
},
{
"epoch": 1.130118289353958,
"grad_norm": 1.8968573987064818,
"learning_rate": 4.39624946124024e-06,
"loss": 0.1027,
"step": 2484
},
{
"epoch": 1.1305732484076434,
"grad_norm": 1.5129833288445977,
"learning_rate": 4.39578366674999e-06,
"loss": 0.072,
"step": 2485
},
{
"epoch": 1.1310282074613285,
"grad_norm": 1.4623536249588729,
"learning_rate": 4.395317717344861e-06,
"loss": 0.0924,
"step": 2486
},
{
"epoch": 1.1314831665150136,
"grad_norm": 1.9901397225611637,
"learning_rate": 4.394851613062927e-06,
"loss": 0.0852,
"step": 2487
},
{
"epoch": 1.1319381255686989,
"grad_norm": 1.3624251358159498,
"learning_rate": 4.394385353942275e-06,
"loss": 0.0543,
"step": 2488
},
{
"epoch": 1.132393084622384,
"grad_norm": 2.097016286942742,
"learning_rate": 4.393918940021008e-06,
"loss": 0.1261,
"step": 2489
},
{
"epoch": 1.132848043676069,
"grad_norm": 1.7568839339292304,
"learning_rate": 4.393452371337238e-06,
"loss": 0.0754,
"step": 2490
},
{
"epoch": 1.1333030027297544,
"grad_norm": 1.4870006844681243,
"learning_rate": 4.39298564792909e-06,
"loss": 0.0765,
"step": 2491
},
{
"epoch": 1.1337579617834395,
"grad_norm": 2.3747689669640204,
"learning_rate": 4.392518769834705e-06,
"loss": 0.1088,
"step": 2492
},
{
"epoch": 1.1342129208371245,
"grad_norm": 1.8391194648070115,
"learning_rate": 4.392051737092231e-06,
"loss": 0.1038,
"step": 2493
},
{
"epoch": 1.1346678798908099,
"grad_norm": 1.3181948862231594,
"learning_rate": 4.391584549739834e-06,
"loss": 0.0953,
"step": 2494
},
{
"epoch": 1.135122838944495,
"grad_norm": 1.768253423337537,
"learning_rate": 4.391117207815691e-06,
"loss": 0.0861,
"step": 2495
},
{
"epoch": 1.1355777979981803,
"grad_norm": 1.7733681614801209,
"learning_rate": 4.3906497113579895e-06,
"loss": 0.0869,
"step": 2496
},
{
"epoch": 1.1360327570518653,
"grad_norm": 1.7107321819304122,
"learning_rate": 4.390182060404931e-06,
"loss": 0.0522,
"step": 2497
},
{
"epoch": 1.1364877161055504,
"grad_norm": 1.434552421646011,
"learning_rate": 4.389714254994732e-06,
"loss": 0.0846,
"step": 2498
},
{
"epoch": 1.1369426751592357,
"grad_norm": 1.5226850377251067,
"learning_rate": 4.389246295165617e-06,
"loss": 0.083,
"step": 2499
},
{
"epoch": 1.1373976342129208,
"grad_norm": 1.1587798025261624,
"learning_rate": 4.388778180955826e-06,
"loss": 0.0715,
"step": 2500
},
{
"epoch": 1.137852593266606,
"grad_norm": 2.2145425207872735,
"learning_rate": 4.388309912403612e-06,
"loss": 0.126,
"step": 2501
},
{
"epoch": 1.1383075523202912,
"grad_norm": 1.860918476304708,
"learning_rate": 4.38784148954724e-06,
"loss": 0.0825,
"step": 2502
},
{
"epoch": 1.1387625113739763,
"grad_norm": 1.5494754816427427,
"learning_rate": 4.387372912424987e-06,
"loss": 0.0664,
"step": 2503
},
{
"epoch": 1.1392174704276614,
"grad_norm": 1.4756280948745337,
"learning_rate": 4.386904181075142e-06,
"loss": 0.1292,
"step": 2504
},
{
"epoch": 1.1396724294813467,
"grad_norm": 1.4970335285969478,
"learning_rate": 4.386435295536008e-06,
"loss": 0.0617,
"step": 2505
},
{
"epoch": 1.1401273885350318,
"grad_norm": 1.3926364015804897,
"learning_rate": 4.385966255845902e-06,
"loss": 0.0978,
"step": 2506
},
{
"epoch": 1.1405823475887171,
"grad_norm": 1.392316755067547,
"learning_rate": 4.38549706204315e-06,
"loss": 0.1051,
"step": 2507
},
{
"epoch": 1.1410373066424022,
"grad_norm": 1.337875750299131,
"learning_rate": 4.385027714166094e-06,
"loss": 0.0818,
"step": 2508
},
{
"epoch": 1.1414922656960873,
"grad_norm": 1.7636561267412383,
"learning_rate": 4.384558212253084e-06,
"loss": 0.058,
"step": 2509
},
{
"epoch": 1.1419472247497726,
"grad_norm": 1.4667430941313127,
"learning_rate": 4.384088556342488e-06,
"loss": 0.0757,
"step": 2510
},
{
"epoch": 1.1424021838034577,
"grad_norm": 1.4237110238919748,
"learning_rate": 4.383618746472686e-06,
"loss": 0.0769,
"step": 2511
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.5730790632789893,
"learning_rate": 4.383148782682064e-06,
"loss": 0.0653,
"step": 2512
},
{
"epoch": 1.143312101910828,
"grad_norm": 1.4241196656590642,
"learning_rate": 4.382678665009028e-06,
"loss": 0.1399,
"step": 2513
},
{
"epoch": 1.1437670609645132,
"grad_norm": 1.343619807338348,
"learning_rate": 4.382208393491994e-06,
"loss": 0.1179,
"step": 2514
},
{
"epoch": 1.1442220200181983,
"grad_norm": 1.5009441966445611,
"learning_rate": 4.381737968169389e-06,
"loss": 0.0771,
"step": 2515
},
{
"epoch": 1.1446769790718836,
"grad_norm": 1.986426705123048,
"learning_rate": 4.381267389079657e-06,
"loss": 0.0701,
"step": 2516
},
{
"epoch": 1.1451319381255687,
"grad_norm": 1.55910702321473,
"learning_rate": 4.380796656261248e-06,
"loss": 0.0972,
"step": 2517
},
{
"epoch": 1.1455868971792538,
"grad_norm": 1.3317020576259018,
"learning_rate": 4.38032576975263e-06,
"loss": 0.0611,
"step": 2518
},
{
"epoch": 1.146041856232939,
"grad_norm": 1.2157043472122377,
"learning_rate": 4.3798547295922825e-06,
"loss": 0.0699,
"step": 2519
},
{
"epoch": 1.1464968152866242,
"grad_norm": 2.724328439334893,
"learning_rate": 4.3793835358186955e-06,
"loss": 0.0797,
"step": 2520
},
{
"epoch": 1.1469517743403093,
"grad_norm": 1.7128126611421937,
"learning_rate": 4.378912188470374e-06,
"loss": 0.1045,
"step": 2521
},
{
"epoch": 1.1474067333939946,
"grad_norm": 1.4469267749443473,
"learning_rate": 4.378440687585832e-06,
"loss": 0.0924,
"step": 2522
},
{
"epoch": 1.1478616924476797,
"grad_norm": 1.8130770437623378,
"learning_rate": 4.3779690332036005e-06,
"loss": 0.1218,
"step": 2523
},
{
"epoch": 1.148316651501365,
"grad_norm": 1.7468548582501024,
"learning_rate": 4.3774972253622205e-06,
"loss": 0.1111,
"step": 2524
},
{
"epoch": 1.14877161055505,
"grad_norm": 1.4797480492586725,
"learning_rate": 4.377025264100246e-06,
"loss": 0.0854,
"step": 2525
},
{
"epoch": 1.1492265696087351,
"grad_norm": 1.7116967965378072,
"learning_rate": 4.376553149456244e-06,
"loss": 0.0594,
"step": 2526
},
{
"epoch": 1.1496815286624205,
"grad_norm": 1.643705257307874,
"learning_rate": 4.376080881468793e-06,
"loss": 0.0696,
"step": 2527
},
{
"epoch": 1.1501364877161055,
"grad_norm": 1.1326114868014416,
"learning_rate": 4.375608460176483e-06,
"loss": 0.0705,
"step": 2528
},
{
"epoch": 1.1505914467697906,
"grad_norm": 1.7031789207462111,
"learning_rate": 4.375135885617922e-06,
"loss": 0.0812,
"step": 2529
},
{
"epoch": 1.151046405823476,
"grad_norm": 1.41010135204267,
"learning_rate": 4.3746631578317236e-06,
"loss": 0.086,
"step": 2530
},
{
"epoch": 1.151501364877161,
"grad_norm": 1.6943016984534656,
"learning_rate": 4.374190276856517e-06,
"loss": 0.0754,
"step": 2531
},
{
"epoch": 1.1519563239308463,
"grad_norm": 2.0617449393261165,
"learning_rate": 4.373717242730946e-06,
"loss": 0.09,
"step": 2532
},
{
"epoch": 1.1524112829845314,
"grad_norm": 1.7367594980944636,
"learning_rate": 4.373244055493663e-06,
"loss": 0.0623,
"step": 2533
},
{
"epoch": 1.1528662420382165,
"grad_norm": 1.9342760133428794,
"learning_rate": 4.372770715183336e-06,
"loss": 0.1147,
"step": 2534
},
{
"epoch": 1.1533212010919018,
"grad_norm": 2.0637174188437255,
"learning_rate": 4.372297221838642e-06,
"loss": 0.1456,
"step": 2535
},
{
"epoch": 1.153776160145587,
"grad_norm": 1.640815829478928,
"learning_rate": 4.3718235754982755e-06,
"loss": 0.1097,
"step": 2536
},
{
"epoch": 1.154231119199272,
"grad_norm": 1.4969972221702579,
"learning_rate": 4.371349776200939e-06,
"loss": 0.1089,
"step": 2537
},
{
"epoch": 1.1546860782529573,
"grad_norm": 1.7453973329666645,
"learning_rate": 4.37087582398535e-06,
"loss": 0.081,
"step": 2538
},
{
"epoch": 1.1551410373066424,
"grad_norm": 1.3301344902434764,
"learning_rate": 4.370401718890237e-06,
"loss": 0.0839,
"step": 2539
},
{
"epoch": 1.1555959963603275,
"grad_norm": 1.3726509501801365,
"learning_rate": 4.369927460954342e-06,
"loss": 0.0757,
"step": 2540
},
{
"epoch": 1.1560509554140128,
"grad_norm": 1.7575525897527056,
"learning_rate": 4.36945305021642e-06,
"loss": 0.0984,
"step": 2541
},
{
"epoch": 1.156505914467698,
"grad_norm": 1.0573468860101436,
"learning_rate": 4.368978486715237e-06,
"loss": 0.0858,
"step": 2542
},
{
"epoch": 1.156960873521383,
"grad_norm": 1.2811400584279555,
"learning_rate": 4.368503770489573e-06,
"loss": 0.0956,
"step": 2543
},
{
"epoch": 1.1574158325750683,
"grad_norm": 1.3937719698326214,
"learning_rate": 4.368028901578218e-06,
"loss": 0.0721,
"step": 2544
},
{
"epoch": 1.1578707916287534,
"grad_norm": 1.3592341439150106,
"learning_rate": 4.367553880019977e-06,
"loss": 0.072,
"step": 2545
},
{
"epoch": 1.1583257506824385,
"grad_norm": 1.6455271567667071,
"learning_rate": 4.367078705853667e-06,
"loss": 0.0688,
"step": 2546
},
{
"epoch": 1.1587807097361238,
"grad_norm": 1.6810345974728753,
"learning_rate": 4.366603379118117e-06,
"loss": 0.1038,
"step": 2547
},
{
"epoch": 1.1592356687898089,
"grad_norm": 1.4578278036788574,
"learning_rate": 4.366127899852169e-06,
"loss": 0.0865,
"step": 2548
},
{
"epoch": 1.159690627843494,
"grad_norm": 1.3103780377545284,
"learning_rate": 4.365652268094675e-06,
"loss": 0.0674,
"step": 2549
},
{
"epoch": 1.1601455868971793,
"grad_norm": 1.7957120553998775,
"learning_rate": 4.365176483884504e-06,
"loss": 0.1312,
"step": 2550
},
{
"epoch": 1.1606005459508644,
"grad_norm": 1.6492238946584739,
"learning_rate": 4.364700547260533e-06,
"loss": 0.0907,
"step": 2551
},
{
"epoch": 1.1610555050045497,
"grad_norm": 1.3864243311454894,
"learning_rate": 4.3642244582616545e-06,
"loss": 0.0977,
"step": 2552
},
{
"epoch": 1.1615104640582348,
"grad_norm": 1.5321223648985156,
"learning_rate": 4.363748216926772e-06,
"loss": 0.0975,
"step": 2553
},
{
"epoch": 1.1619654231119199,
"grad_norm": 1.428088888774431,
"learning_rate": 4.363271823294802e-06,
"loss": 0.1138,
"step": 2554
},
{
"epoch": 1.1624203821656052,
"grad_norm": 1.9030961957887997,
"learning_rate": 4.362795277404673e-06,
"loss": 0.1121,
"step": 2555
},
{
"epoch": 1.1628753412192903,
"grad_norm": 1.1462755051031488,
"learning_rate": 4.362318579295326e-06,
"loss": 0.0467,
"step": 2556
},
{
"epoch": 1.1633303002729753,
"grad_norm": 1.4980767963568005,
"learning_rate": 4.361841729005715e-06,
"loss": 0.1018,
"step": 2557
},
{
"epoch": 1.1637852593266607,
"grad_norm": 2.2145503141446614,
"learning_rate": 4.361364726574806e-06,
"loss": 0.0853,
"step": 2558
},
{
"epoch": 1.1642402183803457,
"grad_norm": 1.1989117424823872,
"learning_rate": 4.360887572041578e-06,
"loss": 0.0868,
"step": 2559
},
{
"epoch": 1.164695177434031,
"grad_norm": 1.9066512245156881,
"learning_rate": 4.36041026544502e-06,
"loss": 0.1471,
"step": 2560
},
{
"epoch": 1.1651501364877161,
"grad_norm": 1.428837377276699,
"learning_rate": 4.359932806824138e-06,
"loss": 0.0718,
"step": 2561
},
{
"epoch": 1.1656050955414012,
"grad_norm": 1.417125208635274,
"learning_rate": 4.359455196217946e-06,
"loss": 0.0614,
"step": 2562
},
{
"epoch": 1.1660600545950865,
"grad_norm": 1.6663939403921464,
"learning_rate": 4.358977433665471e-06,
"loss": 0.0586,
"step": 2563
},
{
"epoch": 1.1665150136487716,
"grad_norm": 1.3921354785427886,
"learning_rate": 4.3584995192057565e-06,
"loss": 0.0691,
"step": 2564
},
{
"epoch": 1.1669699727024567,
"grad_norm": 1.1683109281081594,
"learning_rate": 4.358021452877854e-06,
"loss": 0.0952,
"step": 2565
},
{
"epoch": 1.167424931756142,
"grad_norm": 1.5985810446894706,
"learning_rate": 4.357543234720829e-06,
"loss": 0.0771,
"step": 2566
},
{
"epoch": 1.1678798908098271,
"grad_norm": 1.726758001874974,
"learning_rate": 4.357064864773761e-06,
"loss": 0.0852,
"step": 2567
},
{
"epoch": 1.1683348498635122,
"grad_norm": 1.376146728666042,
"learning_rate": 4.3565863430757375e-06,
"loss": 0.0816,
"step": 2568
},
{
"epoch": 1.1687898089171975,
"grad_norm": 1.266164839412077,
"learning_rate": 4.356107669665862e-06,
"loss": 0.095,
"step": 2569
},
{
"epoch": 1.1692447679708826,
"grad_norm": 1.7363433482517434,
"learning_rate": 4.355628844583249e-06,
"loss": 0.1348,
"step": 2570
},
{
"epoch": 1.1696997270245677,
"grad_norm": 1.5900315387927095,
"learning_rate": 4.355149867867029e-06,
"loss": 0.0785,
"step": 2571
},
{
"epoch": 1.170154686078253,
"grad_norm": 1.7031570854225535,
"learning_rate": 4.354670739556338e-06,
"loss": 0.0903,
"step": 2572
},
{
"epoch": 1.170609645131938,
"grad_norm": 1.553459320102983,
"learning_rate": 4.35419145969033e-06,
"loss": 0.0808,
"step": 2573
},
{
"epoch": 1.1710646041856232,
"grad_norm": 1.624748274996521,
"learning_rate": 4.35371202830817e-06,
"loss": 0.0946,
"step": 2574
},
{
"epoch": 1.1715195632393085,
"grad_norm": 1.998220943026382,
"learning_rate": 4.353232445449034e-06,
"loss": 0.1007,
"step": 2575
},
{
"epoch": 1.1719745222929936,
"grad_norm": 1.3879277679859046,
"learning_rate": 4.352752711152112e-06,
"loss": 0.0752,
"step": 2576
},
{
"epoch": 1.1724294813466787,
"grad_norm": 2.043253151446217,
"learning_rate": 4.352272825456605e-06,
"loss": 0.1392,
"step": 2577
},
{
"epoch": 1.172884440400364,
"grad_norm": 1.4430794602564747,
"learning_rate": 4.3517927884017275e-06,
"loss": 0.1071,
"step": 2578
},
{
"epoch": 1.173339399454049,
"grad_norm": 1.3026567584819855,
"learning_rate": 4.351312600026706e-06,
"loss": 0.0907,
"step": 2579
},
{
"epoch": 1.1737943585077344,
"grad_norm": 1.4101005705511307,
"learning_rate": 4.350832260370779e-06,
"loss": 0.1012,
"step": 2580
},
{
"epoch": 1.1742493175614195,
"grad_norm": 1.3419121345653944,
"learning_rate": 4.350351769473198e-06,
"loss": 0.0696,
"step": 2581
},
{
"epoch": 1.1747042766151046,
"grad_norm": 1.350413613603601,
"learning_rate": 4.349871127373226e-06,
"loss": 0.0917,
"step": 2582
},
{
"epoch": 1.1751592356687899,
"grad_norm": 1.5328058199569599,
"learning_rate": 4.349390334110141e-06,
"loss": 0.1113,
"step": 2583
},
{
"epoch": 1.175614194722475,
"grad_norm": 1.1093873947356732,
"learning_rate": 4.348909389723228e-06,
"loss": 0.0659,
"step": 2584
},
{
"epoch": 1.17606915377616,
"grad_norm": 1.6756868000210596,
"learning_rate": 4.348428294251791e-06,
"loss": 0.0998,
"step": 2585
},
{
"epoch": 1.1765241128298454,
"grad_norm": 1.4020895191217355,
"learning_rate": 4.34794704773514e-06,
"loss": 0.0756,
"step": 2586
},
{
"epoch": 1.1769790718835305,
"grad_norm": 1.619901575556969,
"learning_rate": 4.347465650212602e-06,
"loss": 0.1049,
"step": 2587
},
{
"epoch": 1.1774340309372158,
"grad_norm": 1.2820911146358447,
"learning_rate": 4.346984101723513e-06,
"loss": 0.099,
"step": 2588
},
{
"epoch": 1.1778889899909009,
"grad_norm": 1.5114352969050147,
"learning_rate": 4.3465024023072255e-06,
"loss": 0.1257,
"step": 2589
},
{
"epoch": 1.178343949044586,
"grad_norm": 1.3539463988206946,
"learning_rate": 4.3460205520031006e-06,
"loss": 0.0593,
"step": 2590
},
{
"epoch": 1.1787989080982713,
"grad_norm": 1.951842216649359,
"learning_rate": 4.345538550850512e-06,
"loss": 0.1236,
"step": 2591
},
{
"epoch": 1.1792538671519563,
"grad_norm": 1.8285849146657949,
"learning_rate": 4.345056398888847e-06,
"loss": 0.0928,
"step": 2592
},
{
"epoch": 1.1797088262056414,
"grad_norm": 1.5041066242121004,
"learning_rate": 4.3445740961575066e-06,
"loss": 0.0687,
"step": 2593
},
{
"epoch": 1.1801637852593267,
"grad_norm": 1.6575747108346124,
"learning_rate": 4.3440916426959e-06,
"loss": 0.0904,
"step": 2594
},
{
"epoch": 1.1806187443130118,
"grad_norm": 1.3214979838016756,
"learning_rate": 4.343609038543452e-06,
"loss": 0.0899,
"step": 2595
},
{
"epoch": 1.181073703366697,
"grad_norm": 1.4859231565076656,
"learning_rate": 4.3431262837396e-06,
"loss": 0.0978,
"step": 2596
},
{
"epoch": 1.1815286624203822,
"grad_norm": 1.6150637319977543,
"learning_rate": 4.342643378323791e-06,
"loss": 0.0842,
"step": 2597
},
{
"epoch": 1.1819836214740673,
"grad_norm": 1.413038987453138,
"learning_rate": 4.342160322335487e-06,
"loss": 0.0654,
"step": 2598
},
{
"epoch": 1.1824385805277524,
"grad_norm": 2.182860548460036,
"learning_rate": 4.34167711581416e-06,
"loss": 0.0841,
"step": 2599
},
{
"epoch": 1.1828935395814377,
"grad_norm": 1.275297167024451,
"learning_rate": 4.3411937587992955e-06,
"loss": 0.0722,
"step": 2600
},
{
"epoch": 1.1833484986351228,
"grad_norm": 1.1799530738898074,
"learning_rate": 4.340710251330393e-06,
"loss": 0.0662,
"step": 2601
},
{
"epoch": 1.183803457688808,
"grad_norm": 1.872220715095368,
"learning_rate": 4.34022659344696e-06,
"loss": 0.1292,
"step": 2602
},
{
"epoch": 1.1842584167424932,
"grad_norm": 1.6772862778704278,
"learning_rate": 4.339742785188521e-06,
"loss": 0.0966,
"step": 2603
},
{
"epoch": 1.1847133757961783,
"grad_norm": 1.6082753483614305,
"learning_rate": 4.339258826594611e-06,
"loss": 0.0582,
"step": 2604
},
{
"epoch": 1.1851683348498634,
"grad_norm": 1.6117792608004555,
"learning_rate": 4.338774717704774e-06,
"loss": 0.0643,
"step": 2605
},
{
"epoch": 1.1856232939035487,
"grad_norm": 1.7422517232972539,
"learning_rate": 4.338290458558572e-06,
"loss": 0.1766,
"step": 2606
},
{
"epoch": 1.1860782529572338,
"grad_norm": 2.1476781837506818,
"learning_rate": 4.3378060491955744e-06,
"loss": 0.1463,
"step": 2607
},
{
"epoch": 1.186533212010919,
"grad_norm": 1.8922581543540133,
"learning_rate": 4.337321489655366e-06,
"loss": 0.1528,
"step": 2608
},
{
"epoch": 1.1869881710646042,
"grad_norm": 1.7516502810489014,
"learning_rate": 4.336836779977543e-06,
"loss": 0.1038,
"step": 2609
},
{
"epoch": 1.1874431301182893,
"grad_norm": 1.4511814170214454,
"learning_rate": 4.336351920201714e-06,
"loss": 0.1005,
"step": 2610
},
{
"epoch": 1.1878980891719746,
"grad_norm": 1.5620930461894496,
"learning_rate": 4.335866910367498e-06,
"loss": 0.0492,
"step": 2611
},
{
"epoch": 1.1883530482256597,
"grad_norm": 2.7082970498760117,
"learning_rate": 4.3353817505145294e-06,
"loss": 0.0909,
"step": 2612
},
{
"epoch": 1.1888080072793448,
"grad_norm": 1.5743219982804768,
"learning_rate": 4.334896440682452e-06,
"loss": 0.077,
"step": 2613
},
{
"epoch": 1.18926296633303,
"grad_norm": 1.3966339148129352,
"learning_rate": 4.334410980910924e-06,
"loss": 0.1218,
"step": 2614
},
{
"epoch": 1.1897179253867152,
"grad_norm": 1.4856452151376027,
"learning_rate": 4.333925371239615e-06,
"loss": 0.1035,
"step": 2615
},
{
"epoch": 1.1901728844404005,
"grad_norm": 1.6127438575709883,
"learning_rate": 4.3334396117082065e-06,
"loss": 0.1052,
"step": 2616
},
{
"epoch": 1.1906278434940856,
"grad_norm": 1.7288330036362787,
"learning_rate": 4.332953702356393e-06,
"loss": 0.1607,
"step": 2617
},
{
"epoch": 1.1910828025477707,
"grad_norm": 1.2779780017213267,
"learning_rate": 4.33246764322388e-06,
"loss": 0.0664,
"step": 2618
},
{
"epoch": 1.191537761601456,
"grad_norm": 1.843632743904082,
"learning_rate": 4.331981434350387e-06,
"loss": 0.1535,
"step": 2619
},
{
"epoch": 1.191992720655141,
"grad_norm": 1.3210812550635276,
"learning_rate": 4.331495075775644e-06,
"loss": 0.1404,
"step": 2620
},
{
"epoch": 1.1924476797088261,
"grad_norm": 1.3878492439329282,
"learning_rate": 4.331008567539395e-06,
"loss": 0.0747,
"step": 2621
},
{
"epoch": 1.1929026387625115,
"grad_norm": 1.3357463507965919,
"learning_rate": 4.330521909681394e-06,
"loss": 0.0766,
"step": 2622
},
{
"epoch": 1.1933575978161965,
"grad_norm": 1.6211605147229922,
"learning_rate": 4.330035102241409e-06,
"loss": 0.1197,
"step": 2623
},
{
"epoch": 1.1938125568698816,
"grad_norm": 1.496864935979414,
"learning_rate": 4.32954814525922e-06,
"loss": 0.0701,
"step": 2624
},
{
"epoch": 1.194267515923567,
"grad_norm": 1.3041113510202,
"learning_rate": 4.329061038774619e-06,
"loss": 0.071,
"step": 2625
},
{
"epoch": 1.194722474977252,
"grad_norm": 1.3390637893903103,
"learning_rate": 4.32857378282741e-06,
"loss": 0.0951,
"step": 2626
},
{
"epoch": 1.1951774340309371,
"grad_norm": 1.3209742325562313,
"learning_rate": 4.328086377457409e-06,
"loss": 0.0844,
"step": 2627
},
{
"epoch": 1.1956323930846224,
"grad_norm": 1.8118172786335158,
"learning_rate": 4.327598822704444e-06,
"loss": 0.1175,
"step": 2628
},
{
"epoch": 1.1960873521383075,
"grad_norm": 1.6299368669430234,
"learning_rate": 4.327111118608357e-06,
"loss": 0.1467,
"step": 2629
},
{
"epoch": 1.1965423111919926,
"grad_norm": 1.5688063002459107,
"learning_rate": 4.326623265209001e-06,
"loss": 0.0803,
"step": 2630
},
{
"epoch": 1.196997270245678,
"grad_norm": 1.6465294755773725,
"learning_rate": 4.326135262546241e-06,
"loss": 0.0705,
"step": 2631
},
{
"epoch": 1.197452229299363,
"grad_norm": 1.6238105525738482,
"learning_rate": 4.325647110659954e-06,
"loss": 0.1254,
"step": 2632
},
{
"epoch": 1.197907188353048,
"grad_norm": 1.7891444626148267,
"learning_rate": 4.325158809590028e-06,
"loss": 0.0718,
"step": 2633
},
{
"epoch": 1.1983621474067334,
"grad_norm": 1.047556103709193,
"learning_rate": 4.324670359376368e-06,
"loss": 0.0548,
"step": 2634
},
{
"epoch": 1.1988171064604185,
"grad_norm": 1.4266407858751808,
"learning_rate": 4.3241817600588865e-06,
"loss": 0.0799,
"step": 2635
},
{
"epoch": 1.1992720655141038,
"grad_norm": 1.0758052671422083,
"learning_rate": 4.3236930116775086e-06,
"loss": 0.0469,
"step": 2636
},
{
"epoch": 1.199727024567789,
"grad_norm": 1.8000162783707994,
"learning_rate": 4.323204114272174e-06,
"loss": 0.1349,
"step": 2637
},
{
"epoch": 1.200181983621474,
"grad_norm": 2.2216878566032836,
"learning_rate": 4.3227150678828335e-06,
"loss": 0.1198,
"step": 2638
},
{
"epoch": 1.2006369426751593,
"grad_norm": 1.674728333776232,
"learning_rate": 4.322225872549448e-06,
"loss": 0.1025,
"step": 2639
},
{
"epoch": 1.2010919017288444,
"grad_norm": 1.689368542839076,
"learning_rate": 4.321736528311994e-06,
"loss": 0.1048,
"step": 2640
},
{
"epoch": 1.2015468607825295,
"grad_norm": 1.4354075881450123,
"learning_rate": 4.321247035210456e-06,
"loss": 0.0692,
"step": 2641
},
{
"epoch": 1.2020018198362148,
"grad_norm": 1.6563738642729477,
"learning_rate": 4.320757393284837e-06,
"loss": 0.0767,
"step": 2642
},
{
"epoch": 1.2024567788898999,
"grad_norm": 1.379611923602435,
"learning_rate": 4.3202676025751455e-06,
"loss": 0.0591,
"step": 2643
},
{
"epoch": 1.2029117379435852,
"grad_norm": 1.6479290456698004,
"learning_rate": 4.319777663121406e-06,
"loss": 0.0961,
"step": 2644
},
{
"epoch": 1.2033666969972703,
"grad_norm": 1.9415821059711678,
"learning_rate": 4.319287574963653e-06,
"loss": 0.1624,
"step": 2645
},
{
"epoch": 1.2038216560509554,
"grad_norm": 1.5187755572188995,
"learning_rate": 4.318797338141936e-06,
"loss": 0.0799,
"step": 2646
},
{
"epoch": 1.2042766151046407,
"grad_norm": 1.2261158559841066,
"learning_rate": 4.318306952696314e-06,
"loss": 0.0789,
"step": 2647
},
{
"epoch": 1.2047315741583258,
"grad_norm": 1.5350997195388667,
"learning_rate": 4.317816418666859e-06,
"loss": 0.0648,
"step": 2648
},
{
"epoch": 1.2051865332120109,
"grad_norm": 2.0282859482323135,
"learning_rate": 4.317325736093656e-06,
"loss": 0.1003,
"step": 2649
},
{
"epoch": 1.2056414922656962,
"grad_norm": 1.099438335437198,
"learning_rate": 4.316834905016801e-06,
"loss": 0.0749,
"step": 2650
},
{
"epoch": 1.2060964513193813,
"grad_norm": 1.6955258737212886,
"learning_rate": 4.3163439254764015e-06,
"loss": 0.0799,
"step": 2651
},
{
"epoch": 1.2065514103730663,
"grad_norm": 1.4782312844645842,
"learning_rate": 4.31585279751258e-06,
"loss": 0.0812,
"step": 2652
},
{
"epoch": 1.2070063694267517,
"grad_norm": 0.962225205333111,
"learning_rate": 4.315361521165467e-06,
"loss": 0.0421,
"step": 2653
},
{
"epoch": 1.2074613284804367,
"grad_norm": 1.475944438171979,
"learning_rate": 4.314870096475209e-06,
"loss": 0.0797,
"step": 2654
},
{
"epoch": 1.2079162875341218,
"grad_norm": 1.9568750202890988,
"learning_rate": 4.3143785234819624e-06,
"loss": 0.1064,
"step": 2655
},
{
"epoch": 1.2083712465878071,
"grad_norm": 1.2968330567546162,
"learning_rate": 4.3138868022258974e-06,
"loss": 0.0541,
"step": 2656
},
{
"epoch": 1.2088262056414922,
"grad_norm": 1.3512605939635933,
"learning_rate": 4.313394932747194e-06,
"loss": 0.084,
"step": 2657
},
{
"epoch": 1.2092811646951773,
"grad_norm": 1.2788458917599885,
"learning_rate": 4.312902915086045e-06,
"loss": 0.078,
"step": 2658
},
{
"epoch": 1.2097361237488626,
"grad_norm": 1.2087340265742859,
"learning_rate": 4.312410749282658e-06,
"loss": 0.083,
"step": 2659
},
{
"epoch": 1.2101910828025477,
"grad_norm": 1.51675138627556,
"learning_rate": 4.311918435377248e-06,
"loss": 0.098,
"step": 2660
},
{
"epoch": 1.210646041856233,
"grad_norm": 1.767606141999641,
"learning_rate": 4.311425973410047e-06,
"loss": 0.1403,
"step": 2661
},
{
"epoch": 1.2111010009099181,
"grad_norm": 1.8607859425213837,
"learning_rate": 4.310933363421296e-06,
"loss": 0.1002,
"step": 2662
},
{
"epoch": 1.2115559599636032,
"grad_norm": 2.188295719120762,
"learning_rate": 4.310440605451248e-06,
"loss": 0.1062,
"step": 2663
},
{
"epoch": 1.2120109190172885,
"grad_norm": 1.6007893169355347,
"learning_rate": 4.30994769954017e-06,
"loss": 0.0855,
"step": 2664
},
{
"epoch": 1.2124658780709736,
"grad_norm": 1.7264264512353125,
"learning_rate": 4.30945464572834e-06,
"loss": 0.1561,
"step": 2665
},
{
"epoch": 1.2129208371246587,
"grad_norm": 1.4708066988612976,
"learning_rate": 4.3089614440560465e-06,
"loss": 0.0607,
"step": 2666
},
{
"epoch": 1.213375796178344,
"grad_norm": 1.5600890024513265,
"learning_rate": 4.3084680945635946e-06,
"loss": 0.1364,
"step": 2667
},
{
"epoch": 1.213830755232029,
"grad_norm": 1.876498244558624,
"learning_rate": 4.307974597291296e-06,
"loss": 0.1076,
"step": 2668
},
{
"epoch": 1.2142857142857142,
"grad_norm": 1.37065103914952,
"learning_rate": 4.307480952279478e-06,
"loss": 0.0523,
"step": 2669
},
{
"epoch": 1.2147406733393995,
"grad_norm": 1.4444820040999051,
"learning_rate": 4.3069871595684795e-06,
"loss": 0.0739,
"step": 2670
},
{
"epoch": 1.2151956323930846,
"grad_norm": 1.5069719193608038,
"learning_rate": 4.30649321919865e-06,
"loss": 0.0911,
"step": 2671
},
{
"epoch": 1.21565059144677,
"grad_norm": 1.2934622383879057,
"learning_rate": 4.305999131210353e-06,
"loss": 0.0837,
"step": 2672
},
{
"epoch": 1.216105550500455,
"grad_norm": 1.5853581830621495,
"learning_rate": 4.305504895643963e-06,
"loss": 0.0833,
"step": 2673
},
{
"epoch": 1.21656050955414,
"grad_norm": 1.3709517382273528,
"learning_rate": 4.305010512539867e-06,
"loss": 0.1159,
"step": 2674
},
{
"epoch": 1.2170154686078254,
"grad_norm": 1.4168456459509742,
"learning_rate": 4.304515981938462e-06,
"loss": 0.0606,
"step": 2675
},
{
"epoch": 1.2174704276615105,
"grad_norm": 1.5616363029677887,
"learning_rate": 4.304021303880161e-06,
"loss": 0.0996,
"step": 2676
},
{
"epoch": 1.2179253867151956,
"grad_norm": 1.708179628273713,
"learning_rate": 4.303526478405386e-06,
"loss": 0.1065,
"step": 2677
},
{
"epoch": 1.2183803457688809,
"grad_norm": 2.116672264038859,
"learning_rate": 4.3030315055545715e-06,
"loss": 0.128,
"step": 2678
},
{
"epoch": 1.218835304822566,
"grad_norm": 1.6986733358840764,
"learning_rate": 4.302536385368165e-06,
"loss": 0.082,
"step": 2679
},
{
"epoch": 1.219290263876251,
"grad_norm": 1.6851973141425958,
"learning_rate": 4.3020411178866246e-06,
"loss": 0.0666,
"step": 2680
},
{
"epoch": 1.2197452229299364,
"grad_norm": 1.3268862435295075,
"learning_rate": 4.3015457031504226e-06,
"loss": 0.0615,
"step": 2681
},
{
"epoch": 1.2202001819836215,
"grad_norm": 2.894618285414545,
"learning_rate": 4.301050141200041e-06,
"loss": 0.1161,
"step": 2682
},
{
"epoch": 1.2206551410373065,
"grad_norm": 1.8518976016980668,
"learning_rate": 4.300554432075975e-06,
"loss": 0.0677,
"step": 2683
},
{
"epoch": 1.2211101000909919,
"grad_norm": 1.9252846318661894,
"learning_rate": 4.300058575818733e-06,
"loss": 0.1195,
"step": 2684
},
{
"epoch": 1.221565059144677,
"grad_norm": 1.7916218908549502,
"learning_rate": 4.299562572468833e-06,
"loss": 0.1264,
"step": 2685
},
{
"epoch": 1.222020018198362,
"grad_norm": 1.3194566331820348,
"learning_rate": 4.299066422066807e-06,
"loss": 0.044,
"step": 2686
},
{
"epoch": 1.2224749772520473,
"grad_norm": 1.702059632495899,
"learning_rate": 4.2985701246531965e-06,
"loss": 0.1094,
"step": 2687
},
{
"epoch": 1.2229299363057324,
"grad_norm": 1.3985606136942172,
"learning_rate": 4.2980736802685575e-06,
"loss": 0.0476,
"step": 2688
},
{
"epoch": 1.2233848953594177,
"grad_norm": 1.8905242980121515,
"learning_rate": 4.297577088953458e-06,
"loss": 0.0676,
"step": 2689
},
{
"epoch": 1.2238398544131028,
"grad_norm": 0.8842330436141602,
"learning_rate": 4.2970803507484756e-06,
"loss": 0.0528,
"step": 2690
},
{
"epoch": 1.224294813466788,
"grad_norm": 1.5087671057266334,
"learning_rate": 4.296583465694204e-06,
"loss": 0.0781,
"step": 2691
},
{
"epoch": 1.2247497725204732,
"grad_norm": 2.1139760440967112,
"learning_rate": 4.296086433831244e-06,
"loss": 0.0995,
"step": 2692
},
{
"epoch": 1.2252047315741583,
"grad_norm": 1.3607345905968589,
"learning_rate": 4.295589255200212e-06,
"loss": 0.0842,
"step": 2693
},
{
"epoch": 1.2256596906278434,
"grad_norm": 1.7864471189286306,
"learning_rate": 4.295091929841734e-06,
"loss": 0.0839,
"step": 2694
},
{
"epoch": 1.2261146496815287,
"grad_norm": 1.4725627389737213,
"learning_rate": 4.2945944577964516e-06,
"loss": 0.1817,
"step": 2695
},
{
"epoch": 1.2265696087352138,
"grad_norm": 1.1876699089763878,
"learning_rate": 4.294096839105013e-06,
"loss": 0.0614,
"step": 2696
},
{
"epoch": 1.2270245677888991,
"grad_norm": 1.4225833533824312,
"learning_rate": 4.293599073808083e-06,
"loss": 0.0796,
"step": 2697
},
{
"epoch": 1.2274795268425842,
"grad_norm": 1.3288722678195426,
"learning_rate": 4.293101161946337e-06,
"loss": 0.0555,
"step": 2698
},
{
"epoch": 1.2279344858962693,
"grad_norm": 1.2424148095147949,
"learning_rate": 4.292603103560462e-06,
"loss": 0.0488,
"step": 2699
},
{
"epoch": 1.2283894449499546,
"grad_norm": 1.2746073892843495,
"learning_rate": 4.292104898691157e-06,
"loss": 0.0965,
"step": 2700
},
{
"epoch": 1.2288444040036397,
"grad_norm": 1.9553417584027957,
"learning_rate": 4.291606547379131e-06,
"loss": 0.0863,
"step": 2701
},
{
"epoch": 1.2292993630573248,
"grad_norm": 1.6292687158685326,
"learning_rate": 4.291108049665109e-06,
"loss": 0.1039,
"step": 2702
},
{
"epoch": 1.22975432211101,
"grad_norm": 1.6141920925692421,
"learning_rate": 4.290609405589827e-06,
"loss": 0.0702,
"step": 2703
},
{
"epoch": 1.2302092811646952,
"grad_norm": 1.568358524006938,
"learning_rate": 4.29011061519403e-06,
"loss": 0.1305,
"step": 2704
},
{
"epoch": 1.2306642402183803,
"grad_norm": 1.5832578242534308,
"learning_rate": 4.289611678518478e-06,
"loss": 0.0943,
"step": 2705
},
{
"epoch": 1.2311191992720656,
"grad_norm": 1.7204606734278,
"learning_rate": 4.289112595603941e-06,
"loss": 0.1271,
"step": 2706
},
{
"epoch": 1.2315741583257507,
"grad_norm": 1.878311333320497,
"learning_rate": 4.288613366491202e-06,
"loss": 0.0753,
"step": 2707
},
{
"epoch": 1.2320291173794358,
"grad_norm": 1.6190494499887427,
"learning_rate": 4.288113991221057e-06,
"loss": 0.0815,
"step": 2708
},
{
"epoch": 1.232484076433121,
"grad_norm": 1.4265449920467896,
"learning_rate": 4.2876144698343115e-06,
"loss": 0.0905,
"step": 2709
},
{
"epoch": 1.2329390354868062,
"grad_norm": 1.5792299252383166,
"learning_rate": 4.287114802371783e-06,
"loss": 0.0933,
"step": 2710
},
{
"epoch": 1.2333939945404913,
"grad_norm": 1.5541962345380622,
"learning_rate": 4.286614988874304e-06,
"loss": 0.1018,
"step": 2711
},
{
"epoch": 1.2338489535941766,
"grad_norm": 1.4933850317503654,
"learning_rate": 4.286115029382717e-06,
"loss": 0.1448,
"step": 2712
},
{
"epoch": 1.2343039126478617,
"grad_norm": 1.778907316114548,
"learning_rate": 4.285614923937876e-06,
"loss": 0.1101,
"step": 2713
},
{
"epoch": 1.2347588717015467,
"grad_norm": 1.3970757565526302,
"learning_rate": 4.285114672580647e-06,
"loss": 0.0862,
"step": 2714
},
{
"epoch": 1.235213830755232,
"grad_norm": 1.9653421473113715,
"learning_rate": 4.284614275351907e-06,
"loss": 0.1155,
"step": 2715
},
{
"epoch": 1.2356687898089171,
"grad_norm": 1.4818183158109117,
"learning_rate": 4.2841137322925495e-06,
"loss": 0.1109,
"step": 2716
},
{
"epoch": 1.2361237488626025,
"grad_norm": 1.395827472007909,
"learning_rate": 4.283613043443474e-06,
"loss": 0.0615,
"step": 2717
},
{
"epoch": 1.2365787079162875,
"grad_norm": 1.2600494580099084,
"learning_rate": 4.2831122088455955e-06,
"loss": 0.0588,
"step": 2718
},
{
"epoch": 1.2370336669699726,
"grad_norm": 1.731274261725021,
"learning_rate": 4.2826112285398395e-06,
"loss": 0.1502,
"step": 2719
},
{
"epoch": 1.237488626023658,
"grad_norm": 1.0227517272317024,
"learning_rate": 4.282110102567145e-06,
"loss": 0.0517,
"step": 2720
},
{
"epoch": 1.237943585077343,
"grad_norm": 1.3776885997310226,
"learning_rate": 4.28160883096846e-06,
"loss": 0.0663,
"step": 2721
},
{
"epoch": 1.2383985441310281,
"grad_norm": 1.2572442124919356,
"learning_rate": 4.281107413784747e-06,
"loss": 0.067,
"step": 2722
},
{
"epoch": 1.2388535031847134,
"grad_norm": 1.2741809908905852,
"learning_rate": 4.28060585105698e-06,
"loss": 0.1001,
"step": 2723
},
{
"epoch": 1.2393084622383985,
"grad_norm": 1.6333661735440708,
"learning_rate": 4.280104142826143e-06,
"loss": 0.0787,
"step": 2724
},
{
"epoch": 1.2397634212920838,
"grad_norm": 2.1072595872871984,
"learning_rate": 4.2796022891332355e-06,
"loss": 0.1632,
"step": 2725
},
{
"epoch": 1.240218380345769,
"grad_norm": 2.029930265466161,
"learning_rate": 4.279100290019265e-06,
"loss": 0.0732,
"step": 2726
},
{
"epoch": 1.240673339399454,
"grad_norm": 1.3800193403031813,
"learning_rate": 4.278598145525253e-06,
"loss": 0.1215,
"step": 2727
},
{
"epoch": 1.2411282984531393,
"grad_norm": 2.1334796621942074,
"learning_rate": 4.278095855692233e-06,
"loss": 0.1028,
"step": 2728
},
{
"epoch": 1.2415832575068244,
"grad_norm": 1.9037023983095858,
"learning_rate": 4.277593420561249e-06,
"loss": 0.0583,
"step": 2729
},
{
"epoch": 1.2420382165605095,
"grad_norm": 1.5266711911694233,
"learning_rate": 4.277090840173359e-06,
"loss": 0.0727,
"step": 2730
},
{
"epoch": 1.2424931756141948,
"grad_norm": 1.779852269680275,
"learning_rate": 4.276588114569631e-06,
"loss": 0.1165,
"step": 2731
},
{
"epoch": 1.24294813466788,
"grad_norm": 1.1686354520981554,
"learning_rate": 4.2760852437911436e-06,
"loss": 0.0696,
"step": 2732
},
{
"epoch": 1.243403093721565,
"grad_norm": 1.6281358508365982,
"learning_rate": 4.2755822278789926e-06,
"loss": 0.0748,
"step": 2733
},
{
"epoch": 1.2438580527752503,
"grad_norm": 1.9348550299278917,
"learning_rate": 4.2750790668742795e-06,
"loss": 0.0771,
"step": 2734
},
{
"epoch": 1.2443130118289354,
"grad_norm": 1.6843775010519313,
"learning_rate": 4.274575760818122e-06,
"loss": 0.1291,
"step": 2735
},
{
"epoch": 1.2447679708826205,
"grad_norm": 1.7400214741336621,
"learning_rate": 4.274072309751646e-06,
"loss": 0.0736,
"step": 2736
},
{
"epoch": 1.2452229299363058,
"grad_norm": 1.3279822498973282,
"learning_rate": 4.273568713715993e-06,
"loss": 0.105,
"step": 2737
},
{
"epoch": 1.2456778889899909,
"grad_norm": 1.4181047264694318,
"learning_rate": 4.2730649727523145e-06,
"loss": 0.1044,
"step": 2738
},
{
"epoch": 1.246132848043676,
"grad_norm": 1.5420933585436614,
"learning_rate": 4.272561086901773e-06,
"loss": 0.0742,
"step": 2739
},
{
"epoch": 1.2465878070973613,
"grad_norm": 2.0627213117577616,
"learning_rate": 4.272057056205544e-06,
"loss": 0.1002,
"step": 2740
},
{
"epoch": 1.2470427661510464,
"grad_norm": 1.6373337151018261,
"learning_rate": 4.271552880704815e-06,
"loss": 0.0786,
"step": 2741
},
{
"epoch": 1.2474977252047315,
"grad_norm": 1.4066801307959027,
"learning_rate": 4.271048560440786e-06,
"loss": 0.0951,
"step": 2742
},
{
"epoch": 1.2479526842584168,
"grad_norm": 1.4840597932593944,
"learning_rate": 4.2705440954546665e-06,
"loss": 0.1449,
"step": 2743
},
{
"epoch": 1.2484076433121019,
"grad_norm": 1.4874386819240102,
"learning_rate": 4.270039485787678e-06,
"loss": 0.0979,
"step": 2744
},
{
"epoch": 1.2488626023657872,
"grad_norm": 1.4996547701951468,
"learning_rate": 4.269534731481057e-06,
"loss": 0.1153,
"step": 2745
},
{
"epoch": 1.2493175614194723,
"grad_norm": 1.748368630407863,
"learning_rate": 4.269029832576048e-06,
"loss": 0.0701,
"step": 2746
},
{
"epoch": 1.2497725204731573,
"grad_norm": 1.2272157062443403,
"learning_rate": 4.2685247891139114e-06,
"loss": 0.0742,
"step": 2747
},
{
"epoch": 1.2502274795268427,
"grad_norm": 1.2535267297683748,
"learning_rate": 4.268019601135914e-06,
"loss": 0.0663,
"step": 2748
},
{
"epoch": 1.2506824385805277,
"grad_norm": 2.2232595843640954,
"learning_rate": 4.26751426868334e-06,
"loss": 0.0552,
"step": 2749
},
{
"epoch": 1.251137397634213,
"grad_norm": 1.6413257670602424,
"learning_rate": 4.2670087917974826e-06,
"loss": 0.0953,
"step": 2750
},
{
"epoch": 1.2515923566878981,
"grad_norm": 2.525956129850652,
"learning_rate": 4.266503170519645e-06,
"loss": 0.1019,
"step": 2751
},
{
"epoch": 1.2520473157415832,
"grad_norm": 1.7532088817176623,
"learning_rate": 4.265997404891147e-06,
"loss": 0.0962,
"step": 2752
},
{
"epoch": 1.2525022747952685,
"grad_norm": 1.7385955199194223,
"learning_rate": 4.265491494953316e-06,
"loss": 0.0829,
"step": 2753
},
{
"epoch": 1.2529572338489536,
"grad_norm": 1.5355610337039685,
"learning_rate": 4.2649854407474925e-06,
"loss": 0.1359,
"step": 2754
},
{
"epoch": 1.2534121929026387,
"grad_norm": 1.28022022581084,
"learning_rate": 4.26447924231503e-06,
"loss": 0.0558,
"step": 2755
},
{
"epoch": 1.253867151956324,
"grad_norm": 1.3880085094165089,
"learning_rate": 4.263972899697292e-06,
"loss": 0.0976,
"step": 2756
},
{
"epoch": 1.2543221110100091,
"grad_norm": 1.274974064159807,
"learning_rate": 4.263466412935654e-06,
"loss": 0.1164,
"step": 2757
},
{
"epoch": 1.2547770700636942,
"grad_norm": 1.3582086906964457,
"learning_rate": 4.262959782071505e-06,
"loss": 0.0524,
"step": 2758
},
{
"epoch": 1.2552320291173795,
"grad_norm": 1.8565157639016567,
"learning_rate": 4.262453007146244e-06,
"loss": 0.1207,
"step": 2759
},
{
"epoch": 1.2556869881710646,
"grad_norm": 1.1179278766341727,
"learning_rate": 4.261946088201282e-06,
"loss": 0.0628,
"step": 2760
},
{
"epoch": 1.2561419472247497,
"grad_norm": 1.3815222535677334,
"learning_rate": 4.261439025278044e-06,
"loss": 0.0783,
"step": 2761
},
{
"epoch": 1.256596906278435,
"grad_norm": 1.6096595755674274,
"learning_rate": 4.260931818417962e-06,
"loss": 0.0655,
"step": 2762
},
{
"epoch": 1.25705186533212,
"grad_norm": 1.4310899801227122,
"learning_rate": 4.260424467662484e-06,
"loss": 0.0794,
"step": 2763
},
{
"epoch": 1.2575068243858052,
"grad_norm": 1.3830505652727263,
"learning_rate": 4.259916973053069e-06,
"loss": 0.126,
"step": 2764
},
{
"epoch": 1.2579617834394905,
"grad_norm": 1.2593848254260958,
"learning_rate": 4.2594093346311865e-06,
"loss": 0.0952,
"step": 2765
},
{
"epoch": 1.2584167424931756,
"grad_norm": 1.7618010142299456,
"learning_rate": 4.258901552438319e-06,
"loss": 0.1159,
"step": 2766
},
{
"epoch": 1.2588717015468607,
"grad_norm": 1.4438782108606985,
"learning_rate": 4.25839362651596e-06,
"loss": 0.0862,
"step": 2767
},
{
"epoch": 1.259326660600546,
"grad_norm": 1.960220687441142,
"learning_rate": 4.257885556905613e-06,
"loss": 0.0847,
"step": 2768
},
{
"epoch": 1.259781619654231,
"grad_norm": 1.588478187298156,
"learning_rate": 4.257377343648799e-06,
"loss": 0.0798,
"step": 2769
},
{
"epoch": 1.2602365787079162,
"grad_norm": 1.3801501508630765,
"learning_rate": 4.256868986787044e-06,
"loss": 0.0942,
"step": 2770
},
{
"epoch": 1.2606915377616015,
"grad_norm": 1.429324437514992,
"learning_rate": 4.256360486361889e-06,
"loss": 0.0588,
"step": 2771
},
{
"epoch": 1.2611464968152866,
"grad_norm": 1.6843373956104633,
"learning_rate": 4.255851842414887e-06,
"loss": 0.0655,
"step": 2772
},
{
"epoch": 1.2616014558689717,
"grad_norm": 1.8180982857396182,
"learning_rate": 4.255343054987601e-06,
"loss": 0.1242,
"step": 2773
},
{
"epoch": 1.262056414922657,
"grad_norm": 1.417537186445061,
"learning_rate": 4.2548341241216085e-06,
"loss": 0.0584,
"step": 2774
},
{
"epoch": 1.262511373976342,
"grad_norm": 1.8094891195148863,
"learning_rate": 4.254325049858496e-06,
"loss": 0.104,
"step": 2775
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 24592138002432.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}