chess_T5_seq2seq / trainer_state.json
belpekkan's picture
New training down with grandmaster moves used as training data
dbbfa1c verified
{
"best_global_step": 72742,
"best_metric": 0.4625195264816284,
"best_model_checkpoint": "./chess_t5_model_hikaru/checkpoint-72742",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 72742,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005498886475488713,
"grad_norm": 1.238595962524414,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.506280403137207,
"step": 200
},
{
"epoch": 0.010997772950977426,
"grad_norm": 1.4552940130233765,
"learning_rate": 3.99e-05,
"loss": 0.49826000213623045,
"step": 400
},
{
"epoch": 0.01649665942646614,
"grad_norm": 1.404646873474121,
"learning_rate": 4.9954425345032365e-05,
"loss": 0.49065006256103516,
"step": 600
},
{
"epoch": 0.021995545901954853,
"grad_norm": 1.1736574172973633,
"learning_rate": 4.9862355334996736e-05,
"loss": 0.4841666030883789,
"step": 800
},
{
"epoch": 0.02749443237744357,
"grad_norm": 0.9726278185844421,
"learning_rate": 4.97702853249611e-05,
"loss": 0.49401542663574216,
"step": 1000
},
{
"epoch": 0.03299331885293228,
"grad_norm": 1.2402087450027466,
"learning_rate": 4.967821531492547e-05,
"loss": 0.49149925231933594,
"step": 1200
},
{
"epoch": 0.03849220532842099,
"grad_norm": 1.2934446334838867,
"learning_rate": 4.958614530488984e-05,
"loss": 0.4847659683227539,
"step": 1400
},
{
"epoch": 0.043991091803909706,
"grad_norm": 1.5672705173492432,
"learning_rate": 4.949407529485421e-05,
"loss": 0.4918820571899414,
"step": 1600
},
{
"epoch": 0.049489978279398425,
"grad_norm": 1.2068564891815186,
"learning_rate": 4.940200528481858e-05,
"loss": 0.48373069763183596,
"step": 1800
},
{
"epoch": 0.05498886475488714,
"grad_norm": 1.1447815895080566,
"learning_rate": 4.9309935274782946e-05,
"loss": 0.49397098541259765,
"step": 2000
},
{
"epoch": 0.06048775123037585,
"grad_norm": 1.3082820177078247,
"learning_rate": 4.9217865264747316e-05,
"loss": 0.4879715728759766,
"step": 2200
},
{
"epoch": 0.06598663770586456,
"grad_norm": 1.322120189666748,
"learning_rate": 4.912579525471169e-05,
"loss": 0.4860882568359375,
"step": 2400
},
{
"epoch": 0.07148552418135327,
"grad_norm": 1.3626155853271484,
"learning_rate": 4.903372524467605e-05,
"loss": 0.49077301025390624,
"step": 2600
},
{
"epoch": 0.07698441065684199,
"grad_norm": 1.0659453868865967,
"learning_rate": 4.894165523464042e-05,
"loss": 0.48927955627441405,
"step": 2800
},
{
"epoch": 0.0824832971323307,
"grad_norm": 1.1972386837005615,
"learning_rate": 4.884958522460479e-05,
"loss": 0.4904148101806641,
"step": 3000
},
{
"epoch": 0.08798218360781941,
"grad_norm": 1.3156094551086426,
"learning_rate": 4.875751521456916e-05,
"loss": 0.4949393081665039,
"step": 3200
},
{
"epoch": 0.09348107008330812,
"grad_norm": 1.1924458742141724,
"learning_rate": 4.8665445204533527e-05,
"loss": 0.48969757080078125,
"step": 3400
},
{
"epoch": 0.09897995655879685,
"grad_norm": 1.4736772775650024,
"learning_rate": 4.85733751944979e-05,
"loss": 0.4906147384643555,
"step": 3600
},
{
"epoch": 0.10447884303428556,
"grad_norm": 1.2425668239593506,
"learning_rate": 4.848130518446227e-05,
"loss": 0.48961822509765623,
"step": 3800
},
{
"epoch": 0.10997772950977427,
"grad_norm": 1.2657986879348755,
"learning_rate": 4.838923517442663e-05,
"loss": 0.4902804183959961,
"step": 4000
},
{
"epoch": 0.11547661598526299,
"grad_norm": 1.2814760208129883,
"learning_rate": 4.829716516439101e-05,
"loss": 0.4866915130615234,
"step": 4200
},
{
"epoch": 0.1209755024607517,
"grad_norm": 1.3233275413513184,
"learning_rate": 4.820509515435537e-05,
"loss": 0.4842318344116211,
"step": 4400
},
{
"epoch": 0.1264743889362404,
"grad_norm": 1.0813190937042236,
"learning_rate": 4.8113025144319744e-05,
"loss": 0.4887635040283203,
"step": 4600
},
{
"epoch": 0.13197327541172912,
"grad_norm": 1.4319493770599365,
"learning_rate": 4.8020955134284114e-05,
"loss": 0.48523338317871095,
"step": 4800
},
{
"epoch": 0.13747216188721784,
"grad_norm": 1.1767573356628418,
"learning_rate": 4.792888512424848e-05,
"loss": 0.4876384735107422,
"step": 5000
},
{
"epoch": 0.14297104836270655,
"grad_norm": 1.2524778842926025,
"learning_rate": 4.783681511421285e-05,
"loss": 0.48621952056884765,
"step": 5200
},
{
"epoch": 0.14846993483819526,
"grad_norm": 1.7391471862792969,
"learning_rate": 4.774474510417722e-05,
"loss": 0.4950310516357422,
"step": 5400
},
{
"epoch": 0.15396882131368397,
"grad_norm": 1.3185511827468872,
"learning_rate": 4.765267509414158e-05,
"loss": 0.4825564193725586,
"step": 5600
},
{
"epoch": 0.15946770778917269,
"grad_norm": 1.023362636566162,
"learning_rate": 4.756060508410596e-05,
"loss": 0.4869500732421875,
"step": 5800
},
{
"epoch": 0.1649665942646614,
"grad_norm": 1.4824140071868896,
"learning_rate": 4.7468535074070324e-05,
"loss": 0.49057952880859373,
"step": 6000
},
{
"epoch": 0.1704654807401501,
"grad_norm": 1.1914821863174438,
"learning_rate": 4.7376465064034695e-05,
"loss": 0.49073287963867185,
"step": 6200
},
{
"epoch": 0.17596436721563882,
"grad_norm": 1.0815869569778442,
"learning_rate": 4.7284395053999066e-05,
"loss": 0.49227970123291015,
"step": 6400
},
{
"epoch": 0.18146325369112754,
"grad_norm": 1.644206166267395,
"learning_rate": 4.719232504396343e-05,
"loss": 0.48635608673095704,
"step": 6600
},
{
"epoch": 0.18696214016661625,
"grad_norm": 1.1657360792160034,
"learning_rate": 4.710025503392781e-05,
"loss": 0.49577178955078127,
"step": 6800
},
{
"epoch": 0.192461026642105,
"grad_norm": 1.3343608379364014,
"learning_rate": 4.700818502389217e-05,
"loss": 0.48896270751953125,
"step": 7000
},
{
"epoch": 0.1979599131175937,
"grad_norm": 1.036275863647461,
"learning_rate": 4.6916115013856535e-05,
"loss": 0.4918210983276367,
"step": 7200
},
{
"epoch": 0.2034587995930824,
"grad_norm": 1.1466560363769531,
"learning_rate": 4.682404500382091e-05,
"loss": 0.4845957946777344,
"step": 7400
},
{
"epoch": 0.20895768606857112,
"grad_norm": 1.2194637060165405,
"learning_rate": 4.6731974993785276e-05,
"loss": 0.48361312866210937,
"step": 7600
},
{
"epoch": 0.21445657254405984,
"grad_norm": 1.0549407005310059,
"learning_rate": 4.663990498374965e-05,
"loss": 0.4873248291015625,
"step": 7800
},
{
"epoch": 0.21995545901954855,
"grad_norm": 1.5164191722869873,
"learning_rate": 4.654783497371402e-05,
"loss": 0.48396194458007813,
"step": 8000
},
{
"epoch": 0.22545434549503726,
"grad_norm": 0.9566870927810669,
"learning_rate": 4.645576496367838e-05,
"loss": 0.49601322174072265,
"step": 8200
},
{
"epoch": 0.23095323197052597,
"grad_norm": 1.268650770187378,
"learning_rate": 4.636369495364275e-05,
"loss": 0.4893684387207031,
"step": 8400
},
{
"epoch": 0.2364521184460147,
"grad_norm": 1.626772165298462,
"learning_rate": 4.627162494360712e-05,
"loss": 0.4904355621337891,
"step": 8600
},
{
"epoch": 0.2419510049215034,
"grad_norm": 1.2238197326660156,
"learning_rate": 4.6179554933571486e-05,
"loss": 0.484462890625,
"step": 8800
},
{
"epoch": 0.2474498913969921,
"grad_norm": 1.4520012140274048,
"learning_rate": 4.608748492353586e-05,
"loss": 0.4788643264770508,
"step": 9000
},
{
"epoch": 0.2529487778724808,
"grad_norm": 1.1315358877182007,
"learning_rate": 4.599541491350023e-05,
"loss": 0.48716392517089846,
"step": 9200
},
{
"epoch": 0.25844766434796956,
"grad_norm": 1.0476795434951782,
"learning_rate": 4.59033449034646e-05,
"loss": 0.4895347213745117,
"step": 9400
},
{
"epoch": 0.26394655082345825,
"grad_norm": 1.100468635559082,
"learning_rate": 4.581127489342897e-05,
"loss": 0.4899191665649414,
"step": 9600
},
{
"epoch": 0.269445437298947,
"grad_norm": 1.1204516887664795,
"learning_rate": 4.571920488339333e-05,
"loss": 0.4837226486206055,
"step": 9800
},
{
"epoch": 0.2749443237744357,
"grad_norm": 1.271126627922058,
"learning_rate": 4.56271348733577e-05,
"loss": 0.48351455688476563,
"step": 10000
},
{
"epoch": 0.2804432102499244,
"grad_norm": 1.294801115989685,
"learning_rate": 4.5535064863322074e-05,
"loss": 0.4861069107055664,
"step": 10200
},
{
"epoch": 0.2859420967254131,
"grad_norm": 0.9449974894523621,
"learning_rate": 4.5442994853286444e-05,
"loss": 0.4828087997436523,
"step": 10400
},
{
"epoch": 0.29144098320090184,
"grad_norm": 1.017383337020874,
"learning_rate": 4.535092484325081e-05,
"loss": 0.4907358551025391,
"step": 10600
},
{
"epoch": 0.2969398696763905,
"grad_norm": 1.4358981847763062,
"learning_rate": 4.525885483321518e-05,
"loss": 0.482423210144043,
"step": 10800
},
{
"epoch": 0.30243875615187926,
"grad_norm": 1.3579864501953125,
"learning_rate": 4.516678482317955e-05,
"loss": 0.4872136688232422,
"step": 11000
},
{
"epoch": 0.30793764262736795,
"grad_norm": 1.309594750404358,
"learning_rate": 4.5074714813143913e-05,
"loss": 0.48710639953613283,
"step": 11200
},
{
"epoch": 0.3134365291028567,
"grad_norm": 1.4916502237319946,
"learning_rate": 4.4982644803108284e-05,
"loss": 0.4856998062133789,
"step": 11400
},
{
"epoch": 0.31893541557834537,
"grad_norm": 1.1984270811080933,
"learning_rate": 4.4890574793072655e-05,
"loss": 0.48433101654052735,
"step": 11600
},
{
"epoch": 0.3244343020538341,
"grad_norm": 1.376825213432312,
"learning_rate": 4.479850478303702e-05,
"loss": 0.4843286514282227,
"step": 11800
},
{
"epoch": 0.3299331885293228,
"grad_norm": 1.04801607131958,
"learning_rate": 4.4706434773001396e-05,
"loss": 0.4760005187988281,
"step": 12000
},
{
"epoch": 0.33543207500481154,
"grad_norm": 1.277635097503662,
"learning_rate": 4.461436476296576e-05,
"loss": 0.4829146194458008,
"step": 12200
},
{
"epoch": 0.3409309614803002,
"grad_norm": 1.08747398853302,
"learning_rate": 4.452229475293013e-05,
"loss": 0.49240009307861327,
"step": 12400
},
{
"epoch": 0.34642984795578896,
"grad_norm": 1.1133017539978027,
"learning_rate": 4.44302247428945e-05,
"loss": 0.4923815536499023,
"step": 12600
},
{
"epoch": 0.35192873443127765,
"grad_norm": 1.5661677122116089,
"learning_rate": 4.4338154732858865e-05,
"loss": 0.4879690933227539,
"step": 12800
},
{
"epoch": 0.3574276209067664,
"grad_norm": 1.4570703506469727,
"learning_rate": 4.4246084722823236e-05,
"loss": 0.4856468963623047,
"step": 13000
},
{
"epoch": 0.36292650738225507,
"grad_norm": 1.4638596773147583,
"learning_rate": 4.4154014712787606e-05,
"loss": 0.48412940979003904,
"step": 13200
},
{
"epoch": 0.3684253938577438,
"grad_norm": 1.2463369369506836,
"learning_rate": 4.406194470275197e-05,
"loss": 0.4839859771728516,
"step": 13400
},
{
"epoch": 0.3739242803332325,
"grad_norm": 1.0832504034042358,
"learning_rate": 4.396987469271635e-05,
"loss": 0.49135875701904297,
"step": 13600
},
{
"epoch": 0.37942316680872124,
"grad_norm": 1.1107310056686401,
"learning_rate": 4.387780468268071e-05,
"loss": 0.47643829345703126,
"step": 13800
},
{
"epoch": 0.38492205328421,
"grad_norm": 1.1073737144470215,
"learning_rate": 4.378573467264508e-05,
"loss": 0.4878357315063477,
"step": 14000
},
{
"epoch": 0.39042093975969866,
"grad_norm": 1.4523825645446777,
"learning_rate": 4.369366466260945e-05,
"loss": 0.4929097747802734,
"step": 14200
},
{
"epoch": 0.3959198262351874,
"grad_norm": 1.1978082656860352,
"learning_rate": 4.3601594652573816e-05,
"loss": 0.47999427795410154,
"step": 14400
},
{
"epoch": 0.4014187127106761,
"grad_norm": 1.080812692642212,
"learning_rate": 4.3509524642538194e-05,
"loss": 0.4825727081298828,
"step": 14600
},
{
"epoch": 0.4069175991861648,
"grad_norm": 1.053101897239685,
"learning_rate": 4.341745463250256e-05,
"loss": 0.4855875778198242,
"step": 14800
},
{
"epoch": 0.4124164856616535,
"grad_norm": 1.5434905290603638,
"learning_rate": 4.332538462246692e-05,
"loss": 0.48418006896972654,
"step": 15000
},
{
"epoch": 0.41791537213714225,
"grad_norm": 1.3098441362380981,
"learning_rate": 4.32333146124313e-05,
"loss": 0.47957420349121094,
"step": 15200
},
{
"epoch": 0.42341425861263093,
"grad_norm": 1.1274868249893188,
"learning_rate": 4.314124460239566e-05,
"loss": 0.48411872863769534,
"step": 15400
},
{
"epoch": 0.4289131450881197,
"grad_norm": 1.1913822889328003,
"learning_rate": 4.3049174592360033e-05,
"loss": 0.48610164642333986,
"step": 15600
},
{
"epoch": 0.43441203156360836,
"grad_norm": 1.1664844751358032,
"learning_rate": 4.2957104582324404e-05,
"loss": 0.48251495361328123,
"step": 15800
},
{
"epoch": 0.4399109180390971,
"grad_norm": 0.9833515882492065,
"learning_rate": 4.286503457228877e-05,
"loss": 0.48185344696044924,
"step": 16000
},
{
"epoch": 0.4454098045145858,
"grad_norm": 1.3691802024841309,
"learning_rate": 4.277296456225314e-05,
"loss": 0.47827003479003904,
"step": 16200
},
{
"epoch": 0.4509086909900745,
"grad_norm": 1.4538307189941406,
"learning_rate": 4.268089455221751e-05,
"loss": 0.48655067443847655,
"step": 16400
},
{
"epoch": 0.4564075774655632,
"grad_norm": 1.6174641847610474,
"learning_rate": 4.258882454218188e-05,
"loss": 0.4811368179321289,
"step": 16600
},
{
"epoch": 0.46190646394105195,
"grad_norm": 1.379770278930664,
"learning_rate": 4.2496754532146244e-05,
"loss": 0.4825275421142578,
"step": 16800
},
{
"epoch": 0.46740535041654063,
"grad_norm": 1.1480027437210083,
"learning_rate": 4.2404684522110614e-05,
"loss": 0.4793708801269531,
"step": 17000
},
{
"epoch": 0.4729042368920294,
"grad_norm": 1.2923580408096313,
"learning_rate": 4.2312614512074985e-05,
"loss": 0.48294658660888673,
"step": 17200
},
{
"epoch": 0.47840312336751806,
"grad_norm": 1.1704210042953491,
"learning_rate": 4.2220544502039356e-05,
"loss": 0.48764766693115236,
"step": 17400
},
{
"epoch": 0.4839020098430068,
"grad_norm": 0.9645224213600159,
"learning_rate": 4.212847449200372e-05,
"loss": 0.48104751586914063,
"step": 17600
},
{
"epoch": 0.4894008963184955,
"grad_norm": 1.0854864120483398,
"learning_rate": 4.203640448196809e-05,
"loss": 0.48372928619384764,
"step": 17800
},
{
"epoch": 0.4948997827939842,
"grad_norm": 1.058073878288269,
"learning_rate": 4.194433447193246e-05,
"loss": 0.481105842590332,
"step": 18000
},
{
"epoch": 0.500398669269473,
"grad_norm": 1.1038442850112915,
"learning_rate": 4.185226446189683e-05,
"loss": 0.48221038818359374,
"step": 18200
},
{
"epoch": 0.5058975557449616,
"grad_norm": 1.1211503744125366,
"learning_rate": 4.1760194451861195e-05,
"loss": 0.48461868286132814,
"step": 18400
},
{
"epoch": 0.5113964422204503,
"grad_norm": 1.1851303577423096,
"learning_rate": 4.1668124441825566e-05,
"loss": 0.48900299072265624,
"step": 18600
},
{
"epoch": 0.5168953286959391,
"grad_norm": 1.1773110628128052,
"learning_rate": 4.1576054431789936e-05,
"loss": 0.4895766067504883,
"step": 18800
},
{
"epoch": 0.5223942151714278,
"grad_norm": 1.0236694812774658,
"learning_rate": 4.14839844217543e-05,
"loss": 0.47842552185058596,
"step": 19000
},
{
"epoch": 0.5278931016469165,
"grad_norm": 1.2550437450408936,
"learning_rate": 4.139191441171867e-05,
"loss": 0.4883332061767578,
"step": 19200
},
{
"epoch": 0.5333919881224052,
"grad_norm": 1.6811326742172241,
"learning_rate": 4.129984440168304e-05,
"loss": 0.48251426696777344,
"step": 19400
},
{
"epoch": 0.538890874597894,
"grad_norm": 1.1312133073806763,
"learning_rate": 4.1207774391647405e-05,
"loss": 0.4811555480957031,
"step": 19600
},
{
"epoch": 0.5443897610733827,
"grad_norm": 1.106419563293457,
"learning_rate": 4.111570438161178e-05,
"loss": 0.4829677963256836,
"step": 19800
},
{
"epoch": 0.5498886475488713,
"grad_norm": 1.2335270643234253,
"learning_rate": 4.102363437157615e-05,
"loss": 0.48413619995117185,
"step": 20000
},
{
"epoch": 0.55538753402436,
"grad_norm": 1.195844054222107,
"learning_rate": 4.093156436154052e-05,
"loss": 0.4821126937866211,
"step": 20200
},
{
"epoch": 0.5608864204998488,
"grad_norm": 1.0814074277877808,
"learning_rate": 4.083949435150489e-05,
"loss": 0.4847369003295898,
"step": 20400
},
{
"epoch": 0.5663853069753375,
"grad_norm": 1.4510689973831177,
"learning_rate": 4.074742434146925e-05,
"loss": 0.4875687789916992,
"step": 20600
},
{
"epoch": 0.5718841934508262,
"grad_norm": 1.0444058179855347,
"learning_rate": 4.065535433143363e-05,
"loss": 0.4803382110595703,
"step": 20800
},
{
"epoch": 0.5773830799263149,
"grad_norm": 1.1824759244918823,
"learning_rate": 4.056328432139799e-05,
"loss": 0.48757186889648435,
"step": 21000
},
{
"epoch": 0.5828819664018037,
"grad_norm": 1.1672804355621338,
"learning_rate": 4.047121431136236e-05,
"loss": 0.47619979858398437,
"step": 21200
},
{
"epoch": 0.5883808528772924,
"grad_norm": 1.3952018022537231,
"learning_rate": 4.0379144301326734e-05,
"loss": 0.4820771026611328,
"step": 21400
},
{
"epoch": 0.593879739352781,
"grad_norm": 1.5481926202774048,
"learning_rate": 4.02870742912911e-05,
"loss": 0.4789703369140625,
"step": 21600
},
{
"epoch": 0.5993786258282697,
"grad_norm": 1.1940809488296509,
"learning_rate": 4.019500428125547e-05,
"loss": 0.4823886871337891,
"step": 21800
},
{
"epoch": 0.6048775123037585,
"grad_norm": 1.470038890838623,
"learning_rate": 4.010293427121984e-05,
"loss": 0.47876800537109376,
"step": 22000
},
{
"epoch": 0.6103763987792472,
"grad_norm": 1.372512698173523,
"learning_rate": 4.00108642611842e-05,
"loss": 0.48137435913085935,
"step": 22200
},
{
"epoch": 0.6158752852547359,
"grad_norm": 0.9625583291053772,
"learning_rate": 3.991879425114858e-05,
"loss": 0.4751309967041016,
"step": 22400
},
{
"epoch": 0.6213741717302246,
"grad_norm": 1.0047613382339478,
"learning_rate": 3.9826724241112945e-05,
"loss": 0.4809339141845703,
"step": 22600
},
{
"epoch": 0.6268730582057134,
"grad_norm": 1.8941971063613892,
"learning_rate": 3.973465423107731e-05,
"loss": 0.47376441955566406,
"step": 22800
},
{
"epoch": 0.6323719446812021,
"grad_norm": 1.0294033288955688,
"learning_rate": 3.9642584221041686e-05,
"loss": 0.4846999740600586,
"step": 23000
},
{
"epoch": 0.6378708311566907,
"grad_norm": 1.1899781227111816,
"learning_rate": 3.955051421100605e-05,
"loss": 0.4818299865722656,
"step": 23200
},
{
"epoch": 0.6433697176321795,
"grad_norm": 1.5099271535873413,
"learning_rate": 3.945844420097042e-05,
"loss": 0.4828767776489258,
"step": 23400
},
{
"epoch": 0.6488686041076682,
"grad_norm": 1.3377799987792969,
"learning_rate": 3.936637419093479e-05,
"loss": 0.47890872955322267,
"step": 23600
},
{
"epoch": 0.6543674905831569,
"grad_norm": 1.6240547895431519,
"learning_rate": 3.9274304180899155e-05,
"loss": 0.4793845748901367,
"step": 23800
},
{
"epoch": 0.6598663770586456,
"grad_norm": 1.32374107837677,
"learning_rate": 3.9182234170863525e-05,
"loss": 0.48126724243164065,
"step": 24000
},
{
"epoch": 0.6653652635341344,
"grad_norm": 1.1302155256271362,
"learning_rate": 3.9090164160827896e-05,
"loss": 0.4794307708740234,
"step": 24200
},
{
"epoch": 0.6708641500096231,
"grad_norm": 1.2106575965881348,
"learning_rate": 3.899809415079227e-05,
"loss": 0.4764822769165039,
"step": 24400
},
{
"epoch": 0.6763630364851118,
"grad_norm": 1.1682376861572266,
"learning_rate": 3.890602414075663e-05,
"loss": 0.48130035400390625,
"step": 24600
},
{
"epoch": 0.6818619229606004,
"grad_norm": 1.7385523319244385,
"learning_rate": 3.8813954130721e-05,
"loss": 0.48233951568603517,
"step": 24800
},
{
"epoch": 0.6873608094360892,
"grad_norm": 0.9956115484237671,
"learning_rate": 3.872188412068537e-05,
"loss": 0.47769607543945314,
"step": 25000
},
{
"epoch": 0.6928596959115779,
"grad_norm": 0.9261813759803772,
"learning_rate": 3.862981411064974e-05,
"loss": 0.4826504898071289,
"step": 25200
},
{
"epoch": 0.6983585823870666,
"grad_norm": 1.0754562616348267,
"learning_rate": 3.8537744100614106e-05,
"loss": 0.48267059326171874,
"step": 25400
},
{
"epoch": 0.7038574688625553,
"grad_norm": 1.2435545921325684,
"learning_rate": 3.844567409057848e-05,
"loss": 0.48062828063964846,
"step": 25600
},
{
"epoch": 0.7093563553380441,
"grad_norm": 1.1161478757858276,
"learning_rate": 3.835360408054285e-05,
"loss": 0.476544189453125,
"step": 25800
},
{
"epoch": 0.7148552418135328,
"grad_norm": 1.144326090812683,
"learning_rate": 3.826153407050722e-05,
"loss": 0.4830437469482422,
"step": 26000
},
{
"epoch": 0.7203541282890215,
"grad_norm": 1.2163105010986328,
"learning_rate": 3.816946406047158e-05,
"loss": 0.48178863525390625,
"step": 26200
},
{
"epoch": 0.7258530147645101,
"grad_norm": 1.3089566230773926,
"learning_rate": 3.807739405043595e-05,
"loss": 0.4754468536376953,
"step": 26400
},
{
"epoch": 0.7313519012399989,
"grad_norm": 1.2991975545883179,
"learning_rate": 3.798532404040032e-05,
"loss": 0.4895411682128906,
"step": 26600
},
{
"epoch": 0.7368507877154876,
"grad_norm": 1.6097289323806763,
"learning_rate": 3.789325403036469e-05,
"loss": 0.47313800811767576,
"step": 26800
},
{
"epoch": 0.7423496741909763,
"grad_norm": 1.4237576723098755,
"learning_rate": 3.7801184020329065e-05,
"loss": 0.47288108825683595,
"step": 27000
},
{
"epoch": 0.747848560666465,
"grad_norm": 1.7340173721313477,
"learning_rate": 3.770911401029343e-05,
"loss": 0.4713779067993164,
"step": 27200
},
{
"epoch": 0.7533474471419538,
"grad_norm": 1.3480178117752075,
"learning_rate": 3.761704400025779e-05,
"loss": 0.4823367309570312,
"step": 27400
},
{
"epoch": 0.7588463336174425,
"grad_norm": 0.945102870464325,
"learning_rate": 3.752497399022217e-05,
"loss": 0.485689697265625,
"step": 27600
},
{
"epoch": 0.7643452200929312,
"grad_norm": 1.5504003763198853,
"learning_rate": 3.7432903980186534e-05,
"loss": 0.4697317886352539,
"step": 27800
},
{
"epoch": 0.76984410656842,
"grad_norm": 1.4954441785812378,
"learning_rate": 3.7340833970150904e-05,
"loss": 0.4746841049194336,
"step": 28000
},
{
"epoch": 0.7753429930439086,
"grad_norm": 1.660771131515503,
"learning_rate": 3.7248763960115275e-05,
"loss": 0.48746414184570314,
"step": 28200
},
{
"epoch": 0.7808418795193973,
"grad_norm": 1.216834306716919,
"learning_rate": 3.715669395007964e-05,
"loss": 0.4784600067138672,
"step": 28400
},
{
"epoch": 0.786340765994886,
"grad_norm": 1.3025329113006592,
"learning_rate": 3.7064623940044016e-05,
"loss": 0.48134098052978513,
"step": 28600
},
{
"epoch": 0.7918396524703748,
"grad_norm": 0.8612267374992371,
"learning_rate": 3.697255393000838e-05,
"loss": 0.48288066864013673,
"step": 28800
},
{
"epoch": 0.7973385389458635,
"grad_norm": 1.5112066268920898,
"learning_rate": 3.6880483919972744e-05,
"loss": 0.48638771057128904,
"step": 29000
},
{
"epoch": 0.8028374254213522,
"grad_norm": 1.2981903553009033,
"learning_rate": 3.678841390993712e-05,
"loss": 0.4764302444458008,
"step": 29200
},
{
"epoch": 0.8083363118968409,
"grad_norm": 1.2499499320983887,
"learning_rate": 3.6696343899901485e-05,
"loss": 0.47807662963867187,
"step": 29400
},
{
"epoch": 0.8138351983723296,
"grad_norm": 1.4974340200424194,
"learning_rate": 3.6604273889865856e-05,
"loss": 0.48103851318359375,
"step": 29600
},
{
"epoch": 0.8193340848478183,
"grad_norm": 1.6043846607208252,
"learning_rate": 3.6512203879830226e-05,
"loss": 0.4745806121826172,
"step": 29800
},
{
"epoch": 0.824832971323307,
"grad_norm": 1.0718004703521729,
"learning_rate": 3.642013386979459e-05,
"loss": 0.4758340835571289,
"step": 30000
},
{
"epoch": 0.8303318577987957,
"grad_norm": 1.31827712059021,
"learning_rate": 3.632806385975897e-05,
"loss": 0.48326473236083983,
"step": 30200
},
{
"epoch": 0.8358307442742845,
"grad_norm": 1.214794635772705,
"learning_rate": 3.623599384972333e-05,
"loss": 0.4670214080810547,
"step": 30400
},
{
"epoch": 0.8413296307497732,
"grad_norm": 1.3490458726882935,
"learning_rate": 3.61439238396877e-05,
"loss": 0.4771783065795898,
"step": 30600
},
{
"epoch": 0.8468285172252619,
"grad_norm": 1.7430227994918823,
"learning_rate": 3.605185382965207e-05,
"loss": 0.47809303283691407,
"step": 30800
},
{
"epoch": 0.8523274037007506,
"grad_norm": 1.04710054397583,
"learning_rate": 3.5959783819616437e-05,
"loss": 0.47361648559570313,
"step": 31000
},
{
"epoch": 0.8578262901762393,
"grad_norm": 1.239403247833252,
"learning_rate": 3.586771380958081e-05,
"loss": 0.47289577484130857,
"step": 31200
},
{
"epoch": 0.863325176651728,
"grad_norm": 1.0348613262176514,
"learning_rate": 3.577564379954518e-05,
"loss": 0.4844191360473633,
"step": 31400
},
{
"epoch": 0.8688240631272167,
"grad_norm": 1.2087358236312866,
"learning_rate": 3.568357378950954e-05,
"loss": 0.4849067687988281,
"step": 31600
},
{
"epoch": 0.8743229496027054,
"grad_norm": 1.498613715171814,
"learning_rate": 3.559150377947391e-05,
"loss": 0.4736307907104492,
"step": 31800
},
{
"epoch": 0.8798218360781942,
"grad_norm": 1.2673721313476562,
"learning_rate": 3.549943376943828e-05,
"loss": 0.4751145553588867,
"step": 32000
},
{
"epoch": 0.8853207225536829,
"grad_norm": 1.078145980834961,
"learning_rate": 3.5407363759402654e-05,
"loss": 0.48442405700683594,
"step": 32200
},
{
"epoch": 0.8908196090291716,
"grad_norm": 1.8665213584899902,
"learning_rate": 3.531529374936702e-05,
"loss": 0.47800296783447266,
"step": 32400
},
{
"epoch": 0.8963184955046603,
"grad_norm": 1.093640685081482,
"learning_rate": 3.522322373933139e-05,
"loss": 0.4768505859375,
"step": 32600
},
{
"epoch": 0.901817381980149,
"grad_norm": 1.438798189163208,
"learning_rate": 3.513115372929576e-05,
"loss": 0.4752470016479492,
"step": 32800
},
{
"epoch": 0.9073162684556377,
"grad_norm": 1.156036376953125,
"learning_rate": 3.503908371926013e-05,
"loss": 0.47669639587402346,
"step": 33000
},
{
"epoch": 0.9128151549311264,
"grad_norm": 1.223441481590271,
"learning_rate": 3.494701370922449e-05,
"loss": 0.4820696258544922,
"step": 33200
},
{
"epoch": 0.9183140414066152,
"grad_norm": 1.161592721939087,
"learning_rate": 3.4854943699188864e-05,
"loss": 0.46704940795898436,
"step": 33400
},
{
"epoch": 0.9238129278821039,
"grad_norm": 1.217645287513733,
"learning_rate": 3.4762873689153234e-05,
"loss": 0.4787548065185547,
"step": 33600
},
{
"epoch": 0.9293118143575926,
"grad_norm": 1.2599478960037231,
"learning_rate": 3.4670803679117605e-05,
"loss": 0.47969642639160154,
"step": 33800
},
{
"epoch": 0.9348107008330813,
"grad_norm": 1.1119675636291504,
"learning_rate": 3.457873366908197e-05,
"loss": 0.48166824340820313,
"step": 34000
},
{
"epoch": 0.9403095873085701,
"grad_norm": 1.4451464414596558,
"learning_rate": 3.448666365904634e-05,
"loss": 0.4774625015258789,
"step": 34200
},
{
"epoch": 0.9458084737840587,
"grad_norm": 1.121450662612915,
"learning_rate": 3.439459364901071e-05,
"loss": 0.4775946426391602,
"step": 34400
},
{
"epoch": 0.9513073602595474,
"grad_norm": 1.7251038551330566,
"learning_rate": 3.4302523638975074e-05,
"loss": 0.46810245513916016,
"step": 34600
},
{
"epoch": 0.9568062467350361,
"grad_norm": 1.1376259326934814,
"learning_rate": 3.421045362893945e-05,
"loss": 0.4734595108032227,
"step": 34800
},
{
"epoch": 0.9623051332105249,
"grad_norm": 1.3909783363342285,
"learning_rate": 3.4118383618903815e-05,
"loss": 0.4745623016357422,
"step": 35000
},
{
"epoch": 0.9678040196860136,
"grad_norm": 1.4496464729309082,
"learning_rate": 3.4026313608868186e-05,
"loss": 0.4793576431274414,
"step": 35200
},
{
"epoch": 0.9733029061615023,
"grad_norm": 1.188259482383728,
"learning_rate": 3.3934243598832557e-05,
"loss": 0.48435794830322265,
"step": 35400
},
{
"epoch": 0.978801792636991,
"grad_norm": 0.972775936126709,
"learning_rate": 3.384217358879692e-05,
"loss": 0.48073070526123046,
"step": 35600
},
{
"epoch": 0.9843006791124798,
"grad_norm": 1.3712236881256104,
"learning_rate": 3.375010357876129e-05,
"loss": 0.47246246337890624,
"step": 35800
},
{
"epoch": 0.9897995655879684,
"grad_norm": 1.0553455352783203,
"learning_rate": 3.365803356872566e-05,
"loss": 0.4749702835083008,
"step": 36000
},
{
"epoch": 0.9952984520634571,
"grad_norm": 2.0960538387298584,
"learning_rate": 3.3565963558690026e-05,
"loss": 0.48137100219726564,
"step": 36200
},
{
"epoch": 1.0,
"eval_loss": 0.46924570202827454,
"eval_runtime": 158.4469,
"eval_samples_per_second": 408.08,
"eval_steps_per_second": 25.51,
"step": 36371
},
{
"epoch": 1.000797338538946,
"grad_norm": 1.0945351123809814,
"learning_rate": 3.34738935486544e-05,
"loss": 0.47277111053466797,
"step": 36400
},
{
"epoch": 1.0062962250144345,
"grad_norm": 1.1377208232879639,
"learning_rate": 3.338182353861877e-05,
"loss": 0.4704814147949219,
"step": 36600
},
{
"epoch": 1.0117951114899233,
"grad_norm": 1.2042992115020752,
"learning_rate": 3.328975352858314e-05,
"loss": 0.4778765869140625,
"step": 36800
},
{
"epoch": 1.017293997965412,
"grad_norm": 1.2293647527694702,
"learning_rate": 3.319768351854751e-05,
"loss": 0.4779492950439453,
"step": 37000
},
{
"epoch": 1.0227928844409007,
"grad_norm": 1.0912444591522217,
"learning_rate": 3.310561350851187e-05,
"loss": 0.47877525329589843,
"step": 37200
},
{
"epoch": 1.0282917709163895,
"grad_norm": 1.2448941469192505,
"learning_rate": 3.301354349847624e-05,
"loss": 0.4758515930175781,
"step": 37400
},
{
"epoch": 1.033790657391878,
"grad_norm": 1.127113699913025,
"learning_rate": 3.292147348844061e-05,
"loss": 0.47277240753173827,
"step": 37600
},
{
"epoch": 1.0392895438673668,
"grad_norm": 1.184788703918457,
"learning_rate": 3.282940347840498e-05,
"loss": 0.48051612854003906,
"step": 37800
},
{
"epoch": 1.0447884303428556,
"grad_norm": 1.3059478998184204,
"learning_rate": 3.2737333468369354e-05,
"loss": 0.4780512237548828,
"step": 38000
},
{
"epoch": 1.0502873168183442,
"grad_norm": 1.035843014717102,
"learning_rate": 3.264526345833372e-05,
"loss": 0.4778805160522461,
"step": 38200
},
{
"epoch": 1.055786203293833,
"grad_norm": 1.142691731452942,
"learning_rate": 3.255319344829809e-05,
"loss": 0.4747405242919922,
"step": 38400
},
{
"epoch": 1.0612850897693218,
"grad_norm": 1.2115979194641113,
"learning_rate": 3.246112343826246e-05,
"loss": 0.4684751510620117,
"step": 38600
},
{
"epoch": 1.0667839762448104,
"grad_norm": 1.0604227781295776,
"learning_rate": 3.2369053428226823e-05,
"loss": 0.4836904525756836,
"step": 38800
},
{
"epoch": 1.0722828627202992,
"grad_norm": 1.2616559267044067,
"learning_rate": 3.2276983418191194e-05,
"loss": 0.47024051666259764,
"step": 39000
},
{
"epoch": 1.077781749195788,
"grad_norm": 1.1861746311187744,
"learning_rate": 3.2184913408155565e-05,
"loss": 0.47626224517822263,
"step": 39200
},
{
"epoch": 1.0832806356712765,
"grad_norm": 1.0768451690673828,
"learning_rate": 3.209284339811993e-05,
"loss": 0.4712419128417969,
"step": 39400
},
{
"epoch": 1.0887795221467653,
"grad_norm": 1.1116639375686646,
"learning_rate": 3.20007733880843e-05,
"loss": 0.47870445251464844,
"step": 39600
},
{
"epoch": 1.094278408622254,
"grad_norm": 0.9229024648666382,
"learning_rate": 3.190870337804867e-05,
"loss": 0.47164249420166016,
"step": 39800
},
{
"epoch": 1.0997772950977427,
"grad_norm": 1.2584002017974854,
"learning_rate": 3.181663336801304e-05,
"loss": 0.46996349334716797,
"step": 40000
},
{
"epoch": 1.1052761815732315,
"grad_norm": 1.1987744569778442,
"learning_rate": 3.1724563357977404e-05,
"loss": 0.47581478118896486,
"step": 40200
},
{
"epoch": 1.11077506804872,
"grad_norm": 1.897595763206482,
"learning_rate": 3.1632493347941775e-05,
"loss": 0.47223583221435544,
"step": 40400
},
{
"epoch": 1.1162739545242089,
"grad_norm": 1.384735345840454,
"learning_rate": 3.1540423337906146e-05,
"loss": 0.4742586898803711,
"step": 40600
},
{
"epoch": 1.1217728409996977,
"grad_norm": 1.2924162149429321,
"learning_rate": 3.1448353327870516e-05,
"loss": 0.4763710403442383,
"step": 40800
},
{
"epoch": 1.1272717274751862,
"grad_norm": 1.2529865503311157,
"learning_rate": 3.135628331783489e-05,
"loss": 0.4804756546020508,
"step": 41000
},
{
"epoch": 1.132770613950675,
"grad_norm": 1.0378504991531372,
"learning_rate": 3.126421330779925e-05,
"loss": 0.4701519775390625,
"step": 41200
},
{
"epoch": 1.1382695004261638,
"grad_norm": 1.3165602684020996,
"learning_rate": 3.117214329776362e-05,
"loss": 0.4799094009399414,
"step": 41400
},
{
"epoch": 1.1437683869016524,
"grad_norm": 1.3106869459152222,
"learning_rate": 3.108007328772799e-05,
"loss": 0.4807415771484375,
"step": 41600
},
{
"epoch": 1.1492672733771412,
"grad_norm": 1.870168685913086,
"learning_rate": 3.0988003277692356e-05,
"loss": 0.4763855743408203,
"step": 41800
},
{
"epoch": 1.1547661598526298,
"grad_norm": 1.2770658731460571,
"learning_rate": 3.0895933267656726e-05,
"loss": 0.47005424499511717,
"step": 42000
},
{
"epoch": 1.1602650463281186,
"grad_norm": 1.2080628871917725,
"learning_rate": 3.08038632576211e-05,
"loss": 0.4715093231201172,
"step": 42200
},
{
"epoch": 1.1657639328036074,
"grad_norm": 1.8036431074142456,
"learning_rate": 3.071179324758546e-05,
"loss": 0.4677348327636719,
"step": 42400
},
{
"epoch": 1.171262819279096,
"grad_norm": 1.0280815362930298,
"learning_rate": 3.061972323754984e-05,
"loss": 0.4739281463623047,
"step": 42600
},
{
"epoch": 1.1767617057545847,
"grad_norm": 0.9961258769035339,
"learning_rate": 3.05276532275142e-05,
"loss": 0.4825423049926758,
"step": 42800
},
{
"epoch": 1.1822605922300733,
"grad_norm": 1.0836036205291748,
"learning_rate": 3.0435583217478576e-05,
"loss": 0.47616680145263673,
"step": 43000
},
{
"epoch": 1.187759478705562,
"grad_norm": 0.9266841411590576,
"learning_rate": 3.0343513207442943e-05,
"loss": 0.47153358459472655,
"step": 43200
},
{
"epoch": 1.1932583651810509,
"grad_norm": 1.0143980979919434,
"learning_rate": 3.0251443197407307e-05,
"loss": 0.4762028503417969,
"step": 43400
},
{
"epoch": 1.1987572516565395,
"grad_norm": 1.160222053527832,
"learning_rate": 3.015937318737168e-05,
"loss": 0.4718109893798828,
"step": 43600
},
{
"epoch": 1.2042561381320283,
"grad_norm": 1.1540669202804565,
"learning_rate": 3.006730317733605e-05,
"loss": 0.47153167724609374,
"step": 43800
},
{
"epoch": 1.209755024607517,
"grad_norm": 1.3754700422286987,
"learning_rate": 2.9975233167300416e-05,
"loss": 0.4751555252075195,
"step": 44000
},
{
"epoch": 1.2152539110830056,
"grad_norm": 1.095689296722412,
"learning_rate": 2.9883163157264786e-05,
"loss": 0.47820320129394533,
"step": 44200
},
{
"epoch": 1.2207527975584944,
"grad_norm": 1.2152804136276245,
"learning_rate": 2.9791093147229154e-05,
"loss": 0.4785987091064453,
"step": 44400
},
{
"epoch": 1.2262516840339832,
"grad_norm": 1.3621678352355957,
"learning_rate": 2.969902313719352e-05,
"loss": 0.4778928375244141,
"step": 44600
},
{
"epoch": 1.2317505705094718,
"grad_norm": 1.3576879501342773,
"learning_rate": 2.9606953127157895e-05,
"loss": 0.46979766845703125,
"step": 44800
},
{
"epoch": 1.2372494569849606,
"grad_norm": 1.4446898698806763,
"learning_rate": 2.9514883117122262e-05,
"loss": 0.47956855773925783,
"step": 45000
},
{
"epoch": 1.2427483434604492,
"grad_norm": 1.1428676843643188,
"learning_rate": 2.9422813107086626e-05,
"loss": 0.46750675201416014,
"step": 45200
},
{
"epoch": 1.248247229935938,
"grad_norm": 1.1125656366348267,
"learning_rate": 2.9330743097051e-05,
"loss": 0.4821536254882812,
"step": 45400
},
{
"epoch": 1.2537461164114267,
"grad_norm": 0.9081394672393799,
"learning_rate": 2.9238673087015367e-05,
"loss": 0.48335330963134765,
"step": 45600
},
{
"epoch": 1.2592450028869153,
"grad_norm": 1.3965390920639038,
"learning_rate": 2.9146603076979738e-05,
"loss": 0.48991138458251954,
"step": 45800
},
{
"epoch": 1.2647438893624041,
"grad_norm": 0.9960418939590454,
"learning_rate": 2.9054533066944105e-05,
"loss": 0.48175228118896485,
"step": 46000
},
{
"epoch": 1.2702427758378927,
"grad_norm": 0.8425759077072144,
"learning_rate": 2.8962463056908472e-05,
"loss": 0.48490497589111325,
"step": 46200
},
{
"epoch": 1.2757416623133815,
"grad_norm": 0.8783431053161621,
"learning_rate": 2.8870393046872846e-05,
"loss": 0.4830588150024414,
"step": 46400
},
{
"epoch": 1.2812405487888703,
"grad_norm": 1.6315195560455322,
"learning_rate": 2.8778323036837214e-05,
"loss": 0.48057308197021487,
"step": 46600
},
{
"epoch": 1.286739435264359,
"grad_norm": 1.2200597524642944,
"learning_rate": 2.868625302680158e-05,
"loss": 0.48826507568359373,
"step": 46800
},
{
"epoch": 1.2922383217398477,
"grad_norm": 1.008957028388977,
"learning_rate": 2.859418301676595e-05,
"loss": 0.4910233306884766,
"step": 47000
},
{
"epoch": 1.2977372082153364,
"grad_norm": 0.9655813574790955,
"learning_rate": 2.850211300673032e-05,
"loss": 0.48260990142822263,
"step": 47200
},
{
"epoch": 1.303236094690825,
"grad_norm": 1.0368990898132324,
"learning_rate": 2.8410042996694686e-05,
"loss": 0.4869321060180664,
"step": 47400
},
{
"epoch": 1.3087349811663138,
"grad_norm": 1.0914088487625122,
"learning_rate": 2.8317972986659057e-05,
"loss": 0.4798837661743164,
"step": 47600
},
{
"epoch": 1.3142338676418026,
"grad_norm": 1.0549296140670776,
"learning_rate": 2.8225902976623424e-05,
"loss": 0.4868314743041992,
"step": 47800
},
{
"epoch": 1.3197327541172912,
"grad_norm": 0.9864702224731445,
"learning_rate": 2.813383296658779e-05,
"loss": 0.48143596649169923,
"step": 48000
},
{
"epoch": 1.32523164059278,
"grad_norm": 1.276328444480896,
"learning_rate": 2.8041762956552165e-05,
"loss": 0.4901668930053711,
"step": 48200
},
{
"epoch": 1.3307305270682686,
"grad_norm": 0.9716532826423645,
"learning_rate": 2.7949692946516532e-05,
"loss": 0.48207698822021483,
"step": 48400
},
{
"epoch": 1.3362294135437573,
"grad_norm": 1.3309965133666992,
"learning_rate": 2.7857622936480903e-05,
"loss": 0.4830322265625,
"step": 48600
},
{
"epoch": 1.3417283000192461,
"grad_norm": 0.8904381990432739,
"learning_rate": 2.776555292644527e-05,
"loss": 0.488801383972168,
"step": 48800
},
{
"epoch": 1.347227186494735,
"grad_norm": 1.4656221866607666,
"learning_rate": 2.7673482916409638e-05,
"loss": 0.48581710815429685,
"step": 49000
},
{
"epoch": 1.3527260729702235,
"grad_norm": 1.1317617893218994,
"learning_rate": 2.758141290637401e-05,
"loss": 0.4906336212158203,
"step": 49200
},
{
"epoch": 1.3582249594457123,
"grad_norm": 0.944570779800415,
"learning_rate": 2.7489342896338375e-05,
"loss": 0.4796075439453125,
"step": 49400
},
{
"epoch": 1.3637238459212009,
"grad_norm": 0.8989654779434204,
"learning_rate": 2.7397272886302743e-05,
"loss": 0.48385326385498045,
"step": 49600
},
{
"epoch": 1.3692227323966897,
"grad_norm": 1.2828127145767212,
"learning_rate": 2.7305202876267117e-05,
"loss": 0.4900363540649414,
"step": 49800
},
{
"epoch": 1.3747216188721785,
"grad_norm": 1.3695372343063354,
"learning_rate": 2.7213132866231484e-05,
"loss": 0.4815263366699219,
"step": 50000
},
{
"epoch": 1.380220505347667,
"grad_norm": 1.1346147060394287,
"learning_rate": 2.712106285619585e-05,
"loss": 0.48870357513427737,
"step": 50200
},
{
"epoch": 1.3857193918231558,
"grad_norm": 1.2779992818832397,
"learning_rate": 2.7028992846160222e-05,
"loss": 0.4858957290649414,
"step": 50400
},
{
"epoch": 1.3912182782986444,
"grad_norm": 1.0286052227020264,
"learning_rate": 2.693692283612459e-05,
"loss": 0.48650901794433593,
"step": 50600
},
{
"epoch": 1.3967171647741332,
"grad_norm": 1.0637270212173462,
"learning_rate": 2.6844852826088963e-05,
"loss": 0.48736335754394533,
"step": 50800
},
{
"epoch": 1.402216051249622,
"grad_norm": 1.3406178951263428,
"learning_rate": 2.675278281605333e-05,
"loss": 0.4900504684448242,
"step": 51000
},
{
"epoch": 1.4077149377251106,
"grad_norm": 1.1052333116531372,
"learning_rate": 2.6660712806017694e-05,
"loss": 0.4855587387084961,
"step": 51200
},
{
"epoch": 1.4132138242005994,
"grad_norm": 0.931908130645752,
"learning_rate": 2.6568642795982068e-05,
"loss": 0.4813541030883789,
"step": 51400
},
{
"epoch": 1.4187127106760882,
"grad_norm": 0.9499631524085999,
"learning_rate": 2.6476572785946435e-05,
"loss": 0.4899889373779297,
"step": 51600
},
{
"epoch": 1.4242115971515767,
"grad_norm": 1.1931513547897339,
"learning_rate": 2.6384502775910803e-05,
"loss": 0.48534503936767576,
"step": 51800
},
{
"epoch": 1.4297104836270655,
"grad_norm": 1.3906440734863281,
"learning_rate": 2.6292432765875173e-05,
"loss": 0.47944049835205077,
"step": 52000
},
{
"epoch": 1.4352093701025543,
"grad_norm": 1.1049039363861084,
"learning_rate": 2.620036275583954e-05,
"loss": 0.4796323776245117,
"step": 52200
},
{
"epoch": 1.440708256578043,
"grad_norm": 1.035280704498291,
"learning_rate": 2.6108292745803908e-05,
"loss": 0.4778638076782227,
"step": 52400
},
{
"epoch": 1.4462071430535317,
"grad_norm": 0.9371760487556458,
"learning_rate": 2.6016222735768282e-05,
"loss": 0.4937860870361328,
"step": 52600
},
{
"epoch": 1.4517060295290203,
"grad_norm": 0.932565450668335,
"learning_rate": 2.592415272573265e-05,
"loss": 0.48315887451171874,
"step": 52800
},
{
"epoch": 1.457204916004509,
"grad_norm": 1.1414536237716675,
"learning_rate": 2.5832082715697016e-05,
"loss": 0.48177513122558596,
"step": 53000
},
{
"epoch": 1.4627038024799979,
"grad_norm": 1.3313400745391846,
"learning_rate": 2.5740012705661387e-05,
"loss": 0.4810881423950195,
"step": 53200
},
{
"epoch": 1.4682026889554864,
"grad_norm": 0.9843188524246216,
"learning_rate": 2.5647942695625754e-05,
"loss": 0.48992759704589844,
"step": 53400
},
{
"epoch": 1.4737015754309752,
"grad_norm": 1.0765944719314575,
"learning_rate": 2.5555872685590128e-05,
"loss": 0.48404861450195313,
"step": 53600
},
{
"epoch": 1.4792004619064638,
"grad_norm": 0.9720175266265869,
"learning_rate": 2.5463802675554492e-05,
"loss": 0.48842796325683596,
"step": 53800
},
{
"epoch": 1.4846993483819526,
"grad_norm": 0.9759963154792786,
"learning_rate": 2.537173266551886e-05,
"loss": 0.47752620697021486,
"step": 54000
},
{
"epoch": 1.4901982348574414,
"grad_norm": 0.9573367834091187,
"learning_rate": 2.5279662655483233e-05,
"loss": 0.48062808990478517,
"step": 54200
},
{
"epoch": 1.4956971213329302,
"grad_norm": 1.292158603668213,
"learning_rate": 2.51875926454476e-05,
"loss": 0.487774658203125,
"step": 54400
},
{
"epoch": 1.5011960078084188,
"grad_norm": 1.4202347993850708,
"learning_rate": 2.5095522635411968e-05,
"loss": 0.4807822799682617,
"step": 54600
},
{
"epoch": 1.5066948942839073,
"grad_norm": 1.5612984895706177,
"learning_rate": 2.500345262537634e-05,
"loss": 0.4789771270751953,
"step": 54800
},
{
"epoch": 1.5121937807593961,
"grad_norm": 0.886279821395874,
"learning_rate": 2.4911382615340706e-05,
"loss": 0.482733268737793,
"step": 55000
},
{
"epoch": 1.517692667234885,
"grad_norm": 1.2323397397994995,
"learning_rate": 2.4819312605305076e-05,
"loss": 0.48148895263671876,
"step": 55200
},
{
"epoch": 1.5231915537103737,
"grad_norm": 1.1137135028839111,
"learning_rate": 2.4727242595269447e-05,
"loss": 0.48247013092041013,
"step": 55400
},
{
"epoch": 1.5286904401858623,
"grad_norm": 1.1854609251022339,
"learning_rate": 2.463517258523381e-05,
"loss": 0.48267646789550783,
"step": 55600
},
{
"epoch": 1.534189326661351,
"grad_norm": 1.1057685613632202,
"learning_rate": 2.454310257519818e-05,
"loss": 0.48411903381347654,
"step": 55800
},
{
"epoch": 1.5396882131368397,
"grad_norm": 1.2663975954055786,
"learning_rate": 2.4451032565162552e-05,
"loss": 0.4761699295043945,
"step": 56000
},
{
"epoch": 1.5451870996123285,
"grad_norm": 1.0173465013504028,
"learning_rate": 2.435896255512692e-05,
"loss": 0.48153770446777344,
"step": 56200
},
{
"epoch": 1.5506859860878173,
"grad_norm": 1.0407702922821045,
"learning_rate": 2.4266892545091287e-05,
"loss": 0.4878800201416016,
"step": 56400
},
{
"epoch": 1.556184872563306,
"grad_norm": 1.0399770736694336,
"learning_rate": 2.4174822535055657e-05,
"loss": 0.4796760177612305,
"step": 56600
},
{
"epoch": 1.5616837590387946,
"grad_norm": 1.2796666622161865,
"learning_rate": 2.4082752525020028e-05,
"loss": 0.47880504608154295,
"step": 56800
},
{
"epoch": 1.5671826455142832,
"grad_norm": 1.2479208707809448,
"learning_rate": 2.3990682514984395e-05,
"loss": 0.47731819152832033,
"step": 57000
},
{
"epoch": 1.572681531989772,
"grad_norm": 1.1050926446914673,
"learning_rate": 2.3898612504948766e-05,
"loss": 0.483460693359375,
"step": 57200
},
{
"epoch": 1.5781804184652608,
"grad_norm": 0.9544827342033386,
"learning_rate": 2.3806542494913133e-05,
"loss": 0.48048728942871094,
"step": 57400
},
{
"epoch": 1.5836793049407496,
"grad_norm": 1.063852071762085,
"learning_rate": 2.37144724848775e-05,
"loss": 0.485230827331543,
"step": 57600
},
{
"epoch": 1.5891781914162382,
"grad_norm": 1.1819310188293457,
"learning_rate": 2.362240247484187e-05,
"loss": 0.480164794921875,
"step": 57800
},
{
"epoch": 1.594677077891727,
"grad_norm": 1.021468162536621,
"learning_rate": 2.353033246480624e-05,
"loss": 0.4904788589477539,
"step": 58000
},
{
"epoch": 1.6001759643672155,
"grad_norm": 1.3577057123184204,
"learning_rate": 2.343826245477061e-05,
"loss": 0.48077606201171874,
"step": 58200
},
{
"epoch": 1.6056748508427043,
"grad_norm": 1.2617197036743164,
"learning_rate": 2.3346192444734976e-05,
"loss": 0.4806778717041016,
"step": 58400
},
{
"epoch": 1.6111737373181931,
"grad_norm": 1.2320860624313354,
"learning_rate": 2.3254122434699347e-05,
"loss": 0.4775208282470703,
"step": 58600
},
{
"epoch": 1.616672623793682,
"grad_norm": 0.9680395126342773,
"learning_rate": 2.3162052424663717e-05,
"loss": 0.48886814117431643,
"step": 58800
},
{
"epoch": 1.6221715102691705,
"grad_norm": 1.3157929182052612,
"learning_rate": 2.3069982414628084e-05,
"loss": 0.48573501586914064,
"step": 59000
},
{
"epoch": 1.627670396744659,
"grad_norm": 0.900864839553833,
"learning_rate": 2.297791240459245e-05,
"loss": 0.48609561920166017,
"step": 59200
},
{
"epoch": 1.6331692832201479,
"grad_norm": 1.0947906970977783,
"learning_rate": 2.2885842394556822e-05,
"loss": 0.4897247314453125,
"step": 59400
},
{
"epoch": 1.6386681696956367,
"grad_norm": 0.816973865032196,
"learning_rate": 2.2793772384521193e-05,
"loss": 0.47951828002929686,
"step": 59600
},
{
"epoch": 1.6441670561711255,
"grad_norm": 1.2236440181732178,
"learning_rate": 2.270170237448556e-05,
"loss": 0.4842032241821289,
"step": 59800
},
{
"epoch": 1.649665942646614,
"grad_norm": 1.1023343801498413,
"learning_rate": 2.2609632364449927e-05,
"loss": 0.4781660461425781,
"step": 60000
},
{
"epoch": 1.6551648291221026,
"grad_norm": 0.9589300155639648,
"learning_rate": 2.2517562354414298e-05,
"loss": 0.47841606140136717,
"step": 60200
},
{
"epoch": 1.6606637155975914,
"grad_norm": 1.3003031015396118,
"learning_rate": 2.242549234437867e-05,
"loss": 0.48363441467285156,
"step": 60400
},
{
"epoch": 1.6661626020730802,
"grad_norm": 0.9985244870185852,
"learning_rate": 2.2333422334343036e-05,
"loss": 0.48706722259521484,
"step": 60600
},
{
"epoch": 1.671661488548569,
"grad_norm": 1.319917917251587,
"learning_rate": 2.2241352324307403e-05,
"loss": 0.4843954086303711,
"step": 60800
},
{
"epoch": 1.6771603750240578,
"grad_norm": 1.3378630876541138,
"learning_rate": 2.2149282314271774e-05,
"loss": 0.48122127532958986,
"step": 61000
},
{
"epoch": 1.6826592614995464,
"grad_norm": 1.0471312999725342,
"learning_rate": 2.205721230423614e-05,
"loss": 0.48413547515869143,
"step": 61200
},
{
"epoch": 1.688158147975035,
"grad_norm": 1.0439791679382324,
"learning_rate": 2.196514229420051e-05,
"loss": 0.48604167938232423,
"step": 61400
},
{
"epoch": 1.6936570344505237,
"grad_norm": 0.9854567050933838,
"learning_rate": 2.187307228416488e-05,
"loss": 0.4817595291137695,
"step": 61600
},
{
"epoch": 1.6991559209260125,
"grad_norm": 1.1079517602920532,
"learning_rate": 2.178100227412925e-05,
"loss": 0.48393955230712893,
"step": 61800
},
{
"epoch": 1.7046548074015013,
"grad_norm": 1.1403529644012451,
"learning_rate": 2.1688932264093617e-05,
"loss": 0.47360748291015625,
"step": 62000
},
{
"epoch": 1.71015369387699,
"grad_norm": 0.8809356689453125,
"learning_rate": 2.1596862254057987e-05,
"loss": 0.47694496154785154,
"step": 62200
},
{
"epoch": 1.7156525803524785,
"grad_norm": 0.9528295993804932,
"learning_rate": 2.1504792244022358e-05,
"loss": 0.4844463348388672,
"step": 62400
},
{
"epoch": 1.7211514668279673,
"grad_norm": 1.0902634859085083,
"learning_rate": 2.1412722233986722e-05,
"loss": 0.47806488037109374,
"step": 62600
},
{
"epoch": 1.726650353303456,
"grad_norm": 1.0174310207366943,
"learning_rate": 2.1320652223951093e-05,
"loss": 0.48461170196533204,
"step": 62800
},
{
"epoch": 1.7321492397789449,
"grad_norm": 1.1780657768249512,
"learning_rate": 2.1228582213915463e-05,
"loss": 0.4868865203857422,
"step": 63000
},
{
"epoch": 1.7376481262544334,
"grad_norm": 1.257879614830017,
"learning_rate": 2.1136512203879834e-05,
"loss": 0.4772517776489258,
"step": 63200
},
{
"epoch": 1.7431470127299222,
"grad_norm": 2.5110182762145996,
"learning_rate": 2.10444421938442e-05,
"loss": 0.48027557373046875,
"step": 63400
},
{
"epoch": 1.7486458992054108,
"grad_norm": 1.061119556427002,
"learning_rate": 2.0952372183808568e-05,
"loss": 0.4825307846069336,
"step": 63600
},
{
"epoch": 1.7541447856808996,
"grad_norm": 1.3090649843215942,
"learning_rate": 2.086030217377294e-05,
"loss": 0.4777912902832031,
"step": 63800
},
{
"epoch": 1.7596436721563884,
"grad_norm": 0.8455436825752258,
"learning_rate": 2.0768232163737306e-05,
"loss": 0.4868216705322266,
"step": 64000
},
{
"epoch": 1.7651425586318772,
"grad_norm": 1.1341484785079956,
"learning_rate": 2.0676162153701677e-05,
"loss": 0.4825804901123047,
"step": 64200
},
{
"epoch": 1.7706414451073658,
"grad_norm": 0.9106566905975342,
"learning_rate": 2.0584092143666044e-05,
"loss": 0.480031852722168,
"step": 64400
},
{
"epoch": 1.7761403315828543,
"grad_norm": 0.8978875279426575,
"learning_rate": 2.0492022133630415e-05,
"loss": 0.48035388946533203,
"step": 64600
},
{
"epoch": 1.7816392180583431,
"grad_norm": 1.508074164390564,
"learning_rate": 2.0399952123594782e-05,
"loss": 0.4823148727416992,
"step": 64800
},
{
"epoch": 1.787138104533832,
"grad_norm": 1.0851056575775146,
"learning_rate": 2.0307882113559153e-05,
"loss": 0.4738383102416992,
"step": 65000
},
{
"epoch": 1.7926369910093207,
"grad_norm": 1.0651288032531738,
"learning_rate": 2.021581210352352e-05,
"loss": 0.4777484130859375,
"step": 65200
},
{
"epoch": 1.7981358774848093,
"grad_norm": 1.3095803260803223,
"learning_rate": 2.0123742093487887e-05,
"loss": 0.48325523376464846,
"step": 65400
},
{
"epoch": 1.803634763960298,
"grad_norm": 1.1658202409744263,
"learning_rate": 2.0031672083452258e-05,
"loss": 0.4814822769165039,
"step": 65600
},
{
"epoch": 1.8091336504357867,
"grad_norm": 0.974337637424469,
"learning_rate": 1.9939602073416628e-05,
"loss": 0.47399234771728516,
"step": 65800
},
{
"epoch": 1.8146325369112755,
"grad_norm": 0.914979875087738,
"learning_rate": 1.9847532063380995e-05,
"loss": 0.48681838989257814,
"step": 66000
},
{
"epoch": 1.8201314233867643,
"grad_norm": 0.7990674376487732,
"learning_rate": 1.9755462053345363e-05,
"loss": 0.47843902587890624,
"step": 66200
},
{
"epoch": 1.825630309862253,
"grad_norm": 1.2652182579040527,
"learning_rate": 1.9663392043309733e-05,
"loss": 0.4840336990356445,
"step": 66400
},
{
"epoch": 1.8311291963377416,
"grad_norm": 0.9367465376853943,
"learning_rate": 1.9571322033274104e-05,
"loss": 0.48031715393066404,
"step": 66600
},
{
"epoch": 1.8366280828132302,
"grad_norm": 0.9445034861564636,
"learning_rate": 1.947925202323847e-05,
"loss": 0.48153636932373045,
"step": 66800
},
{
"epoch": 1.842126969288719,
"grad_norm": 1.062595009803772,
"learning_rate": 1.938718201320284e-05,
"loss": 0.4798342514038086,
"step": 67000
},
{
"epoch": 1.8476258557642078,
"grad_norm": 1.0887633562088013,
"learning_rate": 1.929511200316721e-05,
"loss": 0.4826160430908203,
"step": 67200
},
{
"epoch": 1.8531247422396966,
"grad_norm": 1.4558460712432861,
"learning_rate": 1.920304199313158e-05,
"loss": 0.48820636749267576,
"step": 67400
},
{
"epoch": 1.8586236287151852,
"grad_norm": 0.9983727931976318,
"learning_rate": 1.9110971983095947e-05,
"loss": 0.4826961135864258,
"step": 67600
},
{
"epoch": 1.8641225151906737,
"grad_norm": 0.9502201676368713,
"learning_rate": 1.9018901973060314e-05,
"loss": 0.4772541046142578,
"step": 67800
},
{
"epoch": 1.8696214016661625,
"grad_norm": 0.9462329149246216,
"learning_rate": 1.8926831963024685e-05,
"loss": 0.4827272415161133,
"step": 68000
},
{
"epoch": 1.8751202881416513,
"grad_norm": 1.2585595846176147,
"learning_rate": 1.8834761952989056e-05,
"loss": 0.48325294494628906,
"step": 68200
},
{
"epoch": 1.8806191746171401,
"grad_norm": 1.0165777206420898,
"learning_rate": 1.8742691942953423e-05,
"loss": 0.4868499755859375,
"step": 68400
},
{
"epoch": 1.8861180610926287,
"grad_norm": 1.1448917388916016,
"learning_rate": 1.8650621932917793e-05,
"loss": 0.47457069396972656,
"step": 68600
},
{
"epoch": 1.8916169475681175,
"grad_norm": 0.9723443984985352,
"learning_rate": 1.855855192288216e-05,
"loss": 0.4808235168457031,
"step": 68800
},
{
"epoch": 1.897115834043606,
"grad_norm": 1.8042104244232178,
"learning_rate": 1.8466481912846528e-05,
"loss": 0.4818389892578125,
"step": 69000
},
{
"epoch": 1.9026147205190949,
"grad_norm": 1.1425598859786987,
"learning_rate": 1.83744119028109e-05,
"loss": 0.47744728088378907,
"step": 69200
},
{
"epoch": 1.9081136069945837,
"grad_norm": 1.3648266792297363,
"learning_rate": 1.828234189277527e-05,
"loss": 0.47696762084960936,
"step": 69400
},
{
"epoch": 1.9136124934700725,
"grad_norm": 1.2545722723007202,
"learning_rate": 1.8190271882739636e-05,
"loss": 0.4733824920654297,
"step": 69600
},
{
"epoch": 1.919111379945561,
"grad_norm": 1.1813223361968994,
"learning_rate": 1.8098201872704004e-05,
"loss": 0.4728484344482422,
"step": 69800
},
{
"epoch": 1.9246102664210496,
"grad_norm": 1.2796030044555664,
"learning_rate": 1.8006131862668374e-05,
"loss": 0.4804762649536133,
"step": 70000
},
{
"epoch": 1.9301091528965384,
"grad_norm": 1.3735687732696533,
"learning_rate": 1.7914061852632745e-05,
"loss": 0.4790033721923828,
"step": 70200
},
{
"epoch": 1.9356080393720272,
"grad_norm": 1.2554829120635986,
"learning_rate": 1.7821991842597112e-05,
"loss": 0.48504138946533204,
"step": 70400
},
{
"epoch": 1.941106925847516,
"grad_norm": 1.08273184299469,
"learning_rate": 1.772992183256148e-05,
"loss": 0.4772909545898438,
"step": 70600
},
{
"epoch": 1.9466058123230046,
"grad_norm": 0.6954657435417175,
"learning_rate": 1.763785182252585e-05,
"loss": 0.49507545471191405,
"step": 70800
},
{
"epoch": 1.9521046987984934,
"grad_norm": 1.014246940612793,
"learning_rate": 1.754578181249022e-05,
"loss": 0.4824806213378906,
"step": 71000
},
{
"epoch": 1.957603585273982,
"grad_norm": 1.005923867225647,
"learning_rate": 1.7453711802454588e-05,
"loss": 0.4811605453491211,
"step": 71200
},
{
"epoch": 1.9631024717494707,
"grad_norm": 1.1930160522460938,
"learning_rate": 1.7361641792418955e-05,
"loss": 0.4723471450805664,
"step": 71400
},
{
"epoch": 1.9686013582249595,
"grad_norm": 1.132750153541565,
"learning_rate": 1.7269571782383326e-05,
"loss": 0.4810772323608398,
"step": 71600
},
{
"epoch": 1.9741002447004483,
"grad_norm": 1.2968944311141968,
"learning_rate": 1.7177501772347693e-05,
"loss": 0.47881488800048827,
"step": 71800
},
{
"epoch": 1.9795991311759369,
"grad_norm": 1.342724084854126,
"learning_rate": 1.7085431762312064e-05,
"loss": 0.4765338134765625,
"step": 72000
},
{
"epoch": 1.9850980176514255,
"grad_norm": 1.0654747486114502,
"learning_rate": 1.699336175227643e-05,
"loss": 0.4823237228393555,
"step": 72200
},
{
"epoch": 1.9905969041269143,
"grad_norm": 1.0994575023651123,
"learning_rate": 1.69012917422408e-05,
"loss": 0.48160724639892577,
"step": 72400
},
{
"epoch": 1.996095790602403,
"grad_norm": 1.0896570682525635,
"learning_rate": 1.680922173220517e-05,
"loss": 0.4756970977783203,
"step": 72600
},
{
"epoch": 2.0,
"eval_loss": 0.4625195264816284,
"eval_runtime": 158.6666,
"eval_samples_per_second": 407.515,
"eval_steps_per_second": 25.475,
"step": 72742
}
],
"logging_steps": 200,
"max_steps": 109113,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.875850713799066e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}