{ "best_global_step": 72742, "best_metric": 0.4625195264816284, "best_model_checkpoint": "./chess_t5_model_hikaru/checkpoint-72742", "epoch": 2.0, "eval_steps": 500, "global_step": 72742, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005498886475488713, "grad_norm": 1.238595962524414, "learning_rate": 1.9900000000000003e-05, "loss": 0.506280403137207, "step": 200 }, { "epoch": 0.010997772950977426, "grad_norm": 1.4552940130233765, "learning_rate": 3.99e-05, "loss": 0.49826000213623045, "step": 400 }, { "epoch": 0.01649665942646614, "grad_norm": 1.404646873474121, "learning_rate": 4.9954425345032365e-05, "loss": 0.49065006256103516, "step": 600 }, { "epoch": 0.021995545901954853, "grad_norm": 1.1736574172973633, "learning_rate": 4.9862355334996736e-05, "loss": 0.4841666030883789, "step": 800 }, { "epoch": 0.02749443237744357, "grad_norm": 0.9726278185844421, "learning_rate": 4.97702853249611e-05, "loss": 0.49401542663574216, "step": 1000 }, { "epoch": 0.03299331885293228, "grad_norm": 1.2402087450027466, "learning_rate": 4.967821531492547e-05, "loss": 0.49149925231933594, "step": 1200 }, { "epoch": 0.03849220532842099, "grad_norm": 1.2934446334838867, "learning_rate": 4.958614530488984e-05, "loss": 0.4847659683227539, "step": 1400 }, { "epoch": 0.043991091803909706, "grad_norm": 1.5672705173492432, "learning_rate": 4.949407529485421e-05, "loss": 0.4918820571899414, "step": 1600 }, { "epoch": 0.049489978279398425, "grad_norm": 1.2068564891815186, "learning_rate": 4.940200528481858e-05, "loss": 0.48373069763183596, "step": 1800 }, { "epoch": 0.05498886475488714, "grad_norm": 1.1447815895080566, "learning_rate": 4.9309935274782946e-05, "loss": 0.49397098541259765, "step": 2000 }, { "epoch": 0.06048775123037585, "grad_norm": 1.3082820177078247, "learning_rate": 4.9217865264747316e-05, "loss": 0.4879715728759766, "step": 2200 }, { "epoch": 0.06598663770586456, "grad_norm": 1.322120189666748, "learning_rate": 4.912579525471169e-05, "loss": 0.4860882568359375, "step": 2400 }, { "epoch": 0.07148552418135327, "grad_norm": 1.3626155853271484, "learning_rate": 4.903372524467605e-05, "loss": 0.49077301025390624, "step": 2600 }, { "epoch": 0.07698441065684199, "grad_norm": 1.0659453868865967, "learning_rate": 4.894165523464042e-05, "loss": 0.48927955627441405, "step": 2800 }, { "epoch": 0.0824832971323307, "grad_norm": 1.1972386837005615, "learning_rate": 4.884958522460479e-05, "loss": 0.4904148101806641, "step": 3000 }, { "epoch": 0.08798218360781941, "grad_norm": 1.3156094551086426, "learning_rate": 4.875751521456916e-05, "loss": 0.4949393081665039, "step": 3200 }, { "epoch": 0.09348107008330812, "grad_norm": 1.1924458742141724, "learning_rate": 4.8665445204533527e-05, "loss": 0.48969757080078125, "step": 3400 }, { "epoch": 0.09897995655879685, "grad_norm": 1.4736772775650024, "learning_rate": 4.85733751944979e-05, "loss": 0.4906147384643555, "step": 3600 }, { "epoch": 0.10447884303428556, "grad_norm": 1.2425668239593506, "learning_rate": 4.848130518446227e-05, "loss": 0.48961822509765623, "step": 3800 }, { "epoch": 0.10997772950977427, "grad_norm": 1.2657986879348755, "learning_rate": 4.838923517442663e-05, "loss": 0.4902804183959961, "step": 4000 }, { "epoch": 0.11547661598526299, "grad_norm": 1.2814760208129883, "learning_rate": 4.829716516439101e-05, "loss": 0.4866915130615234, "step": 4200 }, { "epoch": 0.1209755024607517, "grad_norm": 1.3233275413513184, "learning_rate": 4.820509515435537e-05, "loss": 0.4842318344116211, "step": 4400 }, { "epoch": 0.1264743889362404, "grad_norm": 1.0813190937042236, "learning_rate": 4.8113025144319744e-05, "loss": 0.4887635040283203, "step": 4600 }, { "epoch": 0.13197327541172912, "grad_norm": 1.4319493770599365, "learning_rate": 4.8020955134284114e-05, "loss": 0.48523338317871095, "step": 4800 }, { "epoch": 0.13747216188721784, "grad_norm": 1.1767573356628418, "learning_rate": 4.792888512424848e-05, "loss": 0.4876384735107422, "step": 5000 }, { "epoch": 0.14297104836270655, "grad_norm": 1.2524778842926025, "learning_rate": 4.783681511421285e-05, "loss": 0.48621952056884765, "step": 5200 }, { "epoch": 0.14846993483819526, "grad_norm": 1.7391471862792969, "learning_rate": 4.774474510417722e-05, "loss": 0.4950310516357422, "step": 5400 }, { "epoch": 0.15396882131368397, "grad_norm": 1.3185511827468872, "learning_rate": 4.765267509414158e-05, "loss": 0.4825564193725586, "step": 5600 }, { "epoch": 0.15946770778917269, "grad_norm": 1.023362636566162, "learning_rate": 4.756060508410596e-05, "loss": 0.4869500732421875, "step": 5800 }, { "epoch": 0.1649665942646614, "grad_norm": 1.4824140071868896, "learning_rate": 4.7468535074070324e-05, "loss": 0.49057952880859373, "step": 6000 }, { "epoch": 0.1704654807401501, "grad_norm": 1.1914821863174438, "learning_rate": 4.7376465064034695e-05, "loss": 0.49073287963867185, "step": 6200 }, { "epoch": 0.17596436721563882, "grad_norm": 1.0815869569778442, "learning_rate": 4.7284395053999066e-05, "loss": 0.49227970123291015, "step": 6400 }, { "epoch": 0.18146325369112754, "grad_norm": 1.644206166267395, "learning_rate": 4.719232504396343e-05, "loss": 0.48635608673095704, "step": 6600 }, { "epoch": 0.18696214016661625, "grad_norm": 1.1657360792160034, "learning_rate": 4.710025503392781e-05, "loss": 0.49577178955078127, "step": 6800 }, { "epoch": 0.192461026642105, "grad_norm": 1.3343608379364014, "learning_rate": 4.700818502389217e-05, "loss": 0.48896270751953125, "step": 7000 }, { "epoch": 0.1979599131175937, "grad_norm": 1.036275863647461, "learning_rate": 4.6916115013856535e-05, "loss": 0.4918210983276367, "step": 7200 }, { "epoch": 0.2034587995930824, "grad_norm": 1.1466560363769531, "learning_rate": 4.682404500382091e-05, "loss": 0.4845957946777344, "step": 7400 }, { "epoch": 0.20895768606857112, "grad_norm": 1.2194637060165405, "learning_rate": 4.6731974993785276e-05, "loss": 0.48361312866210937, "step": 7600 }, { "epoch": 0.21445657254405984, "grad_norm": 1.0549407005310059, "learning_rate": 4.663990498374965e-05, "loss": 0.4873248291015625, "step": 7800 }, { "epoch": 0.21995545901954855, "grad_norm": 1.5164191722869873, "learning_rate": 4.654783497371402e-05, "loss": 0.48396194458007813, "step": 8000 }, { "epoch": 0.22545434549503726, "grad_norm": 0.9566870927810669, "learning_rate": 4.645576496367838e-05, "loss": 0.49601322174072265, "step": 8200 }, { "epoch": 0.23095323197052597, "grad_norm": 1.268650770187378, "learning_rate": 4.636369495364275e-05, "loss": 0.4893684387207031, "step": 8400 }, { "epoch": 0.2364521184460147, "grad_norm": 1.626772165298462, "learning_rate": 4.627162494360712e-05, "loss": 0.4904355621337891, "step": 8600 }, { "epoch": 0.2419510049215034, "grad_norm": 1.2238197326660156, "learning_rate": 4.6179554933571486e-05, "loss": 0.484462890625, "step": 8800 }, { "epoch": 0.2474498913969921, "grad_norm": 1.4520012140274048, "learning_rate": 4.608748492353586e-05, "loss": 0.4788643264770508, "step": 9000 }, { "epoch": 0.2529487778724808, "grad_norm": 1.1315358877182007, "learning_rate": 4.599541491350023e-05, "loss": 0.48716392517089846, "step": 9200 }, { "epoch": 0.25844766434796956, "grad_norm": 1.0476795434951782, "learning_rate": 4.59033449034646e-05, "loss": 0.4895347213745117, "step": 9400 }, { "epoch": 0.26394655082345825, "grad_norm": 1.100468635559082, "learning_rate": 4.581127489342897e-05, "loss": 0.4899191665649414, "step": 9600 }, { "epoch": 0.269445437298947, "grad_norm": 1.1204516887664795, "learning_rate": 4.571920488339333e-05, "loss": 0.4837226486206055, "step": 9800 }, { "epoch": 0.2749443237744357, "grad_norm": 1.271126627922058, "learning_rate": 4.56271348733577e-05, "loss": 0.48351455688476563, "step": 10000 }, { "epoch": 0.2804432102499244, "grad_norm": 1.294801115989685, "learning_rate": 4.5535064863322074e-05, "loss": 0.4861069107055664, "step": 10200 }, { "epoch": 0.2859420967254131, "grad_norm": 0.9449974894523621, "learning_rate": 4.5442994853286444e-05, "loss": 0.4828087997436523, "step": 10400 }, { "epoch": 0.29144098320090184, "grad_norm": 1.017383337020874, "learning_rate": 4.535092484325081e-05, "loss": 0.4907358551025391, "step": 10600 }, { "epoch": 0.2969398696763905, "grad_norm": 1.4358981847763062, "learning_rate": 4.525885483321518e-05, "loss": 0.482423210144043, "step": 10800 }, { "epoch": 0.30243875615187926, "grad_norm": 1.3579864501953125, "learning_rate": 4.516678482317955e-05, "loss": 0.4872136688232422, "step": 11000 }, { "epoch": 0.30793764262736795, "grad_norm": 1.309594750404358, "learning_rate": 4.5074714813143913e-05, "loss": 0.48710639953613283, "step": 11200 }, { "epoch": 0.3134365291028567, "grad_norm": 1.4916502237319946, "learning_rate": 4.4982644803108284e-05, "loss": 0.4856998062133789, "step": 11400 }, { "epoch": 0.31893541557834537, "grad_norm": 1.1984270811080933, "learning_rate": 4.4890574793072655e-05, "loss": 0.48433101654052735, "step": 11600 }, { "epoch": 0.3244343020538341, "grad_norm": 1.376825213432312, "learning_rate": 4.479850478303702e-05, "loss": 0.4843286514282227, "step": 11800 }, { "epoch": 0.3299331885293228, "grad_norm": 1.04801607131958, "learning_rate": 4.4706434773001396e-05, "loss": 0.4760005187988281, "step": 12000 }, { "epoch": 0.33543207500481154, "grad_norm": 1.277635097503662, "learning_rate": 4.461436476296576e-05, "loss": 0.4829146194458008, "step": 12200 }, { "epoch": 0.3409309614803002, "grad_norm": 1.08747398853302, "learning_rate": 4.452229475293013e-05, "loss": 0.49240009307861327, "step": 12400 }, { "epoch": 0.34642984795578896, "grad_norm": 1.1133017539978027, "learning_rate": 4.44302247428945e-05, "loss": 0.4923815536499023, "step": 12600 }, { "epoch": 0.35192873443127765, "grad_norm": 1.5661677122116089, "learning_rate": 4.4338154732858865e-05, "loss": 0.4879690933227539, "step": 12800 }, { "epoch": 0.3574276209067664, "grad_norm": 1.4570703506469727, "learning_rate": 4.4246084722823236e-05, "loss": 0.4856468963623047, "step": 13000 }, { "epoch": 0.36292650738225507, "grad_norm": 1.4638596773147583, "learning_rate": 4.4154014712787606e-05, "loss": 0.48412940979003904, "step": 13200 }, { "epoch": 0.3684253938577438, "grad_norm": 1.2463369369506836, "learning_rate": 4.406194470275197e-05, "loss": 0.4839859771728516, "step": 13400 }, { "epoch": 0.3739242803332325, "grad_norm": 1.0832504034042358, "learning_rate": 4.396987469271635e-05, "loss": 0.49135875701904297, "step": 13600 }, { "epoch": 0.37942316680872124, "grad_norm": 1.1107310056686401, "learning_rate": 4.387780468268071e-05, "loss": 0.47643829345703126, "step": 13800 }, { "epoch": 0.38492205328421, "grad_norm": 1.1073737144470215, "learning_rate": 4.378573467264508e-05, "loss": 0.4878357315063477, "step": 14000 }, { "epoch": 0.39042093975969866, "grad_norm": 1.4523825645446777, "learning_rate": 4.369366466260945e-05, "loss": 0.4929097747802734, "step": 14200 }, { "epoch": 0.3959198262351874, "grad_norm": 1.1978082656860352, "learning_rate": 4.3601594652573816e-05, "loss": 0.47999427795410154, "step": 14400 }, { "epoch": 0.4014187127106761, "grad_norm": 1.080812692642212, "learning_rate": 4.3509524642538194e-05, "loss": 0.4825727081298828, "step": 14600 }, { "epoch": 0.4069175991861648, "grad_norm": 1.053101897239685, "learning_rate": 4.341745463250256e-05, "loss": 0.4855875778198242, "step": 14800 }, { "epoch": 0.4124164856616535, "grad_norm": 1.5434905290603638, "learning_rate": 4.332538462246692e-05, "loss": 0.48418006896972654, "step": 15000 }, { "epoch": 0.41791537213714225, "grad_norm": 1.3098441362380981, "learning_rate": 4.32333146124313e-05, "loss": 0.47957420349121094, "step": 15200 }, { "epoch": 0.42341425861263093, "grad_norm": 1.1274868249893188, "learning_rate": 4.314124460239566e-05, "loss": 0.48411872863769534, "step": 15400 }, { "epoch": 0.4289131450881197, "grad_norm": 1.1913822889328003, "learning_rate": 4.3049174592360033e-05, "loss": 0.48610164642333986, "step": 15600 }, { "epoch": 0.43441203156360836, "grad_norm": 1.1664844751358032, "learning_rate": 4.2957104582324404e-05, "loss": 0.48251495361328123, "step": 15800 }, { "epoch": 0.4399109180390971, "grad_norm": 0.9833515882492065, "learning_rate": 4.286503457228877e-05, "loss": 0.48185344696044924, "step": 16000 }, { "epoch": 0.4454098045145858, "grad_norm": 1.3691802024841309, "learning_rate": 4.277296456225314e-05, "loss": 0.47827003479003904, "step": 16200 }, { "epoch": 0.4509086909900745, "grad_norm": 1.4538307189941406, "learning_rate": 4.268089455221751e-05, "loss": 0.48655067443847655, "step": 16400 }, { "epoch": 0.4564075774655632, "grad_norm": 1.6174641847610474, "learning_rate": 4.258882454218188e-05, "loss": 0.4811368179321289, "step": 16600 }, { "epoch": 0.46190646394105195, "grad_norm": 1.379770278930664, "learning_rate": 4.2496754532146244e-05, "loss": 0.4825275421142578, "step": 16800 }, { "epoch": 0.46740535041654063, "grad_norm": 1.1480027437210083, "learning_rate": 4.2404684522110614e-05, "loss": 0.4793708801269531, "step": 17000 }, { "epoch": 0.4729042368920294, "grad_norm": 1.2923580408096313, "learning_rate": 4.2312614512074985e-05, "loss": 0.48294658660888673, "step": 17200 }, { "epoch": 0.47840312336751806, "grad_norm": 1.1704210042953491, "learning_rate": 4.2220544502039356e-05, "loss": 0.48764766693115236, "step": 17400 }, { "epoch": 0.4839020098430068, "grad_norm": 0.9645224213600159, "learning_rate": 4.212847449200372e-05, "loss": 0.48104751586914063, "step": 17600 }, { "epoch": 0.4894008963184955, "grad_norm": 1.0854864120483398, "learning_rate": 4.203640448196809e-05, "loss": 0.48372928619384764, "step": 17800 }, { "epoch": 0.4948997827939842, "grad_norm": 1.058073878288269, "learning_rate": 4.194433447193246e-05, "loss": 0.481105842590332, "step": 18000 }, { "epoch": 0.500398669269473, "grad_norm": 1.1038442850112915, "learning_rate": 4.185226446189683e-05, "loss": 0.48221038818359374, "step": 18200 }, { "epoch": 0.5058975557449616, "grad_norm": 1.1211503744125366, "learning_rate": 4.1760194451861195e-05, "loss": 0.48461868286132814, "step": 18400 }, { "epoch": 0.5113964422204503, "grad_norm": 1.1851303577423096, "learning_rate": 4.1668124441825566e-05, "loss": 0.48900299072265624, "step": 18600 }, { "epoch": 0.5168953286959391, "grad_norm": 1.1773110628128052, "learning_rate": 4.1576054431789936e-05, "loss": 0.4895766067504883, "step": 18800 }, { "epoch": 0.5223942151714278, "grad_norm": 1.0236694812774658, "learning_rate": 4.14839844217543e-05, "loss": 0.47842552185058596, "step": 19000 }, { "epoch": 0.5278931016469165, "grad_norm": 1.2550437450408936, "learning_rate": 4.139191441171867e-05, "loss": 0.4883332061767578, "step": 19200 }, { "epoch": 0.5333919881224052, "grad_norm": 1.6811326742172241, "learning_rate": 4.129984440168304e-05, "loss": 0.48251426696777344, "step": 19400 }, { "epoch": 0.538890874597894, "grad_norm": 1.1312133073806763, "learning_rate": 4.1207774391647405e-05, "loss": 0.4811555480957031, "step": 19600 }, { "epoch": 0.5443897610733827, "grad_norm": 1.106419563293457, "learning_rate": 4.111570438161178e-05, "loss": 0.4829677963256836, "step": 19800 }, { "epoch": 0.5498886475488713, "grad_norm": 1.2335270643234253, "learning_rate": 4.102363437157615e-05, "loss": 0.48413619995117185, "step": 20000 }, { "epoch": 0.55538753402436, "grad_norm": 1.195844054222107, "learning_rate": 4.093156436154052e-05, "loss": 0.4821126937866211, "step": 20200 }, { "epoch": 0.5608864204998488, "grad_norm": 1.0814074277877808, "learning_rate": 4.083949435150489e-05, "loss": 0.4847369003295898, "step": 20400 }, { "epoch": 0.5663853069753375, "grad_norm": 1.4510689973831177, "learning_rate": 4.074742434146925e-05, "loss": 0.4875687789916992, "step": 20600 }, { "epoch": 0.5718841934508262, "grad_norm": 1.0444058179855347, "learning_rate": 4.065535433143363e-05, "loss": 0.4803382110595703, "step": 20800 }, { "epoch": 0.5773830799263149, "grad_norm": 1.1824759244918823, "learning_rate": 4.056328432139799e-05, "loss": 0.48757186889648435, "step": 21000 }, { "epoch": 0.5828819664018037, "grad_norm": 1.1672804355621338, "learning_rate": 4.047121431136236e-05, "loss": 0.47619979858398437, "step": 21200 }, { "epoch": 0.5883808528772924, "grad_norm": 1.3952018022537231, "learning_rate": 4.0379144301326734e-05, "loss": 0.4820771026611328, "step": 21400 }, { "epoch": 0.593879739352781, "grad_norm": 1.5481926202774048, "learning_rate": 4.02870742912911e-05, "loss": 0.4789703369140625, "step": 21600 }, { "epoch": 0.5993786258282697, "grad_norm": 1.1940809488296509, "learning_rate": 4.019500428125547e-05, "loss": 0.4823886871337891, "step": 21800 }, { "epoch": 0.6048775123037585, "grad_norm": 1.470038890838623, "learning_rate": 4.010293427121984e-05, "loss": 0.47876800537109376, "step": 22000 }, { "epoch": 0.6103763987792472, "grad_norm": 1.372512698173523, "learning_rate": 4.00108642611842e-05, "loss": 0.48137435913085935, "step": 22200 }, { "epoch": 0.6158752852547359, "grad_norm": 0.9625583291053772, "learning_rate": 3.991879425114858e-05, "loss": 0.4751309967041016, "step": 22400 }, { "epoch": 0.6213741717302246, "grad_norm": 1.0047613382339478, "learning_rate": 3.9826724241112945e-05, "loss": 0.4809339141845703, "step": 22600 }, { "epoch": 0.6268730582057134, "grad_norm": 1.8941971063613892, "learning_rate": 3.973465423107731e-05, "loss": 0.47376441955566406, "step": 22800 }, { "epoch": 0.6323719446812021, "grad_norm": 1.0294033288955688, "learning_rate": 3.9642584221041686e-05, "loss": 0.4846999740600586, "step": 23000 }, { "epoch": 0.6378708311566907, "grad_norm": 1.1899781227111816, "learning_rate": 3.955051421100605e-05, "loss": 0.4818299865722656, "step": 23200 }, { "epoch": 0.6433697176321795, "grad_norm": 1.5099271535873413, "learning_rate": 3.945844420097042e-05, "loss": 0.4828767776489258, "step": 23400 }, { "epoch": 0.6488686041076682, "grad_norm": 1.3377799987792969, "learning_rate": 3.936637419093479e-05, "loss": 0.47890872955322267, "step": 23600 }, { "epoch": 0.6543674905831569, "grad_norm": 1.6240547895431519, "learning_rate": 3.9274304180899155e-05, "loss": 0.4793845748901367, "step": 23800 }, { "epoch": 0.6598663770586456, "grad_norm": 1.32374107837677, "learning_rate": 3.9182234170863525e-05, "loss": 0.48126724243164065, "step": 24000 }, { "epoch": 0.6653652635341344, "grad_norm": 1.1302155256271362, "learning_rate": 3.9090164160827896e-05, "loss": 0.4794307708740234, "step": 24200 }, { "epoch": 0.6708641500096231, "grad_norm": 1.2106575965881348, "learning_rate": 3.899809415079227e-05, "loss": 0.4764822769165039, "step": 24400 }, { "epoch": 0.6763630364851118, "grad_norm": 1.1682376861572266, "learning_rate": 3.890602414075663e-05, "loss": 0.48130035400390625, "step": 24600 }, { "epoch": 0.6818619229606004, "grad_norm": 1.7385523319244385, "learning_rate": 3.8813954130721e-05, "loss": 0.48233951568603517, "step": 24800 }, { "epoch": 0.6873608094360892, "grad_norm": 0.9956115484237671, "learning_rate": 3.872188412068537e-05, "loss": 0.47769607543945314, "step": 25000 }, { "epoch": 0.6928596959115779, "grad_norm": 0.9261813759803772, "learning_rate": 3.862981411064974e-05, "loss": 0.4826504898071289, "step": 25200 }, { "epoch": 0.6983585823870666, "grad_norm": 1.0754562616348267, "learning_rate": 3.8537744100614106e-05, "loss": 0.48267059326171874, "step": 25400 }, { "epoch": 0.7038574688625553, "grad_norm": 1.2435545921325684, "learning_rate": 3.844567409057848e-05, "loss": 0.48062828063964846, "step": 25600 }, { "epoch": 0.7093563553380441, "grad_norm": 1.1161478757858276, "learning_rate": 3.835360408054285e-05, "loss": 0.476544189453125, "step": 25800 }, { "epoch": 0.7148552418135328, "grad_norm": 1.144326090812683, "learning_rate": 3.826153407050722e-05, "loss": 0.4830437469482422, "step": 26000 }, { "epoch": 0.7203541282890215, "grad_norm": 1.2163105010986328, "learning_rate": 3.816946406047158e-05, "loss": 0.48178863525390625, "step": 26200 }, { "epoch": 0.7258530147645101, "grad_norm": 1.3089566230773926, "learning_rate": 3.807739405043595e-05, "loss": 0.4754468536376953, "step": 26400 }, { "epoch": 0.7313519012399989, "grad_norm": 1.2991975545883179, "learning_rate": 3.798532404040032e-05, "loss": 0.4895411682128906, "step": 26600 }, { "epoch": 0.7368507877154876, "grad_norm": 1.6097289323806763, "learning_rate": 3.789325403036469e-05, "loss": 0.47313800811767576, "step": 26800 }, { "epoch": 0.7423496741909763, "grad_norm": 1.4237576723098755, "learning_rate": 3.7801184020329065e-05, "loss": 0.47288108825683595, "step": 27000 }, { "epoch": 0.747848560666465, "grad_norm": 1.7340173721313477, "learning_rate": 3.770911401029343e-05, "loss": 0.4713779067993164, "step": 27200 }, { "epoch": 0.7533474471419538, "grad_norm": 1.3480178117752075, "learning_rate": 3.761704400025779e-05, "loss": 0.4823367309570312, "step": 27400 }, { "epoch": 0.7588463336174425, "grad_norm": 0.945102870464325, "learning_rate": 3.752497399022217e-05, "loss": 0.485689697265625, "step": 27600 }, { "epoch": 0.7643452200929312, "grad_norm": 1.5504003763198853, "learning_rate": 3.7432903980186534e-05, "loss": 0.4697317886352539, "step": 27800 }, { "epoch": 0.76984410656842, "grad_norm": 1.4954441785812378, "learning_rate": 3.7340833970150904e-05, "loss": 0.4746841049194336, "step": 28000 }, { "epoch": 0.7753429930439086, "grad_norm": 1.660771131515503, "learning_rate": 3.7248763960115275e-05, "loss": 0.48746414184570314, "step": 28200 }, { "epoch": 0.7808418795193973, "grad_norm": 1.216834306716919, "learning_rate": 3.715669395007964e-05, "loss": 0.4784600067138672, "step": 28400 }, { "epoch": 0.786340765994886, "grad_norm": 1.3025329113006592, "learning_rate": 3.7064623940044016e-05, "loss": 0.48134098052978513, "step": 28600 }, { "epoch": 0.7918396524703748, "grad_norm": 0.8612267374992371, "learning_rate": 3.697255393000838e-05, "loss": 0.48288066864013673, "step": 28800 }, { "epoch": 0.7973385389458635, "grad_norm": 1.5112066268920898, "learning_rate": 3.6880483919972744e-05, "loss": 0.48638771057128904, "step": 29000 }, { "epoch": 0.8028374254213522, "grad_norm": 1.2981903553009033, "learning_rate": 3.678841390993712e-05, "loss": 0.4764302444458008, "step": 29200 }, { "epoch": 0.8083363118968409, "grad_norm": 1.2499499320983887, "learning_rate": 3.6696343899901485e-05, "loss": 0.47807662963867187, "step": 29400 }, { "epoch": 0.8138351983723296, "grad_norm": 1.4974340200424194, "learning_rate": 3.6604273889865856e-05, "loss": 0.48103851318359375, "step": 29600 }, { "epoch": 0.8193340848478183, "grad_norm": 1.6043846607208252, "learning_rate": 3.6512203879830226e-05, "loss": 0.4745806121826172, "step": 29800 }, { "epoch": 0.824832971323307, "grad_norm": 1.0718004703521729, "learning_rate": 3.642013386979459e-05, "loss": 0.4758340835571289, "step": 30000 }, { "epoch": 0.8303318577987957, "grad_norm": 1.31827712059021, "learning_rate": 3.632806385975897e-05, "loss": 0.48326473236083983, "step": 30200 }, { "epoch": 0.8358307442742845, "grad_norm": 1.214794635772705, "learning_rate": 3.623599384972333e-05, "loss": 0.4670214080810547, "step": 30400 }, { "epoch": 0.8413296307497732, "grad_norm": 1.3490458726882935, "learning_rate": 3.61439238396877e-05, "loss": 0.4771783065795898, "step": 30600 }, { "epoch": 0.8468285172252619, "grad_norm": 1.7430227994918823, "learning_rate": 3.605185382965207e-05, "loss": 0.47809303283691407, "step": 30800 }, { "epoch": 0.8523274037007506, "grad_norm": 1.04710054397583, "learning_rate": 3.5959783819616437e-05, "loss": 0.47361648559570313, "step": 31000 }, { "epoch": 0.8578262901762393, "grad_norm": 1.239403247833252, "learning_rate": 3.586771380958081e-05, "loss": 0.47289577484130857, "step": 31200 }, { "epoch": 0.863325176651728, "grad_norm": 1.0348613262176514, "learning_rate": 3.577564379954518e-05, "loss": 0.4844191360473633, "step": 31400 }, { "epoch": 0.8688240631272167, "grad_norm": 1.2087358236312866, "learning_rate": 3.568357378950954e-05, "loss": 0.4849067687988281, "step": 31600 }, { "epoch": 0.8743229496027054, "grad_norm": 1.498613715171814, "learning_rate": 3.559150377947391e-05, "loss": 0.4736307907104492, "step": 31800 }, { "epoch": 0.8798218360781942, "grad_norm": 1.2673721313476562, "learning_rate": 3.549943376943828e-05, "loss": 0.4751145553588867, "step": 32000 }, { "epoch": 0.8853207225536829, "grad_norm": 1.078145980834961, "learning_rate": 3.5407363759402654e-05, "loss": 0.48442405700683594, "step": 32200 }, { "epoch": 0.8908196090291716, "grad_norm": 1.8665213584899902, "learning_rate": 3.531529374936702e-05, "loss": 0.47800296783447266, "step": 32400 }, { "epoch": 0.8963184955046603, "grad_norm": 1.093640685081482, "learning_rate": 3.522322373933139e-05, "loss": 0.4768505859375, "step": 32600 }, { "epoch": 0.901817381980149, "grad_norm": 1.438798189163208, "learning_rate": 3.513115372929576e-05, "loss": 0.4752470016479492, "step": 32800 }, { "epoch": 0.9073162684556377, "grad_norm": 1.156036376953125, "learning_rate": 3.503908371926013e-05, "loss": 0.47669639587402346, "step": 33000 }, { "epoch": 0.9128151549311264, "grad_norm": 1.223441481590271, "learning_rate": 3.494701370922449e-05, "loss": 0.4820696258544922, "step": 33200 }, { "epoch": 0.9183140414066152, "grad_norm": 1.161592721939087, "learning_rate": 3.4854943699188864e-05, "loss": 0.46704940795898436, "step": 33400 }, { "epoch": 0.9238129278821039, "grad_norm": 1.217645287513733, "learning_rate": 3.4762873689153234e-05, "loss": 0.4787548065185547, "step": 33600 }, { "epoch": 0.9293118143575926, "grad_norm": 1.2599478960037231, "learning_rate": 3.4670803679117605e-05, "loss": 0.47969642639160154, "step": 33800 }, { "epoch": 0.9348107008330813, "grad_norm": 1.1119675636291504, "learning_rate": 3.457873366908197e-05, "loss": 0.48166824340820313, "step": 34000 }, { "epoch": 0.9403095873085701, "grad_norm": 1.4451464414596558, "learning_rate": 3.448666365904634e-05, "loss": 0.4774625015258789, "step": 34200 }, { "epoch": 0.9458084737840587, "grad_norm": 1.121450662612915, "learning_rate": 3.439459364901071e-05, "loss": 0.4775946426391602, "step": 34400 }, { "epoch": 0.9513073602595474, "grad_norm": 1.7251038551330566, "learning_rate": 3.4302523638975074e-05, "loss": 0.46810245513916016, "step": 34600 }, { "epoch": 0.9568062467350361, "grad_norm": 1.1376259326934814, "learning_rate": 3.421045362893945e-05, "loss": 0.4734595108032227, "step": 34800 }, { "epoch": 0.9623051332105249, "grad_norm": 1.3909783363342285, "learning_rate": 3.4118383618903815e-05, "loss": 0.4745623016357422, "step": 35000 }, { "epoch": 0.9678040196860136, "grad_norm": 1.4496464729309082, "learning_rate": 3.4026313608868186e-05, "loss": 0.4793576431274414, "step": 35200 }, { "epoch": 0.9733029061615023, "grad_norm": 1.188259482383728, "learning_rate": 3.3934243598832557e-05, "loss": 0.48435794830322265, "step": 35400 }, { "epoch": 0.978801792636991, "grad_norm": 0.972775936126709, "learning_rate": 3.384217358879692e-05, "loss": 0.48073070526123046, "step": 35600 }, { "epoch": 0.9843006791124798, "grad_norm": 1.3712236881256104, "learning_rate": 3.375010357876129e-05, "loss": 0.47246246337890624, "step": 35800 }, { "epoch": 0.9897995655879684, "grad_norm": 1.0553455352783203, "learning_rate": 3.365803356872566e-05, "loss": 0.4749702835083008, "step": 36000 }, { "epoch": 0.9952984520634571, "grad_norm": 2.0960538387298584, "learning_rate": 3.3565963558690026e-05, "loss": 0.48137100219726564, "step": 36200 }, { "epoch": 1.0, "eval_loss": 0.46924570202827454, "eval_runtime": 158.4469, "eval_samples_per_second": 408.08, "eval_steps_per_second": 25.51, "step": 36371 }, { "epoch": 1.000797338538946, "grad_norm": 1.0945351123809814, "learning_rate": 3.34738935486544e-05, "loss": 0.47277111053466797, "step": 36400 }, { "epoch": 1.0062962250144345, "grad_norm": 1.1377208232879639, "learning_rate": 3.338182353861877e-05, "loss": 0.4704814147949219, "step": 36600 }, { "epoch": 1.0117951114899233, "grad_norm": 1.2042992115020752, "learning_rate": 3.328975352858314e-05, "loss": 0.4778765869140625, "step": 36800 }, { "epoch": 1.017293997965412, "grad_norm": 1.2293647527694702, "learning_rate": 3.319768351854751e-05, "loss": 0.4779492950439453, "step": 37000 }, { "epoch": 1.0227928844409007, "grad_norm": 1.0912444591522217, "learning_rate": 3.310561350851187e-05, "loss": 0.47877525329589843, "step": 37200 }, { "epoch": 1.0282917709163895, "grad_norm": 1.2448941469192505, "learning_rate": 3.301354349847624e-05, "loss": 0.4758515930175781, "step": 37400 }, { "epoch": 1.033790657391878, "grad_norm": 1.127113699913025, "learning_rate": 3.292147348844061e-05, "loss": 0.47277240753173827, "step": 37600 }, { "epoch": 1.0392895438673668, "grad_norm": 1.184788703918457, "learning_rate": 3.282940347840498e-05, "loss": 0.48051612854003906, "step": 37800 }, { "epoch": 1.0447884303428556, "grad_norm": 1.3059478998184204, "learning_rate": 3.2737333468369354e-05, "loss": 0.4780512237548828, "step": 38000 }, { "epoch": 1.0502873168183442, "grad_norm": 1.035843014717102, "learning_rate": 3.264526345833372e-05, "loss": 0.4778805160522461, "step": 38200 }, { "epoch": 1.055786203293833, "grad_norm": 1.142691731452942, "learning_rate": 3.255319344829809e-05, "loss": 0.4747405242919922, "step": 38400 }, { "epoch": 1.0612850897693218, "grad_norm": 1.2115979194641113, "learning_rate": 3.246112343826246e-05, "loss": 0.4684751510620117, "step": 38600 }, { "epoch": 1.0667839762448104, "grad_norm": 1.0604227781295776, "learning_rate": 3.2369053428226823e-05, "loss": 0.4836904525756836, "step": 38800 }, { "epoch": 1.0722828627202992, "grad_norm": 1.2616559267044067, "learning_rate": 3.2276983418191194e-05, "loss": 0.47024051666259764, "step": 39000 }, { "epoch": 1.077781749195788, "grad_norm": 1.1861746311187744, "learning_rate": 3.2184913408155565e-05, "loss": 0.47626224517822263, "step": 39200 }, { "epoch": 1.0832806356712765, "grad_norm": 1.0768451690673828, "learning_rate": 3.209284339811993e-05, "loss": 0.4712419128417969, "step": 39400 }, { "epoch": 1.0887795221467653, "grad_norm": 1.1116639375686646, "learning_rate": 3.20007733880843e-05, "loss": 0.47870445251464844, "step": 39600 }, { "epoch": 1.094278408622254, "grad_norm": 0.9229024648666382, "learning_rate": 3.190870337804867e-05, "loss": 0.47164249420166016, "step": 39800 }, { "epoch": 1.0997772950977427, "grad_norm": 1.2584002017974854, "learning_rate": 3.181663336801304e-05, "loss": 0.46996349334716797, "step": 40000 }, { "epoch": 1.1052761815732315, "grad_norm": 1.1987744569778442, "learning_rate": 3.1724563357977404e-05, "loss": 0.47581478118896486, "step": 40200 }, { "epoch": 1.11077506804872, "grad_norm": 1.897595763206482, "learning_rate": 3.1632493347941775e-05, "loss": 0.47223583221435544, "step": 40400 }, { "epoch": 1.1162739545242089, "grad_norm": 1.384735345840454, "learning_rate": 3.1540423337906146e-05, "loss": 0.4742586898803711, "step": 40600 }, { "epoch": 1.1217728409996977, "grad_norm": 1.2924162149429321, "learning_rate": 3.1448353327870516e-05, "loss": 0.4763710403442383, "step": 40800 }, { "epoch": 1.1272717274751862, "grad_norm": 1.2529865503311157, "learning_rate": 3.135628331783489e-05, "loss": 0.4804756546020508, "step": 41000 }, { "epoch": 1.132770613950675, "grad_norm": 1.0378504991531372, "learning_rate": 3.126421330779925e-05, "loss": 0.4701519775390625, "step": 41200 }, { "epoch": 1.1382695004261638, "grad_norm": 1.3165602684020996, "learning_rate": 3.117214329776362e-05, "loss": 0.4799094009399414, "step": 41400 }, { "epoch": 1.1437683869016524, "grad_norm": 1.3106869459152222, "learning_rate": 3.108007328772799e-05, "loss": 0.4807415771484375, "step": 41600 }, { "epoch": 1.1492672733771412, "grad_norm": 1.870168685913086, "learning_rate": 3.0988003277692356e-05, "loss": 0.4763855743408203, "step": 41800 }, { "epoch": 1.1547661598526298, "grad_norm": 1.2770658731460571, "learning_rate": 3.0895933267656726e-05, "loss": 0.47005424499511717, "step": 42000 }, { "epoch": 1.1602650463281186, "grad_norm": 1.2080628871917725, "learning_rate": 3.08038632576211e-05, "loss": 0.4715093231201172, "step": 42200 }, { "epoch": 1.1657639328036074, "grad_norm": 1.8036431074142456, "learning_rate": 3.071179324758546e-05, "loss": 0.4677348327636719, "step": 42400 }, { "epoch": 1.171262819279096, "grad_norm": 1.0280815362930298, "learning_rate": 3.061972323754984e-05, "loss": 0.4739281463623047, "step": 42600 }, { "epoch": 1.1767617057545847, "grad_norm": 0.9961258769035339, "learning_rate": 3.05276532275142e-05, "loss": 0.4825423049926758, "step": 42800 }, { "epoch": 1.1822605922300733, "grad_norm": 1.0836036205291748, "learning_rate": 3.0435583217478576e-05, "loss": 0.47616680145263673, "step": 43000 }, { "epoch": 1.187759478705562, "grad_norm": 0.9266841411590576, "learning_rate": 3.0343513207442943e-05, "loss": 0.47153358459472655, "step": 43200 }, { "epoch": 1.1932583651810509, "grad_norm": 1.0143980979919434, "learning_rate": 3.0251443197407307e-05, "loss": 0.4762028503417969, "step": 43400 }, { "epoch": 1.1987572516565395, "grad_norm": 1.160222053527832, "learning_rate": 3.015937318737168e-05, "loss": 0.4718109893798828, "step": 43600 }, { "epoch": 1.2042561381320283, "grad_norm": 1.1540669202804565, "learning_rate": 3.006730317733605e-05, "loss": 0.47153167724609374, "step": 43800 }, { "epoch": 1.209755024607517, "grad_norm": 1.3754700422286987, "learning_rate": 2.9975233167300416e-05, "loss": 0.4751555252075195, "step": 44000 }, { "epoch": 1.2152539110830056, "grad_norm": 1.095689296722412, "learning_rate": 2.9883163157264786e-05, "loss": 0.47820320129394533, "step": 44200 }, { "epoch": 1.2207527975584944, "grad_norm": 1.2152804136276245, "learning_rate": 2.9791093147229154e-05, "loss": 0.4785987091064453, "step": 44400 }, { "epoch": 1.2262516840339832, "grad_norm": 1.3621678352355957, "learning_rate": 2.969902313719352e-05, "loss": 0.4778928375244141, "step": 44600 }, { "epoch": 1.2317505705094718, "grad_norm": 1.3576879501342773, "learning_rate": 2.9606953127157895e-05, "loss": 0.46979766845703125, "step": 44800 }, { "epoch": 1.2372494569849606, "grad_norm": 1.4446898698806763, "learning_rate": 2.9514883117122262e-05, "loss": 0.47956855773925783, "step": 45000 }, { "epoch": 1.2427483434604492, "grad_norm": 1.1428676843643188, "learning_rate": 2.9422813107086626e-05, "loss": 0.46750675201416014, "step": 45200 }, { "epoch": 1.248247229935938, "grad_norm": 1.1125656366348267, "learning_rate": 2.9330743097051e-05, "loss": 0.4821536254882812, "step": 45400 }, { "epoch": 1.2537461164114267, "grad_norm": 0.9081394672393799, "learning_rate": 2.9238673087015367e-05, "loss": 0.48335330963134765, "step": 45600 }, { "epoch": 1.2592450028869153, "grad_norm": 1.3965390920639038, "learning_rate": 2.9146603076979738e-05, "loss": 0.48991138458251954, "step": 45800 }, { "epoch": 1.2647438893624041, "grad_norm": 0.9960418939590454, "learning_rate": 2.9054533066944105e-05, "loss": 0.48175228118896485, "step": 46000 }, { "epoch": 1.2702427758378927, "grad_norm": 0.8425759077072144, "learning_rate": 2.8962463056908472e-05, "loss": 0.48490497589111325, "step": 46200 }, { "epoch": 1.2757416623133815, "grad_norm": 0.8783431053161621, "learning_rate": 2.8870393046872846e-05, "loss": 0.4830588150024414, "step": 46400 }, { "epoch": 1.2812405487888703, "grad_norm": 1.6315195560455322, "learning_rate": 2.8778323036837214e-05, "loss": 0.48057308197021487, "step": 46600 }, { "epoch": 1.286739435264359, "grad_norm": 1.2200597524642944, "learning_rate": 2.868625302680158e-05, "loss": 0.48826507568359373, "step": 46800 }, { "epoch": 1.2922383217398477, "grad_norm": 1.008957028388977, "learning_rate": 2.859418301676595e-05, "loss": 0.4910233306884766, "step": 47000 }, { "epoch": 1.2977372082153364, "grad_norm": 0.9655813574790955, "learning_rate": 2.850211300673032e-05, "loss": 0.48260990142822263, "step": 47200 }, { "epoch": 1.303236094690825, "grad_norm": 1.0368990898132324, "learning_rate": 2.8410042996694686e-05, "loss": 0.4869321060180664, "step": 47400 }, { "epoch": 1.3087349811663138, "grad_norm": 1.0914088487625122, "learning_rate": 2.8317972986659057e-05, "loss": 0.4798837661743164, "step": 47600 }, { "epoch": 1.3142338676418026, "grad_norm": 1.0549296140670776, "learning_rate": 2.8225902976623424e-05, "loss": 0.4868314743041992, "step": 47800 }, { "epoch": 1.3197327541172912, "grad_norm": 0.9864702224731445, "learning_rate": 2.813383296658779e-05, "loss": 0.48143596649169923, "step": 48000 }, { "epoch": 1.32523164059278, "grad_norm": 1.276328444480896, "learning_rate": 2.8041762956552165e-05, "loss": 0.4901668930053711, "step": 48200 }, { "epoch": 1.3307305270682686, "grad_norm": 0.9716532826423645, "learning_rate": 2.7949692946516532e-05, "loss": 0.48207698822021483, "step": 48400 }, { "epoch": 1.3362294135437573, "grad_norm": 1.3309965133666992, "learning_rate": 2.7857622936480903e-05, "loss": 0.4830322265625, "step": 48600 }, { "epoch": 1.3417283000192461, "grad_norm": 0.8904381990432739, "learning_rate": 2.776555292644527e-05, "loss": 0.488801383972168, "step": 48800 }, { "epoch": 1.347227186494735, "grad_norm": 1.4656221866607666, "learning_rate": 2.7673482916409638e-05, "loss": 0.48581710815429685, "step": 49000 }, { "epoch": 1.3527260729702235, "grad_norm": 1.1317617893218994, "learning_rate": 2.758141290637401e-05, "loss": 0.4906336212158203, "step": 49200 }, { "epoch": 1.3582249594457123, "grad_norm": 0.944570779800415, "learning_rate": 2.7489342896338375e-05, "loss": 0.4796075439453125, "step": 49400 }, { "epoch": 1.3637238459212009, "grad_norm": 0.8989654779434204, "learning_rate": 2.7397272886302743e-05, "loss": 0.48385326385498045, "step": 49600 }, { "epoch": 1.3692227323966897, "grad_norm": 1.2828127145767212, "learning_rate": 2.7305202876267117e-05, "loss": 0.4900363540649414, "step": 49800 }, { "epoch": 1.3747216188721785, "grad_norm": 1.3695372343063354, "learning_rate": 2.7213132866231484e-05, "loss": 0.4815263366699219, "step": 50000 }, { "epoch": 1.380220505347667, "grad_norm": 1.1346147060394287, "learning_rate": 2.712106285619585e-05, "loss": 0.48870357513427737, "step": 50200 }, { "epoch": 1.3857193918231558, "grad_norm": 1.2779992818832397, "learning_rate": 2.7028992846160222e-05, "loss": 0.4858957290649414, "step": 50400 }, { "epoch": 1.3912182782986444, "grad_norm": 1.0286052227020264, "learning_rate": 2.693692283612459e-05, "loss": 0.48650901794433593, "step": 50600 }, { "epoch": 1.3967171647741332, "grad_norm": 1.0637270212173462, "learning_rate": 2.6844852826088963e-05, "loss": 0.48736335754394533, "step": 50800 }, { "epoch": 1.402216051249622, "grad_norm": 1.3406178951263428, "learning_rate": 2.675278281605333e-05, "loss": 0.4900504684448242, "step": 51000 }, { "epoch": 1.4077149377251106, "grad_norm": 1.1052333116531372, "learning_rate": 2.6660712806017694e-05, "loss": 0.4855587387084961, "step": 51200 }, { "epoch": 1.4132138242005994, "grad_norm": 0.931908130645752, "learning_rate": 2.6568642795982068e-05, "loss": 0.4813541030883789, "step": 51400 }, { "epoch": 1.4187127106760882, "grad_norm": 0.9499631524085999, "learning_rate": 2.6476572785946435e-05, "loss": 0.4899889373779297, "step": 51600 }, { "epoch": 1.4242115971515767, "grad_norm": 1.1931513547897339, "learning_rate": 2.6384502775910803e-05, "loss": 0.48534503936767576, "step": 51800 }, { "epoch": 1.4297104836270655, "grad_norm": 1.3906440734863281, "learning_rate": 2.6292432765875173e-05, "loss": 0.47944049835205077, "step": 52000 }, { "epoch": 1.4352093701025543, "grad_norm": 1.1049039363861084, "learning_rate": 2.620036275583954e-05, "loss": 0.4796323776245117, "step": 52200 }, { "epoch": 1.440708256578043, "grad_norm": 1.035280704498291, "learning_rate": 2.6108292745803908e-05, "loss": 0.4778638076782227, "step": 52400 }, { "epoch": 1.4462071430535317, "grad_norm": 0.9371760487556458, "learning_rate": 2.6016222735768282e-05, "loss": 0.4937860870361328, "step": 52600 }, { "epoch": 1.4517060295290203, "grad_norm": 0.932565450668335, "learning_rate": 2.592415272573265e-05, "loss": 0.48315887451171874, "step": 52800 }, { "epoch": 1.457204916004509, "grad_norm": 1.1414536237716675, "learning_rate": 2.5832082715697016e-05, "loss": 0.48177513122558596, "step": 53000 }, { "epoch": 1.4627038024799979, "grad_norm": 1.3313400745391846, "learning_rate": 2.5740012705661387e-05, "loss": 0.4810881423950195, "step": 53200 }, { "epoch": 1.4682026889554864, "grad_norm": 0.9843188524246216, "learning_rate": 2.5647942695625754e-05, "loss": 0.48992759704589844, "step": 53400 }, { "epoch": 1.4737015754309752, "grad_norm": 1.0765944719314575, "learning_rate": 2.5555872685590128e-05, "loss": 0.48404861450195313, "step": 53600 }, { "epoch": 1.4792004619064638, "grad_norm": 0.9720175266265869, "learning_rate": 2.5463802675554492e-05, "loss": 0.48842796325683596, "step": 53800 }, { "epoch": 1.4846993483819526, "grad_norm": 0.9759963154792786, "learning_rate": 2.537173266551886e-05, "loss": 0.47752620697021486, "step": 54000 }, { "epoch": 1.4901982348574414, "grad_norm": 0.9573367834091187, "learning_rate": 2.5279662655483233e-05, "loss": 0.48062808990478517, "step": 54200 }, { "epoch": 1.4956971213329302, "grad_norm": 1.292158603668213, "learning_rate": 2.51875926454476e-05, "loss": 0.487774658203125, "step": 54400 }, { "epoch": 1.5011960078084188, "grad_norm": 1.4202347993850708, "learning_rate": 2.5095522635411968e-05, "loss": 0.4807822799682617, "step": 54600 }, { "epoch": 1.5066948942839073, "grad_norm": 1.5612984895706177, "learning_rate": 2.500345262537634e-05, "loss": 0.4789771270751953, "step": 54800 }, { "epoch": 1.5121937807593961, "grad_norm": 0.886279821395874, "learning_rate": 2.4911382615340706e-05, "loss": 0.482733268737793, "step": 55000 }, { "epoch": 1.517692667234885, "grad_norm": 1.2323397397994995, "learning_rate": 2.4819312605305076e-05, "loss": 0.48148895263671876, "step": 55200 }, { "epoch": 1.5231915537103737, "grad_norm": 1.1137135028839111, "learning_rate": 2.4727242595269447e-05, "loss": 0.48247013092041013, "step": 55400 }, { "epoch": 1.5286904401858623, "grad_norm": 1.1854609251022339, "learning_rate": 2.463517258523381e-05, "loss": 0.48267646789550783, "step": 55600 }, { "epoch": 1.534189326661351, "grad_norm": 1.1057685613632202, "learning_rate": 2.454310257519818e-05, "loss": 0.48411903381347654, "step": 55800 }, { "epoch": 1.5396882131368397, "grad_norm": 1.2663975954055786, "learning_rate": 2.4451032565162552e-05, "loss": 0.4761699295043945, "step": 56000 }, { "epoch": 1.5451870996123285, "grad_norm": 1.0173465013504028, "learning_rate": 2.435896255512692e-05, "loss": 0.48153770446777344, "step": 56200 }, { "epoch": 1.5506859860878173, "grad_norm": 1.0407702922821045, "learning_rate": 2.4266892545091287e-05, "loss": 0.4878800201416016, "step": 56400 }, { "epoch": 1.556184872563306, "grad_norm": 1.0399770736694336, "learning_rate": 2.4174822535055657e-05, "loss": 0.4796760177612305, "step": 56600 }, { "epoch": 1.5616837590387946, "grad_norm": 1.2796666622161865, "learning_rate": 2.4082752525020028e-05, "loss": 0.47880504608154295, "step": 56800 }, { "epoch": 1.5671826455142832, "grad_norm": 1.2479208707809448, "learning_rate": 2.3990682514984395e-05, "loss": 0.47731819152832033, "step": 57000 }, { "epoch": 1.572681531989772, "grad_norm": 1.1050926446914673, "learning_rate": 2.3898612504948766e-05, "loss": 0.483460693359375, "step": 57200 }, { "epoch": 1.5781804184652608, "grad_norm": 0.9544827342033386, "learning_rate": 2.3806542494913133e-05, "loss": 0.48048728942871094, "step": 57400 }, { "epoch": 1.5836793049407496, "grad_norm": 1.063852071762085, "learning_rate": 2.37144724848775e-05, "loss": 0.485230827331543, "step": 57600 }, { "epoch": 1.5891781914162382, "grad_norm": 1.1819310188293457, "learning_rate": 2.362240247484187e-05, "loss": 0.480164794921875, "step": 57800 }, { "epoch": 1.594677077891727, "grad_norm": 1.021468162536621, "learning_rate": 2.353033246480624e-05, "loss": 0.4904788589477539, "step": 58000 }, { "epoch": 1.6001759643672155, "grad_norm": 1.3577057123184204, "learning_rate": 2.343826245477061e-05, "loss": 0.48077606201171874, "step": 58200 }, { "epoch": 1.6056748508427043, "grad_norm": 1.2617197036743164, "learning_rate": 2.3346192444734976e-05, "loss": 0.4806778717041016, "step": 58400 }, { "epoch": 1.6111737373181931, "grad_norm": 1.2320860624313354, "learning_rate": 2.3254122434699347e-05, "loss": 0.4775208282470703, "step": 58600 }, { "epoch": 1.616672623793682, "grad_norm": 0.9680395126342773, "learning_rate": 2.3162052424663717e-05, "loss": 0.48886814117431643, "step": 58800 }, { "epoch": 1.6221715102691705, "grad_norm": 1.3157929182052612, "learning_rate": 2.3069982414628084e-05, "loss": 0.48573501586914064, "step": 59000 }, { "epoch": 1.627670396744659, "grad_norm": 0.900864839553833, "learning_rate": 2.297791240459245e-05, "loss": 0.48609561920166017, "step": 59200 }, { "epoch": 1.6331692832201479, "grad_norm": 1.0947906970977783, "learning_rate": 2.2885842394556822e-05, "loss": 0.4897247314453125, "step": 59400 }, { "epoch": 1.6386681696956367, "grad_norm": 0.816973865032196, "learning_rate": 2.2793772384521193e-05, "loss": 0.47951828002929686, "step": 59600 }, { "epoch": 1.6441670561711255, "grad_norm": 1.2236440181732178, "learning_rate": 2.270170237448556e-05, "loss": 0.4842032241821289, "step": 59800 }, { "epoch": 1.649665942646614, "grad_norm": 1.1023343801498413, "learning_rate": 2.2609632364449927e-05, "loss": 0.4781660461425781, "step": 60000 }, { "epoch": 1.6551648291221026, "grad_norm": 0.9589300155639648, "learning_rate": 2.2517562354414298e-05, "loss": 0.47841606140136717, "step": 60200 }, { "epoch": 1.6606637155975914, "grad_norm": 1.3003031015396118, "learning_rate": 2.242549234437867e-05, "loss": 0.48363441467285156, "step": 60400 }, { "epoch": 1.6661626020730802, "grad_norm": 0.9985244870185852, "learning_rate": 2.2333422334343036e-05, "loss": 0.48706722259521484, "step": 60600 }, { "epoch": 1.671661488548569, "grad_norm": 1.319917917251587, "learning_rate": 2.2241352324307403e-05, "loss": 0.4843954086303711, "step": 60800 }, { "epoch": 1.6771603750240578, "grad_norm": 1.3378630876541138, "learning_rate": 2.2149282314271774e-05, "loss": 0.48122127532958986, "step": 61000 }, { "epoch": 1.6826592614995464, "grad_norm": 1.0471312999725342, "learning_rate": 2.205721230423614e-05, "loss": 0.48413547515869143, "step": 61200 }, { "epoch": 1.688158147975035, "grad_norm": 1.0439791679382324, "learning_rate": 2.196514229420051e-05, "loss": 0.48604167938232423, "step": 61400 }, { "epoch": 1.6936570344505237, "grad_norm": 0.9854567050933838, "learning_rate": 2.187307228416488e-05, "loss": 0.4817595291137695, "step": 61600 }, { "epoch": 1.6991559209260125, "grad_norm": 1.1079517602920532, "learning_rate": 2.178100227412925e-05, "loss": 0.48393955230712893, "step": 61800 }, { "epoch": 1.7046548074015013, "grad_norm": 1.1403529644012451, "learning_rate": 2.1688932264093617e-05, "loss": 0.47360748291015625, "step": 62000 }, { "epoch": 1.71015369387699, "grad_norm": 0.8809356689453125, "learning_rate": 2.1596862254057987e-05, "loss": 0.47694496154785154, "step": 62200 }, { "epoch": 1.7156525803524785, "grad_norm": 0.9528295993804932, "learning_rate": 2.1504792244022358e-05, "loss": 0.4844463348388672, "step": 62400 }, { "epoch": 1.7211514668279673, "grad_norm": 1.0902634859085083, "learning_rate": 2.1412722233986722e-05, "loss": 0.47806488037109374, "step": 62600 }, { "epoch": 1.726650353303456, "grad_norm": 1.0174310207366943, "learning_rate": 2.1320652223951093e-05, "loss": 0.48461170196533204, "step": 62800 }, { "epoch": 1.7321492397789449, "grad_norm": 1.1780657768249512, "learning_rate": 2.1228582213915463e-05, "loss": 0.4868865203857422, "step": 63000 }, { "epoch": 1.7376481262544334, "grad_norm": 1.257879614830017, "learning_rate": 2.1136512203879834e-05, "loss": 0.4772517776489258, "step": 63200 }, { "epoch": 1.7431470127299222, "grad_norm": 2.5110182762145996, "learning_rate": 2.10444421938442e-05, "loss": 0.48027557373046875, "step": 63400 }, { "epoch": 1.7486458992054108, "grad_norm": 1.061119556427002, "learning_rate": 2.0952372183808568e-05, "loss": 0.4825307846069336, "step": 63600 }, { "epoch": 1.7541447856808996, "grad_norm": 1.3090649843215942, "learning_rate": 2.086030217377294e-05, "loss": 0.4777912902832031, "step": 63800 }, { "epoch": 1.7596436721563884, "grad_norm": 0.8455436825752258, "learning_rate": 2.0768232163737306e-05, "loss": 0.4868216705322266, "step": 64000 }, { "epoch": 1.7651425586318772, "grad_norm": 1.1341484785079956, "learning_rate": 2.0676162153701677e-05, "loss": 0.4825804901123047, "step": 64200 }, { "epoch": 1.7706414451073658, "grad_norm": 0.9106566905975342, "learning_rate": 2.0584092143666044e-05, "loss": 0.480031852722168, "step": 64400 }, { "epoch": 1.7761403315828543, "grad_norm": 0.8978875279426575, "learning_rate": 2.0492022133630415e-05, "loss": 0.48035388946533203, "step": 64600 }, { "epoch": 1.7816392180583431, "grad_norm": 1.508074164390564, "learning_rate": 2.0399952123594782e-05, "loss": 0.4823148727416992, "step": 64800 }, { "epoch": 1.787138104533832, "grad_norm": 1.0851056575775146, "learning_rate": 2.0307882113559153e-05, "loss": 0.4738383102416992, "step": 65000 }, { "epoch": 1.7926369910093207, "grad_norm": 1.0651288032531738, "learning_rate": 2.021581210352352e-05, "loss": 0.4777484130859375, "step": 65200 }, { "epoch": 1.7981358774848093, "grad_norm": 1.3095803260803223, "learning_rate": 2.0123742093487887e-05, "loss": 0.48325523376464846, "step": 65400 }, { "epoch": 1.803634763960298, "grad_norm": 1.1658202409744263, "learning_rate": 2.0031672083452258e-05, "loss": 0.4814822769165039, "step": 65600 }, { "epoch": 1.8091336504357867, "grad_norm": 0.974337637424469, "learning_rate": 1.9939602073416628e-05, "loss": 0.47399234771728516, "step": 65800 }, { "epoch": 1.8146325369112755, "grad_norm": 0.914979875087738, "learning_rate": 1.9847532063380995e-05, "loss": 0.48681838989257814, "step": 66000 }, { "epoch": 1.8201314233867643, "grad_norm": 0.7990674376487732, "learning_rate": 1.9755462053345363e-05, "loss": 0.47843902587890624, "step": 66200 }, { "epoch": 1.825630309862253, "grad_norm": 1.2652182579040527, "learning_rate": 1.9663392043309733e-05, "loss": 0.4840336990356445, "step": 66400 }, { "epoch": 1.8311291963377416, "grad_norm": 0.9367465376853943, "learning_rate": 1.9571322033274104e-05, "loss": 0.48031715393066404, "step": 66600 }, { "epoch": 1.8366280828132302, "grad_norm": 0.9445034861564636, "learning_rate": 1.947925202323847e-05, "loss": 0.48153636932373045, "step": 66800 }, { "epoch": 1.842126969288719, "grad_norm": 1.062595009803772, "learning_rate": 1.938718201320284e-05, "loss": 0.4798342514038086, "step": 67000 }, { "epoch": 1.8476258557642078, "grad_norm": 1.0887633562088013, "learning_rate": 1.929511200316721e-05, "loss": 0.4826160430908203, "step": 67200 }, { "epoch": 1.8531247422396966, "grad_norm": 1.4558460712432861, "learning_rate": 1.920304199313158e-05, "loss": 0.48820636749267576, "step": 67400 }, { "epoch": 1.8586236287151852, "grad_norm": 0.9983727931976318, "learning_rate": 1.9110971983095947e-05, "loss": 0.4826961135864258, "step": 67600 }, { "epoch": 1.8641225151906737, "grad_norm": 0.9502201676368713, "learning_rate": 1.9018901973060314e-05, "loss": 0.4772541046142578, "step": 67800 }, { "epoch": 1.8696214016661625, "grad_norm": 0.9462329149246216, "learning_rate": 1.8926831963024685e-05, "loss": 0.4827272415161133, "step": 68000 }, { "epoch": 1.8751202881416513, "grad_norm": 1.2585595846176147, "learning_rate": 1.8834761952989056e-05, "loss": 0.48325294494628906, "step": 68200 }, { "epoch": 1.8806191746171401, "grad_norm": 1.0165777206420898, "learning_rate": 1.8742691942953423e-05, "loss": 0.4868499755859375, "step": 68400 }, { "epoch": 1.8861180610926287, "grad_norm": 1.1448917388916016, "learning_rate": 1.8650621932917793e-05, "loss": 0.47457069396972656, "step": 68600 }, { "epoch": 1.8916169475681175, "grad_norm": 0.9723443984985352, "learning_rate": 1.855855192288216e-05, "loss": 0.4808235168457031, "step": 68800 }, { "epoch": 1.897115834043606, "grad_norm": 1.8042104244232178, "learning_rate": 1.8466481912846528e-05, "loss": 0.4818389892578125, "step": 69000 }, { "epoch": 1.9026147205190949, "grad_norm": 1.1425598859786987, "learning_rate": 1.83744119028109e-05, "loss": 0.47744728088378907, "step": 69200 }, { "epoch": 1.9081136069945837, "grad_norm": 1.3648266792297363, "learning_rate": 1.828234189277527e-05, "loss": 0.47696762084960936, "step": 69400 }, { "epoch": 1.9136124934700725, "grad_norm": 1.2545722723007202, "learning_rate": 1.8190271882739636e-05, "loss": 0.4733824920654297, "step": 69600 }, { "epoch": 1.919111379945561, "grad_norm": 1.1813223361968994, "learning_rate": 1.8098201872704004e-05, "loss": 0.4728484344482422, "step": 69800 }, { "epoch": 1.9246102664210496, "grad_norm": 1.2796030044555664, "learning_rate": 1.8006131862668374e-05, "loss": 0.4804762649536133, "step": 70000 }, { "epoch": 1.9301091528965384, "grad_norm": 1.3735687732696533, "learning_rate": 1.7914061852632745e-05, "loss": 0.4790033721923828, "step": 70200 }, { "epoch": 1.9356080393720272, "grad_norm": 1.2554829120635986, "learning_rate": 1.7821991842597112e-05, "loss": 0.48504138946533204, "step": 70400 }, { "epoch": 1.941106925847516, "grad_norm": 1.08273184299469, "learning_rate": 1.772992183256148e-05, "loss": 0.4772909545898438, "step": 70600 }, { "epoch": 1.9466058123230046, "grad_norm": 0.6954657435417175, "learning_rate": 1.763785182252585e-05, "loss": 0.49507545471191405, "step": 70800 }, { "epoch": 1.9521046987984934, "grad_norm": 1.014246940612793, "learning_rate": 1.754578181249022e-05, "loss": 0.4824806213378906, "step": 71000 }, { "epoch": 1.957603585273982, "grad_norm": 1.005923867225647, "learning_rate": 1.7453711802454588e-05, "loss": 0.4811605453491211, "step": 71200 }, { "epoch": 1.9631024717494707, "grad_norm": 1.1930160522460938, "learning_rate": 1.7361641792418955e-05, "loss": 0.4723471450805664, "step": 71400 }, { "epoch": 1.9686013582249595, "grad_norm": 1.132750153541565, "learning_rate": 1.7269571782383326e-05, "loss": 0.4810772323608398, "step": 71600 }, { "epoch": 1.9741002447004483, "grad_norm": 1.2968944311141968, "learning_rate": 1.7177501772347693e-05, "loss": 0.47881488800048827, "step": 71800 }, { "epoch": 1.9795991311759369, "grad_norm": 1.342724084854126, "learning_rate": 1.7085431762312064e-05, "loss": 0.4765338134765625, "step": 72000 }, { "epoch": 1.9850980176514255, "grad_norm": 1.0654747486114502, "learning_rate": 1.699336175227643e-05, "loss": 0.4823237228393555, "step": 72200 }, { "epoch": 1.9905969041269143, "grad_norm": 1.0994575023651123, "learning_rate": 1.69012917422408e-05, "loss": 0.48160724639892577, "step": 72400 }, { "epoch": 1.996095790602403, "grad_norm": 1.0896570682525635, "learning_rate": 1.680922173220517e-05, "loss": 0.4756970977783203, "step": 72600 }, { "epoch": 2.0, "eval_loss": 0.4625195264816284, "eval_runtime": 158.6666, "eval_samples_per_second": 407.515, "eval_steps_per_second": 25.475, "step": 72742 } ], "logging_steps": 200, "max_steps": 109113, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.875850713799066e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }