{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999383363137448, "eval_steps": 500, "global_step": 8108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006166368625516433, "grad_norm": 16.689239750860494, "learning_rate": 4.0983606557377046e-08, "loss": 1.618, "step": 50 }, { "epoch": 0.012332737251032866, "grad_norm": 17.403265975850882, "learning_rate": 8.196721311475409e-08, "loss": 1.6045, "step": 100 }, { "epoch": 0.0184991058765493, "grad_norm": 14.749335037473138, "learning_rate": 1.2295081967213116e-07, "loss": 1.5032, "step": 150 }, { "epoch": 0.024665474502065732, "grad_norm": 8.133780120015619, "learning_rate": 1.6393442622950818e-07, "loss": 1.3224, "step": 200 }, { "epoch": 0.030831843127582168, "grad_norm": 5.828403504971832, "learning_rate": 1.9999971273346704e-07, "loss": 1.1032, "step": 250 }, { "epoch": 0.0369982117530986, "grad_norm": 3.9432862715914587, "learning_rate": 1.9997497692480678e-07, "loss": 0.9673, "step": 300 }, { "epoch": 0.043164580378615036, "grad_norm": 3.4359051761562025, "learning_rate": 1.9991035427741063e-07, "loss": 0.8524, "step": 350 }, { "epoch": 0.049330949004131465, "grad_norm": 3.058893644982214, "learning_rate": 1.9980587057366126e-07, "loss": 0.7977, "step": 400 }, { "epoch": 0.0554973176296479, "grad_norm": 2.780325971119289, "learning_rate": 1.9966156749923613e-07, "loss": 0.7693, "step": 450 }, { "epoch": 0.061663686255164336, "grad_norm": 2.9531788670760784, "learning_rate": 1.994775026264762e-07, "loss": 0.7432, "step": 500 }, { "epoch": 0.06783005488068077, "grad_norm": 3.2391609392351692, "learning_rate": 1.9925374939141637e-07, "loss": 0.7299, "step": 550 }, { "epoch": 0.0739964235061972, "grad_norm": 2.963688989960738, "learning_rate": 1.9899039706448692e-07, "loss": 0.7002, "step": 600 }, { "epoch": 0.08016279213171364, "grad_norm": 2.880854595097375, "learning_rate": 1.9868755071489728e-07, "loss": 0.6761, "step": 650 }, { "epoch": 0.08632916075723007, "grad_norm": 2.6331858272282904, "learning_rate": 1.98345331168717e-07, "loss": 0.6753, "step": 700 }, { "epoch": 0.0924955293827465, "grad_norm": 3.166081780603538, "learning_rate": 1.9796387496066975e-07, "loss": 0.6627, "step": 750 }, { "epoch": 0.09866189800826293, "grad_norm": 3.2004691707941215, "learning_rate": 1.975433342796604e-07, "loss": 0.6398, "step": 800 }, { "epoch": 0.10482826663377937, "grad_norm": 3.4439116163641534, "learning_rate": 1.9708387690805658e-07, "loss": 0.643, "step": 850 }, { "epoch": 0.1109946352592958, "grad_norm": 2.7959822715068237, "learning_rate": 1.965856861547486e-07, "loss": 0.6299, "step": 900 }, { "epoch": 0.11716100388481224, "grad_norm": 2.8720786800067133, "learning_rate": 1.960489607820153e-07, "loss": 0.6156, "step": 950 }, { "epoch": 0.12332737251032867, "grad_norm": 2.8998981058217512, "learning_rate": 1.9547391492622407e-07, "loss": 0.6045, "step": 1000 }, { "epoch": 0.1294937411358451, "grad_norm": 2.937285416581705, "learning_rate": 1.9486077801239723e-07, "loss": 0.604, "step": 1050 }, { "epoch": 0.13566010976136153, "grad_norm": 2.9265674908029258, "learning_rate": 1.9420979466267888e-07, "loss": 0.5918, "step": 1100 }, { "epoch": 0.14182647838687798, "grad_norm": 3.1223514523834224, "learning_rate": 1.9352122459873818e-07, "loss": 0.5857, "step": 1150 }, { "epoch": 0.1479928470123944, "grad_norm": 2.991244269539233, "learning_rate": 1.9279534253814899e-07, "loss": 0.5797, "step": 1200 }, { "epoch": 0.15415921563791082, "grad_norm": 2.9531460521405313, "learning_rate": 1.9203243808478597e-07, "loss": 0.583, "step": 1250 }, { "epoch": 0.16032558426342727, "grad_norm": 2.9620910098760174, "learning_rate": 1.9123281561328205e-07, "loss": 0.5647, "step": 1300 }, { "epoch": 0.1664919528889437, "grad_norm": 2.8183125229693333, "learning_rate": 1.9039679414759247e-07, "loss": 0.5675, "step": 1350 }, { "epoch": 0.17265832151446014, "grad_norm": 3.029807143662261, "learning_rate": 1.8952470723371465e-07, "loss": 0.5669, "step": 1400 }, { "epoch": 0.17882469013997657, "grad_norm": 3.323729247650118, "learning_rate": 1.886169028066135e-07, "loss": 0.5579, "step": 1450 }, { "epoch": 0.184991058765493, "grad_norm": 2.9853732307969123, "learning_rate": 1.8767374305140678e-07, "loss": 0.5578, "step": 1500 }, { "epoch": 0.19115742739100944, "grad_norm": 2.8289118535370226, "learning_rate": 1.8669560425886458e-07, "loss": 0.5565, "step": 1550 }, { "epoch": 0.19732379601652586, "grad_norm": 3.107927650244337, "learning_rate": 1.8568287667528136e-07, "loss": 0.5482, "step": 1600 }, { "epoch": 0.2034901646420423, "grad_norm": 2.9356477568984474, "learning_rate": 1.846359643467799e-07, "loss": 0.5493, "step": 1650 }, { "epoch": 0.20965653326755873, "grad_norm": 2.8886483110859706, "learning_rate": 1.8355528495811004e-07, "loss": 0.5441, "step": 1700 }, { "epoch": 0.21582290189307518, "grad_norm": 3.043206189340112, "learning_rate": 1.8244126966600537e-07, "loss": 0.5309, "step": 1750 }, { "epoch": 0.2219892705185916, "grad_norm": 2.84058010312111, "learning_rate": 1.8129436292716576e-07, "loss": 0.5281, "step": 1800 }, { "epoch": 0.22815563914410802, "grad_norm": 3.2658177471645793, "learning_rate": 1.8011502232093294e-07, "loss": 0.5219, "step": 1850 }, { "epoch": 0.23432200776962447, "grad_norm": 2.838918099928717, "learning_rate": 1.7890371836673115e-07, "loss": 0.5164, "step": 1900 }, { "epoch": 0.2404883763951409, "grad_norm": 3.3022219294232222, "learning_rate": 1.7766093433634462e-07, "loss": 0.524, "step": 1950 }, { "epoch": 0.24665474502065735, "grad_norm": 3.5602190680329637, "learning_rate": 1.7638716606110768e-07, "loss": 0.509, "step": 2000 }, { "epoch": 0.25282111364617377, "grad_norm": 3.0096527122431973, "learning_rate": 1.7508292173408366e-07, "loss": 0.5193, "step": 2050 }, { "epoch": 0.2589874822716902, "grad_norm": 3.3687126826867044, "learning_rate": 1.7374872170731205e-07, "loss": 0.5186, "step": 2100 }, { "epoch": 0.2651538508972066, "grad_norm": 3.1979358817347734, "learning_rate": 1.7238509828420468e-07, "loss": 0.5081, "step": 2150 }, { "epoch": 0.27132021952272306, "grad_norm": 2.7782410513777207, "learning_rate": 1.709925955071734e-07, "loss": 0.5046, "step": 2200 }, { "epoch": 0.2774865881482395, "grad_norm": 3.0956007198543376, "learning_rate": 1.6957176894057456e-07, "loss": 0.5067, "step": 2250 }, { "epoch": 0.28365295677375596, "grad_norm": 2.883657997016742, "learning_rate": 1.681231854490565e-07, "loss": 0.5034, "step": 2300 }, { "epoch": 0.28981932539927235, "grad_norm": 3.037713494095377, "learning_rate": 1.6664742297139842e-07, "loss": 0.5017, "step": 2350 }, { "epoch": 0.2959856940247888, "grad_norm": 2.7886707641373856, "learning_rate": 1.6514507028993141e-07, "loss": 0.5074, "step": 2400 }, { "epoch": 0.30215206265030525, "grad_norm": 3.0522520309780665, "learning_rate": 1.636167267956328e-07, "loss": 0.504, "step": 2450 }, { "epoch": 0.30831843127582165, "grad_norm": 2.9917653849017967, "learning_rate": 1.620630022489884e-07, "loss": 0.492, "step": 2500 }, { "epoch": 0.3144847999013381, "grad_norm": 3.374780491495851, "learning_rate": 1.604845165367171e-07, "loss": 0.5012, "step": 2550 }, { "epoch": 0.32065116852685455, "grad_norm": 3.054737104124034, "learning_rate": 1.588818994244563e-07, "loss": 0.4961, "step": 2600 }, { "epoch": 0.326817537152371, "grad_norm": 3.1630826680292037, "learning_rate": 1.5725579030550487e-07, "loss": 0.4986, "step": 2650 }, { "epoch": 0.3329839057778874, "grad_norm": 2.787165502227459, "learning_rate": 1.5560683794572599e-07, "loss": 0.5005, "step": 2700 }, { "epoch": 0.33915027440340384, "grad_norm": 3.2159871448935853, "learning_rate": 1.5393570022470996e-07, "loss": 0.4912, "step": 2750 }, { "epoch": 0.3453166430289203, "grad_norm": 2.921398178739714, "learning_rate": 1.5224304387330113e-07, "loss": 0.4873, "step": 2800 }, { "epoch": 0.3514830116544367, "grad_norm": 3.033201824114291, "learning_rate": 1.505295442075936e-07, "loss": 0.4848, "step": 2850 }, { "epoch": 0.35764938027995313, "grad_norm": 3.126845883000846, "learning_rate": 1.4879588485950154e-07, "loss": 0.4761, "step": 2900 }, { "epoch": 0.3638157489054696, "grad_norm": 2.899612235662964, "learning_rate": 1.4704275750401168e-07, "loss": 0.4731, "step": 2950 }, { "epoch": 0.369982117530986, "grad_norm": 2.78803166053557, "learning_rate": 1.45270861583227e-07, "loss": 0.4751, "step": 3000 }, { "epoch": 0.3761484861565024, "grad_norm": 3.217869780099078, "learning_rate": 1.4348090402731177e-07, "loss": 0.4833, "step": 3050 }, { "epoch": 0.3823148547820189, "grad_norm": 2.98388792612514, "learning_rate": 1.416735989724485e-07, "loss": 0.4768, "step": 3100 }, { "epoch": 0.3884812234075353, "grad_norm": 3.095979105793261, "learning_rate": 1.3984966747592066e-07, "loss": 0.4781, "step": 3150 }, { "epoch": 0.3946475920330517, "grad_norm": 2.8570658991316944, "learning_rate": 1.380098372284335e-07, "loss": 0.47, "step": 3200 }, { "epoch": 0.40081396065856817, "grad_norm": 2.913522526116864, "learning_rate": 1.3615484226378866e-07, "loss": 0.4761, "step": 3250 }, { "epoch": 0.4069803292840846, "grad_norm": 3.079167327659028, "learning_rate": 1.3428542266602808e-07, "loss": 0.4691, "step": 3300 }, { "epoch": 0.413146697909601, "grad_norm": 3.167335424827754, "learning_rate": 1.3240232427416377e-07, "loss": 0.4762, "step": 3350 }, { "epoch": 0.41931306653511746, "grad_norm": 2.951805565284142, "learning_rate": 1.3050629838461213e-07, "loss": 0.4743, "step": 3400 }, { "epoch": 0.4254794351606339, "grad_norm": 3.344274691992938, "learning_rate": 1.285981014514501e-07, "loss": 0.4651, "step": 3450 }, { "epoch": 0.43164580378615036, "grad_norm": 3.134003729646922, "learning_rate": 1.2667849478461436e-07, "loss": 0.474, "step": 3500 }, { "epoch": 0.43781217241166676, "grad_norm": 3.000847232186744, "learning_rate": 1.2474824424616271e-07, "loss": 0.4729, "step": 3550 }, { "epoch": 0.4439785410371832, "grad_norm": 2.836288640859743, "learning_rate": 1.228081199447195e-07, "loss": 0.4632, "step": 3600 }, { "epoch": 0.45014490966269965, "grad_norm": 3.308502889653925, "learning_rate": 1.2085889592822667e-07, "loss": 0.4601, "step": 3650 }, { "epoch": 0.45631127828821605, "grad_norm": 2.746002613176513, "learning_rate": 1.1890134987512341e-07, "loss": 0.467, "step": 3700 }, { "epoch": 0.4624776469137325, "grad_norm": 3.24735950823672, "learning_rate": 1.1693626278407694e-07, "loss": 0.4617, "step": 3750 }, { "epoch": 0.46864401553924895, "grad_norm": 3.0607507540260075, "learning_rate": 1.1496441866238905e-07, "loss": 0.4569, "step": 3800 }, { "epoch": 0.47481038416476534, "grad_norm": 2.9943145563385998, "learning_rate": 1.1298660421320194e-07, "loss": 0.4619, "step": 3850 }, { "epoch": 0.4809767527902818, "grad_norm": 3.1612704244607177, "learning_rate": 1.1100360852162888e-07, "loss": 0.4637, "step": 3900 }, { "epoch": 0.48714312141579824, "grad_norm": 3.1449471984877055, "learning_rate": 1.0901622273993417e-07, "loss": 0.4701, "step": 3950 }, { "epoch": 0.4933094900413147, "grad_norm": 3.0699714321899387, "learning_rate": 1.070252397718884e-07, "loss": 0.4558, "step": 4000 }, { "epoch": 0.4994758586668311, "grad_norm": 3.4589428619371834, "learning_rate": 1.0503145395642541e-07, "loss": 0.4599, "step": 4050 }, { "epoch": 0.5056422272923475, "grad_norm": 3.0848999815662674, "learning_rate": 1.0303566075072598e-07, "loss": 0.4558, "step": 4100 }, { "epoch": 0.511808595917864, "grad_norm": 2.947163992749446, "learning_rate": 1.0103865641285583e-07, "loss": 0.457, "step": 4150 }, { "epoch": 0.5179749645433804, "grad_norm": 3.5673363307250927, "learning_rate": 9.904123768408389e-08, "loss": 0.4575, "step": 4200 }, { "epoch": 0.5241413331688969, "grad_norm": 3.07648492625604, "learning_rate": 9.704420147100796e-08, "loss": 0.4528, "step": 4250 }, { "epoch": 0.5303077017944132, "grad_norm": 3.2080853332983907, "learning_rate": 9.504834452761424e-08, "loss": 0.455, "step": 4300 }, { "epoch": 0.5364740704199297, "grad_norm": 2.952611892786328, "learning_rate": 9.305446313739767e-08, "loss": 0.4472, "step": 4350 }, { "epoch": 0.5426404390454461, "grad_norm": 3.005908121136174, "learning_rate": 9.106335279567037e-08, "loss": 0.4516, "step": 4400 }, { "epoch": 0.5488068076709626, "grad_norm": 2.822428791661921, "learning_rate": 8.907580789218414e-08, "loss": 0.4528, "step": 4450 }, { "epoch": 0.554973176296479, "grad_norm": 2.8673595096457465, "learning_rate": 8.709262139419424e-08, "loss": 0.4536, "step": 4500 }, { "epoch": 0.5611395449219955, "grad_norm": 3.264575792740317, "learning_rate": 8.511458453009065e-08, "loss": 0.4524, "step": 4550 }, { "epoch": 0.5673059135475119, "grad_norm": 3.3129039957771806, "learning_rate": 8.314248647372302e-08, "loss": 0.4467, "step": 4600 }, { "epoch": 0.5734722821730283, "grad_norm": 3.083187238173955, "learning_rate": 8.117711402954554e-08, "loss": 0.4488, "step": 4650 }, { "epoch": 0.5796386507985447, "grad_norm": 3.1488830656848, "learning_rate": 7.921925131870672e-08, "loss": 0.4579, "step": 4700 }, { "epoch": 0.5858050194240612, "grad_norm": 2.9768313706421874, "learning_rate": 7.726967946621029e-08, "loss": 0.4481, "step": 4750 }, { "epoch": 0.5919713880495776, "grad_norm": 3.0236276200137486, "learning_rate": 7.532917628927079e-08, "loss": 0.4529, "step": 4800 }, { "epoch": 0.5981377566750941, "grad_norm": 3.0681593760022285, "learning_rate": 7.339851598698955e-08, "loss": 0.4527, "step": 4850 }, { "epoch": 0.6043041253006105, "grad_norm": 3.2203600426157495, "learning_rate": 7.147846883147362e-08, "loss": 0.4473, "step": 4900 }, { "epoch": 0.610470493926127, "grad_norm": 3.127241727234972, "learning_rate": 6.956980086052184e-08, "loss": 0.4536, "step": 4950 }, { "epoch": 0.6166368625516433, "grad_norm": 3.215958029153526, "learning_rate": 6.76732735719999e-08, "loss": 0.4505, "step": 5000 }, { "epoch": 0.6228032311771597, "grad_norm": 2.8220120121880936, "learning_rate": 6.578964362002715e-08, "loss": 0.4514, "step": 5050 }, { "epoch": 0.6289695998026762, "grad_norm": 3.0284791997521054, "learning_rate": 6.391966251309539e-08, "loss": 0.4458, "step": 5100 }, { "epoch": 0.6351359684281926, "grad_norm": 3.371033810021987, "learning_rate": 6.206407631424109e-08, "loss": 0.4446, "step": 5150 }, { "epoch": 0.6413023370537091, "grad_norm": 3.122281628753462, "learning_rate": 6.02236253433898e-08, "loss": 0.4473, "step": 5200 }, { "epoch": 0.6474687056792255, "grad_norm": 2.7353573500503074, "learning_rate": 5.8399043881992104e-08, "loss": 0.4399, "step": 5250 }, { "epoch": 0.653635074304742, "grad_norm": 3.194859384027796, "learning_rate": 5.659105988006851e-08, "loss": 0.4499, "step": 5300 }, { "epoch": 0.6598014429302583, "grad_norm": 2.8707279633921194, "learning_rate": 5.480039466578079e-08, "loss": 0.453, "step": 5350 }, { "epoch": 0.6659678115557748, "grad_norm": 3.313196070466103, "learning_rate": 5.3027762657644745e-08, "loss": 0.4433, "step": 5400 }, { "epoch": 0.6721341801812912, "grad_norm": 2.9398335243680056, "learning_rate": 5.1273871079499986e-08, "loss": 0.447, "step": 5450 }, { "epoch": 0.6783005488068077, "grad_norm": 2.9069645999783726, "learning_rate": 4.9539419678350103e-08, "loss": 0.4424, "step": 5500 }, { "epoch": 0.6844669174323241, "grad_norm": 3.0807794080344744, "learning_rate": 4.7825100445185904e-08, "loss": 0.4502, "step": 5550 }, { "epoch": 0.6906332860578406, "grad_norm": 3.1238439553913913, "learning_rate": 4.613159733890279e-08, "loss": 0.4371, "step": 5600 }, { "epoch": 0.6967996546833569, "grad_norm": 2.894912373492253, "learning_rate": 4.445958601342321e-08, "loss": 0.4466, "step": 5650 }, { "epoch": 0.7029660233088734, "grad_norm": 3.1861168079620352, "learning_rate": 4.280973354813196e-08, "loss": 0.4452, "step": 5700 }, { "epoch": 0.7091323919343898, "grad_norm": 3.218055671881565, "learning_rate": 4.118269818173283e-08, "loss": 0.4335, "step": 5750 }, { "epoch": 0.7152987605599063, "grad_norm": 3.7156552734894177, "learning_rate": 3.957912904963225e-08, "loss": 0.4482, "step": 5800 }, { "epoch": 0.7214651291854227, "grad_norm": 3.248126042161764, "learning_rate": 3.7999665924954815e-08, "loss": 0.4407, "step": 5850 }, { "epoch": 0.7276314978109392, "grad_norm": 3.37840785837335, "learning_rate": 3.64449389632943e-08, "loss": 0.4421, "step": 5900 }, { "epoch": 0.7337978664364556, "grad_norm": 2.919668888292714, "learning_rate": 3.491556845130147e-08, "loss": 0.4358, "step": 5950 }, { "epoch": 0.739964235061972, "grad_norm": 3.449594279809231, "learning_rate": 3.3412164559209485e-08, "loss": 0.4393, "step": 6000 }, { "epoch": 0.7461306036874884, "grad_norm": 3.0449172482636713, "learning_rate": 3.193532709739534e-08, "loss": 0.443, "step": 6050 }, { "epoch": 0.7522969723130049, "grad_norm": 2.9659035390086603, "learning_rate": 3.048564527707457e-08, "loss": 0.4541, "step": 6100 }, { "epoch": 0.7584633409385213, "grad_norm": 3.0426691033458266, "learning_rate": 2.9063697475224736e-08, "loss": 0.4411, "step": 6150 }, { "epoch": 0.7646297095640378, "grad_norm": 3.1254929066925543, "learning_rate": 2.767005100383143e-08, "loss": 0.4466, "step": 6200 }, { "epoch": 0.7707960781895542, "grad_norm": 3.059948610503461, "learning_rate": 2.6305261883548624e-08, "loss": 0.4501, "step": 6250 }, { "epoch": 0.7769624468150707, "grad_norm": 3.178741971582532, "learning_rate": 2.4969874621864373e-08, "loss": 0.4405, "step": 6300 }, { "epoch": 0.783128815440587, "grad_norm": 3.178300180527373, "learning_rate": 2.3664421995859463e-08, "loss": 0.4499, "step": 6350 }, { "epoch": 0.7892951840661034, "grad_norm": 3.003275204473159, "learning_rate": 2.2389424839646286e-08, "loss": 0.4399, "step": 6400 }, { "epoch": 0.7954615526916199, "grad_norm": 3.420014772222019, "learning_rate": 2.114539183657268e-08, "loss": 0.4352, "step": 6450 }, { "epoch": 0.8016279213171363, "grad_norm": 2.971627106875043, "learning_rate": 1.9932819316273307e-08, "loss": 0.4382, "step": 6500 }, { "epoch": 0.8077942899426528, "grad_norm": 3.4422374871537533, "learning_rate": 1.8752191056650023e-08, "loss": 0.4377, "step": 6550 }, { "epoch": 0.8139606585681692, "grad_norm": 3.053124764133182, "learning_rate": 1.7603978090859794e-08, "loss": 0.4442, "step": 6600 }, { "epoch": 0.8201270271936857, "grad_norm": 3.1086937331605613, "learning_rate": 1.6488638519387478e-08, "loss": 0.4466, "step": 6650 }, { "epoch": 0.826293395819202, "grad_norm": 3.4399676514136193, "learning_rate": 1.5406617327278205e-08, "loss": 0.4326, "step": 6700 }, { "epoch": 0.8324597644447185, "grad_norm": 2.8487398222000744, "learning_rate": 1.4358346206602612e-08, "loss": 0.4422, "step": 6750 }, { "epoch": 0.8386261330702349, "grad_norm": 2.9651774336393726, "learning_rate": 1.334424338422534e-08, "loss": 0.4305, "step": 6800 }, { "epoch": 0.8447925016957514, "grad_norm": 3.4279808291982556, "learning_rate": 1.236471345494583e-08, "loss": 0.4386, "step": 6850 }, { "epoch": 0.8509588703212678, "grad_norm": 3.298847289113035, "learning_rate": 1.1420147220077847e-08, "loss": 0.4425, "step": 6900 }, { "epoch": 0.8571252389467843, "grad_norm": 3.199726112913922, "learning_rate": 1.0510921531532192e-08, "loss": 0.4339, "step": 6950 }, { "epoch": 0.8632916075723007, "grad_norm": 3.3865902484127637, "learning_rate": 9.63739914146473e-09, "loss": 0.426, "step": 7000 }, { "epoch": 0.8694579761978171, "grad_norm": 3.080132950484914, "learning_rate": 8.799928557549863e-09, "loss": 0.4437, "step": 7050 }, { "epoch": 0.8756243448233335, "grad_norm": 3.2441647152844526, "learning_rate": 7.998843903936992e-09, "loss": 0.4338, "step": 7100 }, { "epoch": 0.88179071344885, "grad_norm": 2.861131038634973, "learning_rate": 7.2344647879456265e-09, "loss": 0.4363, "step": 7150 }, { "epoch": 0.8879570820743664, "grad_norm": 3.131842173102097, "learning_rate": 6.507096172552195e-09, "loss": 0.4333, "step": 7200 }, { "epoch": 0.8941234506998829, "grad_norm": 3.1735067730802604, "learning_rate": 5.817028254719536e-09, "loss": 0.4395, "step": 7250 }, { "epoch": 0.9002898193253993, "grad_norm": 2.941305225791783, "learning_rate": 5.164536349617532e-09, "loss": 0.4418, "step": 7300 }, { "epoch": 0.9064561879509158, "grad_norm": 3.1369522496788806, "learning_rate": 4.5498807807811015e-09, "loss": 0.4413, "step": 7350 }, { "epoch": 0.9126225565764321, "grad_norm": 3.10250834762718, "learning_rate": 3.973306776249341e-09, "loss": 0.4316, "step": 7400 }, { "epoch": 0.9187889252019485, "grad_norm": 3.113181559222609, "learning_rate": 3.4350443707274135e-09, "loss": 0.4391, "step": 7450 }, { "epoch": 0.924955293827465, "grad_norm": 3.2125045204581424, "learning_rate": 2.9353083138099256e-09, "loss": 0.4453, "step": 7500 }, { "epoch": 0.9311216624529814, "grad_norm": 3.3021789642945008, "learning_rate": 2.474297984302709e-09, "loss": 0.4404, "step": 7550 }, { "epoch": 0.9372880310784979, "grad_norm": 3.3940858957593223, "learning_rate": 2.0521973106770285e-09, "loss": 0.4387, "step": 7600 }, { "epoch": 0.9434543997040143, "grad_norm": 2.8912916713122763, "learning_rate": 1.6691746976879028e-09, "loss": 0.4396, "step": 7650 }, { "epoch": 0.9496207683295307, "grad_norm": 3.437089198963669, "learning_rate": 1.3253829591860387e-09, "loss": 0.4375, "step": 7700 }, { "epoch": 0.9557871369550471, "grad_norm": 3.138591510137561, "learning_rate": 1.0209592571498892e-09, "loss": 0.432, "step": 7750 }, { "epoch": 0.9619535055805636, "grad_norm": 3.0903316485258783, "learning_rate": 7.560250469624385e-10, "loss": 0.4381, "step": 7800 }, { "epoch": 0.96811987420608, "grad_norm": 3.1363149734033233, "learning_rate": 5.306860289543413e-10, "loss": 0.4432, "step": 7850 }, { "epoch": 0.9742862428315965, "grad_norm": 3.143737684684351, "learning_rate": 3.450321062328232e-10, "loss": 0.4334, "step": 7900 }, { "epoch": 0.9804526114571129, "grad_norm": 2.8627388485987444, "learning_rate": 1.9913734881326083e-10, "loss": 0.4372, "step": 7950 }, { "epoch": 0.9866189800826294, "grad_norm": 3.044456688116337, "learning_rate": 9.305996406754335e-11, "loss": 0.4376, "step": 8000 }, { "epoch": 0.9927853487081457, "grad_norm": 2.9030301879677096, "learning_rate": 2.6842273501193058e-11, "loss": 0.4348, "step": 8050 }, { "epoch": 0.9989517173336622, "grad_norm": 3.2924748739223664, "learning_rate": 5.10695868449762e-13, "loss": 0.4422, "step": 8100 }, { "epoch": 0.9999383363137448, "step": 8108, "total_flos": 533986133770240.0, "train_loss": 0.5225248140364254, "train_runtime": 45752.7497, "train_samples_per_second": 5.671, "train_steps_per_second": 0.177 } ], "logging_steps": 50, "max_steps": 8108, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 533986133770240.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }