| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9999383363137448, | |
| "eval_steps": 500, | |
| "global_step": 8108, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006166368625516433, | |
| "grad_norm": 16.689239750860494, | |
| "learning_rate": 4.0983606557377046e-08, | |
| "loss": 1.618, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012332737251032866, | |
| "grad_norm": 17.403265975850882, | |
| "learning_rate": 8.196721311475409e-08, | |
| "loss": 1.6045, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0184991058765493, | |
| "grad_norm": 14.749335037473138, | |
| "learning_rate": 1.2295081967213116e-07, | |
| "loss": 1.5032, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.024665474502065732, | |
| "grad_norm": 8.133780120015619, | |
| "learning_rate": 1.6393442622950818e-07, | |
| "loss": 1.3224, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.030831843127582168, | |
| "grad_norm": 5.828403504971832, | |
| "learning_rate": 1.9999971273346704e-07, | |
| "loss": 1.1032, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0369982117530986, | |
| "grad_norm": 3.9432862715914587, | |
| "learning_rate": 1.9997497692480678e-07, | |
| "loss": 0.9673, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.043164580378615036, | |
| "grad_norm": 3.4359051761562025, | |
| "learning_rate": 1.9991035427741063e-07, | |
| "loss": 0.8524, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.049330949004131465, | |
| "grad_norm": 3.058893644982214, | |
| "learning_rate": 1.9980587057366126e-07, | |
| "loss": 0.7977, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0554973176296479, | |
| "grad_norm": 2.780325971119289, | |
| "learning_rate": 1.9966156749923613e-07, | |
| "loss": 0.7693, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.061663686255164336, | |
| "grad_norm": 2.9531788670760784, | |
| "learning_rate": 1.994775026264762e-07, | |
| "loss": 0.7432, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06783005488068077, | |
| "grad_norm": 3.2391609392351692, | |
| "learning_rate": 1.9925374939141637e-07, | |
| "loss": 0.7299, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0739964235061972, | |
| "grad_norm": 2.963688989960738, | |
| "learning_rate": 1.9899039706448692e-07, | |
| "loss": 0.7002, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08016279213171364, | |
| "grad_norm": 2.880854595097375, | |
| "learning_rate": 1.9868755071489728e-07, | |
| "loss": 0.6761, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.08632916075723007, | |
| "grad_norm": 2.6331858272282904, | |
| "learning_rate": 1.98345331168717e-07, | |
| "loss": 0.6753, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0924955293827465, | |
| "grad_norm": 3.166081780603538, | |
| "learning_rate": 1.9796387496066975e-07, | |
| "loss": 0.6627, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.09866189800826293, | |
| "grad_norm": 3.2004691707941215, | |
| "learning_rate": 1.975433342796604e-07, | |
| "loss": 0.6398, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10482826663377937, | |
| "grad_norm": 3.4439116163641534, | |
| "learning_rate": 1.9708387690805658e-07, | |
| "loss": 0.643, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1109946352592958, | |
| "grad_norm": 2.7959822715068237, | |
| "learning_rate": 1.965856861547486e-07, | |
| "loss": 0.6299, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11716100388481224, | |
| "grad_norm": 2.8720786800067133, | |
| "learning_rate": 1.960489607820153e-07, | |
| "loss": 0.6156, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12332737251032867, | |
| "grad_norm": 2.8998981058217512, | |
| "learning_rate": 1.9547391492622407e-07, | |
| "loss": 0.6045, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1294937411358451, | |
| "grad_norm": 2.937285416581705, | |
| "learning_rate": 1.9486077801239723e-07, | |
| "loss": 0.604, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.13566010976136153, | |
| "grad_norm": 2.9265674908029258, | |
| "learning_rate": 1.9420979466267888e-07, | |
| "loss": 0.5918, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14182647838687798, | |
| "grad_norm": 3.1223514523834224, | |
| "learning_rate": 1.9352122459873818e-07, | |
| "loss": 0.5857, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1479928470123944, | |
| "grad_norm": 2.991244269539233, | |
| "learning_rate": 1.9279534253814899e-07, | |
| "loss": 0.5797, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15415921563791082, | |
| "grad_norm": 2.9531460521405313, | |
| "learning_rate": 1.9203243808478597e-07, | |
| "loss": 0.583, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.16032558426342727, | |
| "grad_norm": 2.9620910098760174, | |
| "learning_rate": 1.9123281561328205e-07, | |
| "loss": 0.5647, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1664919528889437, | |
| "grad_norm": 2.8183125229693333, | |
| "learning_rate": 1.9039679414759247e-07, | |
| "loss": 0.5675, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.17265832151446014, | |
| "grad_norm": 3.029807143662261, | |
| "learning_rate": 1.8952470723371465e-07, | |
| "loss": 0.5669, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.17882469013997657, | |
| "grad_norm": 3.323729247650118, | |
| "learning_rate": 1.886169028066135e-07, | |
| "loss": 0.5579, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.184991058765493, | |
| "grad_norm": 2.9853732307969123, | |
| "learning_rate": 1.8767374305140678e-07, | |
| "loss": 0.5578, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19115742739100944, | |
| "grad_norm": 2.8289118535370226, | |
| "learning_rate": 1.8669560425886458e-07, | |
| "loss": 0.5565, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.19732379601652586, | |
| "grad_norm": 3.107927650244337, | |
| "learning_rate": 1.8568287667528136e-07, | |
| "loss": 0.5482, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2034901646420423, | |
| "grad_norm": 2.9356477568984474, | |
| "learning_rate": 1.846359643467799e-07, | |
| "loss": 0.5493, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.20965653326755873, | |
| "grad_norm": 2.8886483110859706, | |
| "learning_rate": 1.8355528495811004e-07, | |
| "loss": 0.5441, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21582290189307518, | |
| "grad_norm": 3.043206189340112, | |
| "learning_rate": 1.8244126966600537e-07, | |
| "loss": 0.5309, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2219892705185916, | |
| "grad_norm": 2.84058010312111, | |
| "learning_rate": 1.8129436292716576e-07, | |
| "loss": 0.5281, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.22815563914410802, | |
| "grad_norm": 3.2658177471645793, | |
| "learning_rate": 1.8011502232093294e-07, | |
| "loss": 0.5219, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.23432200776962447, | |
| "grad_norm": 2.838918099928717, | |
| "learning_rate": 1.7890371836673115e-07, | |
| "loss": 0.5164, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2404883763951409, | |
| "grad_norm": 3.3022219294232222, | |
| "learning_rate": 1.7766093433634462e-07, | |
| "loss": 0.524, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.24665474502065735, | |
| "grad_norm": 3.5602190680329637, | |
| "learning_rate": 1.7638716606110768e-07, | |
| "loss": 0.509, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.25282111364617377, | |
| "grad_norm": 3.0096527122431973, | |
| "learning_rate": 1.7508292173408366e-07, | |
| "loss": 0.5193, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.2589874822716902, | |
| "grad_norm": 3.3687126826867044, | |
| "learning_rate": 1.7374872170731205e-07, | |
| "loss": 0.5186, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2651538508972066, | |
| "grad_norm": 3.1979358817347734, | |
| "learning_rate": 1.7238509828420468e-07, | |
| "loss": 0.5081, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.27132021952272306, | |
| "grad_norm": 2.7782410513777207, | |
| "learning_rate": 1.709925955071734e-07, | |
| "loss": 0.5046, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2774865881482395, | |
| "grad_norm": 3.0956007198543376, | |
| "learning_rate": 1.6957176894057456e-07, | |
| "loss": 0.5067, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.28365295677375596, | |
| "grad_norm": 2.883657997016742, | |
| "learning_rate": 1.681231854490565e-07, | |
| "loss": 0.5034, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.28981932539927235, | |
| "grad_norm": 3.037713494095377, | |
| "learning_rate": 1.6664742297139842e-07, | |
| "loss": 0.5017, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2959856940247888, | |
| "grad_norm": 2.7886707641373856, | |
| "learning_rate": 1.6514507028993141e-07, | |
| "loss": 0.5074, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.30215206265030525, | |
| "grad_norm": 3.0522520309780665, | |
| "learning_rate": 1.636167267956328e-07, | |
| "loss": 0.504, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.30831843127582165, | |
| "grad_norm": 2.9917653849017967, | |
| "learning_rate": 1.620630022489884e-07, | |
| "loss": 0.492, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3144847999013381, | |
| "grad_norm": 3.374780491495851, | |
| "learning_rate": 1.604845165367171e-07, | |
| "loss": 0.5012, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.32065116852685455, | |
| "grad_norm": 3.054737104124034, | |
| "learning_rate": 1.588818994244563e-07, | |
| "loss": 0.4961, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.326817537152371, | |
| "grad_norm": 3.1630826680292037, | |
| "learning_rate": 1.5725579030550487e-07, | |
| "loss": 0.4986, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3329839057778874, | |
| "grad_norm": 2.787165502227459, | |
| "learning_rate": 1.5560683794572599e-07, | |
| "loss": 0.5005, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.33915027440340384, | |
| "grad_norm": 3.2159871448935853, | |
| "learning_rate": 1.5393570022470996e-07, | |
| "loss": 0.4912, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3453166430289203, | |
| "grad_norm": 2.921398178739714, | |
| "learning_rate": 1.5224304387330113e-07, | |
| "loss": 0.4873, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.3514830116544367, | |
| "grad_norm": 3.033201824114291, | |
| "learning_rate": 1.505295442075936e-07, | |
| "loss": 0.4848, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.35764938027995313, | |
| "grad_norm": 3.126845883000846, | |
| "learning_rate": 1.4879588485950154e-07, | |
| "loss": 0.4761, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.3638157489054696, | |
| "grad_norm": 2.899612235662964, | |
| "learning_rate": 1.4704275750401168e-07, | |
| "loss": 0.4731, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.369982117530986, | |
| "grad_norm": 2.78803166053557, | |
| "learning_rate": 1.45270861583227e-07, | |
| "loss": 0.4751, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3761484861565024, | |
| "grad_norm": 3.217869780099078, | |
| "learning_rate": 1.4348090402731177e-07, | |
| "loss": 0.4833, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3823148547820189, | |
| "grad_norm": 2.98388792612514, | |
| "learning_rate": 1.416735989724485e-07, | |
| "loss": 0.4768, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3884812234075353, | |
| "grad_norm": 3.095979105793261, | |
| "learning_rate": 1.3984966747592066e-07, | |
| "loss": 0.4781, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3946475920330517, | |
| "grad_norm": 2.8570658991316944, | |
| "learning_rate": 1.380098372284335e-07, | |
| "loss": 0.47, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.40081396065856817, | |
| "grad_norm": 2.913522526116864, | |
| "learning_rate": 1.3615484226378866e-07, | |
| "loss": 0.4761, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4069803292840846, | |
| "grad_norm": 3.079167327659028, | |
| "learning_rate": 1.3428542266602808e-07, | |
| "loss": 0.4691, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.413146697909601, | |
| "grad_norm": 3.167335424827754, | |
| "learning_rate": 1.3240232427416377e-07, | |
| "loss": 0.4762, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.41931306653511746, | |
| "grad_norm": 2.951805565284142, | |
| "learning_rate": 1.3050629838461213e-07, | |
| "loss": 0.4743, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.4254794351606339, | |
| "grad_norm": 3.344274691992938, | |
| "learning_rate": 1.285981014514501e-07, | |
| "loss": 0.4651, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.43164580378615036, | |
| "grad_norm": 3.134003729646922, | |
| "learning_rate": 1.2667849478461436e-07, | |
| "loss": 0.474, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.43781217241166676, | |
| "grad_norm": 3.000847232186744, | |
| "learning_rate": 1.2474824424616271e-07, | |
| "loss": 0.4729, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.4439785410371832, | |
| "grad_norm": 2.836288640859743, | |
| "learning_rate": 1.228081199447195e-07, | |
| "loss": 0.4632, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.45014490966269965, | |
| "grad_norm": 3.308502889653925, | |
| "learning_rate": 1.2085889592822667e-07, | |
| "loss": 0.4601, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.45631127828821605, | |
| "grad_norm": 2.746002613176513, | |
| "learning_rate": 1.1890134987512341e-07, | |
| "loss": 0.467, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.4624776469137325, | |
| "grad_norm": 3.24735950823672, | |
| "learning_rate": 1.1693626278407694e-07, | |
| "loss": 0.4617, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.46864401553924895, | |
| "grad_norm": 3.0607507540260075, | |
| "learning_rate": 1.1496441866238905e-07, | |
| "loss": 0.4569, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.47481038416476534, | |
| "grad_norm": 2.9943145563385998, | |
| "learning_rate": 1.1298660421320194e-07, | |
| "loss": 0.4619, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.4809767527902818, | |
| "grad_norm": 3.1612704244607177, | |
| "learning_rate": 1.1100360852162888e-07, | |
| "loss": 0.4637, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.48714312141579824, | |
| "grad_norm": 3.1449471984877055, | |
| "learning_rate": 1.0901622273993417e-07, | |
| "loss": 0.4701, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.4933094900413147, | |
| "grad_norm": 3.0699714321899387, | |
| "learning_rate": 1.070252397718884e-07, | |
| "loss": 0.4558, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4994758586668311, | |
| "grad_norm": 3.4589428619371834, | |
| "learning_rate": 1.0503145395642541e-07, | |
| "loss": 0.4599, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5056422272923475, | |
| "grad_norm": 3.0848999815662674, | |
| "learning_rate": 1.0303566075072598e-07, | |
| "loss": 0.4558, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.511808595917864, | |
| "grad_norm": 2.947163992749446, | |
| "learning_rate": 1.0103865641285583e-07, | |
| "loss": 0.457, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.5179749645433804, | |
| "grad_norm": 3.5673363307250927, | |
| "learning_rate": 9.904123768408389e-08, | |
| "loss": 0.4575, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5241413331688969, | |
| "grad_norm": 3.07648492625604, | |
| "learning_rate": 9.704420147100796e-08, | |
| "loss": 0.4528, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5303077017944132, | |
| "grad_norm": 3.2080853332983907, | |
| "learning_rate": 9.504834452761424e-08, | |
| "loss": 0.455, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5364740704199297, | |
| "grad_norm": 2.952611892786328, | |
| "learning_rate": 9.305446313739767e-08, | |
| "loss": 0.4472, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.5426404390454461, | |
| "grad_norm": 3.005908121136174, | |
| "learning_rate": 9.106335279567037e-08, | |
| "loss": 0.4516, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5488068076709626, | |
| "grad_norm": 2.822428791661921, | |
| "learning_rate": 8.907580789218414e-08, | |
| "loss": 0.4528, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.554973176296479, | |
| "grad_norm": 2.8673595096457465, | |
| "learning_rate": 8.709262139419424e-08, | |
| "loss": 0.4536, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5611395449219955, | |
| "grad_norm": 3.264575792740317, | |
| "learning_rate": 8.511458453009065e-08, | |
| "loss": 0.4524, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.5673059135475119, | |
| "grad_norm": 3.3129039957771806, | |
| "learning_rate": 8.314248647372302e-08, | |
| "loss": 0.4467, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5734722821730283, | |
| "grad_norm": 3.083187238173955, | |
| "learning_rate": 8.117711402954554e-08, | |
| "loss": 0.4488, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.5796386507985447, | |
| "grad_norm": 3.1488830656848, | |
| "learning_rate": 7.921925131870672e-08, | |
| "loss": 0.4579, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5858050194240612, | |
| "grad_norm": 2.9768313706421874, | |
| "learning_rate": 7.726967946621029e-08, | |
| "loss": 0.4481, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5919713880495776, | |
| "grad_norm": 3.0236276200137486, | |
| "learning_rate": 7.532917628927079e-08, | |
| "loss": 0.4529, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5981377566750941, | |
| "grad_norm": 3.0681593760022285, | |
| "learning_rate": 7.339851598698955e-08, | |
| "loss": 0.4527, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6043041253006105, | |
| "grad_norm": 3.2203600426157495, | |
| "learning_rate": 7.147846883147362e-08, | |
| "loss": 0.4473, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.610470493926127, | |
| "grad_norm": 3.127241727234972, | |
| "learning_rate": 6.956980086052184e-08, | |
| "loss": 0.4536, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.6166368625516433, | |
| "grad_norm": 3.215958029153526, | |
| "learning_rate": 6.76732735719999e-08, | |
| "loss": 0.4505, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6228032311771597, | |
| "grad_norm": 2.8220120121880936, | |
| "learning_rate": 6.578964362002715e-08, | |
| "loss": 0.4514, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.6289695998026762, | |
| "grad_norm": 3.0284791997521054, | |
| "learning_rate": 6.391966251309539e-08, | |
| "loss": 0.4458, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.6351359684281926, | |
| "grad_norm": 3.371033810021987, | |
| "learning_rate": 6.206407631424109e-08, | |
| "loss": 0.4446, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.6413023370537091, | |
| "grad_norm": 3.122281628753462, | |
| "learning_rate": 6.02236253433898e-08, | |
| "loss": 0.4473, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6474687056792255, | |
| "grad_norm": 2.7353573500503074, | |
| "learning_rate": 5.8399043881992104e-08, | |
| "loss": 0.4399, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.653635074304742, | |
| "grad_norm": 3.194859384027796, | |
| "learning_rate": 5.659105988006851e-08, | |
| "loss": 0.4499, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6598014429302583, | |
| "grad_norm": 2.8707279633921194, | |
| "learning_rate": 5.480039466578079e-08, | |
| "loss": 0.453, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.6659678115557748, | |
| "grad_norm": 3.313196070466103, | |
| "learning_rate": 5.3027762657644745e-08, | |
| "loss": 0.4433, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.6721341801812912, | |
| "grad_norm": 2.9398335243680056, | |
| "learning_rate": 5.1273871079499986e-08, | |
| "loss": 0.447, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.6783005488068077, | |
| "grad_norm": 2.9069645999783726, | |
| "learning_rate": 4.9539419678350103e-08, | |
| "loss": 0.4424, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6844669174323241, | |
| "grad_norm": 3.0807794080344744, | |
| "learning_rate": 4.7825100445185904e-08, | |
| "loss": 0.4502, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.6906332860578406, | |
| "grad_norm": 3.1238439553913913, | |
| "learning_rate": 4.613159733890279e-08, | |
| "loss": 0.4371, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6967996546833569, | |
| "grad_norm": 2.894912373492253, | |
| "learning_rate": 4.445958601342321e-08, | |
| "loss": 0.4466, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.7029660233088734, | |
| "grad_norm": 3.1861168079620352, | |
| "learning_rate": 4.280973354813196e-08, | |
| "loss": 0.4452, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7091323919343898, | |
| "grad_norm": 3.218055671881565, | |
| "learning_rate": 4.118269818173283e-08, | |
| "loss": 0.4335, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7152987605599063, | |
| "grad_norm": 3.7156552734894177, | |
| "learning_rate": 3.957912904963225e-08, | |
| "loss": 0.4482, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7214651291854227, | |
| "grad_norm": 3.248126042161764, | |
| "learning_rate": 3.7999665924954815e-08, | |
| "loss": 0.4407, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.7276314978109392, | |
| "grad_norm": 3.37840785837335, | |
| "learning_rate": 3.64449389632943e-08, | |
| "loss": 0.4421, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7337978664364556, | |
| "grad_norm": 2.919668888292714, | |
| "learning_rate": 3.491556845130147e-08, | |
| "loss": 0.4358, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.739964235061972, | |
| "grad_norm": 3.449594279809231, | |
| "learning_rate": 3.3412164559209485e-08, | |
| "loss": 0.4393, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7461306036874884, | |
| "grad_norm": 3.0449172482636713, | |
| "learning_rate": 3.193532709739534e-08, | |
| "loss": 0.443, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.7522969723130049, | |
| "grad_norm": 2.9659035390086603, | |
| "learning_rate": 3.048564527707457e-08, | |
| "loss": 0.4541, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7584633409385213, | |
| "grad_norm": 3.0426691033458266, | |
| "learning_rate": 2.9063697475224736e-08, | |
| "loss": 0.4411, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.7646297095640378, | |
| "grad_norm": 3.1254929066925543, | |
| "learning_rate": 2.767005100383143e-08, | |
| "loss": 0.4466, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.7707960781895542, | |
| "grad_norm": 3.059948610503461, | |
| "learning_rate": 2.6305261883548624e-08, | |
| "loss": 0.4501, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.7769624468150707, | |
| "grad_norm": 3.178741971582532, | |
| "learning_rate": 2.4969874621864373e-08, | |
| "loss": 0.4405, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.783128815440587, | |
| "grad_norm": 3.178300180527373, | |
| "learning_rate": 2.3664421995859463e-08, | |
| "loss": 0.4499, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.7892951840661034, | |
| "grad_norm": 3.003275204473159, | |
| "learning_rate": 2.2389424839646286e-08, | |
| "loss": 0.4399, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.7954615526916199, | |
| "grad_norm": 3.420014772222019, | |
| "learning_rate": 2.114539183657268e-08, | |
| "loss": 0.4352, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8016279213171363, | |
| "grad_norm": 2.971627106875043, | |
| "learning_rate": 1.9932819316273307e-08, | |
| "loss": 0.4382, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8077942899426528, | |
| "grad_norm": 3.4422374871537533, | |
| "learning_rate": 1.8752191056650023e-08, | |
| "loss": 0.4377, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.8139606585681692, | |
| "grad_norm": 3.053124764133182, | |
| "learning_rate": 1.7603978090859794e-08, | |
| "loss": 0.4442, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8201270271936857, | |
| "grad_norm": 3.1086937331605613, | |
| "learning_rate": 1.6488638519387478e-08, | |
| "loss": 0.4466, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.826293395819202, | |
| "grad_norm": 3.4399676514136193, | |
| "learning_rate": 1.5406617327278205e-08, | |
| "loss": 0.4326, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8324597644447185, | |
| "grad_norm": 2.8487398222000744, | |
| "learning_rate": 1.4358346206602612e-08, | |
| "loss": 0.4422, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.8386261330702349, | |
| "grad_norm": 2.9651774336393726, | |
| "learning_rate": 1.334424338422534e-08, | |
| "loss": 0.4305, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.8447925016957514, | |
| "grad_norm": 3.4279808291982556, | |
| "learning_rate": 1.236471345494583e-08, | |
| "loss": 0.4386, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.8509588703212678, | |
| "grad_norm": 3.298847289113035, | |
| "learning_rate": 1.1420147220077847e-08, | |
| "loss": 0.4425, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8571252389467843, | |
| "grad_norm": 3.199726112913922, | |
| "learning_rate": 1.0510921531532192e-08, | |
| "loss": 0.4339, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.8632916075723007, | |
| "grad_norm": 3.3865902484127637, | |
| "learning_rate": 9.63739914146473e-09, | |
| "loss": 0.426, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.8694579761978171, | |
| "grad_norm": 3.080132950484914, | |
| "learning_rate": 8.799928557549863e-09, | |
| "loss": 0.4437, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.8756243448233335, | |
| "grad_norm": 3.2441647152844526, | |
| "learning_rate": 7.998843903936992e-09, | |
| "loss": 0.4338, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.88179071344885, | |
| "grad_norm": 2.861131038634973, | |
| "learning_rate": 7.2344647879456265e-09, | |
| "loss": 0.4363, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.8879570820743664, | |
| "grad_norm": 3.131842173102097, | |
| "learning_rate": 6.507096172552195e-09, | |
| "loss": 0.4333, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.8941234506998829, | |
| "grad_norm": 3.1735067730802604, | |
| "learning_rate": 5.817028254719536e-09, | |
| "loss": 0.4395, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9002898193253993, | |
| "grad_norm": 2.941305225791783, | |
| "learning_rate": 5.164536349617532e-09, | |
| "loss": 0.4418, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9064561879509158, | |
| "grad_norm": 3.1369522496788806, | |
| "learning_rate": 4.5498807807811015e-09, | |
| "loss": 0.4413, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.9126225565764321, | |
| "grad_norm": 3.10250834762718, | |
| "learning_rate": 3.973306776249341e-09, | |
| "loss": 0.4316, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9187889252019485, | |
| "grad_norm": 3.113181559222609, | |
| "learning_rate": 3.4350443707274135e-09, | |
| "loss": 0.4391, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.924955293827465, | |
| "grad_norm": 3.2125045204581424, | |
| "learning_rate": 2.9353083138099256e-09, | |
| "loss": 0.4453, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9311216624529814, | |
| "grad_norm": 3.3021789642945008, | |
| "learning_rate": 2.474297984302709e-09, | |
| "loss": 0.4404, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.9372880310784979, | |
| "grad_norm": 3.3940858957593223, | |
| "learning_rate": 2.0521973106770285e-09, | |
| "loss": 0.4387, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9434543997040143, | |
| "grad_norm": 2.8912916713122763, | |
| "learning_rate": 1.6691746976879028e-09, | |
| "loss": 0.4396, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.9496207683295307, | |
| "grad_norm": 3.437089198963669, | |
| "learning_rate": 1.3253829591860387e-09, | |
| "loss": 0.4375, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.9557871369550471, | |
| "grad_norm": 3.138591510137561, | |
| "learning_rate": 1.0209592571498892e-09, | |
| "loss": 0.432, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.9619535055805636, | |
| "grad_norm": 3.0903316485258783, | |
| "learning_rate": 7.560250469624385e-10, | |
| "loss": 0.4381, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.96811987420608, | |
| "grad_norm": 3.1363149734033233, | |
| "learning_rate": 5.306860289543413e-10, | |
| "loss": 0.4432, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.9742862428315965, | |
| "grad_norm": 3.143737684684351, | |
| "learning_rate": 3.450321062328232e-10, | |
| "loss": 0.4334, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.9804526114571129, | |
| "grad_norm": 2.8627388485987444, | |
| "learning_rate": 1.9913734881326083e-10, | |
| "loss": 0.4372, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.9866189800826294, | |
| "grad_norm": 3.044456688116337, | |
| "learning_rate": 9.305996406754335e-11, | |
| "loss": 0.4376, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.9927853487081457, | |
| "grad_norm": 2.9030301879677096, | |
| "learning_rate": 2.6842273501193058e-11, | |
| "loss": 0.4348, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.9989517173336622, | |
| "grad_norm": 3.2924748739223664, | |
| "learning_rate": 5.10695868449762e-13, | |
| "loss": 0.4422, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.9999383363137448, | |
| "step": 8108, | |
| "total_flos": 533986133770240.0, | |
| "train_loss": 0.5225248140364254, | |
| "train_runtime": 45752.7497, | |
| "train_samples_per_second": 5.671, | |
| "train_steps_per_second": 0.177 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8108, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 533986133770240.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |