| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.999259807549963, |
| "eval_steps": 100, |
| "global_step": 675, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007401924500370096, |
| "grad_norm": 2.6769801199789844, |
| "learning_rate": 1.4705882352941177e-06, |
| "loss": 1.0786, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.014803849000740192, |
| "grad_norm": 2.289153229357034, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 1.0901, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02220577350111029, |
| "grad_norm": 1.4151517309864259, |
| "learning_rate": 4.411764705882353e-06, |
| "loss": 1.062, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.029607698001480384, |
| "grad_norm": 1.3753052911702501, |
| "learning_rate": 5.882352941176471e-06, |
| "loss": 1.0087, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.037009622501850484, |
| "grad_norm": 1.117549735717063, |
| "learning_rate": 7.352941176470589e-06, |
| "loss": 0.9684, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04441154700222058, |
| "grad_norm": 0.9514445146454603, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 0.9217, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05181347150259067, |
| "grad_norm": 0.7594358797490329, |
| "learning_rate": 1.0294117647058823e-05, |
| "loss": 0.8859, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.05921539600296077, |
| "grad_norm": 0.6853982401028429, |
| "learning_rate": 1.1764705882352942e-05, |
| "loss": 0.8631, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06661732050333087, |
| "grad_norm": 0.7503960595270591, |
| "learning_rate": 1.323529411764706e-05, |
| "loss": 0.8486, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.07401924500370097, |
| "grad_norm": 0.81742237765088, |
| "learning_rate": 1.4705882352941179e-05, |
| "loss": 0.861, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08142116950407106, |
| "grad_norm": 0.6232853094087558, |
| "learning_rate": 1.6176470588235296e-05, |
| "loss": 0.8545, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.08882309400444116, |
| "grad_norm": 0.7312499750471685, |
| "learning_rate": 1.7647058823529414e-05, |
| "loss": 0.8293, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09622501850481126, |
| "grad_norm": 0.6849629343658746, |
| "learning_rate": 1.911764705882353e-05, |
| "loss": 0.8259, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.10362694300518134, |
| "grad_norm": 0.7670921820036198, |
| "learning_rate": 1.9999464266898485e-05, |
| "loss": 0.8211, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11102886750555144, |
| "grad_norm": 0.9562639233109864, |
| "learning_rate": 1.9993437928712977e-05, |
| "loss": 0.8165, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.11843079200592153, |
| "grad_norm": 0.8187251839834904, |
| "learning_rate": 1.998071963486563e-05, |
| "loss": 0.8062, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12583271650629163, |
| "grad_norm": 0.6607954272855435, |
| "learning_rate": 1.9961317901970953e-05, |
| "loss": 0.7945, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.13323464100666174, |
| "grad_norm": 0.7512885304003232, |
| "learning_rate": 1.993524572210807e-05, |
| "loss": 0.7947, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14063656550703182, |
| "grad_norm": 0.7919859825103274, |
| "learning_rate": 1.990252055412077e-05, |
| "loss": 0.7905, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.14803849000740193, |
| "grad_norm": 0.7738309429220392, |
| "learning_rate": 1.9863164311926433e-05, |
| "loss": 0.8172, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14803849000740193, |
| "eval_loss": 0.8197109699249268, |
| "eval_runtime": 5.3703, |
| "eval_samples_per_second": 23.835, |
| "eval_steps_per_second": 2.979, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15544041450777202, |
| "grad_norm": 0.7130351365422478, |
| "learning_rate": 1.981720334984174e-05, |
| "loss": 0.7921, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.16284233900814213, |
| "grad_norm": 0.7061699809541575, |
| "learning_rate": 1.9764668444934853e-05, |
| "loss": 0.7859, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1702442635085122, |
| "grad_norm": 0.7182947479746219, |
| "learning_rate": 1.970559477641606e-05, |
| "loss": 0.7631, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.17764618800888232, |
| "grad_norm": 0.7311916453291761, |
| "learning_rate": 1.9640021902080523e-05, |
| "loss": 0.7929, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1850481125092524, |
| "grad_norm": 0.6285240054314736, |
| "learning_rate": 1.9567993731818988e-05, |
| "loss": 0.7916, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.19245003700962252, |
| "grad_norm": 0.7404725250217843, |
| "learning_rate": 1.9489558498214197e-05, |
| "loss": 0.7842, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1998519615099926, |
| "grad_norm": 0.6486675105037224, |
| "learning_rate": 1.9404768724242667e-05, |
| "loss": 0.7704, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.20725388601036268, |
| "grad_norm": 0.7306207383223645, |
| "learning_rate": 1.931368118810346e-05, |
| "loss": 0.7947, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2146558105107328, |
| "grad_norm": 0.7466637103159883, |
| "learning_rate": 1.92163568851975e-05, |
| "loss": 0.7756, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.22205773501110287, |
| "grad_norm": 0.7971646525282183, |
| "learning_rate": 1.911286098728296e-05, |
| "loss": 0.772, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.22945965951147299, |
| "grad_norm": 0.7009790395257299, |
| "learning_rate": 1.900326279883392e-05, |
| "loss": 0.8015, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.23686158401184307, |
| "grad_norm": 0.7000968794313863, |
| "learning_rate": 1.8887635710631716e-05, |
| "loss": 0.8043, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.24426350851221318, |
| "grad_norm": 0.6947279868702056, |
| "learning_rate": 1.8766057150619865e-05, |
| "loss": 0.7775, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.25166543301258326, |
| "grad_norm": 0.7054404867195952, |
| "learning_rate": 1.8638608532055635e-05, |
| "loss": 0.7947, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.25906735751295334, |
| "grad_norm": 0.6591670530138999, |
| "learning_rate": 1.8505375198992856e-05, |
| "loss": 0.7831, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.2664692820133235, |
| "grad_norm": 0.7106688805726791, |
| "learning_rate": 1.836644636913258e-05, |
| "loss": 0.7543, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.27387120651369357, |
| "grad_norm": 0.7764398857270777, |
| "learning_rate": 1.8221915074079764e-05, |
| "loss": 0.7779, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.28127313101406365, |
| "grad_norm": 0.8612641589834193, |
| "learning_rate": 1.8071878097046064e-05, |
| "loss": 0.7564, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.28867505551443373, |
| "grad_norm": 0.6890423279187838, |
| "learning_rate": 1.7916435908040413e-05, |
| "loss": 0.7723, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.29607698001480387, |
| "grad_norm": 0.6443921271623081, |
| "learning_rate": 1.7755692596590778e-05, |
| "loss": 0.7746, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.29607698001480387, |
| "eval_loss": 0.7898486852645874, |
| "eval_runtime": 5.365, |
| "eval_samples_per_second": 23.858, |
| "eval_steps_per_second": 2.982, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.30347890451517395, |
| "grad_norm": 0.6925911374679931, |
| "learning_rate": 1.7589755802042188e-05, |
| "loss": 0.7729, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.31088082901554404, |
| "grad_norm": 0.6892184297566977, |
| "learning_rate": 1.7418736641477636e-05, |
| "loss": 0.7561, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3182827535159141, |
| "grad_norm": 0.624988978642692, |
| "learning_rate": 1.7242749635310222e-05, |
| "loss": 0.7581, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.32568467801628426, |
| "grad_norm": 0.6494026923603395, |
| "learning_rate": 1.7061912630596252e-05, |
| "loss": 0.7604, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.33308660251665434, |
| "grad_norm": 0.7186278404090587, |
| "learning_rate": 1.6876346722120747e-05, |
| "loss": 0.7752, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.3404885270170244, |
| "grad_norm": 0.6877574111727853, |
| "learning_rate": 1.6686176171308125e-05, |
| "loss": 0.7977, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3478904515173945, |
| "grad_norm": 0.6696578728231912, |
| "learning_rate": 1.6491528323012412e-05, |
| "loss": 0.7595, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.35529237601776464, |
| "grad_norm": 0.6335948217388314, |
| "learning_rate": 1.6292533520242663e-05, |
| "loss": 0.7622, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3626943005181347, |
| "grad_norm": 0.640069468830107, |
| "learning_rate": 1.6089325016880737e-05, |
| "loss": 0.7526, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.3700962250185048, |
| "grad_norm": 0.676843647805683, |
| "learning_rate": 1.588203888844982e-05, |
| "loss": 0.768, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3774981495188749, |
| "grad_norm": 0.6351408648447299, |
| "learning_rate": 1.5670813940993504e-05, |
| "loss": 0.741, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.38490007401924503, |
| "grad_norm": 0.5889957497779358, |
| "learning_rate": 1.5455791618126407e-05, |
| "loss": 0.7332, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3923019985196151, |
| "grad_norm": 0.6263513470697065, |
| "learning_rate": 1.5237115906318565e-05, |
| "loss": 0.7572, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.3997039230199852, |
| "grad_norm": 0.6463851097490673, |
| "learning_rate": 1.5014933238477069e-05, |
| "loss": 0.7378, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4071058475203553, |
| "grad_norm": 0.6864222070004046, |
| "learning_rate": 1.4789392395889468e-05, |
| "loss": 0.7633, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.41450777202072536, |
| "grad_norm": 0.6463393459572199, |
| "learning_rate": 1.4560644408594602e-05, |
| "loss": 0.7439, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4219096965210955, |
| "grad_norm": 0.6563326063729631, |
| "learning_rate": 1.432884245424761e-05, |
| "loss": 0.7555, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.4293116210214656, |
| "grad_norm": 0.6577336494046302, |
| "learning_rate": 1.4094141755546816e-05, |
| "loss": 0.7831, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.43671354552183567, |
| "grad_norm": 0.6230469159629758, |
| "learning_rate": 1.3856699476291176e-05, |
| "loss": 0.7427, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.44411547002220575, |
| "grad_norm": 0.6803603436676453, |
| "learning_rate": 1.3616674616137902e-05, |
| "loss": 0.7643, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.44411547002220575, |
| "eval_loss": 0.7755386829376221, |
| "eval_runtime": 5.3686, |
| "eval_samples_per_second": 23.843, |
| "eval_steps_per_second": 2.98, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4515173945225759, |
| "grad_norm": 0.6538871821580508, |
| "learning_rate": 1.3374227904130724e-05, |
| "loss": 0.7548, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.45891931902294597, |
| "grad_norm": 0.6412401526107059, |
| "learning_rate": 1.3129521691070108e-05, |
| "loss": 0.7327, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.46632124352331605, |
| "grad_norm": 0.7175626595120815, |
| "learning_rate": 1.2882719840797473e-05, |
| "loss": 0.7513, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.47372316802368614, |
| "grad_norm": 0.6865616455390183, |
| "learning_rate": 1.2633987620466229e-05, |
| "loss": 0.7353, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4811250925240563, |
| "grad_norm": 0.6349023571241349, |
| "learning_rate": 1.2383491589873122e-05, |
| "loss": 0.7404, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.48852701702442636, |
| "grad_norm": 0.6069292373819503, |
| "learning_rate": 1.213139948992394e-05, |
| "loss": 0.7499, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.49592894152479644, |
| "grad_norm": 0.6884470180718896, |
| "learning_rate": 1.187788013030837e-05, |
| "loss": 0.7468, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.5033308660251665, |
| "grad_norm": 0.6109069389568509, |
| "learning_rate": 1.1623103276459086e-05, |
| "loss": 0.7506, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5107327905255367, |
| "grad_norm": 0.642078110839711, |
| "learning_rate": 1.1367239535870913e-05, |
| "loss": 0.7425, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.5181347150259067, |
| "grad_norm": 0.6834108569853193, |
| "learning_rate": 1.1110460243856051e-05, |
| "loss": 0.7301, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5255366395262768, |
| "grad_norm": 0.663649951052548, |
| "learning_rate": 1.085293734881197e-05, |
| "loss": 0.7468, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.532938564026647, |
| "grad_norm": 0.5954162815689811, |
| "learning_rate": 1.0594843297078736e-05, |
| "loss": 0.7658, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.540340488527017, |
| "grad_norm": 0.686716585388219, |
| "learning_rate": 1.0336350917462925e-05, |
| "loss": 0.7558, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.5477424130273871, |
| "grad_norm": 0.5992494474064747, |
| "learning_rate": 1.0077633305505402e-05, |
| "loss": 0.7432, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5551443375277573, |
| "grad_norm": 0.6301308510752842, |
| "learning_rate": 9.818863707570476e-06, |
| "loss": 0.7607, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5625462620281273, |
| "grad_norm": 0.6533374072399002, |
| "learning_rate": 9.560215404834094e-06, |
| "loss": 0.7514, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5699481865284974, |
| "grad_norm": 0.6339343966871546, |
| "learning_rate": 9.30186159724869e-06, |
| "loss": 0.7144, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.5773501110288675, |
| "grad_norm": 0.6024127775100068, |
| "learning_rate": 9.043975287562443e-06, |
| "loss": 0.747, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5847520355292376, |
| "grad_norm": 0.6705486605989903, |
| "learning_rate": 8.786729165470584e-06, |
| "loss": 0.7251, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.5921539600296077, |
| "grad_norm": 0.6833286308389832, |
| "learning_rate": 8.530295491976338e-06, |
| "loss": 0.7306, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5921539600296077, |
| "eval_loss": 0.7636929750442505, |
| "eval_runtime": 5.3692, |
| "eval_samples_per_second": 23.84, |
| "eval_steps_per_second": 2.98, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5995558845299778, |
| "grad_norm": 0.5926825289122618, |
| "learning_rate": 8.274845984038916e-06, |
| "loss": 0.7173, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.6069578090303479, |
| "grad_norm": 0.6485786020572889, |
| "learning_rate": 8.020551699585843e-06, |
| "loss": 0.7469, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6143597335307179, |
| "grad_norm": 0.6038143644627321, |
| "learning_rate": 7.76758292296659e-06, |
| "loss": 0.7264, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.6217616580310881, |
| "grad_norm": 0.6525648213164897, |
| "learning_rate": 7.5161090509242005e-06, |
| "loss": 0.7418, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6291635825314582, |
| "grad_norm": 0.6472092560370133, |
| "learning_rate": 7.2662984791613186e-06, |
| "loss": 0.7345, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.6365655070318282, |
| "grad_norm": 0.6486952224924798, |
| "learning_rate": 7.01831848957653e-06, |
| "loss": 0.7488, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6439674315321984, |
| "grad_norm": 0.5995921591151809, |
| "learning_rate": 6.772335138246548e-06, |
| "loss": 0.7467, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6513693560325685, |
| "grad_norm": 0.6241512176448558, |
| "learning_rate": 6.528513144229256e-06, |
| "loss": 0.7427, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6587712805329385, |
| "grad_norm": 0.6111647376974468, |
| "learning_rate": 6.287015779262064e-06, |
| "loss": 0.7488, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.6661732050333087, |
| "grad_norm": 0.6498096979963152, |
| "learning_rate": 6.048004758429451e-06, |
| "loss": 0.7273, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6735751295336787, |
| "grad_norm": 0.6032913806566699, |
| "learning_rate": 5.811640131872867e-06, |
| "loss": 0.7497, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.6809770540340488, |
| "grad_norm": 0.6076584860831334, |
| "learning_rate": 5.578080177615575e-06, |
| "loss": 0.7201, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.688378978534419, |
| "grad_norm": 0.6233401316768133, |
| "learning_rate": 5.347481295574141e-06, |
| "loss": 0.717, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.695780903034789, |
| "grad_norm": 0.6074432637453558, |
| "learning_rate": 5.119997902827584e-06, |
| "loss": 0.7285, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7031828275351591, |
| "grad_norm": 0.6053282667804308, |
| "learning_rate": 4.8957823302142916e-06, |
| "loss": 0.7353, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.7105847520355293, |
| "grad_norm": 0.5651527733504048, |
| "learning_rate": 4.674984720325961e-06, |
| "loss": 0.7207, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7179866765358993, |
| "grad_norm": 0.5860040588339941, |
| "learning_rate": 4.457752926966888e-06, |
| "loss": 0.7149, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.7253886010362695, |
| "grad_norm": 0.5804110480769434, |
| "learning_rate": 4.244232416145839e-06, |
| "loss": 0.7337, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7327905255366395, |
| "grad_norm": 0.5927598587788678, |
| "learning_rate": 4.0345661686669745e-06, |
| "loss": 0.7269, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.7401924500370096, |
| "grad_norm": 0.6147179659228765, |
| "learning_rate": 3.828894584384867e-06, |
| "loss": 0.7355, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7401924500370096, |
| "eval_loss": 0.7562806010246277, |
| "eval_runtime": 5.3637, |
| "eval_samples_per_second": 23.864, |
| "eval_steps_per_second": 2.983, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7475943745373798, |
| "grad_norm": 0.5559997594301341, |
| "learning_rate": 3.62735538818787e-06, |
| "loss": 0.7196, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7549962990377498, |
| "grad_norm": 0.6309244918615329, |
| "learning_rate": 3.4300835377726904e-06, |
| "loss": 0.7233, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7623982235381199, |
| "grad_norm": 0.6184371610771912, |
| "learning_rate": 3.2372111332720045e-06, |
| "loss": 0.7587, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.7698001480384901, |
| "grad_norm": 0.606915064634782, |
| "learning_rate": 3.048867328795588e-06, |
| "loss": 0.7156, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7772020725388601, |
| "grad_norm": 0.5970702305562683, |
| "learning_rate": 2.865178245944218e-06, |
| "loss": 0.7144, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.7846039970392302, |
| "grad_norm": 0.5700255038227583, |
| "learning_rate": 2.686266889354211e-06, |
| "loss": 0.7374, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7920059215396003, |
| "grad_norm": 0.5871155109774989, |
| "learning_rate": 2.5122530643292274e-06, |
| "loss": 0.7428, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.7994078460399704, |
| "grad_norm": 0.6301777356538756, |
| "learning_rate": 2.3432532966144526e-06, |
| "loss": 0.7323, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8068097705403405, |
| "grad_norm": 0.5814869552487476, |
| "learning_rate": 2.1793807543668857e-06, |
| "loss": 0.7338, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.8142116950407106, |
| "grad_norm": 0.5484281351456736, |
| "learning_rate": 2.0207451723739633e-06, |
| "loss": 0.7257, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8216136195410807, |
| "grad_norm": 0.5634326261626533, |
| "learning_rate": 1.8674527785713247e-06, |
| "loss": 0.7325, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.8290155440414507, |
| "grad_norm": 0.6140394590528712, |
| "learning_rate": 1.7196062229088606e-06, |
| "loss": 0.6996, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8364174685418209, |
| "grad_norm": 0.6235750823611041, |
| "learning_rate": 1.577304508612717e-06, |
| "loss": 0.7297, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.843819393042191, |
| "grad_norm": 0.5186238249593603, |
| "learning_rate": 1.4406429258892762e-06, |
| "loss": 0.7503, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.851221317542561, |
| "grad_norm": 0.6162193205928901, |
| "learning_rate": 1.3097129881154936e-06, |
| "loss": 0.7199, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.8586232420429312, |
| "grad_norm": 0.5804633378076518, |
| "learning_rate": 1.1846023705583442e-06, |
| "loss": 0.7162, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8660251665433013, |
| "grad_norm": 0.5772033426035238, |
| "learning_rate": 1.065394851664394e-06, |
| "loss": 0.7344, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.8734270910436713, |
| "grad_norm": 0.5426053523198552, |
| "learning_rate": 9.521702569588199e-07, |
| "loss": 0.7536, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8808290155440415, |
| "grad_norm": 0.5722535438927212, |
| "learning_rate": 8.450044055914497e-07, |
| "loss": 0.722, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.8882309400444115, |
| "grad_norm": 0.5616139648656882, |
| "learning_rate": 7.439690595656013e-07, |
| "loss": 0.7445, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8882309400444115, |
| "eval_loss": 0.7532752156257629, |
| "eval_runtime": 5.3758, |
| "eval_samples_per_second": 23.81, |
| "eval_steps_per_second": 2.976, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8956328645447816, |
| "grad_norm": 0.6286499659968906, |
| "learning_rate": 6.491318756837417e-07, |
| "loss": 0.7298, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.9030347890451518, |
| "grad_norm": 0.5116306332006605, |
| "learning_rate": 5.605563602421149e-07, |
| "loss": 0.7058, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9104367135455218, |
| "grad_norm": 0.5536125071383629, |
| "learning_rate": 4.783018265047179e-07, |
| "loss": 0.7556, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.9178386380458919, |
| "grad_norm": 0.5813134121106769, |
| "learning_rate": 4.024233549850509e-07, |
| "loss": 0.7435, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9252405625462621, |
| "grad_norm": 0.5492708568069725, |
| "learning_rate": 3.329717565622825e-07, |
| "loss": 0.7403, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.9326424870466321, |
| "grad_norm": 0.5511538126684848, |
| "learning_rate": 2.6999353845651113e-07, |
| "loss": 0.7241, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9400444115470022, |
| "grad_norm": 0.5858089750359664, |
| "learning_rate": 2.1353087308590314e-07, |
| "loss": 0.7389, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.9474463360473723, |
| "grad_norm": 0.5558966176000505, |
| "learning_rate": 1.6362156982656085e-07, |
| "loss": 0.729, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9548482605477424, |
| "grad_norm": 0.5220503199938477, |
| "learning_rate": 1.2029904969404482e-07, |
| "loss": 0.7126, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.9622501850481125, |
| "grad_norm": 0.6027507206871425, |
| "learning_rate": 8.359232296349163e-08, |
| "loss": 0.7163, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9696521095484826, |
| "grad_norm": 0.5697946889297966, |
| "learning_rate": 5.3525969743324356e-08, |
| "loss": 0.7321, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.9770540340488527, |
| "grad_norm": 0.5452210155995557, |
| "learning_rate": 3.012012351554017e-08, |
| "loss": 0.7063, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9844559585492227, |
| "grad_norm": 0.5740830386758149, |
| "learning_rate": 1.3390457653639221e-08, |
| "loss": 0.7354, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.9918578830495929, |
| "grad_norm": 0.6403583337563233, |
| "learning_rate": 3.3481749271768726e-09, |
| "loss": 0.7462, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.999259807549963, |
| "grad_norm": 0.5720756141987368, |
| "learning_rate": 0.0, |
| "loss": 0.7158, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.999259807549963, |
| "step": 675, |
| "total_flos": 76888336760832.0, |
| "train_loss": 0.7675483689484773, |
| "train_runtime": 4012.107, |
| "train_samples_per_second": 5.386, |
| "train_steps_per_second": 0.168 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 675, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 76888336760832.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|