| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 4233, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007559650366170565, | |
| "grad_norm": 32.25, | |
| "learning_rate": 7.311320754716981e-07, | |
| "loss": 0.6915, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01511930073234113, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 1.4858490566037737e-06, | |
| "loss": 0.6792, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.022678951098511695, | |
| "grad_norm": 16.875, | |
| "learning_rate": 2.2405660377358494e-06, | |
| "loss": 0.5993, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.03023860146468226, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 2.995283018867925e-06, | |
| "loss": 0.5179, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03779825183085282, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5111, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04535790219702339, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 4.504716981132076e-06, | |
| "loss": 0.5109, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.05291755256319395, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 5.259433962264151e-06, | |
| "loss": 0.5106, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.06047720292936452, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 6.014150943396226e-06, | |
| "loss": 0.4514, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.06803685329553508, | |
| "grad_norm": 7.875, | |
| "learning_rate": 6.768867924528303e-06, | |
| "loss": 0.4862, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.07559650366170564, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 7.523584905660378e-06, | |
| "loss": 0.4711, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08315615402787621, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 8.278301886792453e-06, | |
| "loss": 0.4783, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.09071580439404678, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 9.03301886792453e-06, | |
| "loss": 0.4457, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.09827545476021735, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 9.787735849056604e-06, | |
| "loss": 0.5125, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.1058351051263879, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 9.99910037719311e-06, | |
| "loss": 0.4171, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.11339475549255847, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 9.994856381944038e-06, | |
| "loss": 0.4538, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12095440585872903, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 9.987133217483066e-06, | |
| "loss": 0.4629, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.1285140562248996, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 9.975936263383488e-06, | |
| "loss": 0.4744, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.13607370659107015, | |
| "grad_norm": 7.125, | |
| "learning_rate": 9.96127331888816e-06, | |
| "loss": 0.4292, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.14363335695724072, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 9.943154597476943e-06, | |
| "loss": 0.4558, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.1511930073234113, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 9.921592719752486e-06, | |
| "loss": 0.448, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.15875265768958186, | |
| "grad_norm": 8.125, | |
| "learning_rate": 9.896602704649348e-06, | |
| "loss": 0.4117, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.16631230805575242, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 9.868201958972548e-06, | |
| "loss": 0.4303, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.173871958421923, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 9.836410265272857e-06, | |
| "loss": 0.4402, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.18143160878809356, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.801249768067246e-06, | |
| "loss": 0.4242, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.18899125915426412, | |
| "grad_norm": 5.625, | |
| "learning_rate": 9.762744958414113e-06, | |
| "loss": 0.4771, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1965509095204347, | |
| "grad_norm": 14.875, | |
| "learning_rate": 9.720922656854032e-06, | |
| "loss": 0.4497, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.20411055988660523, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 9.675811994727897e-06, | |
| "loss": 0.4141, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.2116702102527758, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 9.627444393885463e-06, | |
| "loss": 0.432, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.21922986061894637, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 9.575853544798453e-06, | |
| "loss": 0.4253, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.22678951098511693, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 9.521075383093452e-06, | |
| "loss": 0.4334, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2343491613512875, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.463148064520913e-06, | |
| "loss": 0.4595, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.24190881171745807, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 9.402111938377776e-06, | |
| "loss": 0.4401, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.24946846208362863, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.338009519402132e-06, | |
| "loss": 0.4216, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.2570281124497992, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 9.270885458159576e-06, | |
| "loss": 0.4391, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.26458776281596974, | |
| "grad_norm": 7.875, | |
| "learning_rate": 9.200786509941827e-06, | |
| "loss": 0.4116, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2721474131821403, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 9.127761502199325e-06, | |
| "loss": 0.4004, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.2797070635483109, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 9.051861300530438e-06, | |
| "loss": 0.4261, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.28726671391448144, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 8.973138773251015e-06, | |
| "loss": 0.4075, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.294826364280652, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 8.891648754568943e-06, | |
| "loss": 0.4398, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.3023860146468226, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 8.807448006389343e-06, | |
| "loss": 0.4517, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.30994566501299314, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 8.720595178777063e-06, | |
| "loss": 0.4254, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.3175053153791637, | |
| "grad_norm": 7.25, | |
| "learning_rate": 8.631150769103934e-06, | |
| "loss": 0.441, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.3250649657453343, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 8.539177079909315e-06, | |
| "loss": 0.4337, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.33262461611150484, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 8.444738175503222e-06, | |
| "loss": 0.4537, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.3401842664776754, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 8.347899837342315e-06, | |
| "loss": 0.4071, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.347743916843846, | |
| "grad_norm": 7.875, | |
| "learning_rate": 8.2487295182098e-06, | |
| "loss": 0.4612, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.35530356721001655, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 8.147296295231158e-06, | |
| "loss": 0.4296, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.3628632175761871, | |
| "grad_norm": 8.625, | |
| "learning_rate": 8.04367082175845e-06, | |
| "loss": 0.4491, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.3704228679423577, | |
| "grad_norm": 5.25, | |
| "learning_rate": 7.937925278156698e-06, | |
| "loss": 0.4132, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.37798251830852825, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 7.830133321526615e-06, | |
| "loss": 0.4068, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.3855421686746988, | |
| "grad_norm": 4.375, | |
| "learning_rate": 7.720370034398741e-06, | |
| "loss": 0.4499, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.3931018190408694, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 7.608711872434648e-06, | |
| "loss": 0.4256, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.40066146940703995, | |
| "grad_norm": 9.875, | |
| "learning_rate": 7.495236611171741e-06, | |
| "loss": 0.428, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.40822111977321046, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 7.3800232918486715e-06, | |
| "loss": 0.4146, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.41578077013938103, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 7.263152166349122e-06, | |
| "loss": 0.4476, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4233404205055516, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 7.144704641302337e-06, | |
| "loss": 0.4387, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.43090007087172216, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 7.024763221379289e-06, | |
| "loss": 0.4276, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.43845972123789273, | |
| "grad_norm": 7.25, | |
| "learning_rate": 6.903411451824033e-06, | |
| "loss": 0.4482, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.4460193716040633, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 6.780733860260216e-06, | |
| "loss": 0.4187, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.45357902197023386, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 6.6568158978133455e-06, | |
| "loss": 0.402, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.46113867233640443, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 6.531743879589754e-06, | |
| "loss": 0.4157, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.468698322702575, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 6.405604924553797e-06, | |
| "loss": 0.4771, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.47625797306874557, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 6.278486894845084e-06, | |
| "loss": 0.4408, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.48381762343491613, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 6.150478334578085e-06, | |
| "loss": 0.4434, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.4913772738010867, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 6.021668408166688e-06, | |
| "loss": 0.4214, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.49893692416725727, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 5.892146838216687e-06, | |
| "loss": 0.4164, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.5064965745334278, | |
| "grad_norm": 4.25, | |
| "learning_rate": 5.762003843029466e-06, | |
| "loss": 0.426, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 0.5140562248995983, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 5.631330073760413e-06, | |
| "loss": 0.4205, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.5216158752657689, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 5.500216551275807e-06, | |
| "loss": 0.4429, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 0.5291755256319395, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 5.368754602752213e-06, | |
| "loss": 0.431, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.53673517599811, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 5.237035798062489e-06, | |
| "loss": 0.4224, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 0.5442948263642806, | |
| "grad_norm": 8.125, | |
| "learning_rate": 5.105151885992754e-06, | |
| "loss": 0.4194, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.5518544767304512, | |
| "grad_norm": 5.5, | |
| "learning_rate": 4.9731947303347485e-06, | |
| "loss": 0.434, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 0.5594141270966217, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 4.841256245898055e-06, | |
| "loss": 0.4308, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.5669737774627923, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.709428334486816e-06, | |
| "loss": 0.3907, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5745334278289629, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 4.577802820885482e-06, | |
| "loss": 0.4226, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.5820930781951335, | |
| "grad_norm": 5.75, | |
| "learning_rate": 4.446471388898236e-06, | |
| "loss": 0.4216, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.589652728561304, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 4.315525517486586e-06, | |
| "loss": 0.4632, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.5972123789274746, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 4.185056417049674e-06, | |
| "loss": 0.4304, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 0.6047720292936452, | |
| "grad_norm": 8.25, | |
| "learning_rate": 4.055154965891625e-06, | |
| "loss": 0.451, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6123316796598157, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 3.925911646920235e-06, | |
| "loss": 0.3851, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 0.6198913300259863, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 3.797416484621057e-06, | |
| "loss": 0.4486, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.6274509803921569, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 3.669758982350821e-06, | |
| "loss": 0.4258, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 0.6350106307583274, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 3.5430280599938204e-06, | |
| "loss": 0.4303, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.642570281124498, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 3.4173119920247454e-06, | |
| "loss": 0.4466, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6501299314906686, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 3.2926983460210564e-06, | |
| "loss": 0.4131, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.6576895818568391, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 3.1692739216677483e-06, | |
| "loss": 0.4672, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 0.6652492322230097, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 3.0471246902970032e-06, | |
| "loss": 0.4291, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.6728088825891803, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 2.926335735004817e-06, | |
| "loss": 0.4264, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 0.6803685329553508, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 2.8069911913863414e-06, | |
| "loss": 0.422, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.6879281833215214, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 2.689174188931202e-06, | |
| "loss": 0.4005, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 0.695487833687692, | |
| "grad_norm": 6.125, | |
| "learning_rate": 2.5729667931196103e-06, | |
| "loss": 0.4137, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.7030474840538625, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 2.4584499482596274e-06, | |
| "loss": 0.4145, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 0.7106071344200331, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 2.3457034211053703e-06, | |
| "loss": 0.4601, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.7181667847862037, | |
| "grad_norm": 9.0, | |
| "learning_rate": 2.234805745295457e-06, | |
| "loss": 0.4238, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7257264351523742, | |
| "grad_norm": 15.625, | |
| "learning_rate": 2.125834166650354e-06, | |
| "loss": 0.4579, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.7332860855185448, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 2.018864589366778e-06, | |
| "loss": 0.4183, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 0.7408457358847154, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.9139715231466014e-06, | |
| "loss": 0.4387, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.7484053862508859, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 1.811228031297077e-06, | |
| "loss": 0.4367, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 0.7559650366170565, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 1.7107056798385763e-06, | |
| "loss": 0.451, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.7635246869832271, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 1.6124744876552373e-06, | |
| "loss": 0.4101, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 0.7710843373493976, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 1.5166028777232884e-06, | |
| "loss": 0.3734, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.7786439877155682, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 1.4231576294510013e-06, | |
| "loss": 0.4194, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 0.7862036380817388, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 1.3322038321634567e-06, | |
| "loss": 0.465, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.7937632884479093, | |
| "grad_norm": 5.75, | |
| "learning_rate": 1.2438048397645558e-06, | |
| "loss": 0.4751, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8013229388140799, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.1580222266078367e-06, | |
| "loss": 0.401, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.8088825891802505, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.0749157446068242e-06, | |
| "loss": 0.4418, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 0.8164422395464209, | |
| "grad_norm": 7.375, | |
| "learning_rate": 9.945432816148175e-07, | |
| "loss": 0.4405, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.8240018899125915, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 9.169608211030783e-07, | |
| "loss": 0.4298, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 0.8315615402787621, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 8.422224031655313e-07, | |
| "loss": 0.4156, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.8391211906449326, | |
| "grad_norm": 21.25, | |
| "learning_rate": 7.703800868771e-07, | |
| "loss": 0.4467, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 0.8466808410111032, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 7.014839140319485e-07, | |
| "loss": 0.4443, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.8542404913772738, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 6.355818742868447e-07, | |
| "loss": 0.4381, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 0.8618001417434443, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 5.727198717339511e-07, | |
| "loss": 0.405, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.8693597921096149, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 5.129416929263031e-07, | |
| "loss": 0.4161, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.8769194424757855, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 4.5628897637827354e-07, | |
| "loss": 0.4294, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.884479092841956, | |
| "grad_norm": 6.0, | |
| "learning_rate": 4.028011835622492e-07, | |
| "loss": 0.4084, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 0.8920387432081266, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 3.525155714217227e-07, | |
| "loss": 0.3902, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.8995983935742972, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 3.054671664199543e-07, | |
| "loss": 0.423, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 0.9071580439404677, | |
| "grad_norm": 6.75, | |
| "learning_rate": 2.616887401422796e-07, | |
| "loss": 0.4108, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.9147176943066383, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 2.212107864690438e-07, | |
| "loss": 0.4546, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 0.9222773446728089, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.8406150033507764e-07, | |
| "loss": 0.4434, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.9298369950389794, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.502667580905054e-07, | |
| "loss": 0.4149, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 0.93739664540515, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 1.1985009947656278e-07, | |
| "loss": 0.4504, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.9449562957713206, | |
| "grad_norm": 5.625, | |
| "learning_rate": 9.283271122898174e-08, | |
| "loss": 0.437, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.9525159461374911, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 6.923341232035863e-08, | |
| "loss": 0.4205, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.9600755965036617, | |
| "grad_norm": 9.625, | |
| "learning_rate": 4.9068640851792636e-08, | |
| "loss": 0.4221, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 0.9676352468698323, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 3.235244260292147e-08, | |
| "loss": 0.4172, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.9751948972360028, | |
| "grad_norm": 6.0, | |
| "learning_rate": 1.909646124832576e-08, | |
| "loss": 0.415, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.9827545476021734, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 9.30993024712279e-09, | |
| "loss": 0.4298, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.990314197968344, | |
| "grad_norm": 5.25, | |
| "learning_rate": 2.999666411398483e-09, | |
| "loss": 0.3673, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 0.9978738483345145, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.7006515795336963e-10, | |
| "loss": 0.422, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 4233, | |
| "total_flos": 7.498056185637274e+16, | |
| "train_loss": 0.4399125433520113, | |
| "train_runtime": 1541.2359, | |
| "train_samples_per_second": 10.985, | |
| "train_steps_per_second": 2.746 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 4233, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.498056185637274e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |