| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5675488430095608, | |
| "eval_steps": 1024, | |
| "global_step": 12288, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011823934229365849, | |
| "grad_norm": 1.1381809711456299, | |
| "learning_rate": 1.9615384615384617e-05, | |
| "loss": 10.3904, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.023647868458731697, | |
| "grad_norm": 1.2398728132247925, | |
| "learning_rate": 3.930769230769231e-05, | |
| "loss": 7.9162, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.03547180268809755, | |
| "grad_norm": 1.1707866191864014, | |
| "learning_rate": 4.999617095521894e-05, | |
| "loss": 5.6793, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "grad_norm": 0.7125558853149414, | |
| "learning_rate": 4.9961092368776736e-05, | |
| "loss": 3.8181, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "eval_acr_loss": 0.9955622624588883, | |
| "eval_across_var": 0.0022213522599452947, | |
| "eval_bleu": 0.5783933701616512, | |
| "eval_ce_loss": 2.3272575776870936, | |
| "eval_cos_loss": 0.9208345009039526, | |
| "eval_cov": 0.0631596116714826, | |
| "eval_cov_loss": 0.006449785340815511, | |
| "eval_global_kurtosis": 3.5549826104891356, | |
| "eval_global_mean": -0.0015338476934389437, | |
| "eval_global_var": 0.2108087409032534, | |
| "eval_loss": 2.798912787001971, | |
| "eval_mse_loss": 1.9040727988225685, | |
| "eval_per_var": 0.2048030487478596, | |
| "eval_within_var": 0.2086275832633994, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "eval_acr_loss": 0.9955622624588883, | |
| "eval_across_var": 0.0022213522599452947, | |
| "eval_bleu": 0.5783933701616512, | |
| "eval_ce_loss": 2.3272575776870936, | |
| "eval_cos_loss": 0.9208345009039526, | |
| "eval_cov": 0.0631596116714826, | |
| "eval_cov_loss": 0.006449785340815511, | |
| "eval_global_kurtosis": 3.5549826104891356, | |
| "eval_global_mean": -0.0015338476934389437, | |
| "eval_global_var": 0.2108087409032534, | |
| "eval_loss": 2.798912787001971, | |
| "eval_mse_loss": 1.9040727988225685, | |
| "eval_per_var": 0.2048030487478596, | |
| "eval_runtime": 160.1579, | |
| "eval_samples_per_second": 174.784, | |
| "eval_steps_per_second": 2.735, | |
| "eval_within_var": 0.2086275832633994, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.05911967114682925, | |
| "grad_norm": 0.39322802424430847, | |
| "learning_rate": 4.988941132556799e-05, | |
| "loss": 2.7155, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.0709436053761951, | |
| "grad_norm": 0.3898029327392578, | |
| "learning_rate": 4.9781232937269974e-05, | |
| "loss": 2.1064, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.08276753960556095, | |
| "grad_norm": 0.2939172387123108, | |
| "learning_rate": 4.963671583455164e-05, | |
| "loss": 1.7128, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "grad_norm": 0.27661314606666565, | |
| "learning_rate": 4.945607193446079e-05, | |
| "loss": 1.4405, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "eval_acr_loss": 0.9952529187071814, | |
| "eval_across_var": 0.0023763778212280372, | |
| "eval_bleu": 0.8110985901711817, | |
| "eval_ce_loss": 0.6872511814990544, | |
| "eval_cos_loss": 0.7538281038199386, | |
| "eval_cov": 0.06513830733625856, | |
| "eval_cov_loss": 0.006921744725739235, | |
| "eval_global_kurtosis": 3.977810041000854, | |
| "eval_global_mean": -0.0010168063858328345, | |
| "eval_global_var": 0.2243758249500571, | |
| "eval_loss": 1.0947843177677834, | |
| "eval_mse_loss": 1.6233446619826364, | |
| "eval_per_var": 0.21794232930222604, | |
| "eval_within_var": 0.22208662303887547, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "eval_acr_loss": 0.9952529187071814, | |
| "eval_across_var": 0.0023763778212280372, | |
| "eval_bleu": 0.8110985901711817, | |
| "eval_ce_loss": 0.6872511814990544, | |
| "eval_cos_loss": 0.7538281038199386, | |
| "eval_cov": 0.06513830733625856, | |
| "eval_cov_loss": 0.006921744725739235, | |
| "eval_global_kurtosis": 3.977810041000854, | |
| "eval_global_mean": -0.0010168063858328345, | |
| "eval_global_var": 0.2243758249500571, | |
| "eval_loss": 1.0947843177677834, | |
| "eval_mse_loss": 1.6233446619826364, | |
| "eval_per_var": 0.21794232930222604, | |
| "eval_runtime": 155.3139, | |
| "eval_samples_per_second": 180.235, | |
| "eval_steps_per_second": 2.82, | |
| "eval_within_var": 0.22208662303887547, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.10641540806429264, | |
| "grad_norm": 0.19573667645454407, | |
| "learning_rate": 4.923956612967301e-05, | |
| "loss": 1.2426, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.1182393422936585, | |
| "grad_norm": 0.19026412069797516, | |
| "learning_rate": 4.898751590005826e-05, | |
| "loss": 1.0857, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.13006327652302435, | |
| "grad_norm": 0.17090244591236115, | |
| "learning_rate": 4.870029084713462e-05, | |
| "loss": 0.9634, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "grad_norm": 0.15054035186767578, | |
| "learning_rate": 4.837831215209188e-05, | |
| "loss": 0.866, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "eval_acr_loss": 0.9946045773486568, | |
| "eval_across_var": 0.002701379077032753, | |
| "eval_bleu": 0.8988671192714057, | |
| "eval_ce_loss": 0.32117312840402945, | |
| "eval_cos_loss": 0.5974249733637457, | |
| "eval_cov": 0.06391607136486872, | |
| "eval_cov_loss": 0.006677505914249445, | |
| "eval_global_kurtosis": 4.353016280692462, | |
| "eval_global_mean": 0.0002520815150378502, | |
| "eval_global_var": 0.2451829605450913, | |
| "eval_loss": 0.6672432675753555, | |
| "eval_mse_loss": 1.3342886499073952, | |
| "eval_per_var": 0.23811486649186644, | |
| "eval_within_var": 0.2426148920094586, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "eval_acr_loss": 0.9946045773486568, | |
| "eval_across_var": 0.002701379077032753, | |
| "eval_bleu": 0.8988671192714057, | |
| "eval_ce_loss": 0.32117312840402945, | |
| "eval_cos_loss": 0.5974249733637457, | |
| "eval_cov": 0.06391607136486872, | |
| "eval_cov_loss": 0.006677505914249445, | |
| "eval_global_kurtosis": 4.353016280692462, | |
| "eval_global_mean": 0.0002520815150378502, | |
| "eval_global_var": 0.2451829605450913, | |
| "eval_loss": 0.6672432675753555, | |
| "eval_mse_loss": 1.3342886499073952, | |
| "eval_per_var": 0.23811486649186644, | |
| "eval_runtime": 155.4012, | |
| "eval_samples_per_second": 180.134, | |
| "eval_steps_per_second": 2.819, | |
| "eval_within_var": 0.2426148920094586, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.15371114498175603, | |
| "grad_norm": 0.1389162540435791, | |
| "learning_rate": 4.802205195817963e-05, | |
| "loss": 0.7877, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.1655350792111219, | |
| "grad_norm": 0.13399961590766907, | |
| "learning_rate": 4.763203267836576e-05, | |
| "loss": 0.7208, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.17735901344048774, | |
| "grad_norm": 0.11527423560619354, | |
| "learning_rate": 4.720882622928019e-05, | |
| "loss": 0.6654, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "grad_norm": 0.11336533725261688, | |
| "learning_rate": 4.675305319256765e-05, | |
| "loss": 0.6197, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "eval_acr_loss": 0.9938907430052213, | |
| "eval_across_var": 0.0030593323472410984, | |
| "eval_bleu": 0.9366627287855721, | |
| "eval_ce_loss": 0.1859381996333327, | |
| "eval_cos_loss": 0.4787035355829213, | |
| "eval_cov": 0.0628127006635274, | |
| "eval_cov_loss": 0.006437324723186286, | |
| "eval_global_kurtosis": 4.670551087758312, | |
| "eval_global_mean": -0.0007838325141227408, | |
| "eval_global_var": 0.2651386696454053, | |
| "eval_loss": 0.48502123158544164, | |
| "eval_mse_loss": 1.1087831166236912, | |
| "eval_per_var": 0.257416468232734, | |
| "eval_within_var": 0.2622626631363342, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "eval_acr_loss": 0.9938907430052213, | |
| "eval_across_var": 0.0030593323472410984, | |
| "eval_bleu": 0.9366627287855721, | |
| "eval_ce_loss": 0.1859381996333327, | |
| "eval_cos_loss": 0.4787035355829213, | |
| "eval_cov": 0.0628127006635274, | |
| "eval_cov_loss": 0.006437324723186286, | |
| "eval_global_kurtosis": 4.670551087758312, | |
| "eval_global_mean": -0.0007838325141227408, | |
| "eval_global_var": 0.2651386696454053, | |
| "eval_loss": 0.48502123158544164, | |
| "eval_mse_loss": 1.1087831166236912, | |
| "eval_per_var": 0.257416468232734, | |
| "eval_runtime": 156.5211, | |
| "eval_samples_per_second": 178.845, | |
| "eval_steps_per_second": 2.798, | |
| "eval_within_var": 0.2622626631363342, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.20100688189921945, | |
| "grad_norm": 0.11432790011167526, | |
| "learning_rate": 4.6265381904878854e-05, | |
| "loss": 0.5778, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.2128308161285853, | |
| "grad_norm": 0.1015966460108757, | |
| "learning_rate": 4.57465274778347e-05, | |
| "loss": 0.5464, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.22465475035795113, | |
| "grad_norm": 0.0999189242720604, | |
| "learning_rate": 4.519725074940068e-05, | |
| "loss": 0.5131, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "grad_norm": 0.09335104376077652, | |
| "learning_rate": 4.461835716820895e-05, | |
| "loss": 0.4865, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "eval_acr_loss": 0.993116364903646, | |
| "eval_across_var": 0.0034477898343912745, | |
| "eval_bleu": 0.9567762367252449, | |
| "eval_ce_loss": 0.12156322401136024, | |
| "eval_cos_loss": 0.39367829685069655, | |
| "eval_cov": 0.06181878912938784, | |
| "eval_cov_loss": 0.006211797137995629, | |
| "eval_global_kurtosis": 4.93804465472426, | |
| "eval_global_mean": -0.00010290959654333384, | |
| "eval_global_var": 0.2838530518692922, | |
| "eval_loss": 0.3869279193687657, | |
| "eval_mse_loss": 0.946567648213748, | |
| "eval_per_var": 0.2755494836258562, | |
| "eval_within_var": 0.28062804267831043, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "eval_acr_loss": 0.993116364903646, | |
| "eval_across_var": 0.0034477898343912745, | |
| "eval_bleu": 0.9567762367252449, | |
| "eval_ce_loss": 0.12156322401136024, | |
| "eval_cos_loss": 0.39367829685069655, | |
| "eval_cov": 0.06181878912938784, | |
| "eval_cov_loss": 0.006211797137995629, | |
| "eval_global_kurtosis": 4.93804465472426, | |
| "eval_global_mean": -0.00010290959654333384, | |
| "eval_global_var": 0.2838530518692922, | |
| "eval_loss": 0.3869279193687657, | |
| "eval_mse_loss": 0.946567648213748, | |
| "eval_per_var": 0.2755494836258562, | |
| "eval_runtime": 155.284, | |
| "eval_samples_per_second": 180.27, | |
| "eval_steps_per_second": 2.821, | |
| "eval_within_var": 0.28062804267831043, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.24830261881668284, | |
| "grad_norm": 0.0941869243979454, | |
| "learning_rate": 4.401069561246422e-05, | |
| "loss": 0.4632, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.2601265530460487, | |
| "grad_norm": 0.09946911782026291, | |
| "learning_rate": 4.337515714516545e-05, | |
| "loss": 0.4419, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 0.27195048727541454, | |
| "grad_norm": 0.1010931134223938, | |
| "learning_rate": 4.2712673707468434e-05, | |
| "loss": 0.4267, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "grad_norm": 0.08404899388551712, | |
| "learning_rate": 4.202421675210565e-05, | |
| "loss": 0.4103, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "eval_acr_loss": 0.9921849604066648, | |
| "eval_across_var": 0.003915222487135973, | |
| "eval_bleu": 0.968595516462779, | |
| "eval_ce_loss": 0.08580248714445933, | |
| "eval_cos_loss": 0.3347833108956411, | |
| "eval_cov": 0.06079366326876427, | |
| "eval_cov_loss": 0.006003422991130246, | |
| "eval_global_kurtosis": 5.130746393987577, | |
| "eval_global_mean": -5.609749659011353e-05, | |
| "eval_global_var": 0.3024344945062785, | |
| "eval_loss": 0.3278743654625601, | |
| "eval_mse_loss": 0.836360767143502, | |
| "eval_per_var": 0.2935495594320776, | |
| "eval_within_var": 0.2987868603506045, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "eval_acr_loss": 0.9921849604066648, | |
| "eval_across_var": 0.003915222487135973, | |
| "eval_bleu": 0.968595516462779, | |
| "eval_ce_loss": 0.08580248714445933, | |
| "eval_cos_loss": 0.3347833108956411, | |
| "eval_cov": 0.06079366326876427, | |
| "eval_cov_loss": 0.006003422991130246, | |
| "eval_global_kurtosis": 5.130746393987577, | |
| "eval_global_mean": -5.609749659011353e-05, | |
| "eval_global_var": 0.3024344945062785, | |
| "eval_loss": 0.3278743654625601, | |
| "eval_mse_loss": 0.836360767143502, | |
| "eval_per_var": 0.2935495594320776, | |
| "eval_runtime": 154.5518, | |
| "eval_samples_per_second": 181.124, | |
| "eval_steps_per_second": 2.834, | |
| "eval_within_var": 0.2987868603506045, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2955983557341462, | |
| "grad_norm": 0.08095081150531769, | |
| "learning_rate": 4.131079581886694e-05, | |
| "loss": 0.393, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.30742228996351206, | |
| "grad_norm": 0.08812420070171356, | |
| "learning_rate": 4.057345705423016e-05, | |
| "loss": 0.3806, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 0.3192462241928779, | |
| "grad_norm": 0.08378447592258453, | |
| "learning_rate": 3.981328167731251e-05, | |
| "loss": 0.3703, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "grad_norm": 0.0990639477968216, | |
| "learning_rate": 3.9031384394391954e-05, | |
| "loss": 0.3564, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "eval_acr_loss": 0.9909341404699299, | |
| "eval_across_var": 0.004543303069233316, | |
| "eval_bleu": 0.9765803459841506, | |
| "eval_ce_loss": 0.06368093232551938, | |
| "eval_cos_loss": 0.2937984631894386, | |
| "eval_cov": 0.059937150511023114, | |
| "eval_cov_loss": 0.005822429295255033, | |
| "eval_global_kurtosis": 5.257898571284394, | |
| "eval_global_mean": -0.0003388445126955912, | |
| "eval_global_var": 0.3208498323344749, | |
| "eval_loss": 0.28962595475046604, | |
| "eval_mse_loss": 0.762596405532262, | |
| "eval_per_var": 0.311440385095605, | |
| "eval_within_var": 0.31661272981123295, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "eval_acr_loss": 0.9909341404699299, | |
| "eval_across_var": 0.004543303069233316, | |
| "eval_bleu": 0.9765803459841506, | |
| "eval_ce_loss": 0.06368093232551938, | |
| "eval_cos_loss": 0.2937984631894386, | |
| "eval_cov": 0.059937150511023114, | |
| "eval_cov_loss": 0.005822429295255033, | |
| "eval_global_kurtosis": 5.257898571284394, | |
| "eval_global_mean": -0.0003388445126955912, | |
| "eval_global_var": 0.3208498323344749, | |
| "eval_loss": 0.28962595475046604, | |
| "eval_mse_loss": 0.762596405532262, | |
| "eval_per_var": 0.311440385095605, | |
| "eval_runtime": 152.728, | |
| "eval_samples_per_second": 183.287, | |
| "eval_steps_per_second": 2.868, | |
| "eval_within_var": 0.31661272981123295, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.34289409265160964, | |
| "grad_norm": 0.07939422130584717, | |
| "learning_rate": 3.822891176432382e-05, | |
| "loss": 0.3491, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 0.3547180268809755, | |
| "grad_norm": 0.0864938348531723, | |
| "learning_rate": 3.7407040517249335e-05, | |
| "loss": 0.3399, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.3665419611103413, | |
| "grad_norm": 0.08456117659807205, | |
| "learning_rate": 3.6566975829061614e-05, | |
| "loss": 0.3307, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "grad_norm": 0.06939388811588287, | |
| "learning_rate": 3.5709949554159355e-05, | |
| "loss": 0.3222, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "eval_acr_loss": 0.9887979859358644, | |
| "eval_across_var": 0.005616884891232943, | |
| "eval_bleu": 0.9814701772398943, | |
| "eval_ce_loss": 0.04945951889877178, | |
| "eval_cos_loss": 0.2659321248395258, | |
| "eval_cov": 0.05915525950253282, | |
| "eval_cov_loss": 0.005664959631766699, | |
| "eval_global_kurtosis": 5.291959479519221, | |
| "eval_global_mean": 0.00022150909519631025, | |
| "eval_global_var": 0.3408019183433219, | |
| "eval_loss": 0.26449544942134046, | |
| "eval_mse_loss": 0.7161998211248825, | |
| "eval_per_var": 0.3309253710045662, | |
| "eval_within_var": 0.3355388471252842, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "eval_acr_loss": 0.9887979859358644, | |
| "eval_across_var": 0.005616884891232943, | |
| "eval_bleu": 0.9814701772398943, | |
| "eval_ce_loss": 0.04945951889877178, | |
| "eval_cos_loss": 0.2659321248395258, | |
| "eval_cov": 0.05915525950253282, | |
| "eval_cov_loss": 0.005664959631766699, | |
| "eval_global_kurtosis": 5.291959479519221, | |
| "eval_global_mean": 0.00022150909519631025, | |
| "eval_global_var": 0.3408019183433219, | |
| "eval_loss": 0.26449544942134046, | |
| "eval_mse_loss": 0.7161998211248825, | |
| "eval_per_var": 0.3309253710045662, | |
| "eval_runtime": 152.3388, | |
| "eval_samples_per_second": 183.755, | |
| "eval_steps_per_second": 2.875, | |
| "eval_within_var": 0.3355388471252842, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.390189829569073, | |
| "grad_norm": 0.07356765121221542, | |
| "learning_rate": 3.483721841907964e-05, | |
| "loss": 0.3166, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 0.4020137637984389, | |
| "grad_norm": 0.10812926292419434, | |
| "learning_rate": 3.395006217965885e-05, | |
| "loss": 0.3106, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 0.41383769802780473, | |
| "grad_norm": 0.08505494147539139, | |
| "learning_rate": 3.3049781744423665e-05, | |
| "loss": 0.3032, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "grad_norm": 0.07096228003501892, | |
| "learning_rate": 3.213769726696439e-05, | |
| "loss": 0.2986, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "eval_acr_loss": 0.9829728993923152, | |
| "eval_across_var": 0.008550605183832993, | |
| "eval_bleu": 0.9851257395247456, | |
| "eval_ce_loss": 0.0397455885402484, | |
| "eval_cos_loss": 0.24592996208362927, | |
| "eval_cov": 0.05898399875588613, | |
| "eval_cov_loss": 0.005632478560929157, | |
| "eval_global_kurtosis": 5.217721328343431, | |
| "eval_global_mean": 0.0015772416439230584, | |
| "eval_global_var": 0.3661223111087329, | |
| "eval_loss": 0.24666939634982854, | |
| "eval_mse_loss": 0.6856855016592975, | |
| "eval_per_var": 0.355949227668379, | |
| "eval_within_var": 0.3579468384180983, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "eval_acr_loss": 0.9829728993923152, | |
| "eval_across_var": 0.008550605183832993, | |
| "eval_bleu": 0.9851257395247456, | |
| "eval_ce_loss": 0.0397455885402484, | |
| "eval_cos_loss": 0.24592996208362927, | |
| "eval_cov": 0.05898399875588613, | |
| "eval_cov_loss": 0.005632478560929157, | |
| "eval_global_kurtosis": 5.217721328343431, | |
| "eval_global_mean": 0.0015772416439230584, | |
| "eval_global_var": 0.3661223111087329, | |
| "eval_loss": 0.24666939634982854, | |
| "eval_mse_loss": 0.6856855016592975, | |
| "eval_per_var": 0.355949227668379, | |
| "eval_runtime": 150.6898, | |
| "eval_samples_per_second": 185.766, | |
| "eval_steps_per_second": 2.907, | |
| "eval_within_var": 0.3579468384180983, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4374855664865364, | |
| "grad_norm": 0.07909992337226868, | |
| "learning_rate": 3.121514621008757e-05, | |
| "loss": 0.294, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 0.44930950071590225, | |
| "grad_norm": 0.10224120318889618, | |
| "learning_rate": 3.0283481384586697e-05, | |
| "loss": 0.2906, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 0.4611334349452681, | |
| "grad_norm": 0.07880751043558121, | |
| "learning_rate": 2.9344068965507027e-05, | |
| "loss": 0.2855, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "grad_norm": 0.09268064051866531, | |
| "learning_rate": 2.839828648881323e-05, | |
| "loss": 0.2825, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "eval_acr_loss": 0.9428148437036227, | |
| "eval_across_var": 0.029026934383734722, | |
| "eval_bleu": 0.9874667625866159, | |
| "eval_ce_loss": 0.03316931340409673, | |
| "eval_cos_loss": 0.23319579296868687, | |
| "eval_cov": 0.06389001297624144, | |
| "eval_cov_loss": 0.0071894081860151325, | |
| "eval_global_kurtosis": 4.941469875100541, | |
| "eval_global_mean": 0.0006307871102198074, | |
| "eval_global_var": 0.4374793762485731, | |
| "eval_loss": 0.23165297341537258, | |
| "eval_mse_loss": 0.6704898216680849, | |
| "eval_per_var": 0.4275816477597032, | |
| "eval_within_var": 0.40892007232528843, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "eval_acr_loss": 0.9428148437036227, | |
| "eval_across_var": 0.029026934383734722, | |
| "eval_bleu": 0.9874667625866159, | |
| "eval_ce_loss": 0.03316931340409673, | |
| "eval_cos_loss": 0.23319579296868687, | |
| "eval_cov": 0.06389001297624144, | |
| "eval_cov_loss": 0.0071894081860151325, | |
| "eval_global_kurtosis": 4.941469875100541, | |
| "eval_global_mean": 0.0006307871102198074, | |
| "eval_global_var": 0.4374793762485731, | |
| "eval_loss": 0.23165297341537258, | |
| "eval_mse_loss": 0.6704898216680849, | |
| "eval_per_var": 0.4275816477597032, | |
| "eval_runtime": 151.8849, | |
| "eval_samples_per_second": 184.304, | |
| "eval_steps_per_second": 2.884, | |
| "eval_within_var": 0.40892007232528843, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.48478130340399983, | |
| "grad_norm": 0.12065292149782181, | |
| "learning_rate": 2.7447520831397623e-05, | |
| "loss": 0.2767, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 0.49660523763336567, | |
| "grad_norm": 0.12913434207439423, | |
| "learning_rate": 2.6493166177391138e-05, | |
| "loss": 0.2652, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 0.5084291718627315, | |
| "grad_norm": 0.14671213924884796, | |
| "learning_rate": 2.5536621973758952e-05, | |
| "loss": 0.2329, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "grad_norm": 0.13414837419986725, | |
| "learning_rate": 2.4579290878178904e-05, | |
| "loss": 0.2016, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "eval_acr_loss": 0.02229548650542855, | |
| "eval_across_var": 0.9048832782871647, | |
| "eval_bleu": 0.987608544196938, | |
| "eval_ce_loss": 0.031427863691869666, | |
| "eval_cos_loss": 0.2381418139490907, | |
| "eval_cov": 0.05971070729434218, | |
| "eval_cov_loss": 0.007483123698522715, | |
| "eval_global_kurtosis": 23.39030278872137, | |
| "eval_global_mean": -0.007102423879109561, | |
| "eval_global_var": 1.6357934681792237, | |
| "eval_loss": 0.14085906721889702, | |
| "eval_mse_loss": 0.7002158846757184, | |
| "eval_per_var": 1.6518420911815068, | |
| "eval_within_var": 0.7377716272933298, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "eval_acr_loss": 0.02229548650542855, | |
| "eval_across_var": 0.9048832782871647, | |
| "eval_bleu": 0.987608544196938, | |
| "eval_ce_loss": 0.031427863691869666, | |
| "eval_cos_loss": 0.2381418139490907, | |
| "eval_cov": 0.05971070729434218, | |
| "eval_cov_loss": 0.007483123698522715, | |
| "eval_global_kurtosis": 23.39030278872137, | |
| "eval_global_mean": -0.007102423879109561, | |
| "eval_global_var": 1.6357934681792237, | |
| "eval_loss": 0.14085906721889702, | |
| "eval_mse_loss": 0.7002158846757184, | |
| "eval_per_var": 1.6518420911815068, | |
| "eval_runtime": 152.4278, | |
| "eval_samples_per_second": 183.648, | |
| "eval_steps_per_second": 2.873, | |
| "eval_within_var": 0.7377716272933298, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5320770403214632, | |
| "grad_norm": 0.1328643560409546, | |
| "learning_rate": 2.362257670221181e-05, | |
| "loss": 0.1901, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.5439009745508291, | |
| "grad_norm": 0.1011626198887825, | |
| "learning_rate": 2.2667882352779608e-05, | |
| "loss": 0.1844, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 0.5557249087801949, | |
| "grad_norm": 0.11414045095443726, | |
| "learning_rate": 2.1720315230424133e-05, | |
| "loss": 0.18, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "grad_norm": 0.1001119315624237, | |
| "learning_rate": 2.0773833841855016e-05, | |
| "loss": 0.1774, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "eval_acr_loss": 0.016273215680705114, | |
| "eval_across_var": 0.9544410968207877, | |
| "eval_bleu": 0.9895506585988, | |
| "eval_ce_loss": 0.026713626858986678, | |
| "eval_cos_loss": 0.22355560778074612, | |
| "eval_cov": 0.055647828263234875, | |
| "eval_cov_loss": 0.006082957281973468, | |
| "eval_global_kurtosis": 32.18490342353577, | |
| "eval_global_mean": -0.013440034569126286, | |
| "eval_global_var": 1.8210059039668949, | |
| "eval_loss": 0.1298266947609649, | |
| "eval_mse_loss": 0.6762152809530633, | |
| "eval_per_var": 1.8439484339326484, | |
| "eval_within_var": 0.8750175644545795, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "eval_acr_loss": 0.016273215680705114, | |
| "eval_across_var": 0.9544410968207877, | |
| "eval_bleu": 0.9895506585988, | |
| "eval_ce_loss": 0.026713626858986678, | |
| "eval_cos_loss": 0.22355560778074612, | |
| "eval_cov": 0.055647828263234875, | |
| "eval_cov_loss": 0.006082957281973468, | |
| "eval_global_kurtosis": 32.18490342353577, | |
| "eval_global_mean": -0.013440034569126286, | |
| "eval_global_var": 1.8210059039668949, | |
| "eval_loss": 0.1298266947609649, | |
| "eval_mse_loss": 0.6762152809530633, | |
| "eval_per_var": 1.8439484339326484, | |
| "eval_runtime": 150.5474, | |
| "eval_samples_per_second": 185.941, | |
| "eval_steps_per_second": 2.909, | |
| "eval_within_var": 0.8750175644545795, | |
| "step": 12288 | |
| } | |
| ], | |
| "logging_steps": 256, | |
| "max_steps": 21651, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1024, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |