{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5675488430095608, "eval_steps": 1024, "global_step": 12288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.1381809711456299, "learning_rate": 1.9615384615384617e-05, "loss": 10.3904, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 1.2398728132247925, "learning_rate": 3.930769230769231e-05, "loss": 7.9162, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 1.1707866191864014, "learning_rate": 4.999617095521894e-05, "loss": 5.6793, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.7125558853149414, "learning_rate": 4.9961092368776736e-05, "loss": 3.8181, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_acr_loss": 0.9955622624588883, "eval_across_var": 0.0022213522599452947, "eval_bleu": 0.5783933701616512, "eval_ce_loss": 2.3272575776870936, "eval_cos_loss": 0.9208345009039526, "eval_cov": 0.0631596116714826, "eval_cov_loss": 0.006449785340815511, "eval_global_kurtosis": 3.5549826104891356, "eval_global_mean": -0.0015338476934389437, "eval_global_var": 0.2108087409032534, "eval_loss": 2.798912787001971, "eval_mse_loss": 1.9040727988225685, "eval_per_var": 0.2048030487478596, "eval_within_var": 0.2086275832633994, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_acr_loss": 0.9955622624588883, "eval_across_var": 0.0022213522599452947, "eval_bleu": 0.5783933701616512, "eval_ce_loss": 2.3272575776870936, "eval_cos_loss": 0.9208345009039526, "eval_cov": 0.0631596116714826, "eval_cov_loss": 0.006449785340815511, "eval_global_kurtosis": 3.5549826104891356, "eval_global_mean": -0.0015338476934389437, "eval_global_var": 0.2108087409032534, "eval_loss": 2.798912787001971, "eval_mse_loss": 1.9040727988225685, "eval_per_var": 0.2048030487478596, "eval_runtime": 160.1579, "eval_samples_per_second": 174.784, "eval_steps_per_second": 2.735, "eval_within_var": 0.2086275832633994, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.39322802424430847, "learning_rate": 4.988941132556799e-05, "loss": 2.7155, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.3898029327392578, "learning_rate": 4.9781232937269974e-05, "loss": 2.1064, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.2939172387123108, "learning_rate": 4.963671583455164e-05, "loss": 1.7128, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.27661314606666565, "learning_rate": 4.945607193446079e-05, "loss": 1.4405, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_acr_loss": 0.9952529187071814, "eval_across_var": 0.0023763778212280372, "eval_bleu": 0.8110985901711817, "eval_ce_loss": 0.6872511814990544, "eval_cos_loss": 0.7538281038199386, "eval_cov": 0.06513830733625856, "eval_cov_loss": 0.006921744725739235, "eval_global_kurtosis": 3.977810041000854, "eval_global_mean": -0.0010168063858328345, "eval_global_var": 0.2243758249500571, "eval_loss": 1.0947843177677834, "eval_mse_loss": 1.6233446619826364, "eval_per_var": 0.21794232930222604, "eval_within_var": 0.22208662303887547, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_acr_loss": 0.9952529187071814, "eval_across_var": 0.0023763778212280372, "eval_bleu": 0.8110985901711817, "eval_ce_loss": 0.6872511814990544, "eval_cos_loss": 0.7538281038199386, "eval_cov": 0.06513830733625856, "eval_cov_loss": 0.006921744725739235, "eval_global_kurtosis": 3.977810041000854, "eval_global_mean": -0.0010168063858328345, "eval_global_var": 0.2243758249500571, "eval_loss": 1.0947843177677834, "eval_mse_loss": 1.6233446619826364, "eval_per_var": 0.21794232930222604, "eval_runtime": 155.3139, "eval_samples_per_second": 180.235, "eval_steps_per_second": 2.82, "eval_within_var": 0.22208662303887547, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.19573667645454407, "learning_rate": 4.923956612967301e-05, "loss": 1.2426, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.19026412069797516, "learning_rate": 4.898751590005826e-05, "loss": 1.0857, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.17090244591236115, "learning_rate": 4.870029084713462e-05, "loss": 0.9634, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.15054035186767578, "learning_rate": 4.837831215209188e-05, "loss": 0.866, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_acr_loss": 0.9946045773486568, "eval_across_var": 0.002701379077032753, "eval_bleu": 0.8988671192714057, "eval_ce_loss": 0.32117312840402945, "eval_cos_loss": 0.5974249733637457, "eval_cov": 0.06391607136486872, "eval_cov_loss": 0.006677505914249445, "eval_global_kurtosis": 4.353016280692462, "eval_global_mean": 0.0002520815150378502, "eval_global_var": 0.2451829605450913, "eval_loss": 0.6672432675753555, "eval_mse_loss": 1.3342886499073952, "eval_per_var": 0.23811486649186644, "eval_within_var": 0.2426148920094586, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_acr_loss": 0.9946045773486568, "eval_across_var": 0.002701379077032753, "eval_bleu": 0.8988671192714057, "eval_ce_loss": 0.32117312840402945, "eval_cos_loss": 0.5974249733637457, "eval_cov": 0.06391607136486872, "eval_cov_loss": 0.006677505914249445, "eval_global_kurtosis": 4.353016280692462, "eval_global_mean": 0.0002520815150378502, "eval_global_var": 0.2451829605450913, "eval_loss": 0.6672432675753555, "eval_mse_loss": 1.3342886499073952, "eval_per_var": 0.23811486649186644, "eval_runtime": 155.4012, "eval_samples_per_second": 180.134, "eval_steps_per_second": 2.819, "eval_within_var": 0.2426148920094586, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.1389162540435791, "learning_rate": 4.802205195817963e-05, "loss": 0.7877, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.13399961590766907, "learning_rate": 4.763203267836576e-05, "loss": 0.7208, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.11527423560619354, "learning_rate": 4.720882622928019e-05, "loss": 0.6654, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.11336533725261688, "learning_rate": 4.675305319256765e-05, "loss": 0.6197, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_acr_loss": 0.9938907430052213, "eval_across_var": 0.0030593323472410984, "eval_bleu": 0.9366627287855721, "eval_ce_loss": 0.1859381996333327, "eval_cos_loss": 0.4787035355829213, "eval_cov": 0.0628127006635274, "eval_cov_loss": 0.006437324723186286, "eval_global_kurtosis": 4.670551087758312, "eval_global_mean": -0.0007838325141227408, "eval_global_var": 0.2651386696454053, "eval_loss": 0.48502123158544164, "eval_mse_loss": 1.1087831166236912, "eval_per_var": 0.257416468232734, "eval_within_var": 0.2622626631363342, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_acr_loss": 0.9938907430052213, "eval_across_var": 0.0030593323472410984, "eval_bleu": 0.9366627287855721, "eval_ce_loss": 0.1859381996333327, "eval_cos_loss": 0.4787035355829213, "eval_cov": 0.0628127006635274, "eval_cov_loss": 0.006437324723186286, "eval_global_kurtosis": 4.670551087758312, "eval_global_mean": -0.0007838325141227408, "eval_global_var": 0.2651386696454053, "eval_loss": 0.48502123158544164, "eval_mse_loss": 1.1087831166236912, "eval_per_var": 0.257416468232734, "eval_runtime": 156.5211, "eval_samples_per_second": 178.845, "eval_steps_per_second": 2.798, "eval_within_var": 0.2622626631363342, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.11432790011167526, "learning_rate": 4.6265381904878854e-05, "loss": 0.5778, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.1015966460108757, "learning_rate": 4.57465274778347e-05, "loss": 0.5464, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.0999189242720604, "learning_rate": 4.519725074940068e-05, "loss": 0.5131, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.09335104376077652, "learning_rate": 4.461835716820895e-05, "loss": 0.4865, "step": 5120 }, { "epoch": 0.236478684587317, "eval_acr_loss": 0.993116364903646, "eval_across_var": 0.0034477898343912745, "eval_bleu": 0.9567762367252449, "eval_ce_loss": 0.12156322401136024, "eval_cos_loss": 0.39367829685069655, "eval_cov": 0.06181878912938784, "eval_cov_loss": 0.006211797137995629, "eval_global_kurtosis": 4.93804465472426, "eval_global_mean": -0.00010290959654333384, "eval_global_var": 0.2838530518692922, "eval_loss": 0.3869279193687657, "eval_mse_loss": 0.946567648213748, "eval_per_var": 0.2755494836258562, "eval_within_var": 0.28062804267831043, "step": 5120 }, { "epoch": 0.236478684587317, "eval_acr_loss": 0.993116364903646, "eval_across_var": 0.0034477898343912745, "eval_bleu": 0.9567762367252449, "eval_ce_loss": 0.12156322401136024, "eval_cos_loss": 0.39367829685069655, "eval_cov": 0.06181878912938784, "eval_cov_loss": 0.006211797137995629, "eval_global_kurtosis": 4.93804465472426, "eval_global_mean": -0.00010290959654333384, "eval_global_var": 0.2838530518692922, "eval_loss": 0.3869279193687657, "eval_mse_loss": 0.946567648213748, "eval_per_var": 0.2755494836258562, "eval_runtime": 155.284, "eval_samples_per_second": 180.27, "eval_steps_per_second": 2.821, "eval_within_var": 0.28062804267831043, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.0941869243979454, "learning_rate": 4.401069561246422e-05, "loss": 0.4632, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.09946911782026291, "learning_rate": 4.337515714516545e-05, "loss": 0.4419, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.1010931134223938, "learning_rate": 4.2712673707468434e-05, "loss": 0.4267, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.08404899388551712, "learning_rate": 4.202421675210565e-05, "loss": 0.4103, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_acr_loss": 0.9921849604066648, "eval_across_var": 0.003915222487135973, "eval_bleu": 0.968595516462779, "eval_ce_loss": 0.08580248714445933, "eval_cos_loss": 0.3347833108956411, "eval_cov": 0.06079366326876427, "eval_cov_loss": 0.006003422991130246, "eval_global_kurtosis": 5.130746393987577, "eval_global_mean": -5.609749659011353e-05, "eval_global_var": 0.3024344945062785, "eval_loss": 0.3278743654625601, "eval_mse_loss": 0.836360767143502, "eval_per_var": 0.2935495594320776, "eval_within_var": 0.2987868603506045, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_acr_loss": 0.9921849604066648, "eval_across_var": 0.003915222487135973, "eval_bleu": 0.968595516462779, "eval_ce_loss": 0.08580248714445933, "eval_cos_loss": 0.3347833108956411, "eval_cov": 0.06079366326876427, "eval_cov_loss": 0.006003422991130246, "eval_global_kurtosis": 5.130746393987577, "eval_global_mean": -5.609749659011353e-05, "eval_global_var": 0.3024344945062785, "eval_loss": 0.3278743654625601, "eval_mse_loss": 0.836360767143502, "eval_per_var": 0.2935495594320776, "eval_runtime": 154.5518, "eval_samples_per_second": 181.124, "eval_steps_per_second": 2.834, "eval_within_var": 0.2987868603506045, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.08095081150531769, "learning_rate": 4.131079581886694e-05, "loss": 0.393, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.08812420070171356, "learning_rate": 4.057345705423016e-05, "loss": 0.3806, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.08378447592258453, "learning_rate": 3.981328167731251e-05, "loss": 0.3703, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.0990639477968216, "learning_rate": 3.9031384394391954e-05, "loss": 0.3564, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_acr_loss": 0.9909341404699299, "eval_across_var": 0.004543303069233316, "eval_bleu": 0.9765803459841506, "eval_ce_loss": 0.06368093232551938, "eval_cos_loss": 0.2937984631894386, "eval_cov": 0.059937150511023114, "eval_cov_loss": 0.005822429295255033, "eval_global_kurtosis": 5.257898571284394, "eval_global_mean": -0.0003388445126955912, "eval_global_var": 0.3208498323344749, "eval_loss": 0.28962595475046604, "eval_mse_loss": 0.762596405532262, "eval_per_var": 0.311440385095605, "eval_within_var": 0.31661272981123295, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_acr_loss": 0.9909341404699299, "eval_across_var": 0.004543303069233316, "eval_bleu": 0.9765803459841506, "eval_ce_loss": 0.06368093232551938, "eval_cos_loss": 0.2937984631894386, "eval_cov": 0.059937150511023114, "eval_cov_loss": 0.005822429295255033, "eval_global_kurtosis": 5.257898571284394, "eval_global_mean": -0.0003388445126955912, "eval_global_var": 0.3208498323344749, "eval_loss": 0.28962595475046604, "eval_mse_loss": 0.762596405532262, "eval_per_var": 0.311440385095605, "eval_runtime": 152.728, "eval_samples_per_second": 183.287, "eval_steps_per_second": 2.868, "eval_within_var": 0.31661272981123295, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.07939422130584717, "learning_rate": 3.822891176432382e-05, "loss": 0.3491, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.0864938348531723, "learning_rate": 3.7407040517249335e-05, "loss": 0.3399, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.08456117659807205, "learning_rate": 3.6566975829061614e-05, "loss": 0.3307, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.06939388811588287, "learning_rate": 3.5709949554159355e-05, "loss": 0.3222, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_acr_loss": 0.9887979859358644, "eval_across_var": 0.005616884891232943, "eval_bleu": 0.9814701772398943, "eval_ce_loss": 0.04945951889877178, "eval_cos_loss": 0.2659321248395258, "eval_cov": 0.05915525950253282, "eval_cov_loss": 0.005664959631766699, "eval_global_kurtosis": 5.291959479519221, "eval_global_mean": 0.00022150909519631025, "eval_global_var": 0.3408019183433219, "eval_loss": 0.26449544942134046, "eval_mse_loss": 0.7161998211248825, "eval_per_var": 0.3309253710045662, "eval_within_var": 0.3355388471252842, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_acr_loss": 0.9887979859358644, "eval_across_var": 0.005616884891232943, "eval_bleu": 0.9814701772398943, "eval_ce_loss": 0.04945951889877178, "eval_cos_loss": 0.2659321248395258, "eval_cov": 0.05915525950253282, "eval_cov_loss": 0.005664959631766699, "eval_global_kurtosis": 5.291959479519221, "eval_global_mean": 0.00022150909519631025, "eval_global_var": 0.3408019183433219, "eval_loss": 0.26449544942134046, "eval_mse_loss": 0.7161998211248825, "eval_per_var": 0.3309253710045662, "eval_runtime": 152.3388, "eval_samples_per_second": 183.755, "eval_steps_per_second": 2.875, "eval_within_var": 0.3355388471252842, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.07356765121221542, "learning_rate": 3.483721841907964e-05, "loss": 0.3166, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.10812926292419434, "learning_rate": 3.395006217965885e-05, "loss": 0.3106, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.08505494147539139, "learning_rate": 3.3049781744423665e-05, "loss": 0.3032, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.07096228003501892, "learning_rate": 3.213769726696439e-05, "loss": 0.2986, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_acr_loss": 0.9829728993923152, "eval_across_var": 0.008550605183832993, "eval_bleu": 0.9851257395247456, "eval_ce_loss": 0.0397455885402484, "eval_cos_loss": 0.24592996208362927, "eval_cov": 0.05898399875588613, "eval_cov_loss": 0.005632478560929157, "eval_global_kurtosis": 5.217721328343431, "eval_global_mean": 0.0015772416439230584, "eval_global_var": 0.3661223111087329, "eval_loss": 0.24666939634982854, "eval_mse_loss": 0.6856855016592975, "eval_per_var": 0.355949227668379, "eval_within_var": 0.3579468384180983, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_acr_loss": 0.9829728993923152, "eval_across_var": 0.008550605183832993, "eval_bleu": 0.9851257395247456, "eval_ce_loss": 0.0397455885402484, "eval_cos_loss": 0.24592996208362927, "eval_cov": 0.05898399875588613, "eval_cov_loss": 0.005632478560929157, "eval_global_kurtosis": 5.217721328343431, "eval_global_mean": 0.0015772416439230584, "eval_global_var": 0.3661223111087329, "eval_loss": 0.24666939634982854, "eval_mse_loss": 0.6856855016592975, "eval_per_var": 0.355949227668379, "eval_runtime": 150.6898, "eval_samples_per_second": 185.766, "eval_steps_per_second": 2.907, "eval_within_var": 0.3579468384180983, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.07909992337226868, "learning_rate": 3.121514621008757e-05, "loss": 0.294, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.10224120318889618, "learning_rate": 3.0283481384586697e-05, "loss": 0.2906, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.07880751043558121, "learning_rate": 2.9344068965507027e-05, "loss": 0.2855, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.09268064051866531, "learning_rate": 2.839828648881323e-05, "loss": 0.2825, "step": 10240 }, { "epoch": 0.472957369174634, "eval_acr_loss": 0.9428148437036227, "eval_across_var": 0.029026934383734722, "eval_bleu": 0.9874667625866159, "eval_ce_loss": 0.03316931340409673, "eval_cos_loss": 0.23319579296868687, "eval_cov": 0.06389001297624144, "eval_cov_loss": 0.0071894081860151325, "eval_global_kurtosis": 4.941469875100541, "eval_global_mean": 0.0006307871102198074, "eval_global_var": 0.4374793762485731, "eval_loss": 0.23165297341537258, "eval_mse_loss": 0.6704898216680849, "eval_per_var": 0.4275816477597032, "eval_within_var": 0.40892007232528843, "step": 10240 }, { "epoch": 0.472957369174634, "eval_acr_loss": 0.9428148437036227, "eval_across_var": 0.029026934383734722, "eval_bleu": 0.9874667625866159, "eval_ce_loss": 0.03316931340409673, "eval_cos_loss": 0.23319579296868687, "eval_cov": 0.06389001297624144, "eval_cov_loss": 0.0071894081860151325, "eval_global_kurtosis": 4.941469875100541, "eval_global_mean": 0.0006307871102198074, "eval_global_var": 0.4374793762485731, "eval_loss": 0.23165297341537258, "eval_mse_loss": 0.6704898216680849, "eval_per_var": 0.4275816477597032, "eval_runtime": 151.8849, "eval_samples_per_second": 184.304, "eval_steps_per_second": 2.884, "eval_within_var": 0.40892007232528843, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.12065292149782181, "learning_rate": 2.7447520831397623e-05, "loss": 0.2767, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.12913434207439423, "learning_rate": 2.6493166177391138e-05, "loss": 0.2652, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.14671213924884796, "learning_rate": 2.5536621973758952e-05, "loss": 0.2329, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.13414837419986725, "learning_rate": 2.4579290878178904e-05, "loss": 0.2016, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_acr_loss": 0.02229548650542855, "eval_across_var": 0.9048832782871647, "eval_bleu": 0.987608544196938, "eval_ce_loss": 0.031427863691869666, "eval_cos_loss": 0.2381418139490907, "eval_cov": 0.05971070729434218, "eval_cov_loss": 0.007483123698522715, "eval_global_kurtosis": 23.39030278872137, "eval_global_mean": -0.007102423879109561, "eval_global_var": 1.6357934681792237, "eval_loss": 0.14085906721889702, "eval_mse_loss": 0.7002158846757184, "eval_per_var": 1.6518420911815068, "eval_within_var": 0.7377716272933298, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_acr_loss": 0.02229548650542855, "eval_across_var": 0.9048832782871647, "eval_bleu": 0.987608544196938, "eval_ce_loss": 0.031427863691869666, "eval_cos_loss": 0.2381418139490907, "eval_cov": 0.05971070729434218, "eval_cov_loss": 0.007483123698522715, "eval_global_kurtosis": 23.39030278872137, "eval_global_mean": -0.007102423879109561, "eval_global_var": 1.6357934681792237, "eval_loss": 0.14085906721889702, "eval_mse_loss": 0.7002158846757184, "eval_per_var": 1.6518420911815068, "eval_runtime": 152.4278, "eval_samples_per_second": 183.648, "eval_steps_per_second": 2.873, "eval_within_var": 0.7377716272933298, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.1328643560409546, "learning_rate": 2.362257670221181e-05, "loss": 0.1901, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.1011626198887825, "learning_rate": 2.2667882352779608e-05, "loss": 0.1844, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.11414045095443726, "learning_rate": 2.1720315230424133e-05, "loss": 0.18, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.1001119315624237, "learning_rate": 2.0773833841855016e-05, "loss": 0.1774, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_acr_loss": 0.016273215680705114, "eval_across_var": 0.9544410968207877, "eval_bleu": 0.9895506585988, "eval_ce_loss": 0.026713626858986678, "eval_cos_loss": 0.22355560778074612, "eval_cov": 0.055647828263234875, "eval_cov_loss": 0.006082957281973468, "eval_global_kurtosis": 32.18490342353577, "eval_global_mean": -0.013440034569126286, "eval_global_var": 1.8210059039668949, "eval_loss": 0.1298266947609649, "eval_mse_loss": 0.6762152809530633, "eval_per_var": 1.8439484339326484, "eval_within_var": 0.8750175644545795, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_acr_loss": 0.016273215680705114, "eval_across_var": 0.9544410968207877, "eval_bleu": 0.9895506585988, "eval_ce_loss": 0.026713626858986678, "eval_cos_loss": 0.22355560778074612, "eval_cov": 0.055647828263234875, "eval_cov_loss": 0.006082957281973468, "eval_global_kurtosis": 32.18490342353577, "eval_global_mean": -0.013440034569126286, "eval_global_var": 1.8210059039668949, "eval_loss": 0.1298266947609649, "eval_mse_loss": 0.6762152809530633, "eval_per_var": 1.8439484339326484, "eval_runtime": 150.5474, "eval_samples_per_second": 185.941, "eval_steps_per_second": 2.909, "eval_within_var": 0.8750175644545795, "step": 12288 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }