{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5675488430095608, "eval_steps": 1024, "global_step": 12288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.1181285381317139, "learning_rate": 1.9615384615384617e-05, "loss": 10.3794, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 1.0416101217269897, "learning_rate": 3.930769230769231e-05, "loss": 7.9323, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.8387451171875, "learning_rate": 4.999617095521894e-05, "loss": 5.6072, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.559028148651123, "learning_rate": 4.9961092368776736e-05, "loss": 3.8081, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_acr_loss": 0.9904639322191613, "eval_across_var": 0.004779557895753107, "eval_bleu": 0.5734765992441645, "eval_ce_loss": 2.3428709033417374, "eval_cos_loss": 0.9217384038450511, "eval_cov": 0.07704977793236302, "eval_cov_loss": 0.009401565545225797, "eval_glb_loss": 0.3516415548106851, "eval_global_kurtosis": 3.025142035527861, "eval_global_mean": 0.0031828225747635376, "eval_global_var": 0.4070052752211758, "eval_krt_loss": 0.0007855186364893808, "eval_loss": 2.8181393391465486, "eval_mse_loss": 1.9018371273937835, "eval_per_loss": 0.36342586774259944, "eval_per_var": 0.3971644013984018, "eval_within_var": 0.4021891167038652, "eval_wth_loss": 0.35738333345276035, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_acr_loss": 0.9904639322191613, "eval_across_var": 0.004779557895753107, "eval_bleu": 0.5734765992441645, "eval_ce_loss": 2.3428709033417374, "eval_cos_loss": 0.9217384038450511, "eval_cov": 0.07704977793236302, "eval_cov_loss": 0.009401565545225797, "eval_glb_loss": 0.3516415548106851, "eval_global_kurtosis": 3.025142035527861, "eval_global_mean": 0.0031828225747635376, "eval_global_var": 0.4070052752211758, "eval_krt_loss": 0.0007855186364893808, "eval_loss": 2.8181393391465486, "eval_mse_loss": 1.9018371273937835, "eval_per_loss": 0.36342586774259944, "eval_per_var": 0.3971644013984018, "eval_runtime": 158.2867, "eval_samples_per_second": 176.85, "eval_steps_per_second": 2.767, "eval_within_var": 0.4021891167038652, "eval_wth_loss": 0.35738333345276035, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.3861701786518097, "learning_rate": 4.988941132556799e-05, "loss": 2.7403, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.2994460463523865, "learning_rate": 4.9781232937269974e-05, "loss": 2.1074, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.24324144423007965, "learning_rate": 4.963671583455164e-05, "loss": 1.697, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.2096453607082367, "learning_rate": 4.945607193446079e-05, "loss": 1.4167, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_acr_loss": 0.96901579629885, "eval_across_var": 0.015615421451509135, "eval_bleu": 0.8100885045319792, "eval_ce_loss": 0.6945271475674355, "eval_cos_loss": 0.7685760948211635, "eval_cov": 0.09640809498965468, "eval_cov_loss": 0.014826197332392136, "eval_glb_loss": 0.00014464876967478017, "eval_global_kurtosis": 3.0129096426375925, "eval_global_mean": 0.010630951624482734, "eval_global_var": 0.9908018068635844, "eval_krt_loss": 0.0002954138530850955, "eval_loss": 1.056928829243194, "eval_mse_loss": 1.6510266498343584, "eval_per_loss": 0.0006002873590547744, "eval_per_var": 0.9770318296946348, "eval_within_var": 0.9752532089954098, "eval_wth_loss": 0.0006678303594037685, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_acr_loss": 0.96901579629885, "eval_across_var": 0.015615421451509135, "eval_bleu": 0.8100885045319792, "eval_ce_loss": 0.6945271475674355, "eval_cos_loss": 0.7685760948211635, "eval_cov": 0.09640809498965468, "eval_cov_loss": 0.014826197332392136, "eval_glb_loss": 0.00014464876967478017, "eval_global_kurtosis": 3.0129096426375925, "eval_global_mean": 0.010630951624482734, "eval_global_var": 0.9908018068635844, "eval_krt_loss": 0.0002954138530850955, "eval_loss": 1.056928829243194, "eval_mse_loss": 1.6510266498343584, "eval_per_loss": 0.0006002873590547744, "eval_per_var": 0.9770318296946348, "eval_runtime": 153.1917, "eval_samples_per_second": 182.732, "eval_steps_per_second": 2.859, "eval_within_var": 0.9752532089954098, "eval_wth_loss": 0.0006678303594037685, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.17944040894508362, "learning_rate": 4.923956612967301e-05, "loss": 1.2141, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.1666973978281021, "learning_rate": 4.898751590005826e-05, "loss": 1.0533, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.151300847530365, "learning_rate": 4.870029084713462e-05, "loss": 0.9289, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.12945112586021423, "learning_rate": 4.837831215209188e-05, "loss": 0.8296, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_acr_loss": 0.9623512120823882, "eval_across_var": 0.019007292450682077, "eval_bleu": 0.8976890806937166, "eval_ce_loss": 0.323182335727291, "eval_cos_loss": 0.6147451090486082, "eval_cov": 0.07890243181899258, "eval_cov_loss": 0.010057339255515163, "eval_glb_loss": 0.0003083459862835331, "eval_global_kurtosis": 3.006787543971789, "eval_global_mean": 0.013840801639643978, "eval_global_var": 1.0159304544805936, "eval_krt_loss": 0.0002449526874326567, "eval_loss": 0.6247616745021245, "eval_mse_loss": 1.3700275383039153, "eval_per_loss": 5.431099025081826e-05, "eval_per_var": 0.9992698077197488, "eval_within_var": 0.997042422425257, "eval_wth_loss": 7.23102295562176e-05, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_acr_loss": 0.9623512120823882, "eval_across_var": 0.019007292450682077, "eval_bleu": 0.8976890806937166, "eval_ce_loss": 0.323182335727291, "eval_cos_loss": 0.6147451090486082, "eval_cov": 0.07890243181899258, "eval_cov_loss": 0.010057339255515163, "eval_glb_loss": 0.0003083459862835331, "eval_global_kurtosis": 3.006787543971789, "eval_global_mean": 0.013840801639643978, "eval_global_var": 1.0159304544805936, "eval_krt_loss": 0.0002449526874326567, "eval_loss": 0.6247616745021245, "eval_mse_loss": 1.3700275383039153, "eval_per_loss": 5.431099025081826e-05, "eval_per_var": 0.9992698077197488, "eval_runtime": 153.8502, "eval_samples_per_second": 181.95, "eval_steps_per_second": 2.847, "eval_within_var": 0.997042422425257, "eval_wth_loss": 7.23102295562176e-05, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.12589366734027863, "learning_rate": 4.802205195817963e-05, "loss": 0.7505, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.11897846311330795, "learning_rate": 4.763203267836576e-05, "loss": 0.6825, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.10772541910409927, "learning_rate": 4.720882622928019e-05, "loss": 0.6261, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.10752860456705093, "learning_rate": 4.675305319256765e-05, "loss": 0.5794, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_acr_loss": 0.9460189426870651, "eval_across_var": 0.027373042131974, "eval_bleu": 0.9357963139785648, "eval_ce_loss": 0.18706960009929796, "eval_cos_loss": 0.49577669084888615, "eval_cov": 0.07429023638163527, "eval_cov_loss": 0.0088696325406194, "eval_glb_loss": 0.0008705457051595052, "eval_global_kurtosis": 3.0126239273646105, "eval_global_mean": 0.019379994640611624, "eval_global_var": 1.0280014804509132, "eval_krt_loss": 0.0004827787981839751, "eval_loss": 0.44092568850408403, "eval_mse_loss": 1.145899186395619, "eval_per_loss": 0.000201601960343313, "eval_per_var": 1.0113285709189497, "eval_within_var": 1.0008215422499669, "eval_wth_loss": 0.00011882184727399328, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_acr_loss": 0.9460189426870651, "eval_across_var": 0.027373042131974, "eval_bleu": 0.9357963139785648, "eval_ce_loss": 0.18706960009929796, "eval_cos_loss": 0.49577669084888615, "eval_cov": 0.07429023638163527, "eval_cov_loss": 0.0088696325406194, "eval_glb_loss": 0.0008705457051595052, "eval_global_kurtosis": 3.0126239273646105, "eval_global_mean": 0.019379994640611624, "eval_global_var": 1.0280014804509132, "eval_krt_loss": 0.0004827787981839751, "eval_loss": 0.44092568850408403, "eval_mse_loss": 1.145899186395619, "eval_per_loss": 0.000201601960343313, "eval_per_var": 1.0113285709189497, "eval_runtime": 154.1862, "eval_samples_per_second": 181.553, "eval_steps_per_second": 2.841, "eval_within_var": 1.0008215422499669, "eval_wth_loss": 0.00011882184727399328, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.1053977757692337, "learning_rate": 4.6265381904878854e-05, "loss": 0.5371, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.09793444722890854, "learning_rate": 4.57465274778347e-05, "loss": 0.5051, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.09642181545495987, "learning_rate": 4.519725074940068e-05, "loss": 0.4712, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.09021387249231339, "learning_rate": 4.461835716820895e-05, "loss": 0.4441, "step": 5120 }, { "epoch": 0.236478684587317, "eval_acr_loss": 0.8976050177121271, "eval_across_var": 0.05263728992460661, "eval_bleu": 0.9557004800288741, "eval_ce_loss": 0.12269850065038629, "eval_cos_loss": 0.4099505911815112, "eval_cov": 0.0774190545626427, "eval_cov_loss": 0.00986302298595746, "eval_glb_loss": 0.002168635799460215, "eval_global_kurtosis": 3.0220293960614835, "eval_global_mean": 0.026287226916448168, "eval_global_var": 1.0453655643550228, "eval_krt_loss": 0.0009509958784072548, "eval_loss": 0.3404404183500978, "eval_mse_loss": 0.9835099409160004, "eval_per_loss": 0.0008347246744861342, "eval_per_var": 1.0266046393407535, "eval_within_var": 0.993057064541943, "eval_wth_loss": 0.00020235125328104065, "step": 5120 }, { "epoch": 0.236478684587317, "eval_acr_loss": 0.8976050177121271, "eval_across_var": 0.05263728992460661, "eval_bleu": 0.9557004800288741, "eval_ce_loss": 0.12269850065038629, "eval_cos_loss": 0.4099505911815112, "eval_cov": 0.0774190545626427, "eval_cov_loss": 0.00986302298595746, "eval_glb_loss": 0.002168635799460215, "eval_global_kurtosis": 3.0220293960614835, "eval_global_mean": 0.026287226916448168, "eval_global_var": 1.0453655643550228, "eval_krt_loss": 0.0009509958784072548, "eval_loss": 0.3404404183500978, "eval_mse_loss": 0.9835099409160004, "eval_per_loss": 0.0008347246744861342, "eval_per_var": 1.0266046393407535, "eval_runtime": 152.7613, "eval_samples_per_second": 183.247, "eval_steps_per_second": 2.867, "eval_within_var": 0.993057064541943, "eval_wth_loss": 0.00020235125328104065, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.08437229692935944, "learning_rate": 4.401069561246422e-05, "loss": 0.4198, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.09224370121955872, "learning_rate": 4.337515714516545e-05, "loss": 0.398, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.09789060801267624, "learning_rate": 4.2712673707468434e-05, "loss": 0.3822, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.08814027905464172, "learning_rate": 4.202421675210565e-05, "loss": 0.3652, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_acr_loss": 0.8202883884242681, "eval_across_var": 0.09450281369733755, "eval_bleu": 0.9674697145362666, "eval_ce_loss": 0.0872740181817856, "eval_cos_loss": 0.35029126064145943, "eval_cov": 0.08723331259810217, "eval_cov_loss": 0.012916838626917367, "eval_glb_loss": 0.00597613696093973, "eval_global_kurtosis": 3.024092547969731, "eval_global_mean": 0.04041210157141838, "eval_global_var": 1.0756590682077625, "eval_krt_loss": 0.0019540588812798874, "eval_loss": 0.27823208194209015, "eval_mse_loss": 0.8730145360781177, "eval_per_loss": 0.0027831836378193336, "eval_per_var": 1.0494290007848173, "eval_within_var": 0.9819663109844678, "eval_wth_loss": 0.00048046983986418896, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_acr_loss": 0.8202883884242681, "eval_across_var": 0.09450281369733755, "eval_bleu": 0.9674697145362666, "eval_ce_loss": 0.0872740181817856, "eval_cos_loss": 0.35029126064145943, "eval_cov": 0.08723331259810217, "eval_cov_loss": 0.012916838626917367, "eval_glb_loss": 0.00597613696093973, "eval_global_kurtosis": 3.024092547969731, "eval_global_mean": 0.04041210157141838, "eval_global_var": 1.0756590682077625, "eval_krt_loss": 0.0019540588812798874, "eval_loss": 0.27823208194209015, "eval_mse_loss": 0.8730145360781177, "eval_per_loss": 0.0027831836378193336, "eval_per_var": 1.0494290007848173, "eval_runtime": 152.787, "eval_samples_per_second": 183.216, "eval_steps_per_second": 2.867, "eval_within_var": 0.9819663109844678, "eval_wth_loss": 0.00048046983986418896, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.0873865932226181, "learning_rate": 4.131362984409538e-05, "loss": 0.3471, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.07706636935472488, "learning_rate": 4.0576382427305324e-05, "loss": 0.3341, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.09675078094005585, "learning_rate": 3.981629410852463e-05, "loss": 0.323, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.09393124282360077, "learning_rate": 3.903447946637093e-05, "loss": 0.3086, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_acr_loss": 0.726046212868059, "eval_across_var": 0.14836800615537113, "eval_bleu": 0.9750192561822552, "eval_ce_loss": 0.06533130286389018, "eval_cos_loss": 0.3090691367785136, "eval_cov": 0.0970152415097032, "eval_cov_loss": 0.01665312849988812, "eval_glb_loss": 0.012594995977671723, "eval_global_kurtosis": 3.052053846180711, "eval_global_mean": 0.055559001556814536, "eval_global_var": 1.1099212507134704, "eval_krt_loss": 0.005212522339852683, "eval_loss": 0.23639628832062629, "eval_mse_loss": 0.800027005204327, "eval_per_loss": 0.004818411722575149, "eval_per_var": 1.0654642462186072, "eval_within_var": 0.9631439711405262, "eval_wth_loss": 0.0015324829790224052, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_acr_loss": 0.726046212868059, "eval_across_var": 0.14836800615537113, "eval_bleu": 0.9750192561822552, "eval_ce_loss": 0.06533130286389018, "eval_cos_loss": 0.3090691367785136, "eval_cov": 0.0970152415097032, "eval_cov_loss": 0.01665312849988812, "eval_glb_loss": 0.012594995977671723, "eval_global_kurtosis": 3.052053846180711, "eval_global_mean": 0.055559001556814536, "eval_global_var": 1.1099212507134704, "eval_krt_loss": 0.005212522339852683, "eval_loss": 0.23639628832062629, "eval_mse_loss": 0.800027005204327, "eval_per_loss": 0.004818411722575149, "eval_per_var": 1.0654642462186072, "eval_runtime": 149.7025, "eval_samples_per_second": 186.991, "eval_steps_per_second": 2.926, "eval_within_var": 0.9631439711405262, "eval_wth_loss": 0.0015324829790224052, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.08861193060874939, "learning_rate": 3.823208493851674e-05, "loss": 0.3004, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.10392362624406815, "learning_rate": 3.741028714057574e-05, "loss": 0.2906, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.09276238083839417, "learning_rate": 3.657029114073668e-05, "loss": 0.2807, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.08639795333147049, "learning_rate": 3.571332869267499e-05, "loss": 0.2715, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_acr_loss": 0.6639264498127105, "eval_across_var": 0.18578778622357267, "eval_bleu": 0.9802777908340867, "eval_ce_loss": 0.05099814674778752, "eval_cos_loss": 0.2809805735333325, "eval_cov": 0.10055263292843893, "eval_cov_loss": 0.018623638194326532, "eval_glb_loss": 0.017183700108637005, "eval_global_kurtosis": 3.0215004377713486, "eval_global_mean": 0.06458907802355345, "eval_global_var": 1.1285004637557077, "eval_krt_loss": 0.0026812796775613885, "eval_loss": 0.20853681022037654, "eval_mse_loss": 0.7543587909169394, "eval_per_loss": 0.004951255506576468, "eval_per_var": 1.0664095943921232, "eval_within_var": 0.9453123360191851, "eval_wth_loss": 0.0031645966432090277, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_acr_loss": 0.6639264498127105, "eval_across_var": 0.18578778622357267, "eval_bleu": 0.9802777908340867, "eval_ce_loss": 0.05099814674778752, "eval_cos_loss": 0.2809805735333325, "eval_cov": 0.10055263292843893, "eval_cov_loss": 0.018623638194326532, "eval_glb_loss": 0.017183700108637005, "eval_global_kurtosis": 3.0215004377713486, "eval_global_mean": 0.06458907802355345, "eval_global_var": 1.1285004637557077, "eval_krt_loss": 0.0026812796775613885, "eval_loss": 0.20853681022037654, "eval_mse_loss": 0.7543587909169394, "eval_per_loss": 0.004951255506576468, "eval_per_var": 1.0664095943921232, "eval_runtime": 151.9683, "eval_samples_per_second": 184.203, "eval_steps_per_second": 2.882, "eval_within_var": 0.9453123360191851, "eval_wth_loss": 0.0031645966432090277, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.09597329795360565, "learning_rate": 3.484065642933335e-05, "loss": 0.2653, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.09884477406740189, "learning_rate": 3.395704566041868e-05, "loss": 0.2588, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.12420962005853653, "learning_rate": 3.305686266521002e-05, "loss": 0.251, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.09434681385755539, "learning_rate": 3.214486524445379e-05, "loss": 0.2459, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_acr_loss": 0.5954188222074073, "eval_across_var": 0.22920796821174555, "eval_bleu": 0.9836602165246535, "eval_ce_loss": 0.041640645169470136, "eval_cos_loss": 0.2611509262835054, "eval_cov": 0.1050533486283533, "eval_cov_loss": 0.02121386145875198, "eval_glb_loss": 0.02622999339343206, "eval_global_kurtosis": 3.059722278216114, "eval_global_mean": 0.07356201894751423, "eval_global_var": 1.1591975242579908, "eval_krt_loss": 0.0066943209811956075, "eval_loss": 0.18931484470764795, "eval_mse_loss": 0.7253989612675149, "eval_per_loss": 0.007019040791411378, "eval_per_var": 1.0800580586472603, "eval_within_var": 0.9334883986542758, "eval_wth_loss": 0.004603328326420399, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_acr_loss": 0.5954188222074073, "eval_across_var": 0.22920796821174555, "eval_bleu": 0.9836602165246535, "eval_ce_loss": 0.041640645169470136, "eval_cos_loss": 0.2611509262835054, "eval_cov": 0.1050533486283533, "eval_cov_loss": 0.02121386145875198, "eval_glb_loss": 0.02622999339343206, "eval_global_kurtosis": 3.059722278216114, "eval_global_mean": 0.07356201894751423, "eval_global_var": 1.1591975242579908, "eval_krt_loss": 0.0066943209811956075, "eval_loss": 0.18931484470764795, "eval_mse_loss": 0.7253989612675149, "eval_per_loss": 0.007019040791411378, "eval_per_var": 1.0800580586472603, "eval_runtime": 152.3156, "eval_samples_per_second": 183.783, "eval_steps_per_second": 2.876, "eval_within_var": 0.9334883986542758, "eval_wth_loss": 0.004603328326420399, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.09029538929462433, "learning_rate": 3.122239073329827e-05, "loss": 0.241, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.0906376913189888, "learning_rate": 3.029079183029181e-05, "loss": 0.2373, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.08700253814458847, "learning_rate": 2.935143461381221e-05, "loss": 0.2319, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.12261617183685303, "learning_rate": 2.8405696538876124e-05, "loss": 0.2291, "step": 10240 }, { "epoch": 0.472957369174634, "eval_acr_loss": 0.5550906222024464, "eval_across_var": 0.2559552679867505, "eval_bleu": 0.9864228314384785, "eval_ce_loss": 0.03463799520246855, "eval_cos_loss": 0.2463943555815035, "eval_cov": 0.10881105187821062, "eval_cov_loss": 0.023118084696329894, "eval_glb_loss": 0.031056445483203347, "eval_global_kurtosis": 3.0601126701320145, "eval_global_mean": 0.08328581727258691, "eval_global_var": 1.173308629423516, "eval_krt_loss": 0.0071202065942054926, "eval_loss": 0.17534998668247162, "eval_mse_loss": 0.7060943077144013, "eval_per_loss": 0.007063817760171411, "eval_per_var": 1.0801762271689497, "eval_within_var": 0.9215792903344925, "eval_wth_loss": 0.006330131607865872, "step": 10240 }, { "epoch": 0.472957369174634, "eval_acr_loss": 0.5550906222024464, "eval_across_var": 0.2559552679867505, "eval_bleu": 0.9864228314384785, "eval_ce_loss": 0.03463799520246855, "eval_cos_loss": 0.2463943555815035, "eval_cov": 0.10881105187821062, "eval_cov_loss": 0.023118084696329894, "eval_glb_loss": 0.031056445483203347, "eval_global_kurtosis": 3.0601126701320145, "eval_global_mean": 0.08328581727258691, "eval_global_var": 1.173308629423516, "eval_krt_loss": 0.0071202065942054926, "eval_loss": 0.17534998668247162, "eval_mse_loss": 0.7060943077144013, "eval_per_loss": 0.007063817760171411, "eval_per_var": 1.0801762271689497, "eval_runtime": 149.1447, "eval_samples_per_second": 187.69, "eval_steps_per_second": 2.937, "eval_within_var": 0.9215792903344925, "eval_wth_loss": 0.006330131607865872, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.0960695669054985, "learning_rate": 2.7454964417266016e-05, "loss": 0.2243, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.09179438650608063, "learning_rate": 2.6504365436879463e-05, "loss": 0.2201, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.12633387744426727, "learning_rate": 2.5547838773922217e-05, "loss": 0.2189, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.10988616943359375, "learning_rate": 2.4590508770921138e-05, "loss": 0.2148, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_acr_loss": 0.5188274796558842, "eval_across_var": 0.2808882309353515, "eval_bleu": 0.9881634510768484, "eval_ce_loss": 0.029888733919522806, "eval_cos_loss": 0.2354265242202641, "eval_cov": 0.1133959347798944, "eval_cov_loss": 0.025275619128574383, "eval_glb_loss": 0.03796970898702264, "eval_global_kurtosis": 3.0807342012178953, "eval_global_mean": 0.09449117042158292, "eval_global_var": 1.1918365617865296, "eval_krt_loss": 0.010437782962688104, "eval_loss": 0.16559295880195757, "eval_mse_loss": 0.6934955275222047, "eval_per_loss": 0.008229013991682497, "eval_per_var": 1.0867022509988584, "eval_within_var": 0.9157081242293528, "eval_wth_loss": 0.007290151238135279, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_acr_loss": 0.5188274796558842, "eval_across_var": 0.2808882309353515, "eval_bleu": 0.9881634510768484, "eval_ce_loss": 0.029888733919522806, "eval_cos_loss": 0.2354265242202641, "eval_cov": 0.1133959347798944, "eval_cov_loss": 0.025275619128574383, "eval_glb_loss": 0.03796970898702264, "eval_global_kurtosis": 3.0807342012178953, "eval_global_mean": 0.09449117042158292, "eval_global_var": 1.1918365617865296, "eval_krt_loss": 0.010437782962688104, "eval_loss": 0.16559295880195757, "eval_mse_loss": 0.6934955275222047, "eval_per_loss": 0.008229013991682497, "eval_per_var": 1.0867022509988584, "eval_runtime": 150.6787, "eval_samples_per_second": 185.779, "eval_steps_per_second": 2.907, "eval_within_var": 0.9157081242293528, "eval_wth_loss": 0.007290151238135279, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.10263644903898239, "learning_rate": 2.3633779237834874e-05, "loss": 0.2118, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.0880010798573494, "learning_rate": 2.267905310410478e-05, "loss": 0.2097, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.08736666291952133, "learning_rate": 2.172773036142663e-05, "loss": 0.2077, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.08412963151931763, "learning_rate": 2.0781206010830228e-05, "loss": 0.2063, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_acr_loss": 0.5011714086548923, "eval_across_var": 0.2933282603479956, "eval_bleu": 0.9892267845036895, "eval_ce_loss": 0.026746679018740647, "eval_cos_loss": 0.22736865680103432, "eval_cov": 0.11331831701269977, "eval_cov_loss": 0.02555328495502812, "eval_glb_loss": 0.03884035702709738, "eval_global_kurtosis": 3.0901281877195452, "eval_global_mean": 0.09504469671205842, "eval_global_var": 1.193941299229452, "eval_krt_loss": 0.012473824904427331, "eval_loss": 0.15896797551003766, "eval_mse_loss": 0.6858495211220224, "eval_per_loss": 0.007296311800882697, "eval_per_var": 1.081324468464612, "eval_within_var": 0.9059402656881776, "eval_wth_loss": 0.009029195172984374, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_acr_loss": 0.5011714086548923, "eval_across_var": 0.2933282603479956, "eval_bleu": 0.9892267845036895, "eval_ce_loss": 0.026746679018740647, "eval_cos_loss": 0.22736865680103432, "eval_cov": 0.11331831701269977, "eval_cov_loss": 0.02555328495502812, "eval_glb_loss": 0.03884035702709738, "eval_global_kurtosis": 3.0901281877195452, "eval_global_mean": 0.09504469671205842, "eval_global_var": 1.193941299229452, "eval_krt_loss": 0.012473824904427331, "eval_loss": 0.15896797551003766, "eval_mse_loss": 0.6858495211220224, "eval_per_loss": 0.007296311800882697, "eval_per_var": 1.081324468464612, "eval_runtime": 150.6158, "eval_samples_per_second": 185.857, "eval_steps_per_second": 2.908, "eval_within_var": 0.9059402656881776, "eval_wth_loss": 0.009029195172984374, "step": 12288 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }