| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5675488430095608, | |
| "eval_steps": 1024, | |
| "global_step": 12288, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011823934229365849, | |
| "grad_norm": 1.1181285381317139, | |
| "learning_rate": 1.9615384615384617e-05, | |
| "loss": 10.3794, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.023647868458731697, | |
| "grad_norm": 1.0416101217269897, | |
| "learning_rate": 3.930769230769231e-05, | |
| "loss": 7.9323, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.03547180268809755, | |
| "grad_norm": 0.8387451171875, | |
| "learning_rate": 4.999617095521894e-05, | |
| "loss": 5.6072, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "grad_norm": 0.559028148651123, | |
| "learning_rate": 4.9961092368776736e-05, | |
| "loss": 3.8081, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "eval_acr_loss": 0.9904639322191613, | |
| "eval_across_var": 0.004779557895753107, | |
| "eval_bleu": 0.5734765992441645, | |
| "eval_ce_loss": 2.3428709033417374, | |
| "eval_cos_loss": 0.9217384038450511, | |
| "eval_cov": 0.07704977793236302, | |
| "eval_cov_loss": 0.009401565545225797, | |
| "eval_glb_loss": 0.3516415548106851, | |
| "eval_global_kurtosis": 3.025142035527861, | |
| "eval_global_mean": 0.0031828225747635376, | |
| "eval_global_var": 0.4070052752211758, | |
| "eval_krt_loss": 0.0007855186364893808, | |
| "eval_loss": 2.8181393391465486, | |
| "eval_mse_loss": 1.9018371273937835, | |
| "eval_per_loss": 0.36342586774259944, | |
| "eval_per_var": 0.3971644013984018, | |
| "eval_within_var": 0.4021891167038652, | |
| "eval_wth_loss": 0.35738333345276035, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.047295736917463395, | |
| "eval_acr_loss": 0.9904639322191613, | |
| "eval_across_var": 0.004779557895753107, | |
| "eval_bleu": 0.5734765992441645, | |
| "eval_ce_loss": 2.3428709033417374, | |
| "eval_cos_loss": 0.9217384038450511, | |
| "eval_cov": 0.07704977793236302, | |
| "eval_cov_loss": 0.009401565545225797, | |
| "eval_glb_loss": 0.3516415548106851, | |
| "eval_global_kurtosis": 3.025142035527861, | |
| "eval_global_mean": 0.0031828225747635376, | |
| "eval_global_var": 0.4070052752211758, | |
| "eval_krt_loss": 0.0007855186364893808, | |
| "eval_loss": 2.8181393391465486, | |
| "eval_mse_loss": 1.9018371273937835, | |
| "eval_per_loss": 0.36342586774259944, | |
| "eval_per_var": 0.3971644013984018, | |
| "eval_runtime": 158.2867, | |
| "eval_samples_per_second": 176.85, | |
| "eval_steps_per_second": 2.767, | |
| "eval_within_var": 0.4021891167038652, | |
| "eval_wth_loss": 0.35738333345276035, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.05911967114682925, | |
| "grad_norm": 0.3861701786518097, | |
| "learning_rate": 4.988941132556799e-05, | |
| "loss": 2.7403, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.0709436053761951, | |
| "grad_norm": 0.2994460463523865, | |
| "learning_rate": 4.9781232937269974e-05, | |
| "loss": 2.1074, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.08276753960556095, | |
| "grad_norm": 0.24324144423007965, | |
| "learning_rate": 4.963671583455164e-05, | |
| "loss": 1.697, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "grad_norm": 0.2096453607082367, | |
| "learning_rate": 4.945607193446079e-05, | |
| "loss": 1.4167, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "eval_acr_loss": 0.96901579629885, | |
| "eval_across_var": 0.015615421451509135, | |
| "eval_bleu": 0.8100885045319792, | |
| "eval_ce_loss": 0.6945271475674355, | |
| "eval_cos_loss": 0.7685760948211635, | |
| "eval_cov": 0.09640809498965468, | |
| "eval_cov_loss": 0.014826197332392136, | |
| "eval_glb_loss": 0.00014464876967478017, | |
| "eval_global_kurtosis": 3.0129096426375925, | |
| "eval_global_mean": 0.010630951624482734, | |
| "eval_global_var": 0.9908018068635844, | |
| "eval_krt_loss": 0.0002954138530850955, | |
| "eval_loss": 1.056928829243194, | |
| "eval_mse_loss": 1.6510266498343584, | |
| "eval_per_loss": 0.0006002873590547744, | |
| "eval_per_var": 0.9770318296946348, | |
| "eval_within_var": 0.9752532089954098, | |
| "eval_wth_loss": 0.0006678303594037685, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.09459147383492679, | |
| "eval_acr_loss": 0.96901579629885, | |
| "eval_across_var": 0.015615421451509135, | |
| "eval_bleu": 0.8100885045319792, | |
| "eval_ce_loss": 0.6945271475674355, | |
| "eval_cos_loss": 0.7685760948211635, | |
| "eval_cov": 0.09640809498965468, | |
| "eval_cov_loss": 0.014826197332392136, | |
| "eval_glb_loss": 0.00014464876967478017, | |
| "eval_global_kurtosis": 3.0129096426375925, | |
| "eval_global_mean": 0.010630951624482734, | |
| "eval_global_var": 0.9908018068635844, | |
| "eval_krt_loss": 0.0002954138530850955, | |
| "eval_loss": 1.056928829243194, | |
| "eval_mse_loss": 1.6510266498343584, | |
| "eval_per_loss": 0.0006002873590547744, | |
| "eval_per_var": 0.9770318296946348, | |
| "eval_runtime": 153.1917, | |
| "eval_samples_per_second": 182.732, | |
| "eval_steps_per_second": 2.859, | |
| "eval_within_var": 0.9752532089954098, | |
| "eval_wth_loss": 0.0006678303594037685, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.10641540806429264, | |
| "grad_norm": 0.17944040894508362, | |
| "learning_rate": 4.923956612967301e-05, | |
| "loss": 1.2141, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.1182393422936585, | |
| "grad_norm": 0.1666973978281021, | |
| "learning_rate": 4.898751590005826e-05, | |
| "loss": 1.0533, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.13006327652302435, | |
| "grad_norm": 0.151300847530365, | |
| "learning_rate": 4.870029084713462e-05, | |
| "loss": 0.9289, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "grad_norm": 0.12945112586021423, | |
| "learning_rate": 4.837831215209188e-05, | |
| "loss": 0.8296, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "eval_acr_loss": 0.9623512120823882, | |
| "eval_across_var": 0.019007292450682077, | |
| "eval_bleu": 0.8976890806937166, | |
| "eval_ce_loss": 0.323182335727291, | |
| "eval_cos_loss": 0.6147451090486082, | |
| "eval_cov": 0.07890243181899258, | |
| "eval_cov_loss": 0.010057339255515163, | |
| "eval_glb_loss": 0.0003083459862835331, | |
| "eval_global_kurtosis": 3.006787543971789, | |
| "eval_global_mean": 0.013840801639643978, | |
| "eval_global_var": 1.0159304544805936, | |
| "eval_krt_loss": 0.0002449526874326567, | |
| "eval_loss": 0.6247616745021245, | |
| "eval_mse_loss": 1.3700275383039153, | |
| "eval_per_loss": 5.431099025081826e-05, | |
| "eval_per_var": 0.9992698077197488, | |
| "eval_within_var": 0.997042422425257, | |
| "eval_wth_loss": 7.23102295562176e-05, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.1418872107523902, | |
| "eval_acr_loss": 0.9623512120823882, | |
| "eval_across_var": 0.019007292450682077, | |
| "eval_bleu": 0.8976890806937166, | |
| "eval_ce_loss": 0.323182335727291, | |
| "eval_cos_loss": 0.6147451090486082, | |
| "eval_cov": 0.07890243181899258, | |
| "eval_cov_loss": 0.010057339255515163, | |
| "eval_glb_loss": 0.0003083459862835331, | |
| "eval_global_kurtosis": 3.006787543971789, | |
| "eval_global_mean": 0.013840801639643978, | |
| "eval_global_var": 1.0159304544805936, | |
| "eval_krt_loss": 0.0002449526874326567, | |
| "eval_loss": 0.6247616745021245, | |
| "eval_mse_loss": 1.3700275383039153, | |
| "eval_per_loss": 5.431099025081826e-05, | |
| "eval_per_var": 0.9992698077197488, | |
| "eval_runtime": 153.8502, | |
| "eval_samples_per_second": 181.95, | |
| "eval_steps_per_second": 2.847, | |
| "eval_within_var": 0.997042422425257, | |
| "eval_wth_loss": 7.23102295562176e-05, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.15371114498175603, | |
| "grad_norm": 0.12589366734027863, | |
| "learning_rate": 4.802205195817963e-05, | |
| "loss": 0.7505, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.1655350792111219, | |
| "grad_norm": 0.11897846311330795, | |
| "learning_rate": 4.763203267836576e-05, | |
| "loss": 0.6825, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.17735901344048774, | |
| "grad_norm": 0.10772541910409927, | |
| "learning_rate": 4.720882622928019e-05, | |
| "loss": 0.6261, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "grad_norm": 0.10752860456705093, | |
| "learning_rate": 4.675305319256765e-05, | |
| "loss": 0.5794, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "eval_acr_loss": 0.9460189426870651, | |
| "eval_across_var": 0.027373042131974, | |
| "eval_bleu": 0.9357963139785648, | |
| "eval_ce_loss": 0.18706960009929796, | |
| "eval_cos_loss": 0.49577669084888615, | |
| "eval_cov": 0.07429023638163527, | |
| "eval_cov_loss": 0.0088696325406194, | |
| "eval_glb_loss": 0.0008705457051595052, | |
| "eval_global_kurtosis": 3.0126239273646105, | |
| "eval_global_mean": 0.019379994640611624, | |
| "eval_global_var": 1.0280014804509132, | |
| "eval_krt_loss": 0.0004827787981839751, | |
| "eval_loss": 0.44092568850408403, | |
| "eval_mse_loss": 1.145899186395619, | |
| "eval_per_loss": 0.000201601960343313, | |
| "eval_per_var": 1.0113285709189497, | |
| "eval_within_var": 1.0008215422499669, | |
| "eval_wth_loss": 0.00011882184727399328, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.18918294766985358, | |
| "eval_acr_loss": 0.9460189426870651, | |
| "eval_across_var": 0.027373042131974, | |
| "eval_bleu": 0.9357963139785648, | |
| "eval_ce_loss": 0.18706960009929796, | |
| "eval_cos_loss": 0.49577669084888615, | |
| "eval_cov": 0.07429023638163527, | |
| "eval_cov_loss": 0.0088696325406194, | |
| "eval_glb_loss": 0.0008705457051595052, | |
| "eval_global_kurtosis": 3.0126239273646105, | |
| "eval_global_mean": 0.019379994640611624, | |
| "eval_global_var": 1.0280014804509132, | |
| "eval_krt_loss": 0.0004827787981839751, | |
| "eval_loss": 0.44092568850408403, | |
| "eval_mse_loss": 1.145899186395619, | |
| "eval_per_loss": 0.000201601960343313, | |
| "eval_per_var": 1.0113285709189497, | |
| "eval_runtime": 154.1862, | |
| "eval_samples_per_second": 181.553, | |
| "eval_steps_per_second": 2.841, | |
| "eval_within_var": 1.0008215422499669, | |
| "eval_wth_loss": 0.00011882184727399328, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.20100688189921945, | |
| "grad_norm": 0.1053977757692337, | |
| "learning_rate": 4.6265381904878854e-05, | |
| "loss": 0.5371, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.2128308161285853, | |
| "grad_norm": 0.09793444722890854, | |
| "learning_rate": 4.57465274778347e-05, | |
| "loss": 0.5051, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.22465475035795113, | |
| "grad_norm": 0.09642181545495987, | |
| "learning_rate": 4.519725074940068e-05, | |
| "loss": 0.4712, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "grad_norm": 0.09021387249231339, | |
| "learning_rate": 4.461835716820895e-05, | |
| "loss": 0.4441, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "eval_acr_loss": 0.8976050177121271, | |
| "eval_across_var": 0.05263728992460661, | |
| "eval_bleu": 0.9557004800288741, | |
| "eval_ce_loss": 0.12269850065038629, | |
| "eval_cos_loss": 0.4099505911815112, | |
| "eval_cov": 0.0774190545626427, | |
| "eval_cov_loss": 0.00986302298595746, | |
| "eval_glb_loss": 0.002168635799460215, | |
| "eval_global_kurtosis": 3.0220293960614835, | |
| "eval_global_mean": 0.026287226916448168, | |
| "eval_global_var": 1.0453655643550228, | |
| "eval_krt_loss": 0.0009509958784072548, | |
| "eval_loss": 0.3404404183500978, | |
| "eval_mse_loss": 0.9835099409160004, | |
| "eval_per_loss": 0.0008347246744861342, | |
| "eval_per_var": 1.0266046393407535, | |
| "eval_within_var": 0.993057064541943, | |
| "eval_wth_loss": 0.00020235125328104065, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.236478684587317, | |
| "eval_acr_loss": 0.8976050177121271, | |
| "eval_across_var": 0.05263728992460661, | |
| "eval_bleu": 0.9557004800288741, | |
| "eval_ce_loss": 0.12269850065038629, | |
| "eval_cos_loss": 0.4099505911815112, | |
| "eval_cov": 0.0774190545626427, | |
| "eval_cov_loss": 0.00986302298595746, | |
| "eval_glb_loss": 0.002168635799460215, | |
| "eval_global_kurtosis": 3.0220293960614835, | |
| "eval_global_mean": 0.026287226916448168, | |
| "eval_global_var": 1.0453655643550228, | |
| "eval_krt_loss": 0.0009509958784072548, | |
| "eval_loss": 0.3404404183500978, | |
| "eval_mse_loss": 0.9835099409160004, | |
| "eval_per_loss": 0.0008347246744861342, | |
| "eval_per_var": 1.0266046393407535, | |
| "eval_runtime": 152.7613, | |
| "eval_samples_per_second": 183.247, | |
| "eval_steps_per_second": 2.867, | |
| "eval_within_var": 0.993057064541943, | |
| "eval_wth_loss": 0.00020235125328104065, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.24830261881668284, | |
| "grad_norm": 0.08437229692935944, | |
| "learning_rate": 4.401069561246422e-05, | |
| "loss": 0.4198, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.2601265530460487, | |
| "grad_norm": 0.09224370121955872, | |
| "learning_rate": 4.337515714516545e-05, | |
| "loss": 0.398, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 0.27195048727541454, | |
| "grad_norm": 0.09789060801267624, | |
| "learning_rate": 4.2712673707468434e-05, | |
| "loss": 0.3822, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "grad_norm": 0.08814027905464172, | |
| "learning_rate": 4.202421675210565e-05, | |
| "loss": 0.3652, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "eval_acr_loss": 0.8202883884242681, | |
| "eval_across_var": 0.09450281369733755, | |
| "eval_bleu": 0.9674697145362666, | |
| "eval_ce_loss": 0.0872740181817856, | |
| "eval_cos_loss": 0.35029126064145943, | |
| "eval_cov": 0.08723331259810217, | |
| "eval_cov_loss": 0.012916838626917367, | |
| "eval_glb_loss": 0.00597613696093973, | |
| "eval_global_kurtosis": 3.024092547969731, | |
| "eval_global_mean": 0.04041210157141838, | |
| "eval_global_var": 1.0756590682077625, | |
| "eval_krt_loss": 0.0019540588812798874, | |
| "eval_loss": 0.27823208194209015, | |
| "eval_mse_loss": 0.8730145360781177, | |
| "eval_per_loss": 0.0027831836378193336, | |
| "eval_per_var": 1.0494290007848173, | |
| "eval_within_var": 0.9819663109844678, | |
| "eval_wth_loss": 0.00048046983986418896, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2837744215047804, | |
| "eval_acr_loss": 0.8202883884242681, | |
| "eval_across_var": 0.09450281369733755, | |
| "eval_bleu": 0.9674697145362666, | |
| "eval_ce_loss": 0.0872740181817856, | |
| "eval_cos_loss": 0.35029126064145943, | |
| "eval_cov": 0.08723331259810217, | |
| "eval_cov_loss": 0.012916838626917367, | |
| "eval_glb_loss": 0.00597613696093973, | |
| "eval_global_kurtosis": 3.024092547969731, | |
| "eval_global_mean": 0.04041210157141838, | |
| "eval_global_var": 1.0756590682077625, | |
| "eval_krt_loss": 0.0019540588812798874, | |
| "eval_loss": 0.27823208194209015, | |
| "eval_mse_loss": 0.8730145360781177, | |
| "eval_per_loss": 0.0027831836378193336, | |
| "eval_per_var": 1.0494290007848173, | |
| "eval_runtime": 152.787, | |
| "eval_samples_per_second": 183.216, | |
| "eval_steps_per_second": 2.867, | |
| "eval_within_var": 0.9819663109844678, | |
| "eval_wth_loss": 0.00048046983986418896, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.2955983557341462, | |
| "grad_norm": 0.0873865932226181, | |
| "learning_rate": 4.131362984409538e-05, | |
| "loss": 0.3471, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.30742228996351206, | |
| "grad_norm": 0.07706636935472488, | |
| "learning_rate": 4.0576382427305324e-05, | |
| "loss": 0.3341, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 0.3192462241928779, | |
| "grad_norm": 0.09675078094005585, | |
| "learning_rate": 3.981629410852463e-05, | |
| "loss": 0.323, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "grad_norm": 0.09393124282360077, | |
| "learning_rate": 3.903447946637093e-05, | |
| "loss": 0.3086, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "eval_acr_loss": 0.726046212868059, | |
| "eval_across_var": 0.14836800615537113, | |
| "eval_bleu": 0.9750192561822552, | |
| "eval_ce_loss": 0.06533130286389018, | |
| "eval_cos_loss": 0.3090691367785136, | |
| "eval_cov": 0.0970152415097032, | |
| "eval_cov_loss": 0.01665312849988812, | |
| "eval_glb_loss": 0.012594995977671723, | |
| "eval_global_kurtosis": 3.052053846180711, | |
| "eval_global_mean": 0.055559001556814536, | |
| "eval_global_var": 1.1099212507134704, | |
| "eval_krt_loss": 0.005212522339852683, | |
| "eval_loss": 0.23639628832062629, | |
| "eval_mse_loss": 0.800027005204327, | |
| "eval_per_loss": 0.004818411722575149, | |
| "eval_per_var": 1.0654642462186072, | |
| "eval_within_var": 0.9631439711405262, | |
| "eval_wth_loss": 0.0015324829790224052, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.3310701584222438, | |
| "eval_acr_loss": 0.726046212868059, | |
| "eval_across_var": 0.14836800615537113, | |
| "eval_bleu": 0.9750192561822552, | |
| "eval_ce_loss": 0.06533130286389018, | |
| "eval_cos_loss": 0.3090691367785136, | |
| "eval_cov": 0.0970152415097032, | |
| "eval_cov_loss": 0.01665312849988812, | |
| "eval_glb_loss": 0.012594995977671723, | |
| "eval_global_kurtosis": 3.052053846180711, | |
| "eval_global_mean": 0.055559001556814536, | |
| "eval_global_var": 1.1099212507134704, | |
| "eval_krt_loss": 0.005212522339852683, | |
| "eval_loss": 0.23639628832062629, | |
| "eval_mse_loss": 0.800027005204327, | |
| "eval_per_loss": 0.004818411722575149, | |
| "eval_per_var": 1.0654642462186072, | |
| "eval_runtime": 149.7025, | |
| "eval_samples_per_second": 186.991, | |
| "eval_steps_per_second": 2.926, | |
| "eval_within_var": 0.9631439711405262, | |
| "eval_wth_loss": 0.0015324829790224052, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.34289409265160964, | |
| "grad_norm": 0.08861193060874939, | |
| "learning_rate": 3.823208493851674e-05, | |
| "loss": 0.3004, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 0.3547180268809755, | |
| "grad_norm": 0.10392362624406815, | |
| "learning_rate": 3.741028714057574e-05, | |
| "loss": 0.2906, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.3665419611103413, | |
| "grad_norm": 0.09276238083839417, | |
| "learning_rate": 3.657029114073668e-05, | |
| "loss": 0.2807, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "grad_norm": 0.08639795333147049, | |
| "learning_rate": 3.571332869267499e-05, | |
| "loss": 0.2715, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "eval_acr_loss": 0.6639264498127105, | |
| "eval_across_var": 0.18578778622357267, | |
| "eval_bleu": 0.9802777908340867, | |
| "eval_ce_loss": 0.05099814674778752, | |
| "eval_cos_loss": 0.2809805735333325, | |
| "eval_cov": 0.10055263292843893, | |
| "eval_cov_loss": 0.018623638194326532, | |
| "eval_glb_loss": 0.017183700108637005, | |
| "eval_global_kurtosis": 3.0215004377713486, | |
| "eval_global_mean": 0.06458907802355345, | |
| "eval_global_var": 1.1285004637557077, | |
| "eval_krt_loss": 0.0026812796775613885, | |
| "eval_loss": 0.20853681022037654, | |
| "eval_mse_loss": 0.7543587909169394, | |
| "eval_per_loss": 0.004951255506576468, | |
| "eval_per_var": 1.0664095943921232, | |
| "eval_within_var": 0.9453123360191851, | |
| "eval_wth_loss": 0.0031645966432090277, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.37836589533970716, | |
| "eval_acr_loss": 0.6639264498127105, | |
| "eval_across_var": 0.18578778622357267, | |
| "eval_bleu": 0.9802777908340867, | |
| "eval_ce_loss": 0.05099814674778752, | |
| "eval_cos_loss": 0.2809805735333325, | |
| "eval_cov": 0.10055263292843893, | |
| "eval_cov_loss": 0.018623638194326532, | |
| "eval_glb_loss": 0.017183700108637005, | |
| "eval_global_kurtosis": 3.0215004377713486, | |
| "eval_global_mean": 0.06458907802355345, | |
| "eval_global_var": 1.1285004637557077, | |
| "eval_krt_loss": 0.0026812796775613885, | |
| "eval_loss": 0.20853681022037654, | |
| "eval_mse_loss": 0.7543587909169394, | |
| "eval_per_loss": 0.004951255506576468, | |
| "eval_per_var": 1.0664095943921232, | |
| "eval_runtime": 151.9683, | |
| "eval_samples_per_second": 184.203, | |
| "eval_steps_per_second": 2.882, | |
| "eval_within_var": 0.9453123360191851, | |
| "eval_wth_loss": 0.0031645966432090277, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.390189829569073, | |
| "grad_norm": 0.09597329795360565, | |
| "learning_rate": 3.484065642933335e-05, | |
| "loss": 0.2653, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 0.4020137637984389, | |
| "grad_norm": 0.09884477406740189, | |
| "learning_rate": 3.395704566041868e-05, | |
| "loss": 0.2588, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 0.41383769802780473, | |
| "grad_norm": 0.12420962005853653, | |
| "learning_rate": 3.305686266521002e-05, | |
| "loss": 0.251, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "grad_norm": 0.09434681385755539, | |
| "learning_rate": 3.214486524445379e-05, | |
| "loss": 0.2459, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "eval_acr_loss": 0.5954188222074073, | |
| "eval_across_var": 0.22920796821174555, | |
| "eval_bleu": 0.9836602165246535, | |
| "eval_ce_loss": 0.041640645169470136, | |
| "eval_cos_loss": 0.2611509262835054, | |
| "eval_cov": 0.1050533486283533, | |
| "eval_cov_loss": 0.02121386145875198, | |
| "eval_glb_loss": 0.02622999339343206, | |
| "eval_global_kurtosis": 3.059722278216114, | |
| "eval_global_mean": 0.07356201894751423, | |
| "eval_global_var": 1.1591975242579908, | |
| "eval_krt_loss": 0.0066943209811956075, | |
| "eval_loss": 0.18931484470764795, | |
| "eval_mse_loss": 0.7253989612675149, | |
| "eval_per_loss": 0.007019040791411378, | |
| "eval_per_var": 1.0800580586472603, | |
| "eval_within_var": 0.9334883986542758, | |
| "eval_wth_loss": 0.004603328326420399, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4256616322571706, | |
| "eval_acr_loss": 0.5954188222074073, | |
| "eval_across_var": 0.22920796821174555, | |
| "eval_bleu": 0.9836602165246535, | |
| "eval_ce_loss": 0.041640645169470136, | |
| "eval_cos_loss": 0.2611509262835054, | |
| "eval_cov": 0.1050533486283533, | |
| "eval_cov_loss": 0.02121386145875198, | |
| "eval_glb_loss": 0.02622999339343206, | |
| "eval_global_kurtosis": 3.059722278216114, | |
| "eval_global_mean": 0.07356201894751423, | |
| "eval_global_var": 1.1591975242579908, | |
| "eval_krt_loss": 0.0066943209811956075, | |
| "eval_loss": 0.18931484470764795, | |
| "eval_mse_loss": 0.7253989612675149, | |
| "eval_per_loss": 0.007019040791411378, | |
| "eval_per_var": 1.0800580586472603, | |
| "eval_runtime": 152.3156, | |
| "eval_samples_per_second": 183.783, | |
| "eval_steps_per_second": 2.876, | |
| "eval_within_var": 0.9334883986542758, | |
| "eval_wth_loss": 0.004603328326420399, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.4374855664865364, | |
| "grad_norm": 0.09029538929462433, | |
| "learning_rate": 3.122239073329827e-05, | |
| "loss": 0.241, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 0.44930950071590225, | |
| "grad_norm": 0.0906376913189888, | |
| "learning_rate": 3.029079183029181e-05, | |
| "loss": 0.2373, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 0.4611334349452681, | |
| "grad_norm": 0.08700253814458847, | |
| "learning_rate": 2.935143461381221e-05, | |
| "loss": 0.2319, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "grad_norm": 0.12261617183685303, | |
| "learning_rate": 2.8405696538876124e-05, | |
| "loss": 0.2291, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "eval_acr_loss": 0.5550906222024464, | |
| "eval_across_var": 0.2559552679867505, | |
| "eval_bleu": 0.9864228314384785, | |
| "eval_ce_loss": 0.03463799520246855, | |
| "eval_cos_loss": 0.2463943555815035, | |
| "eval_cov": 0.10881105187821062, | |
| "eval_cov_loss": 0.023118084696329894, | |
| "eval_glb_loss": 0.031056445483203347, | |
| "eval_global_kurtosis": 3.0601126701320145, | |
| "eval_global_mean": 0.08328581727258691, | |
| "eval_global_var": 1.173308629423516, | |
| "eval_krt_loss": 0.0071202065942054926, | |
| "eval_loss": 0.17534998668247162, | |
| "eval_mse_loss": 0.7060943077144013, | |
| "eval_per_loss": 0.007063817760171411, | |
| "eval_per_var": 1.0801762271689497, | |
| "eval_within_var": 0.9215792903344925, | |
| "eval_wth_loss": 0.006330131607865872, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.472957369174634, | |
| "eval_acr_loss": 0.5550906222024464, | |
| "eval_across_var": 0.2559552679867505, | |
| "eval_bleu": 0.9864228314384785, | |
| "eval_ce_loss": 0.03463799520246855, | |
| "eval_cos_loss": 0.2463943555815035, | |
| "eval_cov": 0.10881105187821062, | |
| "eval_cov_loss": 0.023118084696329894, | |
| "eval_glb_loss": 0.031056445483203347, | |
| "eval_global_kurtosis": 3.0601126701320145, | |
| "eval_global_mean": 0.08328581727258691, | |
| "eval_global_var": 1.173308629423516, | |
| "eval_krt_loss": 0.0071202065942054926, | |
| "eval_loss": 0.17534998668247162, | |
| "eval_mse_loss": 0.7060943077144013, | |
| "eval_per_loss": 0.007063817760171411, | |
| "eval_per_var": 1.0801762271689497, | |
| "eval_runtime": 149.1447, | |
| "eval_samples_per_second": 187.69, | |
| "eval_steps_per_second": 2.937, | |
| "eval_within_var": 0.9215792903344925, | |
| "eval_wth_loss": 0.006330131607865872, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.48478130340399983, | |
| "grad_norm": 0.0960695669054985, | |
| "learning_rate": 2.7454964417266016e-05, | |
| "loss": 0.2243, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 0.49660523763336567, | |
| "grad_norm": 0.09179438650608063, | |
| "learning_rate": 2.6504365436879463e-05, | |
| "loss": 0.2201, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 0.5084291718627315, | |
| "grad_norm": 0.12633387744426727, | |
| "learning_rate": 2.5547838773922217e-05, | |
| "loss": 0.2189, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "grad_norm": 0.10988616943359375, | |
| "learning_rate": 2.4590508770921138e-05, | |
| "loss": 0.2148, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "eval_acr_loss": 0.5188274796558842, | |
| "eval_across_var": 0.2808882309353515, | |
| "eval_bleu": 0.9881634510768484, | |
| "eval_ce_loss": 0.029888733919522806, | |
| "eval_cos_loss": 0.2354265242202641, | |
| "eval_cov": 0.1133959347798944, | |
| "eval_cov_loss": 0.025275619128574383, | |
| "eval_glb_loss": 0.03796970898702264, | |
| "eval_global_kurtosis": 3.0807342012178953, | |
| "eval_global_mean": 0.09449117042158292, | |
| "eval_global_var": 1.1918365617865296, | |
| "eval_krt_loss": 0.010437782962688104, | |
| "eval_loss": 0.16559295880195757, | |
| "eval_mse_loss": 0.6934955275222047, | |
| "eval_per_loss": 0.008229013991682497, | |
| "eval_per_var": 1.0867022509988584, | |
| "eval_within_var": 0.9157081242293528, | |
| "eval_wth_loss": 0.007290151238135279, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5202531060920974, | |
| "eval_acr_loss": 0.5188274796558842, | |
| "eval_across_var": 0.2808882309353515, | |
| "eval_bleu": 0.9881634510768484, | |
| "eval_ce_loss": 0.029888733919522806, | |
| "eval_cos_loss": 0.2354265242202641, | |
| "eval_cov": 0.1133959347798944, | |
| "eval_cov_loss": 0.025275619128574383, | |
| "eval_glb_loss": 0.03796970898702264, | |
| "eval_global_kurtosis": 3.0807342012178953, | |
| "eval_global_mean": 0.09449117042158292, | |
| "eval_global_var": 1.1918365617865296, | |
| "eval_krt_loss": 0.010437782962688104, | |
| "eval_loss": 0.16559295880195757, | |
| "eval_mse_loss": 0.6934955275222047, | |
| "eval_per_loss": 0.008229013991682497, | |
| "eval_per_var": 1.0867022509988584, | |
| "eval_runtime": 150.6787, | |
| "eval_samples_per_second": 185.779, | |
| "eval_steps_per_second": 2.907, | |
| "eval_within_var": 0.9157081242293528, | |
| "eval_wth_loss": 0.007290151238135279, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.5320770403214632, | |
| "grad_norm": 0.10263644903898239, | |
| "learning_rate": 2.3633779237834874e-05, | |
| "loss": 0.2118, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.5439009745508291, | |
| "grad_norm": 0.0880010798573494, | |
| "learning_rate": 2.267905310410478e-05, | |
| "loss": 0.2097, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 0.5557249087801949, | |
| "grad_norm": 0.08736666291952133, | |
| "learning_rate": 2.172773036142663e-05, | |
| "loss": 0.2077, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "grad_norm": 0.08412963151931763, | |
| "learning_rate": 2.0781206010830228e-05, | |
| "loss": 0.2063, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "eval_acr_loss": 0.5011714086548923, | |
| "eval_across_var": 0.2933282603479956, | |
| "eval_bleu": 0.9892267845036895, | |
| "eval_ce_loss": 0.026746679018740647, | |
| "eval_cos_loss": 0.22736865680103432, | |
| "eval_cov": 0.11331831701269977, | |
| "eval_cov_loss": 0.02555328495502812, | |
| "eval_glb_loss": 0.03884035702709738, | |
| "eval_global_kurtosis": 3.0901281877195452, | |
| "eval_global_mean": 0.09504469671205842, | |
| "eval_global_var": 1.193941299229452, | |
| "eval_krt_loss": 0.012473824904427331, | |
| "eval_loss": 0.15896797551003766, | |
| "eval_mse_loss": 0.6858495211220224, | |
| "eval_per_loss": 0.007296311800882697, | |
| "eval_per_var": 1.081324468464612, | |
| "eval_within_var": 0.9059402656881776, | |
| "eval_wth_loss": 0.009029195172984374, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 0.5675488430095608, | |
| "eval_acr_loss": 0.5011714086548923, | |
| "eval_across_var": 0.2933282603479956, | |
| "eval_bleu": 0.9892267845036895, | |
| "eval_ce_loss": 0.026746679018740647, | |
| "eval_cos_loss": 0.22736865680103432, | |
| "eval_cov": 0.11331831701269977, | |
| "eval_cov_loss": 0.02555328495502812, | |
| "eval_glb_loss": 0.03884035702709738, | |
| "eval_global_kurtosis": 3.0901281877195452, | |
| "eval_global_mean": 0.09504469671205842, | |
| "eval_global_var": 1.193941299229452, | |
| "eval_krt_loss": 0.012473824904427331, | |
| "eval_loss": 0.15896797551003766, | |
| "eval_mse_loss": 0.6858495211220224, | |
| "eval_per_loss": 0.007296311800882697, | |
| "eval_per_var": 1.081324468464612, | |
| "eval_runtime": 150.6158, | |
| "eval_samples_per_second": 185.857, | |
| "eval_steps_per_second": 2.908, | |
| "eval_within_var": 0.9059402656881776, | |
| "eval_wth_loss": 0.009029195172984374, | |
| "step": 12288 | |
| } | |
| ], | |
| "logging_steps": 256, | |
| "max_steps": 21651, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1024, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |