{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3310701584222438, "eval_steps": 1024, "global_step": 7168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.1487267017364502, "learning_rate": 0.000498046875, "loss": 11.798027992248535, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.8386015295982361, "learning_rate": 0.000998046875, "loss": 1.7853779792785645, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.7344363331794739, "learning_rate": 0.000999640996023194, "loss": 1.103014588356018, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 1.1188315153121948, "learning_rate": 0.0009985588674043958, "loss": 0.9580796360969543, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.9192375404443994, "eval_ce_loss": 0.26167990953648745, "eval_cos_loss": 0.26406568816127296, "eval_loss": 0.9037111119864738, "eval_mse_loss": 0.6016385473617135, "eval_rec_loss": 0.013986090569199833, "flow/cos_sim": 0.7359343250048215, "flow/improvement_ratio": 0.9760946458605326, "flow/mag_ratio_mean": 0.7269674200717717, "flow/mag_ratio_std": 0.1390784539316343, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.9192375404443994, "eval_ce_loss": 0.26167990953648745, "eval_cos_loss": 0.26406568816127296, "eval_loss": 0.9037111119864738, "eval_mse_loss": 0.6016385473617135, "eval_rec_loss": 0.013986090569199833, "eval_runtime": 144.0156, "eval_samples_per_second": 194.375, "eval_steps_per_second": 3.041, "flow/cos_sim": 0.7359343250048215, "flow/improvement_ratio": 0.9760946458605326, "flow/mag_ratio_mean": 0.7269674200717717, "flow/mag_ratio_std": 0.1390784539316343, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 1.0113043785095215, "learning_rate": 0.0009967551747861387, "loss": 0.8836896419525146, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.9680636525154114, "learning_rate": 0.000994232528651847, "loss": 0.8432819247245789, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 1.166627049446106, "learning_rate": 0.0009909945800260092, "loss": 0.7870283126831055, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.7747617363929749, "learning_rate": 0.0009870460151900522, "loss": 0.7735522389411926, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.9264483676285591, "eval_ce_loss": 0.22021640589690372, "eval_cos_loss": 0.15934634184864557, "eval_loss": 0.751849727815689, "eval_mse_loss": 0.51190055103879, "eval_rec_loss": 0.0037981332031229603, "flow/cos_sim": 0.8406536742432477, "flow/improvement_ratio": 0.9754726998337871, "flow/mag_ratio_mean": 0.8395581131112085, "flow/mag_ratio_std": 0.09706963221096013, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.9264483676285591, "eval_ce_loss": 0.22021640589690372, "eval_cos_loss": 0.15934634184864557, "eval_loss": 0.751849727815689, "eval_mse_loss": 0.51190055103879, "eval_rec_loss": 0.0037981332031229603, "eval_runtime": 139.3758, "eval_samples_per_second": 200.845, "eval_steps_per_second": 3.143, "flow/cos_sim": 0.8406536742432477, "flow/improvement_ratio": 0.9754726998337871, "flow/mag_ratio_mean": 0.8395581131112085, "flow/mag_ratio_std": 0.09706963221096013, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 1.3507742881774902, "learning_rate": 0.0009823925488998885, "loss": 0.7531520128250122, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 1.090326189994812, "learning_rate": 0.0009770409161149525, "loss": 0.7384664416313171, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 1.6648627519607544, "learning_rate": 0.0009709988622506973, "loss": 0.7159472107887268, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.720405638217926, "learning_rate": 0.000964275131968659, "loss": 0.7134207487106323, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.9362046037171927, "eval_ce_loss": 0.18669388993094638, "eval_cos_loss": 0.09599785676829892, "eval_loss": 0.7009708360177741, "eval_mse_loss": 0.5027340843797274, "eval_rec_loss": 0.0019430727923829022, "flow/cos_sim": 0.9040021625555814, "flow/improvement_ratio": 0.9753212502830104, "flow/mag_ratio_mean": 0.906250716208323, "flow/mag_ratio_std": 0.07307879087519428, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.9362046037171927, "eval_ce_loss": 0.18669388993094638, "eval_cos_loss": 0.09599785676829892, "eval_loss": 0.7009708360177741, "eval_mse_loss": 0.5027340843797274, "eval_rec_loss": 0.0019430727923829022, "eval_runtime": 139.2617, "eval_samples_per_second": 201.01, "eval_steps_per_second": 3.145, "flow/cos_sim": 0.9040021625555814, "flow/improvement_ratio": 0.9753212502830104, "flow/mag_ratio_mean": 0.906250716208323, "flow/mag_ratio_std": 0.07307879087519428, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 1.171225905418396, "learning_rate": 0.0009568794565203123, "loss": 0.6967981457710266, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.9184499979019165, "learning_rate": 0.0009488225396630347, "loss": 0.6859588623046875, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 1.0972322225570679, "learning_rate": 0.0009401160421685646, "loss": 0.68483966588974, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 1.2944236993789673, "learning_rate": 0.0009307725649463714, "loss": 0.6722217202186584, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.9367529454304925, "eval_ce_loss": 0.17851220300531687, "eval_cos_loss": 0.0685597131088308, "eval_loss": 0.6638682314522191, "eval_mse_loss": 0.4771829223660029, "eval_rec_loss": 0.0013171346048419428, "flow/cos_sim": 0.9314403127045392, "flow/improvement_ratio": 0.9747336862021929, "flow/mag_ratio_mean": 0.9314602082722807, "flow/mag_ratio_std": 0.05983651272068013, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.9367529454304925, "eval_ce_loss": 0.17851220300531687, "eval_cos_loss": 0.0685597131088308, "eval_loss": 0.6638682314522191, "eval_mse_loss": 0.4771829223660029, "eval_rec_loss": 0.0013171346048419428, "eval_runtime": 143.247, "eval_samples_per_second": 195.418, "eval_steps_per_second": 3.058, "flow/cos_sim": 0.9314403127045392, "flow/improvement_ratio": 0.9747336862021929, "flow/mag_ratio_mean": 0.9314602082722807, "flow/mag_ratio_std": 0.05983651272068013, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 1.0950664281845093, "learning_rate": 0.0009208056308063659, "loss": 0.6635431051254272, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 1.0510311126708984, "learning_rate": 0.0009102296648873445, "loss": 0.652130126953125, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.7107524275779724, "learning_rate": 0.0008990599737794927, "loss": 0.6548014283180237, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 1.119279146194458, "learning_rate": 0.0008873127233711644, "loss": 0.644295871257782, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.9375087809294717, "eval_ce_loss": 0.17866700632629498, "eval_cos_loss": 0.05568918508379699, "eval_loss": 0.6384022356304404, "eval_mse_loss": 0.4531491862856634, "eval_rec_loss": 0.0010171248973892112, "flow/cos_sim": 0.9443108406785417, "flow/improvement_ratio": 0.9754358607612245, "flow/mag_ratio_mean": 0.9432625220790846, "flow/mag_ratio_std": 0.0532344374550532, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.9375087809294717, "eval_ce_loss": 0.17866700632629498, "eval_cos_loss": 0.05568918508379699, "eval_loss": 0.6384022356304404, "eval_mse_loss": 0.4531491862856634, "eval_rec_loss": 0.0010171248973892112, "eval_runtime": 140.7376, "eval_samples_per_second": 198.902, "eval_steps_per_second": 3.112, "flow/cos_sim": 0.9443108406785417, "flow/improvement_ratio": 0.9754358607612245, "flow/mag_ratio_mean": 0.9432625220790846, "flow/mag_ratio_std": 0.0532344374550532, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 1.223329782485962, "learning_rate": 0.0008750049154520011, "loss": 0.6385497450828552, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.7951129078865051, "learning_rate": 0.0008621543631062487, "loss": 0.6350575089454651, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.8830247521400452, "learning_rate": 0.0008487796649318904, "loss": 0.6269800066947937, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 1.0399079322814941, "learning_rate": 0.0008349001781229053, "loss": 0.6236906051635742, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.9406288888554537, "eval_ce_loss": 0.16271350685439018, "eval_cos_loss": 0.04946032597696128, "eval_loss": 0.6158577807962078, "eval_mse_loss": 0.4472781540868489, "eval_rec_loss": 0.0009200886164281037, "flow/cos_sim": 0.9505396935765602, "flow/improvement_ratio": 0.9752184091365501, "flow/mag_ratio_mean": 0.9574754819205907, "flow/mag_ratio_std": 0.04891788842131014, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.9406288888554537, "eval_ce_loss": 0.16271350685439018, "eval_cos_loss": 0.04946032597696128, "eval_loss": 0.6158577807962078, "eval_mse_loss": 0.4472781540868489, "eval_rec_loss": 0.0009200886164281037, "eval_runtime": 138.7881, "eval_samples_per_second": 201.696, "eval_steps_per_second": 3.156, "flow/cos_sim": 0.9505396935765602, "flow/improvement_ratio": 0.9752184091365501, "flow/mag_ratio_mean": 0.9574754819205907, "flow/mag_ratio_std": 0.04891788842131014, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 1.2804330587387085, "learning_rate": 0.0008205359904536107, "loss": 0.6104704737663269, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 1.038807988166809, "learning_rate": 0.0008057078912056363, "loss": 0.6035579442977905, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 1.1162539720535278, "learning_rate": 0.0007904373410796086, "loss": 0.6099694967269897, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.8053554892539978, "learning_rate": 0.0007747464411350876, "loss": 0.6010444760322571, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.9403609007223014, "eval_ce_loss": 0.16489822024130793, "eval_cos_loss": 0.042615741474307293, "eval_loss": 0.5965314044799979, "eval_mse_loss": 0.4266453656839998, "eval_rec_loss": 0.0007262465627901213, "flow/cos_sim": 0.9573842790573155, "flow/improvement_ratio": 0.9754182146564466, "flow/mag_ratio_mean": 0.9583017167435389, "flow/mag_ratio_std": 0.044361623012584096, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.9403609007223014, "eval_ce_loss": 0.16489822024130793, "eval_cos_loss": 0.042615741474307293, "eval_loss": 0.5965314044799979, "eval_mse_loss": 0.4266453656839998, "eval_rec_loss": 0.0007262465627901213, "eval_runtime": 141.0373, "eval_samples_per_second": 198.479, "eval_steps_per_second": 3.106, "flow/cos_sim": 0.9573842790573155, "flow/improvement_ratio": 0.9754182146564466, "flow/mag_ratio_mean": 0.9583017167435389, "flow/mag_ratio_std": 0.044361623012584096, "step": 7168 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }