{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5675488430095608, "eval_steps": 1024, "global_step": 12288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047295736917463395, "grad_norm": 2.106490135192871, "learning_rate": 0.00029970703124999995, "loss": 7.3224077224731445, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.16704781801498278, "eval_ce_clean_loss": 0.8006052018300583, "eval_ce_pred_loss": 5.09323315206728, "eval_flow_cos_loss": 0.25552307487758874, "eval_flow_mse_loss": 0.4737413005741764, "eval_loss": 5.201592297314509, "flow/cos_sim": 0.7444769605381848, "flow/improvement_ratio": 0.9690007240804908, "flow/mag_ratio_mean": 0.5684286470962986, "flow/mag_ratio_std": 0.1943025813187094, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.16704781801498278, "eval_ce_clean_loss": 0.8006052018300583, "eval_ce_pred_loss": 5.09323315206728, "eval_flow_cos_loss": 0.25552307487758874, "eval_flow_mse_loss": 0.4737413005741764, "eval_loss": 5.201592297314509, "eval_runtime": 184.3437, "eval_samples_per_second": 151.852, "eval_steps_per_second": 2.376, "flow/cos_sim": 0.7444769605381848, "flow/improvement_ratio": 0.9690007240804908, "flow/mag_ratio_mean": 0.5684286470962986, "flow/mag_ratio_std": 0.1943025813187094, "step": 1024 }, { "epoch": 0.09459147383492679, "grad_norm": 1.5151523351669312, "learning_rate": 0.00029818297300322, "loss": 5.1176676750183105, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.15959126254389083, "eval_ce_clean_loss": 0.2092574786854117, "eval_ce_pred_loss": 5.118338076491334, "eval_flow_cos_loss": 0.13469866032129554, "eval_flow_mse_loss": 0.3451302863963663, "eval_loss": 5.006234865754707, "flow/cos_sim": 0.865301351841182, "flow/improvement_ratio": 0.993777159822586, "flow/mag_ratio_mean": 0.764228024575264, "flow/mag_ratio_std": 0.1451562752938706, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.15959126254389083, "eval_ce_clean_loss": 0.2092574786854117, "eval_ce_pred_loss": 5.118338076491334, "eval_flow_cos_loss": 0.13469866032129554, "eval_flow_mse_loss": 0.3451302863963663, "eval_loss": 5.006234865754707, "eval_runtime": 179.8087, "eval_samples_per_second": 155.682, "eval_steps_per_second": 2.436, "flow/cos_sim": 0.865301351841182, "flow/improvement_ratio": 0.993777159822586, "flow/mag_ratio_mean": 0.764228024575264, "flow/mag_ratio_std": 0.1451562752938706, "step": 2048 }, { "epoch": 0.1418872107523902, "grad_norm": 2.0519113540649414, "learning_rate": 0.0002927689070858589, "loss": 4.94038724899292, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.19109057338236854, "eval_ce_clean_loss": 0.09927985731291172, "eval_ce_pred_loss": 4.847544043031458, "eval_flow_cos_loss": 0.11591754169905022, "eval_flow_mse_loss": 0.3725693541574696, "eval_loss": 4.7742662386262795, "flow/cos_sim": 0.8840824780671019, "flow/improvement_ratio": 0.9947053511120957, "flow/mag_ratio_mean": 0.8077854450162687, "flow/mag_ratio_std": 0.12759848946033547, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.19109057338236854, "eval_ce_clean_loss": 0.09927985731291172, "eval_ce_pred_loss": 4.847544043031458, "eval_flow_cos_loss": 0.11591754169905022, "eval_flow_mse_loss": 0.3725693541574696, "eval_loss": 4.7742662386262795, "eval_runtime": 179.382, "eval_samples_per_second": 156.052, "eval_steps_per_second": 2.442, "flow/cos_sim": 0.8840824780671019, "flow/improvement_ratio": 0.9947053511120957, "flow/mag_ratio_mean": 0.8077854450162687, "flow/mag_ratio_std": 0.12759848946033547, "step": 3072 }, { "epoch": 0.18918294766985358, "grad_norm": 2.0934338569641113, "learning_rate": 0.0002838892232677901, "loss": 4.741153240203857, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.18968657464122718, "eval_ce_clean_loss": 0.05336422391967278, "eval_ce_pred_loss": 4.809050649268442, "eval_flow_cos_loss": 0.07764796834422029, "eval_flow_mse_loss": 0.3626167481620562, "eval_loss": 4.7155106383371574, "flow/cos_sim": 0.9223520577498222, "flow/improvement_ratio": 0.9944308828817655, "flow/mag_ratio_mean": 0.8769120071848778, "flow/mag_ratio_std": 0.09842952985537651, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.18968657464122718, "eval_ce_clean_loss": 0.05336422391967278, "eval_ce_pred_loss": 4.809050649268442, "eval_flow_cos_loss": 0.07764796834422029, "eval_flow_mse_loss": 0.3626167481620562, "eval_loss": 4.7155106383371574, "eval_runtime": 179.0768, "eval_samples_per_second": 156.318, "eval_steps_per_second": 2.446, "flow/cos_sim": 0.9223520577498222, "flow/improvement_ratio": 0.9944308828817655, "flow/mag_ratio_mean": 0.8769120071848778, "flow/mag_ratio_std": 0.09842952985537651, "step": 4096 }, { "epoch": 0.236478684587317, "grad_norm": 2.3496689796447754, "learning_rate": 0.00027177281107320826, "loss": 4.68638801574707, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.22356960405367263, "eval_ce_clean_loss": 0.0327355340824858, "eval_ce_pred_loss": 4.565925827853756, "eval_flow_cos_loss": 0.08153306739617429, "eval_flow_mse_loss": 0.3821449596860093, "eval_loss": 4.515134906115597, "flow/cos_sim": 0.9184669488913393, "flow/improvement_ratio": 0.9950582473517553, "flow/mag_ratio_mean": 0.8821974423922361, "flow/mag_ratio_std": 0.09247283822937644, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.22356960405367263, "eval_ce_clean_loss": 0.0327355340824858, "eval_ce_pred_loss": 4.565925827853756, "eval_flow_cos_loss": 0.08153306739617429, "eval_flow_mse_loss": 0.3821449596860093, "eval_loss": 4.515134906115597, "eval_runtime": 181.4276, "eval_samples_per_second": 154.293, "eval_steps_per_second": 2.414, "flow/cos_sim": 0.9184669488913393, "flow/improvement_ratio": 0.9950582473517553, "flow/mag_ratio_mean": 0.8821974423922361, "flow/mag_ratio_std": 0.09247283822937644, "step": 5120 }, { "epoch": 0.2837744215047804, "grad_norm": 2.1534059047698975, "learning_rate": 0.00025669014822961186, "loss": 4.549870491027832, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.2167831701158986, "eval_ce_clean_loss": 0.022042056864561285, "eval_ce_pred_loss": 4.496095672045668, "eval_flow_cos_loss": 0.07625704317366423, "eval_flow_mse_loss": 0.4058228160964844, "eval_loss": 4.473577274579435, "flow/cos_sim": 0.9237429750020101, "flow/improvement_ratio": 0.9947892213793106, "flow/mag_ratio_mean": 0.8912639768972789, "flow/mag_ratio_std": 0.08584192658872365, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.2167831701158986, "eval_ce_clean_loss": 0.022042056864561285, "eval_ce_pred_loss": 4.496095672045668, "eval_flow_cos_loss": 0.07625704317366423, "eval_flow_mse_loss": 0.4058228160964844, "eval_loss": 4.473577274579435, "eval_runtime": 180.321, "eval_samples_per_second": 155.24, "eval_steps_per_second": 2.429, "flow/cos_sim": 0.9237429750020101, "flow/improvement_ratio": 0.9947892213793106, "flow/mag_ratio_mean": 0.8912639768972789, "flow/mag_ratio_std": 0.08584192658872365, "step": 6144 }, { "epoch": 0.3310701584222438, "grad_norm": 2.1275017261505127, "learning_rate": 0.0002390360415767374, "loss": 4.411749362945557, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.26130694526226583, "eval_ce_clean_loss": 0.015259190446582355, "eval_ce_pred_loss": 4.084311145081368, "eval_flow_cos_loss": 0.11306819562974586, "eval_flow_mse_loss": 0.5684606986094828, "eval_loss": 4.274133609310133, "flow/cos_sim": 0.8869318397349963, "flow/improvement_ratio": 0.9916486408068165, "flow/mag_ratio_mean": 0.853514781281284, "flow/mag_ratio_std": 0.09556960212449504, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.26130694526226583, "eval_ce_clean_loss": 0.015259190446582355, "eval_ce_pred_loss": 4.084311145081368, "eval_flow_cos_loss": 0.11306819562974586, "eval_flow_mse_loss": 0.5684606986094828, "eval_loss": 4.274133609310133, "eval_runtime": 183.0141, "eval_samples_per_second": 152.955, "eval_steps_per_second": 2.393, "flow/cos_sim": 0.8869318397349963, "flow/improvement_ratio": 0.9916486408068165, "flow/mag_ratio_mean": 0.853514781281284, "flow/mag_ratio_std": 0.09556960212449504, "step": 7168 }, { "epoch": 0.37836589533970716, "grad_norm": 2.679063081741333, "learning_rate": 0.00021920458819314276, "loss": 4.291924476623535, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.26455338282933316, "eval_ce_clean_loss": 0.010562141490759984, "eval_ce_pred_loss": 4.048519775203374, "eval_flow_cos_loss": 0.08972637632598071, "eval_flow_mse_loss": 0.5574726984245048, "eval_loss": 4.224628202447064, "flow/cos_sim": 0.9102736386262118, "flow/improvement_ratio": 0.9913910556601607, "flow/mag_ratio_mean": 0.8761251777002256, "flow/mag_ratio_std": 0.08667415052176067, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.26455338282933316, "eval_ce_clean_loss": 0.010562141490759984, "eval_ce_pred_loss": 4.048519775203374, "eval_flow_cos_loss": 0.08972637632598071, "eval_flow_mse_loss": 0.5574726984245048, "eval_loss": 4.224628202447064, "eval_runtime": 182.7475, "eval_samples_per_second": 153.179, "eval_steps_per_second": 2.397, "flow/cos_sim": 0.9102736386262118, "flow/improvement_ratio": 0.9913910556601607, "flow/mag_ratio_mean": 0.8761251777002256, "flow/mag_ratio_std": 0.08667415052176067, "step": 8192 }, { "epoch": 0.4256616322571706, "grad_norm": 3.4350733757019043, "learning_rate": 0.0001976932406046495, "loss": 4.231810569763184, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.27369821400928834, "eval_ce_clean_loss": 0.007689533357032188, "eval_ce_pred_loss": 3.966488937809043, "eval_flow_cos_loss": 0.08683863282203674, "eval_flow_mse_loss": 0.569647965025684, "eval_loss": 4.161966539409063, "flow/cos_sim": 0.9131613878626802, "flow/improvement_ratio": 0.9904770306800599, "flow/mag_ratio_mean": 0.8949770351795301, "flow/mag_ratio_std": 0.08686588956340807, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.27369821400928834, "eval_ce_clean_loss": 0.007689533357032188, "eval_ce_pred_loss": 3.966488937809043, "eval_flow_cos_loss": 0.08683863282203674, "eval_flow_mse_loss": 0.569647965025684, "eval_loss": 4.161966539409063, "eval_runtime": 191.5369, "eval_samples_per_second": 146.149, "eval_steps_per_second": 2.287, "flow/cos_sim": 0.9131613878626802, "flow/improvement_ratio": 0.9904770306800599, "flow/mag_ratio_mean": 0.8949770351795301, "flow/mag_ratio_std": 0.08686588956340807, "step": 9216 }, { "epoch": 0.472957369174634, "grad_norm": 2.2814252376556396, "learning_rate": 0.00017504669739254724, "loss": 4.164244174957275, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.30777640553962915, "eval_ce_clean_loss": 0.0060966779907408375, "eval_ce_pred_loss": 3.6631897802222264, "eval_flow_cos_loss": 0.09066317927891805, "eval_flow_mse_loss": 0.6576739563245207, "eval_loss": 3.9778201345983706, "flow/cos_sim": 0.9093368449439742, "flow/improvement_ratio": 0.9863403809669355, "flow/mag_ratio_mean": 0.8948755321437365, "flow/mag_ratio_std": 0.0903660870203961, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.30777640553962915, "eval_ce_clean_loss": 0.0060966779907408375, "eval_ce_pred_loss": 3.6631897802222264, "eval_flow_cos_loss": 0.09066317927891805, "eval_flow_mse_loss": 0.6576739563245207, "eval_loss": 3.9778201345983706, "eval_runtime": 189.8476, "eval_samples_per_second": 147.45, "eval_steps_per_second": 2.307, "flow/cos_sim": 0.9093368449439742, "flow/improvement_ratio": 0.9863403809669355, "flow/mag_ratio_mean": 0.8948755321437365, "flow/mag_ratio_std": 0.0903660870203961, "step": 10240 }, { "epoch": 0.5202531060920974, "grad_norm": 2.119964122772217, "learning_rate": 0.00015177050305675404, "loss": 4.065644264221191, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.2925834655122522, "eval_ce_clean_loss": 0.004753688909139986, "eval_ce_pred_loss": 3.7541082803517174, "eval_flow_cos_loss": 0.08232224217220528, "eval_flow_mse_loss": 0.621043747554631, "eval_loss": 4.020797040908849, "flow/cos_sim": 0.9176777691057284, "flow/improvement_ratio": 0.9826103085524416, "flow/mag_ratio_mean": 0.8994121777412554, "flow/mag_ratio_std": 0.09006089084360698, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.2925834655122522, "eval_ce_clean_loss": 0.004753688909139986, "eval_ce_pred_loss": 3.7541082803517174, "eval_flow_cos_loss": 0.08232224217220528, "eval_flow_mse_loss": 0.621043747554631, "eval_loss": 4.020797040908849, "eval_runtime": 189.3251, "eval_samples_per_second": 147.857, "eval_steps_per_second": 2.313, "flow/cos_sim": 0.9176777691057284, "flow/improvement_ratio": 0.9826103085524416, "flow/mag_ratio_mean": 0.8994121777412554, "flow/mag_ratio_std": 0.09006089084360698, "step": 11264 }, { "epoch": 0.5675488430095608, "grad_norm": 2.5909643173217773, "learning_rate": 0.00012845133096612622, "loss": 4.054959774017334, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.28965271334128717, "eval_ce_clean_loss": 0.004024346635361164, "eval_ce_pred_loss": 3.718931865474405, "eval_flow_cos_loss": 0.08415858520696697, "eval_flow_mse_loss": 0.6252854074517341, "eval_loss": 3.9937660802989248, "flow/cos_sim": 0.9158414309699786, "flow/improvement_ratio": 0.9831551257605966, "flow/mag_ratio_mean": 0.8911215496934168, "flow/mag_ratio_std": 0.08892078035974611, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.28965271334128717, "eval_ce_clean_loss": 0.004024346635361164, "eval_ce_pred_loss": 3.718931865474405, "eval_flow_cos_loss": 0.08415858520696697, "eval_flow_mse_loss": 0.6252854074517341, "eval_loss": 3.9937660802989248, "eval_runtime": 188.6951, "eval_samples_per_second": 148.35, "eval_steps_per_second": 2.321, "flow/cos_sim": 0.9158414309699786, "flow/improvement_ratio": 0.9831551257605966, "flow/mag_ratio_mean": 0.8911215496934168, "flow/mag_ratio_std": 0.08892078035974611, "step": 12288 } ], "logging_steps": 1024, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }