{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.44131273299286744, "eval_steps": 1024, "global_step": 10240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011032818324821687, "grad_norm": 0.20388168096542358, "learning_rate": 0.000498046875, "loss": 2.512310028076172, "step": 256 }, { "epoch": 0.022065636649643373, "grad_norm": 0.3460715115070343, "learning_rate": 0.000998046875, "loss": 2.018148422241211, "step": 512 }, { "epoch": 0.03309845497446506, "grad_norm": 0.5453425645828247, "learning_rate": 0.000999688448778502, "loss": 1.8114819526672363, "step": 768 }, { "epoch": 0.04413127329928675, "grad_norm": 0.9051710367202759, "learning_rate": 0.0009987492950653055, "loss": 1.75458824634552, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9399749239907662, "eval_cos_loss": 0.47715059564566054, "eval_dec_loss": 0.10647435713885055, "eval_loss": 1.73655072051579, "eval_mse2_loss": 0.1675516524389863, "eval_mse_loss": 1.3505616757407117, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.522849405561683, "flow/improvement_ratio": 0.8910323953323527, "flow/mag_ratio_mean": 0.5448383128465112, "flow/mag_ratio_std": 0.23550461588510826, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9399749239907662, "eval_cos_loss": 0.47715059564566054, "eval_dec_loss": 0.10647435713885055, "eval_loss": 1.73655072051579, "eval_mse2_loss": 0.1675516524389863, "eval_mse_loss": 1.3505616757407117, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.3582, "eval_samples_per_second": 196.904, "eval_steps_per_second": 3.078, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.522849405561683, "flow/improvement_ratio": 0.8910323953323527, "flow/mag_ratio_mean": 0.5448383128465112, "flow/mag_ratio_std": 0.23550461588510826, "step": 1024 }, { "epoch": 0.05516409162410843, "grad_norm": 0.5948837399482727, "learning_rate": 0.0009971837136430763, "loss": 1.732498049736023, "step": 1280 }, { "epoch": 0.06619690994893011, "grad_norm": 0.6182620525360107, "learning_rate": 0.0009949936708776692, "loss": 1.7030788660049438, "step": 1536 }, { "epoch": 0.07722972827375181, "grad_norm": 1.142866611480713, "learning_rate": 0.0009921819174566252, "loss": 1.7001720666885376, "step": 1792 }, { "epoch": 0.0882625465985735, "grad_norm": 0.8506317734718323, "learning_rate": 0.000988751984934317, "loss": 1.6855676174163818, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.9434877818736953, "eval_cos_loss": 0.46131489716613217, "eval_dec_loss": 0.09124524126659388, "eval_loss": 1.6677722422553023, "eval_mse2_loss": 0.15732173794812992, "eval_mse_loss": 1.3088257922800874, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5386851041683002, "flow/improvement_ratio": 0.8871048878250855, "flow/mag_ratio_mean": 0.5622577369848548, "flow/mag_ratio_std": 0.24945266208033573, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.9434877818736953, "eval_cos_loss": 0.46131489716613217, "eval_dec_loss": 0.09124524126659388, "eval_loss": 1.6677722422553023, "eval_mse2_loss": 0.15732173794812992, "eval_mse_loss": 1.3088257922800874, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 150.2408, "eval_samples_per_second": 199.68, "eval_steps_per_second": 3.122, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5386851041683002, "flow/improvement_ratio": 0.8871048878250855, "flow/mag_ratio_mean": 0.5622577369848548, "flow/mag_ratio_std": 0.24945266208033573, "step": 2048 }, { "epoch": 0.09929536492339518, "grad_norm": 0.7440093159675598, "learning_rate": 0.0009847081812963268, "loss": 1.6802997589111328, "step": 2304 }, { "epoch": 0.11032818324821686, "grad_norm": 0.9319222569465637, "learning_rate": 0.0009800555855486275, "loss": 1.6744197607040405, "step": 2560 }, { "epoch": 0.12136100157303854, "grad_norm": 0.8629500865936279, "learning_rate": 0.0009748000413383664, "loss": 1.6740639209747314, "step": 2816 }, { "epoch": 0.13239381989786023, "grad_norm": 0.9893732666969299, "learning_rate": 0.0009689481496142604, "loss": 1.664785623550415, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9404068449586629, "eval_cos_loss": 0.4534218231243874, "eval_dec_loss": 0.10443712005824614, "eval_loss": 1.6572931651621738, "eval_mse2_loss": 0.1550013455850229, "eval_mse_loss": 1.288264540212749, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5465781726816824, "flow/improvement_ratio": 0.8946911343125138, "flow/mag_ratio_mean": 0.5628405101517878, "flow/mag_ratio_std": 0.24253392000315285, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9404068449586629, "eval_cos_loss": 0.4534218231243874, "eval_dec_loss": 0.10443712005824614, "eval_loss": 1.6572931651621738, "eval_mse2_loss": 0.1550013455850229, "eval_mse_loss": 1.288264540212749, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 153.5602, "eval_samples_per_second": 195.363, "eval_steps_per_second": 3.054, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5465781726816824, "flow/improvement_ratio": 0.8946911343125138, "flow/mag_ratio_mean": 0.5628405101517878, "flow/mag_ratio_std": 0.24253392000315285, "step": 3072 }, { "epoch": 0.14342663822268192, "grad_norm": 0.9933224320411682, "learning_rate": 0.0009625072603358231, "loss": 1.6605451107025146, "step": 3328 }, { "epoch": 0.15445945654750362, "grad_norm": 1.221793532371521, "learning_rate": 0.0009554854632418371, "loss": 1.6490036249160767, "step": 3584 }, { "epoch": 0.1654922748723253, "grad_norm": 0.8394345045089722, "learning_rate": 0.000947891577689663, "loss": 1.649448275566101, "step": 3840 }, { "epoch": 0.176525093197147, "grad_norm": 1.245514154434204, "learning_rate": 0.0009397351415781539, "loss": 1.6489267349243164, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.942338221101898, "eval_cos_loss": 0.450266804585833, "eval_dec_loss": 0.1002394597608048, "eval_loss": 1.6427197324187517, "eval_mse2_loss": 0.15268841918025697, "eval_mse_loss": 1.280517189741643, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5497331933172018, "flow/improvement_ratio": 0.8926444684010325, "flow/mag_ratio_mean": 0.5659083429175907, "flow/mag_ratio_std": 0.24314757854318314, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.942338221101898, "eval_cos_loss": 0.450266804585833, "eval_dec_loss": 0.1002394597608048, "eval_loss": 1.6427197324187517, "eval_mse2_loss": 0.15268841918025697, "eval_mse_loss": 1.280517189741643, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.0027, "eval_samples_per_second": 197.365, "eval_steps_per_second": 3.085, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5497331933172018, "flow/improvement_ratio": 0.8926444684010325, "flow/mag_ratio_mean": 0.5659083429175907, "flow/mag_ratio_std": 0.24314757854318314, "step": 4096 }, { "epoch": 0.18755791152196866, "grad_norm": 1.0416312217712402, "learning_rate": 0.000931026399368079, "loss": 1.6447768211364746, "step": 4352 }, { "epoch": 0.19859072984679035, "grad_norm": 1.1173036098480225, "learning_rate": 0.0009217762892151117, "loss": 1.6489276885986328, "step": 4608 }, { "epoch": 0.20962354817161205, "grad_norm": 0.930402934551239, "learning_rate": 0.0009119964292315354, "loss": 1.6420283317565918, "step": 4864 }, { "epoch": 0.22065636649643372, "grad_norm": 0.9209682941436768, "learning_rate": 0.0009016991028939279, "loss": 1.6357425451278687, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.9431814239032791, "eval_cos_loss": 0.4463651056991203, "eval_dec_loss": 0.09822412853889755, "eval_loss": 1.628653102846288, "eval_mse2_loss": 0.14985301353529826, "eval_mse_loss": 1.2716914730539708, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5536348958259453, "flow/improvement_ratio": 0.8939987117293547, "flow/mag_ratio_mean": 0.576050937048662, "flow/mag_ratio_std": 0.2491114061397276, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.9431814239032791, "eval_cos_loss": 0.4463651056991203, "eval_dec_loss": 0.09822412853889755, "eval_loss": 1.628653102846288, "eval_mse2_loss": 0.14985301353529826, "eval_mse_loss": 1.2716914730539708, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 150.4792, "eval_samples_per_second": 199.363, "eval_steps_per_second": 3.117, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5536348958259453, "flow/improvement_ratio": 0.8939987117293547, "flow/mag_ratio_mean": 0.576050937048662, "flow/mag_ratio_std": 0.2491114061397276, "step": 5120 }, { "epoch": 0.23168918482125542, "grad_norm": 0.6372450590133667, "learning_rate": 0.0008908972436151494, "loss": 1.6375595331192017, "step": 5376 }, { "epoch": 0.2427220031460771, "grad_norm": 0.833997368812561, "learning_rate": 0.0008796044185000127, "loss": 1.6372840404510498, "step": 5632 }, { "epoch": 0.2537548214708988, "grad_norm": 0.6318811178207397, "learning_rate": 0.0008678348113050368, "loss": 1.628332257270813, "step": 5888 }, { "epoch": 0.26478763979572045, "grad_norm": 0.7464238405227661, "learning_rate": 0.0008556032046236897, "loss": 1.6342945098876953, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.941904927802284, "eval_cos_loss": 0.44474837520737637, "eval_dec_loss": 0.102102437767504, "eval_loss": 1.628202569764306, "eval_mse2_loss": 0.15077992649411343, "eval_mse_loss": 1.266597391954109, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5552516249197124, "flow/improvement_ratio": 0.8912770662988935, "flow/mag_ratio_mean": 0.5855364076364269, "flow/mag_ratio_std": 0.25087345015011364, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.941904927802284, "eval_cos_loss": 0.44474837520737637, "eval_dec_loss": 0.102102437767504, "eval_loss": 1.628202569764306, "eval_mse2_loss": 0.15077992649411343, "eval_mse_loss": 1.266597391954109, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 151.0104, "eval_samples_per_second": 198.662, "eval_steps_per_second": 3.106, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5552516249197124, "flow/improvement_ratio": 0.8912770662988935, "flow/mag_ratio_mean": 0.5855364076364269, "flow/mag_ratio_std": 0.25087345015011364, "step": 6144 }, { "epoch": 0.2758204581205422, "grad_norm": 1.1199694871902466, "learning_rate": 0.000842924961319492, "loss": 1.6266489028930664, "step": 6400 }, { "epoch": 0.28685327644536385, "grad_norm": 0.8668828010559082, "learning_rate": 0.0008298160052303045, "loss": 1.62454092502594, "step": 6656 }, { "epoch": 0.2978860947701855, "grad_norm": 0.8108460307121277, "learning_rate": 0.0008162928011680314, "loss": 1.624453067779541, "step": 6912 }, { "epoch": 0.30891891309500724, "grad_norm": 0.8465085625648499, "learning_rate": 0.000802372334238864, "loss": 1.6209194660186768, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.942268661485615, "eval_cos_loss": 0.4415781778186115, "eval_dec_loss": 0.09969639404813872, "eval_loss": 1.61412800298825, "eval_mse2_loss": 0.14795255090699774, "eval_mse_loss": 1.2580732640935415, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5584218213553114, "flow/improvement_ratio": 0.8916484548338949, "flow/mag_ratio_mean": 0.5767976182864419, "flow/mag_ratio_std": 0.2534670445011623, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.942268661485615, "eval_cos_loss": 0.4415781778186115, "eval_dec_loss": 0.09969639404813872, "eval_loss": 1.61412800298825, "eval_mse2_loss": 0.14795255090699774, "eval_mse_loss": 1.2580732640935415, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 150.313, "eval_samples_per_second": 199.584, "eval_steps_per_second": 3.12, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5584218213553114, "flow/improvement_ratio": 0.8916484548338949, "flow/mag_ratio_mean": 0.5767976182864419, "flow/mag_ratio_std": 0.2534670445011623, "step": 7168 }, { "epoch": 0.3199517314198289, "grad_norm": 1.8415089845657349, "learning_rate": 0.0007880720885100349, "loss": 1.6192532777786255, "step": 7424 }, { "epoch": 0.3309845497446506, "grad_norm": 0.7575666904449463, "learning_rate": 0.0007734100250498788, "loss": 1.6192028522491455, "step": 7680 }, { "epoch": 0.3420173680694723, "grad_norm": 1.108810544013977, "learning_rate": 0.000758404559368781, "loss": 1.614426851272583, "step": 7936 }, { "epoch": 0.353050186394294, "grad_norm": 1.224976897239685, "learning_rate": 0.0007430745382893488, "loss": 1.612691879272461, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9417854724917855, "eval_cos_loss": 0.4400326657905253, "eval_dec_loss": 0.1005224303852743, "eval_loss": 1.6120708153954446, "eval_mse2_loss": 0.14864133708258429, "eval_mse_loss": 1.254655804715431, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5599673331927643, "flow/improvement_ratio": 0.8927549306771903, "flow/mag_ratio_mean": 0.5825493686488951, "flow/mag_ratio_std": 0.25123427366651196, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9417854724917855, "eval_cos_loss": 0.4400326657905253, "eval_dec_loss": 0.1005224303852743, "eval_loss": 1.6120708153954446, "eval_mse2_loss": 0.14864133708258429, "eval_mse_loss": 1.254655804715431, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 153.0586, "eval_samples_per_second": 196.003, "eval_steps_per_second": 3.064, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5599673331927643, "flow/improvement_ratio": 0.8927549306771903, "flow/mag_ratio_mean": 0.5825493686488951, "flow/mag_ratio_std": 0.25123427366651196, "step": 8192 }, { "epoch": 0.36408300471911564, "grad_norm": 1.2766138315200806, "learning_rate": 0.0007274392162748551, "loss": 1.6162679195404053, "step": 8448 }, { "epoch": 0.3751158230439373, "grad_norm": 0.862872302532196, "learning_rate": 0.000711518231245687, "loss": 1.6088062524795532, "step": 8704 }, { "epoch": 0.38614864136875904, "grad_norm": 0.7975575923919678, "learning_rate": 0.0006953315799141723, "loss": 1.6033779382705688, "step": 8960 }, { "epoch": 0.3971814596935807, "grad_norm": 1.822509765625, "learning_rate": 0.0006788995926687669, "loss": 1.6062895059585571, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9410773493346376, "eval_cos_loss": 0.4370425479498499, "eval_dec_loss": 0.10427019556861188, "eval_loss": 1.6075624590997757, "eval_mse2_loss": 0.1481303899272927, "eval_mse_loss": 1.247209641470838, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5629574498896406, "flow/improvement_ratio": 0.8962287469459241, "flow/mag_ratio_mean": 0.5783110863364327, "flow/mag_ratio_std": 0.2480927841432059, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9410773493346376, "eval_cos_loss": 0.4370425479498499, "eval_dec_loss": 0.10427019556861188, "eval_loss": 1.6075624590997757, "eval_mse2_loss": 0.1481303899272927, "eval_mse_loss": 1.247209641470838, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 159.7751, "eval_samples_per_second": 187.764, "eval_steps_per_second": 2.935, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5629574498896406, "flow/improvement_ratio": 0.8962287469459241, "flow/mag_ratio_mean": 0.5783110863364327, "flow/mag_ratio_std": 0.2480927841432059, "step": 9216 }, { "epoch": 0.4082142780184024, "grad_norm": 1.1795552968978882, "learning_rate": 0.0006622429080391422, "loss": 1.6098705530166626, "step": 9472 }, { "epoch": 0.4192470963432241, "grad_norm": 0.8205899000167847, "learning_rate": 0.0006453824467742515, "loss": 1.6050623655319214, "step": 9728 }, { "epoch": 0.43027991466804577, "grad_norm": 0.6470943093299866, "learning_rate": 0.0006283393855659275, "loss": 1.61065673828125, "step": 9984 }, { "epoch": 0.44131273299286744, "grad_norm": 0.9093553423881531, "learning_rate": 0.0006111351304510173, "loss": 1.6007680892944336, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.9417572640186486, "eval_cos_loss": 0.4365569194242644, "eval_dec_loss": 0.10090916226905927, "eval_loss": 1.6032250524838088, "eval_mse2_loss": 0.14803322787478027, "eval_mse_loss": 1.2463789934288465, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5634430805757356, "flow/improvement_ratio": 0.8962225704304954, "flow/mag_ratio_mean": 0.5846807780042131, "flow/mag_ratio_std": 0.25302369241267125, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.9417572640186486, "eval_cos_loss": 0.4365569194242644, "eval_dec_loss": 0.10090916226905927, "eval_loss": 1.6032250524838088, "eval_mse2_loss": 0.14803322787478027, "eval_mse_loss": 1.2463789934288465, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.6772, "eval_samples_per_second": 196.493, "eval_steps_per_second": 3.072, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5634430805757356, "flow/improvement_ratio": 0.8962225704304954, "flow/mag_ratio_mean": 0.5846807780042131, "flow/mag_ratio_std": 0.25302369241267125, "step": 10240 } ], "logging_steps": 256, "max_steps": 23204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }