{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1024, "global_step": 23204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011032818324821687, "grad_norm": 0.3007344603538513, "learning_rate": 0.000498046875, "loss": 1.9607043266296387, "step": 256 }, { "epoch": 0.022065636649643373, "grad_norm": 0.38754719495773315, "learning_rate": 0.000998046875, "loss": 1.8510947227478027, "step": 512 }, { "epoch": 0.03309845497446506, "grad_norm": 0.41651925444602966, "learning_rate": 0.000999688448778502, "loss": 1.7883503437042236, "step": 768 }, { "epoch": 0.04413127329928675, "grad_norm": 0.4576423764228821, "learning_rate": 0.0009987492950653055, "loss": 1.7437095642089844, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.7290582309773348, "eval_cos_loss": 0.6748774711257105, "eval_dec_loss": 0.0016125108924325007, "eval_loss": 1.7118930867485909, "eval_mse2_loss": 0.23725909220257294, "eval_mse_loss": 1.7118930867485909, "eval_rec_loss": 0.05790480172861296, "eval_var_loss": 0.029386979561529435, "flow/cos_sim": 0.32512253071707703, "flow/improvement_ratio": 0.773648498536173, "flow/mag_ratio_mean": 0.3315794987083752, "flow/mag_ratio_std": 0.19750540018844198, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.7290582309773348, "eval_cos_loss": 0.6748774711257105, "eval_dec_loss": 0.0016125108924325007, "eval_loss": 1.7118930867485909, "eval_mse2_loss": 0.23725909220257294, "eval_mse_loss": 1.7118930867485909, "eval_rec_loss": 0.05790480172861296, "eval_runtime": 103.0234, "eval_samples_per_second": 291.196, "eval_steps_per_second": 4.552, "eval_var_loss": 0.029386979561529435, "flow/cos_sim": 0.32512253071707703, "flow/improvement_ratio": 0.773648498536173, "flow/mag_ratio_mean": 0.3315794987083752, "flow/mag_ratio_std": 0.19750540018844198, "step": 1024 }, { "epoch": 0.05516409162410843, "grad_norm": 0.46116578578948975, "learning_rate": 0.0009971837136430763, "loss": 1.6916401386260986, "step": 1280 }, { "epoch": 0.06619690994893011, "grad_norm": 0.4674736559391022, "learning_rate": 0.0009949936708776692, "loss": 1.6614705324172974, "step": 1536 }, { "epoch": 0.07722972827375181, "grad_norm": 0.4964284598827362, "learning_rate": 0.0009921819174566252, "loss": 1.6426620483398438, "step": 1792 }, { "epoch": 0.0882625465985735, "grad_norm": 0.5635536313056946, "learning_rate": 0.000988751984934317, "loss": 1.6190364360809326, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.7361472993029746, "eval_cos_loss": 0.6193856069528217, "eval_dec_loss": 0.0013400374704601193, "eval_loss": 1.6149477191062878, "eval_mse2_loss": 0.2126978265959571, "eval_mse_loss": 1.6149477191062878, "eval_rec_loss": 0.055882355892288085, "eval_var_loss": 0.02890209875492526, "flow/cos_sim": 0.3806143929836338, "flow/improvement_ratio": 0.8031272210800318, "flow/mag_ratio_mean": 0.38922329186630655, "flow/mag_ratio_std": 0.2309291490168968, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.7361472993029746, "eval_cos_loss": 0.6193856069528217, "eval_dec_loss": 0.0013400374704601193, "eval_loss": 1.6149477191062878, "eval_mse2_loss": 0.2126978265959571, "eval_mse_loss": 1.6149477191062878, "eval_rec_loss": 0.055882355892288085, "eval_runtime": 102.671, "eval_samples_per_second": 292.196, "eval_steps_per_second": 4.568, "eval_var_loss": 0.02890209875492526, "flow/cos_sim": 0.3806143929836338, "flow/improvement_ratio": 0.8031272210800318, "flow/mag_ratio_mean": 0.38922329186630655, "flow/mag_ratio_std": 0.2309291490168968, "step": 2048 }, { "epoch": 0.09929536492339518, "grad_norm": 0.5123931169509888, "learning_rate": 0.0009847081812963268, "loss": 1.6022895574569702, "step": 2304 }, { "epoch": 0.11032818324821686, "grad_norm": 0.5382006764411926, "learning_rate": 0.0009800555855486275, "loss": 1.5898725986480713, "step": 2560 }, { "epoch": 0.12136100157303854, "grad_norm": 0.5514854192733765, "learning_rate": 0.0009748000413383664, "loss": 1.5826457738876343, "step": 2816 }, { "epoch": 0.13239381989786023, "grad_norm": 0.5678655505180359, "learning_rate": 0.0009689481496142604, "loss": 1.5685003995895386, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.7259279887313845, "eval_cos_loss": 0.5904010080579501, "eval_dec_loss": 0.001531593777309569, "eval_loss": 1.5695596038659752, "eval_mse2_loss": 0.20936787379448857, "eval_mse_loss": 1.5695596038659752, "eval_rec_loss": 0.060287337766082555, "eval_var_loss": 0.02963222060868862, "flow/cos_sim": 0.409598992768127, "flow/improvement_ratio": 0.8170475746268657, "flow/mag_ratio_mean": 0.4326363442294887, "flow/mag_ratio_std": 0.22964929263474845, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.7259279887313845, "eval_cos_loss": 0.5904010080579501, "eval_dec_loss": 0.001531593777309569, "eval_loss": 1.5695596038659752, "eval_mse2_loss": 0.20936787379448857, "eval_mse_loss": 1.5695596038659752, "eval_rec_loss": 0.060287337766082555, "eval_runtime": 103.2177, "eval_samples_per_second": 290.648, "eval_steps_per_second": 4.544, "eval_var_loss": 0.02963222060868862, "flow/cos_sim": 0.409598992768127, "flow/improvement_ratio": 0.8170475746268657, "flow/mag_ratio_mean": 0.4326363442294887, "flow/mag_ratio_std": 0.22964929263474845, "step": 3072 }, { "epoch": 0.14342663822268192, "grad_norm": 0.5561569333076477, "learning_rate": 0.0009625072603358231, "loss": 1.5565699338912964, "step": 3328 }, { "epoch": 0.15445945654750362, "grad_norm": 0.5337810516357422, "learning_rate": 0.0009554854632418371, "loss": 1.556112289428711, "step": 3584 }, { "epoch": 0.1654922748723253, "grad_norm": 0.6065189242362976, "learning_rate": 0.000947891577689663, "loss": 1.5425442457199097, "step": 3840 }, { "epoch": 0.176525093197147, "grad_norm": 0.6436013579368591, "learning_rate": 0.0009397351415781539, "loss": 1.5381078720092773, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.7609616675139879, "eval_cos_loss": 0.5725635654255271, "eval_dec_loss": 0.0013841146930163827, "eval_loss": 1.5326353372541317, "eval_mse2_loss": 0.20037362373459822, "eval_mse_loss": 1.5326353372541317, "eval_rec_loss": 0.05205997703934529, "eval_var_loss": 0.029761007865831288, "flow/cos_sim": 0.42743643495573924, "flow/improvement_ratio": 0.8228500355789656, "flow/mag_ratio_mean": 0.44373360606653095, "flow/mag_ratio_std": 0.2451275099060937, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.7609616675139879, "eval_cos_loss": 0.5725635654255271, "eval_dec_loss": 0.0013841146930163827, "eval_loss": 1.5326353372541317, "eval_mse2_loss": 0.20037362373459822, "eval_mse_loss": 1.5326353372541317, "eval_rec_loss": 0.05205997703934529, "eval_runtime": 103.0729, "eval_samples_per_second": 291.056, "eval_steps_per_second": 4.55, "eval_var_loss": 0.029761007865831288, "flow/cos_sim": 0.42743643495573924, "flow/improvement_ratio": 0.8228500355789656, "flow/mag_ratio_mean": 0.44373360606653095, "flow/mag_ratio_std": 0.2451275099060937, "step": 4096 }, { "epoch": 0.18755791152196866, "grad_norm": 0.5533596873283386, "learning_rate": 0.000931026399368079, "loss": 1.5355464220046997, "step": 4352 }, { "epoch": 0.19859072984679035, "grad_norm": 0.6129039525985718, "learning_rate": 0.0009217762892151117, "loss": 1.526825189590454, "step": 4608 }, { "epoch": 0.20962354817161205, "grad_norm": 0.6128653287887573, "learning_rate": 0.0009119964292315354, "loss": 1.5186046361923218, "step": 4864 }, { "epoch": 0.22065636649643372, "grad_norm": 0.6013854146003723, "learning_rate": 0.0009016991028939279, "loss": 1.5184156894683838, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.7313483153096204, "eval_cos_loss": 0.565770648880554, "eval_dec_loss": 0.0013971831941510986, "eval_loss": 1.5208095035064957, "eval_mse2_loss": 0.19698964767872906, "eval_mse_loss": 1.5208095035064957, "eval_rec_loss": 0.058415787606271724, "eval_var_loss": 0.029480641187508223, "flow/cos_sim": 0.4342293481328594, "flow/improvement_ratio": 0.8310012437387316, "flow/mag_ratio_mean": 0.4485036200170578, "flow/mag_ratio_std": 0.24114183547781476, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.7313483153096204, "eval_cos_loss": 0.565770648880554, "eval_dec_loss": 0.0013971831941510986, "eval_loss": 1.5208095035064957, "eval_mse2_loss": 0.19698964767872906, "eval_mse_loss": 1.5208095035064957, "eval_rec_loss": 0.058415787606271724, "eval_runtime": 104.1181, "eval_samples_per_second": 288.134, "eval_steps_per_second": 4.505, "eval_var_loss": 0.029480641187508223, "flow/cos_sim": 0.4342293481328594, "flow/improvement_ratio": 0.8310012437387316, "flow/mag_ratio_mean": 0.4485036200170578, "flow/mag_ratio_std": 0.24114183547781476, "step": 5120 }, { "epoch": 0.23168918482125542, "grad_norm": 0.5818307995796204, "learning_rate": 0.0008908972436151494, "loss": 1.5146307945251465, "step": 5376 }, { "epoch": 0.2427220031460771, "grad_norm": 0.5968588590621948, "learning_rate": 0.0008796044185000127, "loss": 1.5090495347976685, "step": 5632 }, { "epoch": 0.2537548214708988, "grad_norm": 0.645140528678894, "learning_rate": 0.0008678348113050368, "loss": 1.5024750232696533, "step": 5888 }, { "epoch": 0.26478763979572045, "grad_norm": 0.6232675313949585, "learning_rate": 0.0008556032046236897, "loss": 1.499906301498413, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.7912708006589123, "eval_cos_loss": 0.5567008724598996, "eval_dec_loss": 0.0014503563844457592, "eval_loss": 1.5032868258226146, "eval_mse2_loss": 0.19748503063469808, "eval_mse_loss": 1.5032868258226146, "eval_rec_loss": 0.05662109937145512, "eval_var_loss": 0.029432428198487265, "flow/cos_sim": 0.44329912652339, "flow/improvement_ratio": 0.8293576759061834, "flow/mag_ratio_mean": 0.4766448940803756, "flow/mag_ratio_std": 0.2504093461771255, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.7912708006589123, "eval_cos_loss": 0.5567008724598996, "eval_dec_loss": 0.0014503563844457592, "eval_loss": 1.5032868258226146, "eval_mse2_loss": 0.19748503063469808, "eval_mse_loss": 1.5032868258226146, "eval_rec_loss": 0.05662109937145512, "eval_runtime": 102.8737, "eval_samples_per_second": 291.62, "eval_steps_per_second": 4.559, "eval_var_loss": 0.029432428198487265, "flow/cos_sim": 0.44329912652339, "flow/improvement_ratio": 0.8293576759061834, "flow/mag_ratio_mean": 0.4766448940803756, "flow/mag_ratio_std": 0.2504093461771255, "step": 6144 }, { "epoch": 0.2758204581205422, "grad_norm": 0.6002918481826782, "learning_rate": 0.000842924961319492, "loss": 1.5013189315795898, "step": 6400 }, { "epoch": 0.28685327644536385, "grad_norm": 0.6131093502044678, "learning_rate": 0.0008298160052303045, "loss": 1.491563320159912, "step": 6656 }, { "epoch": 0.2978860947701855, "grad_norm": 0.6153339743614197, "learning_rate": 0.0008162928011680314, "loss": 1.4890822172164917, "step": 6912 }, { "epoch": 0.30891891309500724, "grad_norm": 0.5415698885917664, "learning_rate": 0.000802372334238864, "loss": 1.4869613647460938, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.6991471025766374, "eval_cos_loss": 0.5479137339571646, "eval_dec_loss": 0.0014181479605397324, "eval_loss": 1.4831991663365476, "eval_mse2_loss": 0.19134751513505033, "eval_mse_loss": 1.4831991663365476, "eval_rec_loss": 0.059703294130197086, "eval_var_loss": 0.029254676190330023, "flow/cos_sim": 0.45208626534384705, "flow/improvement_ratio": 0.8373922797154262, "flow/mag_ratio_mean": 0.46522473710686413, "flow/mag_ratio_std": 0.24233753331053232, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.6991471025766374, "eval_cos_loss": 0.5479137339571646, "eval_dec_loss": 0.0014181479605397324, "eval_loss": 1.4831991663365476, "eval_mse2_loss": 0.19134751513505033, "eval_mse_loss": 1.4831991663365476, "eval_rec_loss": 0.059703294130197086, "eval_runtime": 102.4636, "eval_samples_per_second": 292.787, "eval_steps_per_second": 4.577, "eval_var_loss": 0.029254676190330023, "flow/cos_sim": 0.45208626534384705, "flow/improvement_ratio": 0.8373922797154262, "flow/mag_ratio_mean": 0.46522473710686413, "flow/mag_ratio_std": 0.24233753331053232, "step": 7168 }, { "epoch": 0.3199517314198289, "grad_norm": 0.6683939695358276, "learning_rate": 0.0007880720885100349, "loss": 1.476445198059082, "step": 7424 }, { "epoch": 0.3309845497446506, "grad_norm": 0.5962712168693542, "learning_rate": 0.0007734100250498788, "loss": 1.4769901037216187, "step": 7680 }, { "epoch": 0.3420173680694723, "grad_norm": 0.5617682933807373, "learning_rate": 0.000758404559368781, "loss": 1.4828119277954102, "step": 7936 }, { "epoch": 0.353050186394294, "grad_norm": 0.7243582606315613, "learning_rate": 0.0007430745382893488, "loss": 1.4768471717834473, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.8304965060986523, "eval_cos_loss": 0.540344935744556, "eval_dec_loss": 0.001444703027248449, "eval_loss": 1.4680257085035604, "eval_mse2_loss": 0.1896642409979916, "eval_mse_loss": 1.4680257085035604, "eval_rec_loss": 0.056618061303885886, "eval_var_loss": 0.02941279357144319, "flow/cos_sim": 0.45965506501797676, "flow/improvement_ratio": 0.8406627575980067, "flow/mag_ratio_mean": 0.478700284002178, "flow/mag_ratio_std": 0.25183968741629426, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.8304965060986523, "eval_cos_loss": 0.540344935744556, "eval_dec_loss": 0.001444703027248449, "eval_loss": 1.4680257085035604, "eval_mse2_loss": 0.1896642409979916, "eval_mse_loss": 1.4680257085035604, "eval_rec_loss": 0.056618061303885886, "eval_runtime": 102.5949, "eval_samples_per_second": 292.412, "eval_steps_per_second": 4.571, "eval_var_loss": 0.02941279357144319, "flow/cos_sim": 0.45965506501797676, "flow/improvement_ratio": 0.8406627575980067, "flow/mag_ratio_mean": 0.478700284002178, "flow/mag_ratio_std": 0.25183968741629426, "step": 8192 }, { "epoch": 0.36408300471911564, "grad_norm": 0.6018216013908386, "learning_rate": 0.0007274392162748551, "loss": 1.4694677591323853, "step": 8448 }, { "epoch": 0.3751158230439373, "grad_norm": 0.5599421262741089, "learning_rate": 0.000711518231245687, "loss": 1.4721711874008179, "step": 8704 }, { "epoch": 0.38614864136875904, "grad_norm": 0.6241788864135742, "learning_rate": 0.0006953315799141723, "loss": 1.459176778793335, "step": 8960 }, { "epoch": 0.3971814596935807, "grad_norm": 0.6998386383056641, "learning_rate": 0.0006788995926687669, "loss": 1.4632288217544556, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.7369729060948697, "eval_cos_loss": 0.5354188728942546, "eval_dec_loss": 0.0013972995771112035, "eval_loss": 1.456240051336634, "eval_mse2_loss": 0.18758021689045912, "eval_mse_loss": 1.456240051336634, "eval_rec_loss": 0.05933690067730161, "eval_var_loss": 0.029272472025203045, "flow/cos_sim": 0.4645811278047338, "flow/improvement_ratio": 0.841912091286706, "flow/mag_ratio_mean": 0.4744996659791292, "flow/mag_ratio_std": 0.25510632248321324, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.7369729060948697, "eval_cos_loss": 0.5354188728942546, "eval_dec_loss": 0.0013972995771112035, "eval_loss": 1.456240051336634, "eval_mse2_loss": 0.18758021689045912, "eval_mse_loss": 1.456240051336634, "eval_rec_loss": 0.05933690067730161, "eval_runtime": 102.7869, "eval_samples_per_second": 291.866, "eval_steps_per_second": 4.563, "eval_var_loss": 0.029272472025203045, "flow/cos_sim": 0.4645811278047338, "flow/improvement_ratio": 0.841912091286706, "flow/mag_ratio_mean": 0.4744996659791292, "flow/mag_ratio_std": 0.25510632248321324, "step": 9216 }, { "epoch": 0.4082142780184024, "grad_norm": 0.5962811708450317, "learning_rate": 0.0006622429080391422, "loss": 1.4640510082244873, "step": 9472 }, { "epoch": 0.4192470963432241, "grad_norm": 0.588157594203949, "learning_rate": 0.0006453824467742515, "loss": 1.4573228359222412, "step": 9728 }, { "epoch": 0.43027991466804577, "grad_norm": 0.5932533740997314, "learning_rate": 0.0006283393855659275, "loss": 1.45904541015625, "step": 9984 }, { "epoch": 0.44131273299286744, "grad_norm": 0.6125295162200928, "learning_rate": 0.0006111351304510173, "loss": 1.455463171005249, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.782062866367082, "eval_cos_loss": 0.5326331170128861, "eval_dec_loss": 0.0014520329028074289, "eval_loss": 1.453022389777942, "eval_mse2_loss": 0.18840382176675777, "eval_mse_loss": 1.453022389777942, "eval_rec_loss": 0.05694365586195864, "eval_var_loss": 0.030047652452612227, "flow/cos_sim": 0.4673668822881255, "flow/improvement_ratio": 0.8451325959488273, "flow/mag_ratio_mean": 0.4745017219581075, "flow/mag_ratio_std": 0.2538460113092272, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.782062866367082, "eval_cos_loss": 0.5326331170128861, "eval_dec_loss": 0.0014520329028074289, "eval_loss": 1.453022389777942, "eval_mse2_loss": 0.18840382176675777, "eval_mse_loss": 1.453022389777942, "eval_rec_loss": 0.05694365586195864, "eval_runtime": 102.6396, "eval_samples_per_second": 292.285, "eval_steps_per_second": 4.569, "eval_var_loss": 0.030047652452612227, "flow/cos_sim": 0.4673668822881255, "flow/improvement_ratio": 0.8451325959488273, "flow/mag_ratio_mean": 0.4745017219581075, "flow/mag_ratio_std": 0.2538460113092272, "step": 10240 }, { "epoch": 0.45234555131768917, "grad_norm": 0.6636393666267395, "learning_rate": 0.0005937912899254605, "loss": 1.449182152748108, "step": 10496 }, { "epoch": 0.46337836964251083, "grad_norm": 0.5821182727813721, "learning_rate": 0.0005763296478040787, "loss": 1.4548357725143433, "step": 10752 }, { "epoch": 0.4744111879673325, "grad_norm": 0.6481524109840393, "learning_rate": 0.0005587721358601663, "loss": 1.4508562088012695, "step": 11008 }, { "epoch": 0.4854440062921542, "grad_norm": 0.653151273727417, "learning_rate": 0.0005411408062792448, "loss": 1.4442917108535767, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.7210051310934674, "eval_cos_loss": 0.5317811018495417, "eval_dec_loss": 0.001381080663882877, "eval_loss": 1.4509358108679116, "eval_mse2_loss": 0.18439998461811274, "eval_mse_loss": 1.4509358108679116, "eval_rec_loss": 0.06008440565301983, "eval_var_loss": 0.02928201055952481, "flow/cos_sim": 0.46821889872235783, "flow/improvement_ratio": 0.8448605187920365, "flow/mag_ratio_mean": 0.4716693379604486, "flow/mag_ratio_std": 0.25824843223161026, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.7210051310934674, "eval_cos_loss": 0.5317811018495417, "eval_dec_loss": 0.001381080663882877, "eval_loss": 1.4509358108679116, "eval_mse2_loss": 0.18439998461811274, "eval_mse_loss": 1.4509358108679116, "eval_rec_loss": 0.06008440565301983, "eval_runtime": 102.2168, "eval_samples_per_second": 293.494, "eval_steps_per_second": 4.588, "eval_var_loss": 0.02928201055952481, "flow/cos_sim": 0.46821889872235783, "flow/improvement_ratio": 0.8448605187920365, "flow/mag_ratio_mean": 0.4716693379604486, "flow/mag_ratio_std": 0.25824843223161026, "step": 11264 }, { "epoch": 0.4964768246169759, "grad_norm": 0.6343415379524231, "learning_rate": 0.0005234578039615789, "loss": 1.439915418624878, "step": 11520 }, { "epoch": 0.5075096429417976, "grad_norm": 0.7004493474960327, "learning_rate": 0.0005057453387082458, "loss": 1.4451959133148193, "step": 11776 }, { "epoch": 0.5185424612666193, "grad_norm": 0.7312789559364319, "learning_rate": 0.0004880256573256866, "loss": 1.4458304643630981, "step": 12032 }, { "epoch": 0.5295752795914409, "grad_norm": 0.6173807382583618, "learning_rate": 0.0004703210156837805, "loss": 1.4372222423553467, "step": 12288 }, { "epoch": 0.5295752795914409, "eval_bleu": 0.7895861883551821, "eval_cos_loss": 0.5291873634751163, "eval_dec_loss": 0.0014323489154225587, "eval_loss": 1.4454485322875001, "eval_mse2_loss": 0.18589616244408622, "eval_mse_loss": 1.4454485322875001, "eval_rec_loss": 0.05557121256036736, "eval_var_loss": 0.02942733407052341, "flow/cos_sim": 0.4708126370967832, "flow/improvement_ratio": 0.8446106520542966, "flow/mag_ratio_mean": 0.4888702236385996, "flow/mag_ratio_std": 0.2535232830403456, "step": 12288 }, { "epoch": 0.5295752795914409, "eval_bleu": 0.7895861883551821, "eval_cos_loss": 0.5291873634751163, "eval_dec_loss": 0.0014323489154225587, "eval_loss": 1.4454485322875001, "eval_mse2_loss": 0.18589616244408622, "eval_mse_loss": 1.4454485322875001, "eval_rec_loss": 0.05557121256036736, "eval_runtime": 103.43, "eval_samples_per_second": 290.051, "eval_steps_per_second": 4.534, "eval_var_loss": 0.02942733407052341, "flow/cos_sim": 0.4708126370967832, "flow/improvement_ratio": 0.8446106520542966, "flow/mag_ratio_mean": 0.4888702236385996, "flow/mag_ratio_std": 0.2535232830403456, "step": 12288 }, { "epoch": 0.5406080979162626, "grad_norm": 0.6357247233390808, "learning_rate": 0.0004526536507625343, "loss": 1.4381682872772217, "step": 12544 }, { "epoch": 0.5516409162410844, "grad_norm": 0.6554076671600342, "learning_rate": 0.00043504575272249973, "loss": 1.433600664138794, "step": 12800 }, { "epoch": 0.562673734565906, "grad_norm": 0.6298866271972656, "learning_rate": 0.0004175194370339921, "loss": 1.4380649328231812, "step": 13056 }, { "epoch": 0.5737065528907277, "grad_norm": 0.6736286282539368, "learning_rate": 0.0004000967167001243, "loss": 1.4344258308410645, "step": 13312 }, { "epoch": 0.5737065528907277, "eval_bleu": 0.754198615923969, "eval_cos_loss": 0.5237328007912585, "eval_dec_loss": 0.0013661543356946239, "eval_loss": 1.4330016496593256, "eval_mse2_loss": 0.18149238913806517, "eval_mse_loss": 1.4330016496593256, "eval_rec_loss": 0.05589268211104564, "eval_var_loss": 0.029215975571225194, "flow/cos_sim": 0.47626719946291907, "flow/improvement_ratio": 0.8467817164179104, "flow/mag_ratio_mean": 0.48472079412261054, "flow/mag_ratio_std": 0.25520913404569445, "step": 13312 }, { "epoch": 0.5737065528907277, "eval_bleu": 0.754198615923969, "eval_cos_loss": 0.5237328007912585, "eval_dec_loss": 0.0013661543356946239, "eval_loss": 1.4330016496593256, "eval_mse2_loss": 0.18149238913806517, "eval_mse_loss": 1.4330016496593256, "eval_rec_loss": 0.05589268211104564, "eval_runtime": 104.5844, "eval_samples_per_second": 286.85, "eval_steps_per_second": 4.484, "eval_var_loss": 0.029215975571225194, "flow/cos_sim": 0.47626719946291907, "flow/improvement_ratio": 0.8467817164179104, "flow/mag_ratio_mean": 0.48472079412261054, "flow/mag_ratio_std": 0.25520913404569445, "step": 13312 }, { "epoch": 0.5847393712155494, "grad_norm": 0.6239475011825562, "learning_rate": 0.00038279947460853446, "loss": 1.4331660270690918, "step": 13568 }, { "epoch": 0.595772189540371, "grad_norm": 0.6627410054206848, "learning_rate": 0.00036564943604654345, "loss": 1.4354665279388428, "step": 13824 }, { "epoch": 0.6068050078651928, "grad_norm": 0.6042789816856384, "learning_rate": 0.00034866814141425254, "loss": 1.4358711242675781, "step": 14080 }, { "epoch": 0.6178378261900145, "grad_norm": 0.6111028790473938, "learning_rate": 0.0003318769191698637, "loss": 1.4299204349517822, "step": 14336 }, { "epoch": 0.6178378261900145, "eval_bleu": 0.7007889817819709, "eval_cos_loss": 0.5228769168543663, "eval_dec_loss": 0.001358627397164917, "eval_loss": 1.4326896403135776, "eval_mse2_loss": 0.18112752599312043, "eval_mse_loss": 1.4326896403135776, "eval_rec_loss": 0.05488209239939954, "eval_var_loss": 0.02930486012401103, "flow/cos_sim": 0.4771230810486686, "flow/improvement_ratio": 0.8480699183081767, "flow/mag_ratio_mean": 0.48936520539112943, "flow/mag_ratio_std": 0.2627385834386862, "step": 14336 }, { "epoch": 0.6178378261900145, "eval_bleu": 0.7007889817819709, "eval_cos_loss": 0.5228769168543663, "eval_dec_loss": 0.001358627397164917, "eval_loss": 1.4326896403135776, "eval_mse2_loss": 0.18112752599312043, "eval_mse_loss": 1.4326896403135776, "eval_rec_loss": 0.05488209239939954, "eval_runtime": 103.0204, "eval_samples_per_second": 291.205, "eval_steps_per_second": 4.552, "eval_var_loss": 0.02930486012401103, "flow/cos_sim": 0.4771230810486686, "flow/improvement_ratio": 0.8480699183081767, "flow/mag_ratio_mean": 0.48936520539112943, "flow/mag_ratio_std": 0.2627385834386862, "step": 14336 }, { "epoch": 0.6288706445148361, "grad_norm": 0.6931398510932922, "learning_rate": 0.00031529685904119485, "loss": 1.4271036386489868, "step": 14592 }, { "epoch": 0.6399034628396578, "grad_norm": 0.616621196269989, "learning_rate": 0.0002989487855370421, "loss": 1.4223978519439697, "step": 14848 }, { "epoch": 0.6509362811644795, "grad_norm": 0.7069717645645142, "learning_rate": 0.00028285323179165424, "loss": 1.4210408926010132, "step": 15104 }, { "epoch": 0.6619690994893012, "grad_norm": 0.5767509937286377, "learning_rate": 0.0002670304137751759, "loss": 1.4249491691589355, "step": 15360 }, { "epoch": 0.6619690994893012, "eval_bleu": 0.7712555700160785, "eval_cos_loss": 0.520311662700893, "eval_dec_loss": 0.0013948907095809597, "eval_loss": 1.4238692244995377, "eval_mse2_loss": 0.1801110237900382, "eval_mse_loss": 1.4238692244995377, "eval_rec_loss": 0.05672604351370002, "eval_var_loss": 0.029106232196664507, "flow/cos_sim": 0.4796883367907518, "flow/improvement_ratio": 0.8454879620181981, "flow/mag_ratio_mean": 0.4905342829507043, "flow/mag_ratio_std": 0.2601209406786636, "step": 15360 }, { "epoch": 0.6619690994893012, "eval_bleu": 0.7712555700160785, "eval_cos_loss": 0.520311662700893, "eval_dec_loss": 0.0013948907095809597, "eval_loss": 1.4238692244995377, "eval_mse2_loss": 0.1801110237900382, "eval_mse_loss": 1.4238692244995377, "eval_rec_loss": 0.05672604351370002, "eval_runtime": 102.9224, "eval_samples_per_second": 291.482, "eval_steps_per_second": 4.557, "eval_var_loss": 0.029106232196664507, "flow/cos_sim": 0.4796883367907518, "flow/improvement_ratio": 0.8454879620181981, "flow/mag_ratio_mean": 0.4905342829507043, "flow/mag_ratio_std": 0.2601209406786636, "step": 15360 }, { "epoch": 0.6730019178141229, "grad_norm": 0.7135971784591675, "learning_rate": 0.0002515002049024435, "loss": 1.4220284223556519, "step": 15616 }, { "epoch": 0.6840347361389446, "grad_norm": 0.6657771468162537, "learning_rate": 0.00023628211107203429, "loss": 1.421180248260498, "step": 15872 }, { "epoch": 0.6950675544637662, "grad_norm": 0.6840319037437439, "learning_rate": 0.00022139524616691188, "loss": 1.4254897832870483, "step": 16128 }, { "epoch": 0.706100372788588, "grad_norm": 0.6978499889373779, "learning_rate": 0.000206858308047443, "loss": 1.4185926914215088, "step": 16384 }, { "epoch": 0.706100372788588, "eval_bleu": 0.7788876579155211, "eval_cos_loss": 0.5166550292643403, "eval_dec_loss": 0.0013616397724124983, "eval_loss": 1.4177445305435896, "eval_mse2_loss": 0.17684134553426872, "eval_mse_loss": 1.4177445305435896, "eval_rec_loss": 0.05370217473951103, "eval_var_loss": 0.02986719635233823, "flow/cos_sim": 0.48334496971894936, "flow/improvement_ratio": 0.851445895522388, "flow/mag_ratio_mean": 0.49312538899846675, "flow/mag_ratio_std": 0.2614598782586136, "step": 16384 }, { "epoch": 0.706100372788588, "eval_bleu": 0.7788876579155211, "eval_cos_loss": 0.5166550292643403, "eval_dec_loss": 0.0013616397724124983, "eval_loss": 1.4177445305435896, "eval_mse2_loss": 0.17684134553426872, "eval_mse_loss": 1.4177445305435896, "eval_rec_loss": 0.05370217473951103, "eval_runtime": 103.4098, "eval_samples_per_second": 290.108, "eval_steps_per_second": 4.535, "eval_var_loss": 0.02986719635233823, "flow/cos_sim": 0.48334496971894936, "flow/improvement_ratio": 0.851445895522388, "flow/mag_ratio_mean": 0.49312538899846675, "flow/mag_ratio_std": 0.2614598782586136, "step": 16384 }, { "epoch": 0.7171331911134096, "grad_norm": 0.7240028977394104, "learning_rate": 0.00019268955506693798, "loss": 1.4189178943634033, "step": 16640 }, { "epoch": 0.7281660094382313, "grad_norm": 0.6644338369369507, "learning_rate": 0.00017890678313921, "loss": 1.4202007055282593, "step": 16896 }, { "epoch": 0.739198827763053, "grad_norm": 0.8413478136062622, "learning_rate": 0.00016552730338695792, "loss": 1.419106364250183, "step": 17152 }, { "epoch": 0.7502316460878746, "grad_norm": 0.741065263748169, "learning_rate": 0.00015256792039904465, "loss": 1.415405511856079, "step": 17408 }, { "epoch": 0.7502316460878746, "eval_bleu": 0.7637354358631164, "eval_cos_loss": 0.5136227607727051, "eval_dec_loss": 0.0013235103740173923, "eval_loss": 1.4103716327183282, "eval_mse2_loss": 0.17683548507278662, "eval_mse_loss": 1.4103716327183282, "eval_rec_loss": 0.05761792201366125, "eval_var_loss": 0.03023185586926144, "flow/cos_sim": 0.4863772399898277, "flow/improvement_ratio": 0.8529339908028463, "flow/mag_ratio_mean": 0.4943711748128253, "flow/mag_ratio_std": 0.2643810258046396, "step": 17408 }, { "epoch": 0.7502316460878746, "eval_bleu": 0.7637354358631164, "eval_cos_loss": 0.5136227607727051, "eval_dec_loss": 0.0013235103740173923, "eval_loss": 1.4103716327183282, "eval_mse2_loss": 0.17683548507278662, "eval_mse_loss": 1.4103716327183282, "eval_rec_loss": 0.05761792201366125, "eval_runtime": 103.2171, "eval_samples_per_second": 290.65, "eval_steps_per_second": 4.544, "eval_var_loss": 0.03023185586926144, "flow/cos_sim": 0.4863772399898277, "flow/improvement_ratio": 0.8529339908028463, "flow/mag_ratio_mean": 0.4943711748128253, "flow/mag_ratio_std": 0.2643810258046396, "step": 17408 }, { "epoch": 0.7612644644126964, "grad_norm": 0.5421018600463867, "learning_rate": 0.00014004491112398103, "loss": 1.4142208099365234, "step": 17664 }, { "epoch": 0.7722972827375181, "grad_norm": 0.665582537651062, "learning_rate": 0.00012797400442612433, "loss": 1.411756992340088, "step": 17920 }, { "epoch": 0.7833301010623397, "grad_norm": 0.6837579607963562, "learning_rate": 0.00011637036133026895, "loss": 1.4075802564620972, "step": 18176 }, { "epoch": 0.7943629193871614, "grad_norm": 0.7160040736198425, "learning_rate": 0.00010524855597944216, "loss": 1.4070231914520264, "step": 18432 }, { "epoch": 0.7943629193871614, "eval_bleu": 0.8024029342579875, "eval_cos_loss": 0.5127464083593283, "eval_dec_loss": 0.0013179335473900858, "eval_loss": 1.4091586799763922, "eval_mse2_loss": 0.17562630394501472, "eval_mse_loss": 1.4091586799763922, "eval_rec_loss": 0.059627406716124334, "eval_var_loss": 0.029311500787576123, "flow/cos_sim": 0.4872535904333281, "flow/improvement_ratio": 0.8548329780096693, "flow/mag_ratio_mean": 0.49533584078491877, "flow/mag_ratio_std": 0.2655049035988891, "step": 18432 }, { "epoch": 0.7943629193871614, "eval_bleu": 0.8024029342579875, "eval_cos_loss": 0.5127464083593283, "eval_dec_loss": 0.0013179335473900858, "eval_loss": 1.4091586799763922, "eval_mse2_loss": 0.17562630394501472, "eval_mse_loss": 1.4091586799763922, "eval_rec_loss": 0.059627406716124334, "eval_runtime": 103.4418, "eval_samples_per_second": 290.018, "eval_steps_per_second": 4.534, "eval_var_loss": 0.029311500787576123, "flow/cos_sim": 0.4872535904333281, "flow/improvement_ratio": 0.8548329780096693, "flow/mag_ratio_mean": 0.49533584078491877, "flow/mag_ratio_std": 0.2655049035988891, "step": 18432 }, { "epoch": 0.8053957377119831, "grad_norm": 0.727080762386322, "learning_rate": 9.462255732982089e-05, "loss": 1.406097650527954, "step": 18688 }, { "epoch": 0.8164285560368048, "grad_norm": 0.6209878921508789, "learning_rate": 8.450571160576348e-05, "loss": 1.4059816598892212, "step": 18944 }, { "epoch": 0.8274613743616265, "grad_norm": 0.659706175327301, "learning_rate": 7.491072553698764e-05, "loss": 1.410292148590088, "step": 19200 }, { "epoch": 0.8384941926864482, "grad_norm": 0.5520651340484619, "learning_rate": 6.584965039895586e-05, "loss": 1.402584195137024, "step": 19456 }, { "epoch": 0.8384941926864482, "eval_bleu": 0.7435766156577157, "eval_cos_loss": 0.5144387822923884, "eval_dec_loss": 0.0013450082680801236, "eval_loss": 1.4127296161041585, "eval_mse2_loss": 0.17700788906134013, "eval_mse_loss": 1.4127296161041585, "eval_rec_loss": 0.058054142113306374, "eval_var_loss": 0.0291894421593022, "flow/cos_sim": 0.4855612163731793, "flow/improvement_ratio": 0.8498689588199038, "flow/mag_ratio_mean": 0.4951269815344292, "flow/mag_ratio_std": 0.26389562489508567, "step": 19456 }, { "epoch": 0.8384941926864482, "eval_bleu": 0.7435766156577157, "eval_cos_loss": 0.5144387822923884, "eval_dec_loss": 0.0013450082680801236, "eval_loss": 1.4127296161041585, "eval_mse2_loss": 0.17700788906134013, "eval_mse_loss": 1.4127296161041585, "eval_rec_loss": 0.058054142113306374, "eval_runtime": 103.8996, "eval_samples_per_second": 288.74, "eval_steps_per_second": 4.514, "eval_var_loss": 0.0291894421593022, "flow/cos_sim": 0.4855612163731793, "flow/improvement_ratio": 0.8498689588199038, "flow/mag_ratio_mean": 0.4951269815344292, "flow/mag_ratio_std": 0.26389562489508567, "step": 19456 }, { "epoch": 0.8495270110112698, "grad_norm": 0.7207916378974915, "learning_rate": 5.73338668765051e-05, "loss": 1.408148169517517, "step": 19712 }, { "epoch": 0.8605598293360915, "grad_norm": 0.6444937586784363, "learning_rate": 4.9374070769740984e-05, "loss": 1.4169082641601562, "step": 19968 }, { "epoch": 0.8715926476609133, "grad_norm": 0.6508966088294983, "learning_rate": 4.198025956014095e-05, "loss": 1.412489891052246, "step": 20224 }, { "epoch": 0.8826254659857349, "grad_norm": 0.8207064270973206, "learning_rate": 3.516171985374755e-05, "loss": 1.4014993906021118, "step": 20480 }, { "epoch": 0.8826254659857349, "eval_bleu": 0.7371724072330055, "eval_cos_loss": 0.5137777864805925, "eval_dec_loss": 0.0013706799051735545, "eval_loss": 1.409610672546094, "eval_mse2_loss": 0.17626210351361396, "eval_mse_loss": 1.409610672546094, "eval_rec_loss": 0.054663843655986574, "eval_var_loss": 0.029133995291965604, "flow/cos_sim": 0.48622221402776267, "flow/improvement_ratio": 0.8532393833975802, "flow/mag_ratio_mean": 0.4940188680249237, "flow/mag_ratio_std": 0.2655889735674299, "step": 20480 }, { "epoch": 0.8826254659857349, "eval_bleu": 0.7371724072330055, "eval_cos_loss": 0.5137777864805925, "eval_dec_loss": 0.0013706799051735545, "eval_loss": 1.409610672546094, "eval_mse2_loss": 0.17626210351361396, "eval_mse_loss": 1.409610672546094, "eval_rec_loss": 0.054663843655986574, "eval_runtime": 104.0379, "eval_samples_per_second": 288.356, "eval_steps_per_second": 4.508, "eval_var_loss": 0.029133995291965604, "flow/cos_sim": 0.48622221402776267, "flow/improvement_ratio": 0.8532393833975802, "flow/mag_ratio_mean": 0.4940188680249237, "flow/mag_ratio_std": 0.2655889735674299, "step": 20480 }, { "epoch": 0.8936582843105566, "grad_norm": 0.5996214151382446, "learning_rate": 2.8927015717215733e-05, "loss": 1.4027345180511475, "step": 20736 }, { "epoch": 0.9046911026353783, "grad_norm": 0.6789088845252991, "learning_rate": 2.3283977921370547e-05, "loss": 1.4052367210388184, "step": 20992 }, { "epoch": 0.9157239209601999, "grad_norm": 0.6676909327507019, "learning_rate": 1.8239694105780413e-05, "loss": 1.406872034072876, "step": 21248 }, { "epoch": 0.9267567392850217, "grad_norm": 0.5955349802970886, "learning_rate": 1.3800499876701955e-05, "loss": 1.4064586162567139, "step": 21504 }, { "epoch": 0.9267567392850217, "eval_bleu": 0.7426715244464189, "eval_cos_loss": 0.5071186275878695, "eval_dec_loss": 0.001350255208637894, "eval_loss": 1.3937010752366805, "eval_mse2_loss": 0.17541809607225695, "eval_mse_loss": 1.3937010752366805, "eval_rec_loss": 0.05103444970691445, "eval_var_loss": 0.02931836185091213, "flow/cos_sim": 0.49288137139542015, "flow/improvement_ratio": 0.8528173863252343, "flow/mag_ratio_mean": 0.4987420951252553, "flow/mag_ratio_std": 0.26658764935886936, "step": 21504 }, { "epoch": 0.9267567392850217, "eval_bleu": 0.7426715244464189, "eval_cos_loss": 0.5071186275878695, "eval_dec_loss": 0.001350255208637894, "eval_loss": 1.3937010752366805, "eval_mse2_loss": 0.17541809607225695, "eval_mse_loss": 1.3937010752366805, "eval_rec_loss": 0.05103444970691445, "eval_runtime": 104.142, "eval_samples_per_second": 288.068, "eval_steps_per_second": 4.503, "eval_var_loss": 0.02931836185091213, "flow/cos_sim": 0.49288137139542015, "flow/improvement_ratio": 0.8528173863252343, "flow/mag_ratio_mean": 0.4987420951252553, "flow/mag_ratio_std": 0.26658764935886936, "step": 21504 }, { "epoch": 0.9377895576098434, "grad_norm": 0.8665277361869812, "learning_rate": 9.971970849576406e-06, "loss": 1.4001104831695557, "step": 21760 }, { "epoch": 0.948822375934665, "grad_norm": 0.6160731315612793, "learning_rate": 6.758915646072339e-06, "loss": 1.4023921489715576, "step": 22016 }, { "epoch": 0.9598551942594867, "grad_norm": 0.6823092103004456, "learning_rate": 4.1653698544703575e-06, "loss": 1.4057680368423462, "step": 22272 }, { "epoch": 0.9708880125843083, "grad_norm": 0.7474303841590881, "learning_rate": 2.1945909609756286e-06, "loss": 1.402069330215454, "step": 22528 }, { "epoch": 0.9708880125843083, "eval_bleu": 0.7359243412878435, "eval_cos_loss": 0.5119307249593836, "eval_dec_loss": 0.0013969406839550735, "eval_loss": 1.4065255351158092, "eval_mse2_loss": 0.17711426552806073, "eval_mse_loss": 1.4065255351158092, "eval_rec_loss": 0.056425910651572604, "eval_var_loss": 0.02955100304091663, "flow/cos_sim": 0.4880692758031491, "flow/improvement_ratio": 0.8548107675906184, "flow/mag_ratio_mean": 0.49543472253945847, "flow/mag_ratio_std": 0.263321697140045, "step": 22528 }, { "epoch": 0.9708880125843083, "eval_bleu": 0.7359243412878435, "eval_cos_loss": 0.5119307249593836, "eval_dec_loss": 0.0013969406839550735, "eval_loss": 1.4065255351158092, "eval_mse2_loss": 0.17711426552806073, "eval_mse_loss": 1.4065255351158092, "eval_rec_loss": 0.056425910651572604, "eval_runtime": 103.1789, "eval_samples_per_second": 290.757, "eval_steps_per_second": 4.546, "eval_var_loss": 0.02955100304091663, "flow/cos_sim": 0.4880692758031491, "flow/improvement_ratio": 0.8548107675906184, "flow/mag_ratio_mean": 0.49543472253945847, "flow/mag_ratio_std": 0.263321697140045, "step": 22528 }, { "epoch": 0.9819208309091301, "grad_norm": 0.6597904562950134, "learning_rate": 8.490542583243222e-07, "loss": 1.4066376686096191, "step": 22784 }, { "epoch": 0.9929536492339518, "grad_norm": 0.7082860469818115, "learning_rate": 1.3044973682302396e-07, "loss": 1.4058468341827393, "step": 23040 } ], "logging_steps": 256, "max_steps": 23204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }