{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8826254659857349, "eval_steps": 1024, "global_step": 20480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011032818324821687, "grad_norm": 10.797319412231445, "learning_rate": 0.000498046875, "loss": 105.79659271240234, "step": 256 }, { "epoch": 0.022065636649643373, "grad_norm": 25.741575241088867, "learning_rate": 0.000998046875, "loss": 2.3060808181762695, "step": 512 }, { "epoch": 0.03309845497446506, "grad_norm": 14.934273719787598, "learning_rate": 0.000999688448778502, "loss": 2.2731690406799316, "step": 768 }, { "epoch": 0.04413127329928675, "grad_norm": 28.308002471923828, "learning_rate": 0.0009987492950653055, "loss": 2.175210475921631, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9214307463191996, "eval_cos_loss": 0.7069334270857545, "eval_dec_loss": 0.14960542684599662, "eval_loss": 2.0735362056475966, "eval_mse2_loss": 0.2511829924164042, "eval_mse_loss": 1.8223532110389107, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.29306657075373604, "flow/improvement_ratio": 0.7837164053784759, "flow/mag_ratio_mean": 0.434395147856873, "flow/mag_ratio_std": 0.1415387201728597, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9214307463191996, "eval_cos_loss": 0.7069334270857545, "eval_dec_loss": 0.14960542684599662, "eval_loss": 2.0735362056475966, "eval_mse2_loss": 0.2511829924164042, "eval_mse_loss": 1.8223532110389107, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 171.0537, "eval_samples_per_second": 175.384, "eval_steps_per_second": 2.742, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.29306657075373604, "flow/improvement_ratio": 0.7837164053784759, "flow/mag_ratio_mean": 0.434395147856873, "flow/mag_ratio_std": 0.1415387201728597, "step": 1024 }, { "epoch": 0.05516409162410843, "grad_norm": 20.717588424682617, "learning_rate": 0.0009971837136430763, "loss": 1.9890590906143188, "step": 1280 }, { "epoch": 0.06619690994893011, "grad_norm": 16.1776123046875, "learning_rate": 0.0009949936708776692, "loss": 1.8445225954055786, "step": 1536 }, { "epoch": 0.07722972827375181, "grad_norm": 15.698042869567871, "learning_rate": 0.0009921819174566252, "loss": 1.7741947174072266, "step": 1792 }, { "epoch": 0.0882625465985735, "grad_norm": 12.729333877563477, "learning_rate": 0.000988751984934317, "loss": 1.714084267616272, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.9356201248298646, "eval_cos_loss": 0.5489801322854658, "eval_dec_loss": 0.11870824870889757, "eval_loss": 1.7085340877077473, "eval_mse2_loss": 0.1958959426865903, "eval_mse_loss": 1.5126381456724871, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.4510198644102255, "flow/improvement_ratio": 0.8669772742907884, "flow/mag_ratio_mean": 0.5195024066261137, "flow/mag_ratio_std": 0.21416059253947822, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.9356201248298646, "eval_cos_loss": 0.5489801322854658, "eval_dec_loss": 0.11870824870889757, "eval_loss": 1.7085340877077473, "eval_mse2_loss": 0.1958959426865903, "eval_mse_loss": 1.5126381456724871, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 172.0919, "eval_samples_per_second": 174.325, "eval_steps_per_second": 2.725, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.4510198644102255, "flow/improvement_ratio": 0.8669772742907884, "flow/mag_ratio_mean": 0.5195024066261137, "flow/mag_ratio_std": 0.21416059253947822, "step": 2048 }, { "epoch": 0.09929536492339518, "grad_norm": 9.853597640991211, "learning_rate": 0.0009847081812963268, "loss": 1.6746653318405151, "step": 2304 }, { "epoch": 0.11032818324821686, "grad_norm": 7.131998062133789, "learning_rate": 0.0009800555855486275, "loss": 1.646998405456543, "step": 2560 }, { "epoch": 0.12136100157303854, "grad_norm": 7.610518932342529, "learning_rate": 0.0009748000413383664, "loss": 1.62706458568573, "step": 2816 }, { "epoch": 0.13239381989786023, "grad_norm": 5.741150856018066, "learning_rate": 0.0009689481496142604, "loss": 1.6093950271606445, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9338383707066459, "eval_cos_loss": 0.5054820228868456, "eval_dec_loss": 0.12964063795851366, "eval_loss": 1.5915681512625233, "eval_mse2_loss": 0.1819949251438763, "eval_mse_loss": 1.4095732276119404, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.4945179769860656, "flow/improvement_ratio": 0.8849164608444995, "flow/mag_ratio_mean": 0.5184873009541395, "flow/mag_ratio_std": 0.229987337486322, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9338383707066459, "eval_cos_loss": 0.5054820228868456, "eval_dec_loss": 0.12964063795851366, "eval_loss": 1.5915681512625233, "eval_mse2_loss": 0.1819949251438763, "eval_mse_loss": 1.4095732276119404, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 176.4575, "eval_samples_per_second": 170.013, "eval_steps_per_second": 2.658, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.4945179769860656, "flow/improvement_ratio": 0.8849164608444995, "flow/mag_ratio_mean": 0.5184873009541395, "flow/mag_ratio_std": 0.229987337486322, "step": 3072 }, { "epoch": 0.14342663822268192, "grad_norm": 5.7491865158081055, "learning_rate": 0.0009625072603358231, "loss": 1.5970690250396729, "step": 3328 }, { "epoch": 0.15445945654750362, "grad_norm": 6.157455921173096, "learning_rate": 0.0009554854632418371, "loss": 1.5825296640396118, "step": 3584 }, { "epoch": 0.1654922748723253, "grad_norm": 4.907317638397217, "learning_rate": 0.000947891577689663, "loss": 1.5728044509887695, "step": 3840 }, { "epoch": 0.176525093197147, "grad_norm": 3.5090701580047607, "learning_rate": 0.0009397351415781539, "loss": 1.565865159034729, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.9365961616358232, "eval_cos_loss": 0.49659409953841266, "eval_dec_loss": 0.12140450885753705, "eval_loss": 1.564894216655414, "eval_mse2_loss": 0.17669188814249628, "eval_mse_loss": 1.3882023289259562, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5034058995719658, "flow/improvement_ratio": 0.8819394332767804, "flow/mag_ratio_mean": 0.5346324385356293, "flow/mag_ratio_std": 0.23142226367616958, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.9365961616358232, "eval_cos_loss": 0.49659409953841266, "eval_dec_loss": 0.12140450885753705, "eval_loss": 1.564894216655414, "eval_mse2_loss": 0.17669188814249628, "eval_mse_loss": 1.3882023289259562, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 160.4591, "eval_samples_per_second": 186.964, "eval_steps_per_second": 2.923, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5034058995719658, "flow/improvement_ratio": 0.8819394332767804, "flow/mag_ratio_mean": 0.5346324385356293, "flow/mag_ratio_std": 0.23142226367616958, "step": 4096 }, { "epoch": 0.18755791152196866, "grad_norm": 4.376228332519531, "learning_rate": 0.000931026399368079, "loss": 1.5588833093643188, "step": 4352 }, { "epoch": 0.19859072984679035, "grad_norm": 4.248619556427002, "learning_rate": 0.0009217762892151117, "loss": 1.5539987087249756, "step": 4608 }, { "epoch": 0.20962354817161205, "grad_norm": 4.112342357635498, "learning_rate": 0.0009119964292315354, "loss": 1.547034502029419, "step": 4864 }, { "epoch": 0.22065636649643372, "grad_norm": 4.957840919494629, "learning_rate": 0.0009016991028939279, "loss": 1.539028525352478, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.937221428849788, "eval_cos_loss": 0.48556288504905537, "eval_dec_loss": 0.11899168453395748, "eval_loss": 1.5318274358188166, "eval_mse2_loss": 0.17039804826222504, "eval_mse_loss": 1.3614293873183, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5144371166666434, "flow/improvement_ratio": 0.8873141655789764, "flow/mag_ratio_mean": 0.5353354320470204, "flow/mag_ratio_std": 0.2365191124204888, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.937221428849788, "eval_cos_loss": 0.48556288504905537, "eval_dec_loss": 0.11899168453395748, "eval_loss": 1.5318274358188166, "eval_mse2_loss": 0.17039804826222504, "eval_mse_loss": 1.3614293873183, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 169.2874, "eval_samples_per_second": 177.213, "eval_steps_per_second": 2.77, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5144371166666434, "flow/improvement_ratio": 0.8873141655789764, "flow/mag_ratio_mean": 0.5353354320470204, "flow/mag_ratio_std": 0.2365191124204888, "step": 5120 }, { "epoch": 0.23168918482125542, "grad_norm": 3.4134156703948975, "learning_rate": 0.0008908972436151494, "loss": 1.532220721244812, "step": 5376 }, { "epoch": 0.2427220031460771, "grad_norm": 3.1423563957214355, "learning_rate": 0.0008796044185000127, "loss": 1.525950312614441, "step": 5632 }, { "epoch": 0.2537548214708988, "grad_norm": 2.3490381240844727, "learning_rate": 0.0008678348113050368, "loss": 1.5172430276870728, "step": 5888 }, { "epoch": 0.26478763979572045, "grad_norm": 6.032445430755615, "learning_rate": 0.0008556032046236897, "loss": 1.5190675258636475, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.9354061397916119, "eval_cos_loss": 0.4837510161308337, "eval_dec_loss": 0.1238921330338205, "eval_loss": 1.5238950331328012, "eval_mse2_loss": 0.17081616409043512, "eval_mse_loss": 1.353078869360088, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5162489819628343, "flow/improvement_ratio": 0.8831684786373618, "flow/mag_ratio_mean": 0.5513551437905603, "flow/mag_ratio_std": 0.24864270314097658, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.9354061397916119, "eval_cos_loss": 0.4837510161308337, "eval_dec_loss": 0.1238921330338205, "eval_loss": 1.5238950331328012, "eval_mse2_loss": 0.17081616409043512, "eval_mse_loss": 1.353078869360088, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 172.5467, "eval_samples_per_second": 173.866, "eval_steps_per_second": 2.718, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5162489819628343, "flow/improvement_ratio": 0.8831684786373618, "flow/mag_ratio_mean": 0.5513551437905603, "flow/mag_ratio_std": 0.24864270314097658, "step": 6144 }, { "epoch": 0.2758204581205422, "grad_norm": 4.0988078117370605, "learning_rate": 0.000842924961319492, "loss": 1.5116130113601685, "step": 6400 }, { "epoch": 0.28685327644536385, "grad_norm": 3.4181602001190186, "learning_rate": 0.0008298160052303045, "loss": 1.506291389465332, "step": 6656 }, { "epoch": 0.2978860947701855, "grad_norm": 3.047140598297119, "learning_rate": 0.0008162928011680314, "loss": 1.5031940937042236, "step": 6912 }, { "epoch": 0.30891891309500724, "grad_norm": 3.206284761428833, "learning_rate": 0.000802372334238864, "loss": 1.5017902851104736, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.9368851499593083, "eval_cos_loss": 0.47378275395710584, "eval_dec_loss": 0.11915739709888694, "eval_loss": 1.4968281847073326, "eval_mse2_loss": 0.1652413869876343, "eval_mse_loss": 1.3315867993877386, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5262172466783381, "flow/improvement_ratio": 0.8891915549347396, "flow/mag_ratio_mean": 0.5413357251361489, "flow/mag_ratio_std": 0.24727411558633167, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.9368851499593083, "eval_cos_loss": 0.47378275395710584, "eval_dec_loss": 0.11915739709888694, "eval_loss": 1.4968281847073326, "eval_mse2_loss": 0.1652413869876343, "eval_mse_loss": 1.3315867993877386, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 176.7103, "eval_samples_per_second": 169.769, "eval_steps_per_second": 2.654, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5262172466783381, "flow/improvement_ratio": 0.8891915549347396, "flow/mag_ratio_mean": 0.5413357251361489, "flow/mag_ratio_std": 0.24727411558633167, "step": 7168 }, { "epoch": 0.3199517314198289, "grad_norm": 3.409893274307251, "learning_rate": 0.0007880720885100349, "loss": 1.4971449375152588, "step": 7424 }, { "epoch": 0.3309845497446506, "grad_norm": 3.073834180831909, "learning_rate": 0.0007734100250498788, "loss": 1.49185311794281, "step": 7680 }, { "epoch": 0.3420173680694723, "grad_norm": 4.364046096801758, "learning_rate": 0.000758404559368781, "loss": 1.4903632402420044, "step": 7936 }, { "epoch": 0.353050186394294, "grad_norm": 2.848015785217285, "learning_rate": 0.0007430745382893488, "loss": 1.4838106632232666, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9353698110567827, "eval_cos_loss": 0.4692605264913807, "eval_dec_loss": 0.12152907604983113, "eval_loss": 1.4847185436342316, "eval_mse2_loss": 0.16447527768578865, "eval_mse_loss": 1.3202432660914178, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5307394738898856, "flow/improvement_ratio": 0.8924712222268079, "flow/mag_ratio_mean": 0.5445197708825312, "flow/mag_ratio_std": 0.24787098121668485, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9353698110567827, "eval_cos_loss": 0.4692605264913807, "eval_dec_loss": 0.12152907604983113, "eval_loss": 1.4847185436342316, "eval_mse2_loss": 0.16447527768578865, "eval_mse_loss": 1.3202432660914178, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 173.5643, "eval_samples_per_second": 172.847, "eval_steps_per_second": 2.702, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5307394738898856, "flow/improvement_ratio": 0.8924712222268079, "flow/mag_ratio_mean": 0.5445197708825312, "flow/mag_ratio_std": 0.24787098121668485, "step": 8192 }, { "epoch": 0.36408300471911564, "grad_norm": 2.5942811965942383, "learning_rate": 0.0007274392162748551, "loss": 1.4843038320541382, "step": 8448 }, { "epoch": 0.3751158230439373, "grad_norm": 3.0958163738250732, "learning_rate": 0.000711518231245687, "loss": 1.4775118827819824, "step": 8704 }, { "epoch": 0.38614864136875904, "grad_norm": 3.219830274581909, "learning_rate": 0.0006953315799141723, "loss": 1.4766325950622559, "step": 8960 }, { "epoch": 0.3971814596935807, "grad_norm": 2.296929359436035, "learning_rate": 0.0006788995926687669, "loss": 1.4715560674667358, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9356008220185699, "eval_cos_loss": 0.4650544371050812, "eval_dec_loss": 0.1218992033397465, "eval_loss": 1.4720673169662704, "eval_mse2_loss": 0.1628535425167348, "eval_mse_loss": 1.3092137732739642, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.534945561814664, "flow/improvement_ratio": 0.8920619619934798, "flow/mag_ratio_mean": 0.5498689174143745, "flow/mag_ratio_std": 0.2565161931489322, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9356008220185699, "eval_cos_loss": 0.4650544371050812, "eval_dec_loss": 0.1218992033397465, "eval_loss": 1.4720673169662704, "eval_mse2_loss": 0.1628535425167348, "eval_mse_loss": 1.3092137732739642, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 165.3408, "eval_samples_per_second": 181.443, "eval_steps_per_second": 2.837, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.534945561814664, "flow/improvement_ratio": 0.8920619619934798, "flow/mag_ratio_mean": 0.5498689174143745, "flow/mag_ratio_std": 0.2565161931489322, "step": 9216 }, { "epoch": 0.4082142780184024, "grad_norm": 2.3320987224578857, "learning_rate": 0.0006622429080391422, "loss": 1.4737770557403564, "step": 9472 }, { "epoch": 0.4192470963432241, "grad_norm": 2.76210355758667, "learning_rate": 0.0006453824467742515, "loss": 1.467403531074524, "step": 9728 }, { "epoch": 0.43027991466804577, "grad_norm": 2.2834606170654297, "learning_rate": 0.0006283393855659275, "loss": 1.466551661491394, "step": 9984 }, { "epoch": 0.44131273299286744, "grad_norm": 2.1904284954071045, "learning_rate": 0.0006111351304510173, "loss": 1.46049165725708, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.9362346289783122, "eval_cos_loss": 0.46082611143716107, "eval_dec_loss": 0.1203397757717287, "eval_loss": 1.4601926084266288, "eval_mse2_loss": 0.1609408200613217, "eval_mse_loss": 1.2992517866813806, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5391738893253717, "flow/improvement_ratio": 0.8964146429033422, "flow/mag_ratio_mean": 0.5516184565863376, "flow/mag_ratio_std": 0.25244328309732206, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.9362346289783122, "eval_cos_loss": 0.46082611143716107, "eval_dec_loss": 0.1203397757717287, "eval_loss": 1.4601926084266288, "eval_mse2_loss": 0.1609408200613217, "eval_mse_loss": 1.2992517866813806, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 179.4357, "eval_samples_per_second": 167.191, "eval_steps_per_second": 2.614, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5391738893253717, "flow/improvement_ratio": 0.8964146429033422, "flow/mag_ratio_mean": 0.5516184565863376, "flow/mag_ratio_std": 0.25244328309732206, "step": 10240 }, { "epoch": 0.45234555131768917, "grad_norm": 2.3216679096221924, "learning_rate": 0.0005937912899254605, "loss": 1.4583410024642944, "step": 10496 }, { "epoch": 0.46337836964251083, "grad_norm": 2.843108892440796, "learning_rate": 0.0005763296478040787, "loss": 1.4574695825576782, "step": 10752 }, { "epoch": 0.4744111879673325, "grad_norm": 2.320223808288574, "learning_rate": 0.0005587721358601663, "loss": 1.4572159051895142, "step": 11008 }, { "epoch": 0.4854440062921542, "grad_norm": 2.517941474914551, "learning_rate": 0.0005411408062792448, "loss": 1.4533445835113525, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.9349888888103645, "eval_cos_loss": 0.4606767813407028, "eval_dec_loss": 0.12063868982848455, "eval_loss": 1.4557419222301002, "eval_mse2_loss": 0.15818865597248077, "eval_mse_loss": 1.2975532645101486, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5393232155456218, "flow/improvement_ratio": 0.892127673127758, "flow/mag_ratio_mean": 0.550250798717999, "flow/mag_ratio_std": 0.2556351939879501, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.9349888888103645, "eval_cos_loss": 0.4606767813407028, "eval_dec_loss": 0.12063868982848455, "eval_loss": 1.4557419222301002, "eval_mse2_loss": 0.15818865597248077, "eval_mse_loss": 1.2975532645101486, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 169.0185, "eval_samples_per_second": 177.495, "eval_steps_per_second": 2.775, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5393232155456218, "flow/improvement_ratio": 0.892127673127758, "flow/mag_ratio_mean": 0.550250798717999, "flow/mag_ratio_std": 0.2556351939879501, "step": 11264 }, { "epoch": 0.4964768246169759, "grad_norm": 2.285221815109253, "learning_rate": 0.0005234578039615789, "loss": 1.4499876499176025, "step": 11520 }, { "epoch": 0.5075096429417976, "grad_norm": 2.029639482498169, "learning_rate": 0.0005057453387082458, "loss": 1.4481278657913208, "step": 11776 }, { "epoch": 0.5185424612666193, "grad_norm": 2.0235965251922607, "learning_rate": 0.0004880256573256866, "loss": 1.4484679698944092, "step": 12032 }, { "epoch": 0.5295752795914409, "grad_norm": 1.6718403100967407, "learning_rate": 0.0004703210156837805, "loss": 1.447327733039856, "step": 12288 }, { "epoch": 0.5295752795914409, "eval_bleu": 0.9364176777903095, "eval_cos_loss": 0.4588198344082212, "eval_dec_loss": 0.11679136855746193, "eval_loss": 1.4490999984842883, "eval_mse2_loss": 0.15755183694522773, "eval_mse_loss": 1.2915481588225375, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5411801628593697, "flow/improvement_ratio": 0.8935188584998727, "flow/mag_ratio_mean": 0.5548275625273618, "flow/mag_ratio_std": 0.25878340491989277, "step": 12288 }, { "epoch": 0.5295752795914409, "eval_bleu": 0.9364176777903095, "eval_cos_loss": 0.4588198344082212, "eval_dec_loss": 0.11679136855746193, "eval_loss": 1.4490999984842883, "eval_mse2_loss": 0.15755183694522773, "eval_mse_loss": 1.2915481588225375, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 167.4569, "eval_samples_per_second": 179.151, "eval_steps_per_second": 2.801, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5411801628593697, "flow/improvement_ratio": 0.8935188584998727, "flow/mag_ratio_mean": 0.5548275625273618, "flow/mag_ratio_std": 0.25878340491989277, "step": 12288 }, { "epoch": 0.5406080979162626, "grad_norm": 1.9634923934936523, "learning_rate": 0.0004526536507625343, "loss": 1.443847894668579, "step": 12544 }, { "epoch": 0.5516409162410844, "grad_norm": 2.0251376628875732, "learning_rate": 0.00043504575272249973, "loss": 1.4448539018630981, "step": 12800 }, { "epoch": 0.562673734565906, "grad_norm": 1.487449049949646, "learning_rate": 0.0004175194370339921, "loss": 1.44254732131958, "step": 13056 }, { "epoch": 0.5737065528907277, "grad_norm": 2.1053073406219482, "learning_rate": 0.0004000967167001243, "loss": 1.4406143426895142, "step": 13312 }, { "epoch": 0.5737065528907277, "eval_bleu": 0.9372209976548388, "eval_cos_loss": 0.45495398104317913, "eval_dec_loss": 0.11334308082345071, "eval_loss": 1.438905306970641, "eval_mse2_loss": 0.15505272262830977, "eval_mse_loss": 1.2838525835639123, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5450460188932764, "flow/improvement_ratio": 0.8936982345479384, "flow/mag_ratio_mean": 0.5579607989996481, "flow/mag_ratio_std": 0.2575365855876825, "step": 13312 }, { "epoch": 0.5737065528907277, "eval_bleu": 0.9372209976548388, "eval_cos_loss": 0.45495398104317913, "eval_dec_loss": 0.11334308082345071, "eval_loss": 1.438905306970641, "eval_mse2_loss": 0.15505272262830977, "eval_mse_loss": 1.2838525835639123, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 177.5771, "eval_samples_per_second": 168.941, "eval_steps_per_second": 2.641, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5450460188932764, "flow/improvement_ratio": 0.8936982345479384, "flow/mag_ratio_mean": 0.5579607989996481, "flow/mag_ratio_std": 0.2575365855876825, "step": 13312 }, { "epoch": 0.5847393712155494, "grad_norm": 2.0249550342559814, "learning_rate": 0.00038279947460853446, "loss": 1.4377583265304565, "step": 13568 }, { "epoch": 0.595772189540371, "grad_norm": 1.6271026134490967, "learning_rate": 0.00036564943604654345, "loss": 1.4369451999664307, "step": 13824 }, { "epoch": 0.6068050078651928, "grad_norm": 1.2092267274856567, "learning_rate": 0.00034866814141425254, "loss": 1.437900424003601, "step": 14080 }, { "epoch": 0.6178378261900145, "grad_norm": 1.7238409519195557, "learning_rate": 0.0003318769191698637, "loss": 1.4345003366470337, "step": 14336 }, { "epoch": 0.6178378261900145, "eval_bleu": 0.9375344922384511, "eval_cos_loss": 0.45383678259117516, "eval_dec_loss": 0.11429015738961062, "eval_loss": 1.4351127869538916, "eval_mse2_loss": 0.15503559347345378, "eval_mse_loss": 1.280077194608351, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5461632174723692, "flow/improvement_ratio": 0.8951520059408664, "flow/mag_ratio_mean": 0.5569842827599695, "flow/mag_ratio_std": 0.26188866851299303, "step": 14336 }, { "epoch": 0.6178378261900145, "eval_bleu": 0.9375344922384511, "eval_cos_loss": 0.45383678259117516, "eval_dec_loss": 0.11429015738961062, "eval_loss": 1.4351127869538916, "eval_mse2_loss": 0.15503559347345378, "eval_mse_loss": 1.280077194608351, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 176.4023, "eval_samples_per_second": 170.066, "eval_steps_per_second": 2.659, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5461632174723692, "flow/improvement_ratio": 0.8951520059408664, "flow/mag_ratio_mean": 0.5569842827599695, "flow/mag_ratio_std": 0.26188866851299303, "step": 14336 }, { "epoch": 0.6288706445148361, "grad_norm": 1.5894383192062378, "learning_rate": 0.00031529685904119485, "loss": 1.4325922727584839, "step": 14592 }, { "epoch": 0.6399034628396578, "grad_norm": 1.9905190467834473, "learning_rate": 0.0002989487855370421, "loss": 1.4348605871200562, "step": 14848 }, { "epoch": 0.6509362811644795, "grad_norm": 1.5708714723587036, "learning_rate": 0.00028285323179165424, "loss": 1.4332606792449951, "step": 15104 }, { "epoch": 0.6619690994893012, "grad_norm": 1.5457080602645874, "learning_rate": 0.0002670304137751759, "loss": 1.435610294342041, "step": 15360 }, { "epoch": 0.6619690994893012, "eval_bleu": 0.9355922222002754, "eval_cos_loss": 0.4520330929171556, "eval_dec_loss": 0.12299507638332305, "eval_loss": 1.4329845295277739, "eval_mse2_loss": 0.15654502583465088, "eval_mse_loss": 1.2764395058536326, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5479669079724659, "flow/improvement_ratio": 0.8930472631190123, "flow/mag_ratio_mean": 0.5571114101897933, "flow/mag_ratio_std": 0.256343261551247, "step": 15360 }, { "epoch": 0.6619690994893012, "eval_bleu": 0.9355922222002754, "eval_cos_loss": 0.4520330929171556, "eval_dec_loss": 0.12299507638332305, "eval_loss": 1.4329845295277739, "eval_mse2_loss": 0.15654502583465088, "eval_mse_loss": 1.2764395058536326, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 174.468, "eval_samples_per_second": 171.951, "eval_steps_per_second": 2.688, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5479669079724659, "flow/improvement_ratio": 0.8930472631190123, "flow/mag_ratio_mean": 0.5571114101897933, "flow/mag_ratio_std": 0.256343261551247, "step": 15360 }, { "epoch": 0.6730019178141229, "grad_norm": 1.6477590799331665, "learning_rate": 0.0002515002049024435, "loss": 1.4318206310272217, "step": 15616 }, { "epoch": 0.6840347361389446, "grad_norm": 1.5333731174468994, "learning_rate": 0.00023628211107203429, "loss": 1.4313552379608154, "step": 15872 }, { "epoch": 0.6950675544637662, "grad_norm": 1.147505283355713, "learning_rate": 0.00022139524616691188, "loss": 1.4301337003707886, "step": 16128 }, { "epoch": 0.706100372788588, "grad_norm": 1.1893175840377808, "learning_rate": 0.000206858308047443, "loss": 1.427048921585083, "step": 16384 }, { "epoch": 0.706100372788588, "eval_bleu": 0.9378969893045919, "eval_cos_loss": 0.45077633311245235, "eval_dec_loss": 0.11347989061736126, "eval_loss": 1.426147088567331, "eval_mse2_loss": 0.1526812995071096, "eval_mse_loss": 1.2734657897115516, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5492236670146364, "flow/improvement_ratio": 0.8943654500853533, "flow/mag_ratio_mean": 0.560582883195328, "flow/mag_ratio_std": 0.25867489536306754, "step": 16384 }, { "epoch": 0.706100372788588, "eval_bleu": 0.9378969893045919, "eval_cos_loss": 0.45077633311245235, "eval_dec_loss": 0.11347989061736126, "eval_loss": 1.426147088567331, "eval_mse2_loss": 0.1526812995071096, "eval_mse_loss": 1.2734657897115516, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 174.3978, "eval_samples_per_second": 172.021, "eval_steps_per_second": 2.689, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5492236670146364, "flow/improvement_ratio": 0.8943654500853533, "flow/mag_ratio_mean": 0.560582883195328, "flow/mag_ratio_std": 0.25867489536306754, "step": 16384 }, { "epoch": 0.7171331911134096, "grad_norm": 0.9107947945594788, "learning_rate": 0.00019268955506693798, "loss": 1.42864990234375, "step": 16640 }, { "epoch": 0.7281660094382313, "grad_norm": 1.4668571949005127, "learning_rate": 0.00017890678313921, "loss": 1.426837682723999, "step": 16896 }, { "epoch": 0.739198827763053, "grad_norm": 1.6071405410766602, "learning_rate": 0.00016552730338695792, "loss": 1.4257097244262695, "step": 17152 }, { "epoch": 0.7502316460878746, "grad_norm": 1.0349161624908447, "learning_rate": 0.00015256792039904465, "loss": 1.424682855606079, "step": 17408 }, { "epoch": 0.7502316460878746, "eval_bleu": 0.9392870027928614, "eval_cos_loss": 0.4492604330277392, "eval_dec_loss": 0.11262787379888392, "eval_loss": 1.4202917224562752, "eval_mse2_loss": 0.15203414787488706, "eval_mse_loss": 1.2682575782987355, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5507395660190948, "flow/improvement_ratio": 0.8961705508262618, "flow/mag_ratio_mean": 0.5607809571823331, "flow/mag_ratio_std": 0.2602766063739496, "step": 17408 }, { "epoch": 0.7502316460878746, "eval_bleu": 0.9392870027928614, "eval_cos_loss": 0.4492604330277392, "eval_dec_loss": 0.11262787379888392, "eval_loss": 1.4202917224562752, "eval_mse2_loss": 0.15203414787488706, "eval_mse_loss": 1.2682575782987355, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 188.3193, "eval_samples_per_second": 159.304, "eval_steps_per_second": 2.49, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5507395660190948, "flow/improvement_ratio": 0.8961705508262618, "flow/mag_ratio_mean": 0.5607809571823331, "flow/mag_ratio_std": 0.2602766063739496, "step": 17408 }, { "epoch": 0.7612644644126964, "grad_norm": 1.1615740060806274, "learning_rate": 0.00014004491112398103, "loss": 1.4255716800689697, "step": 17664 }, { "epoch": 0.7722972827375181, "grad_norm": 0.9076006412506104, "learning_rate": 0.00012797400442612433, "loss": 1.4207247495651245, "step": 17920 }, { "epoch": 0.7833301010623397, "grad_norm": 1.1816908121109009, "learning_rate": 0.00011637036133026895, "loss": 1.4235727787017822, "step": 18176 }, { "epoch": 0.7943629193871614, "grad_norm": 1.0465947389602661, "learning_rate": 0.00010524855597944216, "loss": 1.421402096748352, "step": 18432 }, { "epoch": 0.7943629193871614, "eval_bleu": 0.9381946887096906, "eval_cos_loss": 0.4503502440986349, "eval_dec_loss": 0.11471369324811995, "eval_loss": 1.4244044193072614, "eval_mse2_loss": 0.1525979548184348, "eval_mse_loss": 1.2718064660456643, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5496497546940216, "flow/improvement_ratio": 0.8959581610490518, "flow/mag_ratio_mean": 0.5600753023680339, "flow/mag_ratio_std": 0.2604317919277687, "step": 18432 }, { "epoch": 0.7943629193871614, "eval_bleu": 0.9381946887096906, "eval_cos_loss": 0.4503502440986349, "eval_dec_loss": 0.11471369324811995, "eval_loss": 1.4244044193072614, "eval_mse2_loss": 0.1525979548184348, "eval_mse_loss": 1.2718064660456643, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 180.9403, "eval_samples_per_second": 165.801, "eval_steps_per_second": 2.592, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5496497546940216, "flow/improvement_ratio": 0.8959581610490518, "flow/mag_ratio_mean": 0.5600753023680339, "flow/mag_ratio_std": 0.2604317919277687, "step": 18432 }, { "epoch": 0.8053957377119831, "grad_norm": 1.0948517322540283, "learning_rate": 9.462255732982089e-05, "loss": 1.4208451509475708, "step": 18688 }, { "epoch": 0.8164285560368048, "grad_norm": 0.8038628101348877, "learning_rate": 8.450571160576348e-05, "loss": 1.4227826595306396, "step": 18944 }, { "epoch": 0.8274613743616265, "grad_norm": 0.7290875315666199, "learning_rate": 7.491072553698764e-05, "loss": 1.4202244281768799, "step": 19200 }, { "epoch": 0.8384941926864482, "grad_norm": 0.7819137573242188, "learning_rate": 6.584965039895586e-05, "loss": 1.4174209833145142, "step": 19456 }, { "epoch": 0.8384941926864482, "eval_bleu": 0.9372464164495187, "eval_cos_loss": 0.4498716953720874, "eval_dec_loss": 0.11378647393183604, "eval_loss": 1.422363788588469, "eval_mse2_loss": 0.15286639037289854, "eval_mse_loss": 1.2694973953243003, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5501283043101907, "flow/improvement_ratio": 0.8944210361824361, "flow/mag_ratio_mean": 0.5602786459648279, "flow/mag_ratio_std": 0.26204406249243567, "step": 19456 }, { "epoch": 0.8384941926864482, "eval_bleu": 0.9372464164495187, "eval_cos_loss": 0.4498716953720874, "eval_dec_loss": 0.11378647393183604, "eval_loss": 1.422363788588469, "eval_mse2_loss": 0.15286639037289854, "eval_mse_loss": 1.2694973953243003, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 175.4483, "eval_samples_per_second": 170.991, "eval_steps_per_second": 2.673, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5501283043101907, "flow/improvement_ratio": 0.8944210361824361, "flow/mag_ratio_mean": 0.5602786459648279, "flow/mag_ratio_std": 0.26204406249243567, "step": 19456 }, { "epoch": 0.8495270110112698, "grad_norm": 0.7724987268447876, "learning_rate": 5.73338668765051e-05, "loss": 1.4236266613006592, "step": 19712 }, { "epoch": 0.8605598293360915, "grad_norm": 0.6551377177238464, "learning_rate": 4.9374070769740984e-05, "loss": 1.4212990999221802, "step": 19968 }, { "epoch": 0.8715926476609133, "grad_norm": 0.8115680813789368, "learning_rate": 4.198025956014095e-05, "loss": 1.4218370914459229, "step": 20224 }, { "epoch": 0.8826254659857349, "grad_norm": 1.0555589199066162, "learning_rate": 3.516171985374755e-05, "loss": 1.4221084117889404, "step": 20480 }, { "epoch": 0.8826254659857349, "eval_bleu": 0.937209637932583, "eval_cos_loss": 0.4497852080158079, "eval_dec_loss": 0.11631931441583868, "eval_loss": 1.4226631736958713, "eval_mse2_loss": 0.15323443922089108, "eval_mse_loss": 1.2694287335694725, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5502147936363464, "flow/improvement_ratio": 0.8968729432711977, "flow/mag_ratio_mean": 0.5599655887719664, "flow/mag_ratio_std": 0.2619784128055898, "step": 20480 }, { "epoch": 0.8826254659857349, "eval_bleu": 0.937209637932583, "eval_cos_loss": 0.4497852080158079, "eval_dec_loss": 0.11631931441583868, "eval_loss": 1.4226631736958713, "eval_mse2_loss": 0.15323443922089108, "eval_mse_loss": 1.2694287335694725, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 172.9963, "eval_samples_per_second": 173.414, "eval_steps_per_second": 2.711, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5502147936363464, "flow/improvement_ratio": 0.8968729432711977, "flow/mag_ratio_mean": 0.5599655887719664, "flow/mag_ratio_std": 0.2619784128055898, "step": 20480 } ], "logging_steps": 256, "max_steps": 23204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }