{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4854440062921542, "eval_steps": 1024, "global_step": 11264, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011032818324821687, "grad_norm": 0.10309942811727524, "learning_rate": 0.000498046875, "loss": 1.9074174165725708, "step": 256 }, { "epoch": 0.022065636649643373, "grad_norm": 0.2910110056400299, "learning_rate": 0.000998046875, "loss": 1.5273144245147705, "step": 512 }, { "epoch": 0.03309845497446506, "grad_norm": 0.3859289586544037, "learning_rate": 0.000999688448778502, "loss": 1.3800736665725708, "step": 768 }, { "epoch": 0.04413127329928675, "grad_norm": 0.5722110867500305, "learning_rate": 0.0009987492950653055, "loss": 1.342606544494629, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9366650964401493, "eval_cos_loss": 0.4710617309440174, "eval_dec_loss": 0.11786629736169314, "eval_loss": 1.3323029561845987, "eval_mse2_loss": 0.1665979178824913, "eval_mse_loss": 1.3323029561845987, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5289382661329404, "flow/improvement_ratio": 0.8936813888010948, "flow/mag_ratio_mean": 0.5435932263382462, "flow/mag_ratio_std": 0.2489985737210906, "step": 1024 }, { "epoch": 0.04413127329928675, "eval_bleu": 0.9366650964401493, "eval_cos_loss": 0.4710617309440174, "eval_dec_loss": 0.11786629736169314, "eval_loss": 1.3323029561845987, "eval_mse2_loss": 0.1665979178824913, "eval_mse_loss": 1.3323029561845987, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 157.3375, "eval_samples_per_second": 190.673, "eval_steps_per_second": 2.981, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5289382661329404, "flow/improvement_ratio": 0.8936813888010948, "flow/mag_ratio_mean": 0.5435932263382462, "flow/mag_ratio_std": 0.2489985737210906, "step": 1024 }, { "epoch": 0.05516409162410843, "grad_norm": 0.6506242752075195, "learning_rate": 0.0009971837136430763, "loss": 1.3261979818344116, "step": 1280 }, { "epoch": 0.06619690994893011, "grad_norm": 0.6324401497840881, "learning_rate": 0.0009949936708776692, "loss": 1.3123514652252197, "step": 1536 }, { "epoch": 0.07722972827375181, "grad_norm": 1.1031574010849, "learning_rate": 0.0009921819174566252, "loss": 1.3050185441970825, "step": 1792 }, { "epoch": 0.0882625465985735, "grad_norm": 0.762417733669281, "learning_rate": 0.000988751984934317, "loss": 1.3001574277877808, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.938925796606621, "eval_cos_loss": 0.4579503086330032, "eval_dec_loss": 0.10506504188690867, "eval_loss": 1.2970875999820766, "eval_mse2_loss": 0.15707123614768229, "eval_mse_loss": 1.2970875999820766, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5420496905409197, "flow/improvement_ratio": 0.8918823948038667, "flow/mag_ratio_mean": 0.5503126610316702, "flow/mag_ratio_std": 0.25175602854823254, "step": 2048 }, { "epoch": 0.0882625465985735, "eval_bleu": 0.938925796606621, "eval_cos_loss": 0.4579503086330032, "eval_dec_loss": 0.10506504188690867, "eval_loss": 1.2970875999820766, "eval_mse2_loss": 0.15707123614768229, "eval_mse_loss": 1.2970875999820766, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 151.9416, "eval_samples_per_second": 197.444, "eval_steps_per_second": 3.087, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5420496905409197, "flow/improvement_ratio": 0.8918823948038667, "flow/mag_ratio_mean": 0.5503126610316702, "flow/mag_ratio_std": 0.25175602854823254, "step": 2048 }, { "epoch": 0.09929536492339518, "grad_norm": 0.39165085554122925, "learning_rate": 0.0009847081812963268, "loss": 1.2909460067749023, "step": 2304 }, { "epoch": 0.11032818324821686, "grad_norm": 0.6050369739532471, "learning_rate": 0.0009800555855486275, "loss": 1.291382908821106, "step": 2560 }, { "epoch": 0.12136100157303854, "grad_norm": 0.6340572237968445, "learning_rate": 0.0009748000413383664, "loss": 1.2860350608825684, "step": 2816 }, { "epoch": 0.13239381989786023, "grad_norm": 0.8046131134033203, "learning_rate": 0.0009689481496142604, "loss": 1.2806360721588135, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9365596012238808, "eval_cos_loss": 0.4510079253075728, "eval_dec_loss": 0.1170106883853801, "eval_loss": 1.2785198518208094, "eval_mse2_loss": 0.15482012001372603, "eval_mse_loss": 1.2785198518208094, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5489920710703966, "flow/improvement_ratio": 0.895310169598187, "flow/mag_ratio_mean": 0.5600611698398712, "flow/mag_ratio_std": 0.2589119763326035, "step": 3072 }, { "epoch": 0.13239381989786023, "eval_bleu": 0.9365596012238808, "eval_cos_loss": 0.4510079253075728, "eval_dec_loss": 0.1170106883853801, "eval_loss": 1.2785198518208094, "eval_mse2_loss": 0.15482012001372603, "eval_mse_loss": 1.2785198518208094, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 150.2303, "eval_samples_per_second": 199.693, "eval_steps_per_second": 3.122, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5489920710703966, "flow/improvement_ratio": 0.895310169598187, "flow/mag_ratio_mean": 0.5600611698398712, "flow/mag_ratio_std": 0.2589119763326035, "step": 3072 }, { "epoch": 0.14342663822268192, "grad_norm": 0.7344346046447754, "learning_rate": 0.0009625072603358231, "loss": 1.277908444404602, "step": 3328 }, { "epoch": 0.15445945654750362, "grad_norm": 0.7456739544868469, "learning_rate": 0.0009554854632418371, "loss": 1.274967074394226, "step": 3584 }, { "epoch": 0.1654922748723253, "grad_norm": 0.528167724609375, "learning_rate": 0.000947891577689663, "loss": 1.2722811698913574, "step": 3840 }, { "epoch": 0.176525093197147, "grad_norm": 0.7374073266983032, "learning_rate": 0.0009397351415781539, "loss": 1.2716022729873657, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.9383145863088955, "eval_cos_loss": 0.44795799712890755, "eval_dec_loss": 0.11301154795406597, "eval_loss": 1.2707049117159488, "eval_mse2_loss": 0.15204078735890927, "eval_mse_loss": 1.2707049117159488, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.552042003760714, "flow/improvement_ratio": 0.8948889724227157, "flow/mag_ratio_mean": 0.5576132778674047, "flow/mag_ratio_std": 0.25525683488672984, "step": 4096 }, { "epoch": 0.176525093197147, "eval_bleu": 0.9383145863088955, "eval_cos_loss": 0.44795799712890755, "eval_dec_loss": 0.11301154795406597, "eval_loss": 1.2707049117159488, "eval_mse2_loss": 0.15204078735890927, "eval_mse_loss": 1.2707049117159488, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 149.5476, "eval_samples_per_second": 200.605, "eval_steps_per_second": 3.136, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.552042003760714, "flow/improvement_ratio": 0.8948889724227157, "flow/mag_ratio_mean": 0.5576132778674047, "flow/mag_ratio_std": 0.25525683488672984, "step": 4096 }, { "epoch": 0.18755791152196866, "grad_norm": 1.123129963874817, "learning_rate": 0.000931026399368079, "loss": 1.2691912651062012, "step": 4352 }, { "epoch": 0.19859072984679035, "grad_norm": 0.49173882603645325, "learning_rate": 0.0009217762892151117, "loss": 1.26752769947052, "step": 4608 }, { "epoch": 0.20962354817161205, "grad_norm": 0.5665431618690491, "learning_rate": 0.0009119964292315354, "loss": 1.2669333219528198, "step": 4864 }, { "epoch": 0.22065636649643372, "grad_norm": 0.4946308732032776, "learning_rate": 0.0009016991028939279, "loss": 1.2646225690841675, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.9396675860722136, "eval_cos_loss": 0.44516199083724767, "eval_dec_loss": 0.10893038547893705, "eval_loss": 1.264682760879175, "eval_mse2_loss": 0.1498125367073108, "eval_mse_loss": 1.264682760879175, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5548380070657872, "flow/improvement_ratio": 0.8946977740665997, "flow/mag_ratio_mean": 0.5694006043456511, "flow/mag_ratio_std": 0.2655116878211625, "step": 5120 }, { "epoch": 0.22065636649643372, "eval_bleu": 0.9396675860722136, "eval_cos_loss": 0.44516199083724767, "eval_dec_loss": 0.10893038547893705, "eval_loss": 1.264682760879175, "eval_mse2_loss": 0.1498125367073108, "eval_mse_loss": 1.264682760879175, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 151.8799, "eval_samples_per_second": 197.524, "eval_steps_per_second": 3.088, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5548380070657872, "flow/improvement_ratio": 0.8946977740665997, "flow/mag_ratio_mean": 0.5694006043456511, "flow/mag_ratio_std": 0.2655116878211625, "step": 5120 }, { "epoch": 0.23168918482125542, "grad_norm": 0.5147830843925476, "learning_rate": 0.0008908972436151494, "loss": 1.261371374130249, "step": 5376 }, { "epoch": 0.2427220031460771, "grad_norm": 0.7221893668174744, "learning_rate": 0.0008796044185000127, "loss": 1.259010672569275, "step": 5632 }, { "epoch": 0.2537548214708988, "grad_norm": 0.6270182132720947, "learning_rate": 0.0008678348113050368, "loss": 1.2565613985061646, "step": 5888 }, { "epoch": 0.26478763979572045, "grad_norm": 0.3954711854457855, "learning_rate": 0.0008556032046236897, "loss": 1.258548378944397, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.9381239377332383, "eval_cos_loss": 0.4434889930524806, "eval_dec_loss": 0.11391587999226378, "eval_loss": 1.2588644528439812, "eval_mse2_loss": 0.15056055846181252, "eval_mse_loss": 1.2588644528439812, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5565110067568863, "flow/improvement_ratio": 0.8946461940625074, "flow/mag_ratio_mean": 0.5628604918146438, "flow/mag_ratio_std": 0.2606462057528974, "step": 6144 }, { "epoch": 0.26478763979572045, "eval_bleu": 0.9381239377332383, "eval_cos_loss": 0.4434889930524806, "eval_dec_loss": 0.11391587999226378, "eval_loss": 1.2588644528439812, "eval_mse2_loss": 0.15056055846181252, "eval_mse_loss": 1.2588644528439812, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 153.8457, "eval_samples_per_second": 195.001, "eval_steps_per_second": 3.049, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5565110067568863, "flow/improvement_ratio": 0.8946461940625074, "flow/mag_ratio_mean": 0.5628604918146438, "flow/mag_ratio_std": 0.2606462057528974, "step": 6144 }, { "epoch": 0.2758204581205422, "grad_norm": 0.8126729130744934, "learning_rate": 0.000842924961319492, "loss": 1.2565950155258179, "step": 6400 }, { "epoch": 0.28685327644536385, "grad_norm": 0.84797203540802, "learning_rate": 0.0008298160052303045, "loss": 1.2548315525054932, "step": 6656 }, { "epoch": 0.2978860947701855, "grad_norm": 0.561568021774292, "learning_rate": 0.0008162928011680314, "loss": 1.2526129484176636, "step": 6912 }, { "epoch": 0.30891891309500724, "grad_norm": 0.45474377274513245, "learning_rate": 0.000802372334238864, "loss": 1.2513761520385742, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.9385536520845816, "eval_cos_loss": 0.4402598062557961, "eval_dec_loss": 0.11249503215500858, "eval_loss": 1.2510530173397267, "eval_mse2_loss": 0.1480516226116274, "eval_mse_loss": 1.2510530173397267, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5597401952692694, "flow/improvement_ratio": 0.895444744939743, "flow/mag_ratio_mean": 0.5710282248220464, "flow/mag_ratio_std": 0.26387540328858505, "step": 7168 }, { "epoch": 0.30891891309500724, "eval_bleu": 0.9385536520845816, "eval_cos_loss": 0.4402598062557961, "eval_dec_loss": 0.11249503215500858, "eval_loss": 1.2510530173397267, "eval_mse2_loss": 0.1480516226116274, "eval_mse_loss": 1.2510530173397267, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.7181, "eval_samples_per_second": 196.44, "eval_steps_per_second": 3.071, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5597401952692694, "flow/improvement_ratio": 0.895444744939743, "flow/mag_ratio_mean": 0.5710282248220464, "flow/mag_ratio_std": 0.26387540328858505, "step": 7168 }, { "epoch": 0.3199517314198289, "grad_norm": 1.3543585538864136, "learning_rate": 0.0007880720885100349, "loss": 1.2521653175354004, "step": 7424 }, { "epoch": 0.3309845497446506, "grad_norm": 0.4370076358318329, "learning_rate": 0.0007734100250498788, "loss": 1.249273419380188, "step": 7680 }, { "epoch": 0.3420173680694723, "grad_norm": 1.0196475982666016, "learning_rate": 0.000758404559368781, "loss": 1.2500712871551514, "step": 7936 }, { "epoch": 0.353050186394294, "grad_norm": 0.733001708984375, "learning_rate": 0.0007430745382893488, "loss": 1.245364785194397, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9376793187397806, "eval_cos_loss": 0.4385024095013706, "eval_dec_loss": 0.11364057421017049, "eval_loss": 1.2459661925016945, "eval_mse2_loss": 0.148339767350571, "eval_mse_loss": 1.2459661925016945, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5614975882745755, "flow/improvement_ratio": 0.8961417695352518, "flow/mag_ratio_mean": 0.5688313084370547, "flow/mag_ratio_std": 0.26494109700483554, "step": 8192 }, { "epoch": 0.353050186394294, "eval_bleu": 0.9376793187397806, "eval_cos_loss": 0.4385024095013706, "eval_dec_loss": 0.11364057421017049, "eval_loss": 1.2459661925016945, "eval_mse2_loss": 0.148339767350571, "eval_mse_loss": 1.2459661925016945, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.8054, "eval_samples_per_second": 196.328, "eval_steps_per_second": 3.069, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5614975882745755, "flow/improvement_ratio": 0.8961417695352518, "flow/mag_ratio_mean": 0.5688313084370547, "flow/mag_ratio_std": 0.26494109700483554, "step": 8192 }, { "epoch": 0.36408300471911564, "grad_norm": 0.676328718662262, "learning_rate": 0.0007274392162748551, "loss": 1.2448910474777222, "step": 8448 }, { "epoch": 0.3751158230439373, "grad_norm": 0.6379961967468262, "learning_rate": 0.000711518231245687, "loss": 1.2442706823349, "step": 8704 }, { "epoch": 0.38614864136875904, "grad_norm": 0.5386805534362793, "learning_rate": 0.0006953315799141723, "loss": 1.2446835041046143, "step": 8960 }, { "epoch": 0.3971814596935807, "grad_norm": 0.8263258934020996, "learning_rate": 0.0006788995926687669, "loss": 1.2411766052246094, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9372486918854673, "eval_cos_loss": 0.43675092898452206, "eval_dec_loss": 0.11516488874867273, "eval_loss": 1.241364901762273, "eval_mse2_loss": 0.1478570194196091, "eval_mse_loss": 1.241364901762273, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5632490722863659, "flow/improvement_ratio": 0.8974738620491679, "flow/mag_ratio_mean": 0.5655419154207844, "flow/mag_ratio_std": 0.2603240320041998, "step": 9216 }, { "epoch": 0.3971814596935807, "eval_bleu": 0.9372486918854673, "eval_cos_loss": 0.43675092898452206, "eval_dec_loss": 0.11516488874867273, "eval_loss": 1.241364901762273, "eval_mse2_loss": 0.1478570194196091, "eval_mse_loss": 1.241364901762273, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.8433, "eval_samples_per_second": 196.28, "eval_steps_per_second": 3.069, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5632490722863659, "flow/improvement_ratio": 0.8974738620491679, "flow/mag_ratio_mean": 0.5655419154207844, "flow/mag_ratio_std": 0.2603240320041998, "step": 9216 }, { "epoch": 0.4082142780184024, "grad_norm": 0.7855456471443176, "learning_rate": 0.0006622429080391422, "loss": 1.2460049390792847, "step": 9472 }, { "epoch": 0.4192470963432241, "grad_norm": 0.4608207941055298, "learning_rate": 0.0006453824467742515, "loss": 1.2414920330047607, "step": 9728 }, { "epoch": 0.43027991466804577, "grad_norm": 0.5247617959976196, "learning_rate": 0.0006283393855659275, "loss": 1.2424880266189575, "step": 9984 }, { "epoch": 0.44131273299286744, "grad_norm": 0.8765453100204468, "learning_rate": 0.0006111351304510173, "loss": 1.237776517868042, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.937646836000478, "eval_cos_loss": 0.4353823194752878, "eval_dec_loss": 0.11402556833737632, "eval_loss": 1.2377641976260936, "eval_mse2_loss": 0.1474350707204357, "eval_mse_loss": 1.2377641976260936, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.564617679063191, "flow/improvement_ratio": 0.899760089830549, "flow/mag_ratio_mean": 0.5730336795229394, "flow/mag_ratio_std": 0.26344449729172154, "step": 10240 }, { "epoch": 0.44131273299286744, "eval_bleu": 0.937646836000478, "eval_cos_loss": 0.4353823194752878, "eval_dec_loss": 0.11402556833737632, "eval_loss": 1.2377641976260936, "eval_mse2_loss": 0.1474350707204357, "eval_mse_loss": 1.2377641976260936, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 151.9737, "eval_samples_per_second": 197.403, "eval_steps_per_second": 3.086, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.564617679063191, "flow/improvement_ratio": 0.899760089830549, "flow/mag_ratio_mean": 0.5730336795229394, "flow/mag_ratio_std": 0.26344449729172154, "step": 10240 }, { "epoch": 0.45234555131768917, "grad_norm": 0.6895334124565125, "learning_rate": 0.0005937912899254605, "loss": 1.2384426593780518, "step": 10496 }, { "epoch": 0.46337836964251083, "grad_norm": 0.6421330571174622, "learning_rate": 0.0005763296478040787, "loss": 1.240878939628601, "step": 10752 }, { "epoch": 0.4744111879673325, "grad_norm": 0.7770284414291382, "learning_rate": 0.0005587721358601663, "loss": 1.2393468618392944, "step": 11008 }, { "epoch": 0.4854440062921542, "grad_norm": 1.0520166158676147, "learning_rate": 0.0005411408062792448, "loss": 1.237922191619873, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.93652744913201, "eval_cos_loss": 0.4366011674851497, "eval_dec_loss": 0.11468809016390459, "eval_loss": 1.2409222840246108, "eval_mse2_loss": 0.14564816977804912, "eval_mse_loss": 1.2409222840246108, "eval_rec_loss": 0.047009017791098624, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5633988297824413, "flow/improvement_ratio": 0.897065937773251, "flow/mag_ratio_mean": 0.5639294942558956, "flow/mag_ratio_std": 0.25510865748564066, "step": 11264 }, { "epoch": 0.4854440062921542, "eval_bleu": 0.93652744913201, "eval_cos_loss": 0.4366011674851497, "eval_dec_loss": 0.11468809016390459, "eval_loss": 1.2409222840246108, "eval_mse2_loss": 0.14564816977804912, "eval_mse_loss": 1.2409222840246108, "eval_rec_loss": 0.047009017791098624, "eval_runtime": 152.4483, "eval_samples_per_second": 196.788, "eval_steps_per_second": 3.076, "eval_var_loss": 0.01723895594080501, "flow/cos_sim": 0.5633988297824413, "flow/improvement_ratio": 0.897065937773251, "flow/mag_ratio_mean": 0.5639294942558956, "flow/mag_ratio_std": 0.25510865748564066, "step": 11264 } ], "logging_steps": 256, "max_steps": 23204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }