{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932104752667313, "eval_steps": 1024, "global_step": 21504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.9140732288360596, "learning_rate": 0.000498046875, "loss": 6.9310712814331055, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.966057538986206, "learning_rate": 0.000998046875, "loss": 1.017655372619629, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.6036155223846436, "learning_rate": 0.000999640996023194, "loss": 0.5299859046936035, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.6692569851875305, "learning_rate": 0.0009985588674043958, "loss": 0.43251049518585205, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.6267380536600159, "eval_ce_loss": 1.563563620935292, "eval_cos_loss": 0.11390362637432198, "eval_loss": 0.40128198402113024, "eval_mse_loss": 0.2176408769061032, "eval_rec_loss": 0.015894381031734215, "flow/cos_sim": 0.8860963983078526, "flow/improvement_ratio": 0.9765386980146034, "flow/mag_ratio_mean": 0.8838755380889597, "flow/mag_ratio_std": 0.1646749794006892, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.6267380536600159, "eval_ce_loss": 1.563563620935292, "eval_cos_loss": 0.11390362637432198, "eval_loss": 0.40128198402113024, "eval_mse_loss": 0.2176408769061032, "eval_rec_loss": 0.015894381031734215, "eval_runtime": 145.0392, "eval_samples_per_second": 193.003, "eval_steps_per_second": 3.02, "flow/cos_sim": 0.8860963983078526, "flow/improvement_ratio": 0.9765386980146034, "flow/mag_ratio_mean": 0.8838755380889597, "flow/mag_ratio_std": 0.1646749794006892, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.46750983595848083, "learning_rate": 0.0009967551747861387, "loss": 0.3926387429237366, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.5673767328262329, "learning_rate": 0.000994232528651847, "loss": 0.3741423189640045, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.5806387066841125, "learning_rate": 0.0009909945800260092, "loss": 0.35307449102401733, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.6838217973709106, "learning_rate": 0.0009870460151900522, "loss": 0.3399304151535034, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.6478349738418474, "eval_ce_loss": 1.3434572242982856, "eval_cos_loss": 0.09738318450306649, "eval_loss": 0.3373560005943525, "eval_mse_loss": 0.18800275102597938, "eval_rec_loss": 0.005269206022433341, "flow/cos_sim": 0.9026168323542973, "flow/improvement_ratio": 0.9768092384621433, "flow/mag_ratio_mean": 0.9078535956062682, "flow/mag_ratio_std": 0.12002966954419601, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.6478349738418474, "eval_ce_loss": 1.3434572242982856, "eval_cos_loss": 0.09738318450306649, "eval_loss": 0.3373560005943525, "eval_mse_loss": 0.18800275102597938, "eval_rec_loss": 0.005269206022433341, "eval_runtime": 143.5996, "eval_samples_per_second": 194.938, "eval_steps_per_second": 3.05, "flow/cos_sim": 0.9026168323542973, "flow/improvement_ratio": 0.9768092384621433, "flow/mag_ratio_mean": 0.9078535956062682, "flow/mag_ratio_std": 0.12002966954419601, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.6462650299072266, "learning_rate": 0.0009823925488998885, "loss": 0.3380688428878784, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.6545562744140625, "learning_rate": 0.0009770409161149525, "loss": 0.3214532732963562, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.6667947173118591, "learning_rate": 0.0009709988622506973, "loss": 0.31182003021240234, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.600439727306366, "learning_rate": 0.000964275131968659, "loss": 0.3082171380519867, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.6794402302458825, "eval_ce_loss": 1.18399681810919, "eval_cos_loss": 0.08983433964453875, "eval_loss": 0.3044066195414491, "eval_mse_loss": 0.17457320809908652, "eval_rec_loss": 0.0024502936500277726, "flow/cos_sim": 0.9101656942062726, "flow/improvement_ratio": 0.9734727941691603, "flow/mag_ratio_mean": 0.9100713347463303, "flow/mag_ratio_std": 0.09228039255629392, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.6794402302458825, "eval_ce_loss": 1.18399681810919, "eval_cos_loss": 0.08983433964453875, "eval_loss": 0.3044066195414491, "eval_mse_loss": 0.17457320809908652, "eval_rec_loss": 0.0024502936500277726, "eval_runtime": 142.616, "eval_samples_per_second": 196.282, "eval_steps_per_second": 3.071, "flow/cos_sim": 0.9101656942062726, "flow/improvement_ratio": 0.9734727941691603, "flow/mag_ratio_mean": 0.9100713347463303, "flow/mag_ratio_std": 0.09228039255629392, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.5188813209533691, "learning_rate": 0.0009568794565203123, "loss": 0.3006798326969147, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.3702296316623688, "learning_rate": 0.0009488225396630347, "loss": 0.29647037386894226, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.49310481548309326, "learning_rate": 0.0009401160421685646, "loss": 0.3130263388156891, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.5644649267196655, "learning_rate": 0.0009307725649463714, "loss": 0.2926834523677826, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.6839715035857402, "eval_ce_loss": 1.1360810732460458, "eval_cos_loss": 0.08503438343536364, "eval_loss": 0.2889118402788084, "eval_mse_loss": 0.16517186124030858, "eval_rec_loss": 0.0016284317239192974, "flow/cos_sim": 0.9149656362457362, "flow/improvement_ratio": 0.9750959768415042, "flow/mag_ratio_mean": 0.9128544130553938, "flow/mag_ratio_std": 0.09354445787325297, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.6839715035857402, "eval_ce_loss": 1.1360810732460458, "eval_cos_loss": 0.08503438343536364, "eval_loss": 0.2889118402788084, "eval_mse_loss": 0.16517186124030858, "eval_rec_loss": 0.0016284317239192974, "eval_runtime": 143.3522, "eval_samples_per_second": 195.274, "eval_steps_per_second": 3.055, "flow/cos_sim": 0.9149656362457362, "flow/improvement_ratio": 0.9750959768415042, "flow/mag_ratio_mean": 0.9128544130553938, "flow/mag_ratio_std": 0.09354445787325297, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.3648587763309479, "learning_rate": 0.0009208056308063659, "loss": 0.28915518522262573, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.5004331469535828, "learning_rate": 0.0009102296648873445, "loss": 0.28424444794654846, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.27222171425819397, "learning_rate": 0.0008990599737794927, "loss": 0.2838270366191864, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.39166298508644104, "learning_rate": 0.0008873127233711644, "loss": 0.2802566885948181, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.6966429437264656, "eval_ce_loss": 1.0860335096784923, "eval_cos_loss": 0.08069509963534738, "eval_loss": 0.27587767567944854, "eval_mse_loss": 0.15802378180228413, "eval_rec_loss": 0.0011810324893420196, "flow/cos_sim": 0.9193049165756191, "flow/improvement_ratio": 0.9735453040360316, "flow/mag_ratio_mean": 0.9165048223652251, "flow/mag_ratio_std": 0.08227625193252955, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.6966429437264656, "eval_ce_loss": 1.0860335096784923, "eval_cos_loss": 0.08069509963534738, "eval_loss": 0.27587767567944854, "eval_mse_loss": 0.15802378180228413, "eval_rec_loss": 0.0011810324893420196, "eval_runtime": 144.8851, "eval_samples_per_second": 193.208, "eval_steps_per_second": 3.023, "flow/cos_sim": 0.9193049165756191, "flow/improvement_ratio": 0.9735453040360316, "flow/mag_ratio_mean": 0.9165048223652251, "flow/mag_ratio_std": 0.08227625193252955, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.4552461802959442, "learning_rate": 0.0008750049154520011, "loss": 0.27747780084609985, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.3430848717689514, "learning_rate": 0.0008621543631062487, "loss": 0.2891384959220886, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.4337022304534912, "learning_rate": 0.0008487796649318904, "loss": 0.274080365896225, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.40342026948928833, "learning_rate": 0.0008349001781229053, "loss": 0.2737179100513458, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.7005510372485733, "eval_ce_loss": 1.067978764981984, "eval_cos_loss": 0.0791517910994079, "eval_loss": 0.2710464134744313, "eval_mse_loss": 0.15530247285485813, "eval_rec_loss": 0.00103088276002016, "flow/cos_sim": 0.9208482295682986, "flow/improvement_ratio": 0.9731569932476026, "flow/mag_ratio_mean": 0.9284612283314744, "flow/mag_ratio_std": 0.07813496222574962, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.7005510372485733, "eval_ce_loss": 1.067978764981984, "eval_cos_loss": 0.0791517910994079, "eval_loss": 0.2710464134744313, "eval_mse_loss": 0.15530247285485813, "eval_rec_loss": 0.00103088276002016, "eval_runtime": 142.9519, "eval_samples_per_second": 195.821, "eval_steps_per_second": 3.064, "flow/cos_sim": 0.9208482295682986, "flow/improvement_ratio": 0.9731569932476026, "flow/mag_ratio_mean": 0.9284612283314744, "flow/mag_ratio_std": 0.07813496222574962, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.36669906973838806, "learning_rate": 0.0008205359904536107, "loss": 0.2686825394630432, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.2844366729259491, "learning_rate": 0.0008057078912056363, "loss": 0.26841098070144653, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.341339647769928, "learning_rate": 0.0007904373410796086, "loss": 0.2661970555782318, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.35041218996047974, "learning_rate": 0.0007747464411350876, "loss": 0.26556745171546936, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.710653856755971, "eval_ce_loss": 1.0037036097213015, "eval_cos_loss": 0.0774884378733156, "eval_loss": 0.2617515635817018, "eval_mse_loss": 0.15282062886784611, "eval_rec_loss": 0.0008117282769037305, "flow/cos_sim": 0.9225115811443765, "flow/improvement_ratio": 0.9752856880834658, "flow/mag_ratio_mean": 0.9207794453999768, "flow/mag_ratio_std": 0.08132727078447059, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.710653856755971, "eval_ce_loss": 1.0037036097213015, "eval_cos_loss": 0.0774884378733156, "eval_loss": 0.2617515635817018, "eval_mse_loss": 0.15282062886784611, "eval_rec_loss": 0.0008117282769037305, "eval_runtime": 142.6116, "eval_samples_per_second": 196.288, "eval_steps_per_second": 3.071, "flow/cos_sim": 0.9225115811443765, "flow/improvement_ratio": 0.9752856880834658, "flow/mag_ratio_mean": 0.9207794453999768, "flow/mag_ratio_std": 0.08132727078447059, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.328535258769989, "learning_rate": 0.000758657900803716, "loss": 0.27869582176208496, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.27767524123191833, "learning_rate": 0.000742195005021869, "loss": 0.26837044954299927, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.40343427658081055, "learning_rate": 0.0007253815805303786, "loss": 0.26287224888801575, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.3494727611541748, "learning_rate": 0.0007082419613901028, "loss": 0.2608710527420044, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.7258561394806508, "eval_ce_loss": 0.954990051584701, "eval_cos_loss": 0.07798272708099183, "eval_loss": 0.2584831827239359, "eval_mse_loss": 0.15443320324023566, "eval_rec_loss": 0.0007526992644982298, "flow/cos_sim": 0.9220172950666244, "flow/improvement_ratio": 0.9747015749482804, "flow/mag_ratio_mean": 0.9233354187175019, "flow/mag_ratio_std": 0.07788482179108276, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.7258561394806508, "eval_ce_loss": 0.954990051584701, "eval_cos_loss": 0.07798272708099183, "eval_loss": 0.2584831827239359, "eval_mse_loss": 0.15443320324023566, "eval_rec_loss": 0.0007526992644982298, "eval_runtime": 144.1923, "eval_samples_per_second": 194.137, "eval_steps_per_second": 3.038, "flow/cos_sim": 0.9220172950666244, "flow/improvement_ratio": 0.9747015749482804, "flow/mag_ratio_mean": 0.9233354187175019, "flow/mag_ratio_std": 0.07788482179108276, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.4279082417488098, "learning_rate": 0.0006908009537632514, "loss": 0.2599097788333893, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.43606042861938477, "learning_rate": 0.0006730838000114403, "loss": 0.26111704111099243, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.21560809016227722, "learning_rate": 0.0006551161421624341, "loss": 0.268413245677948, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.5756855607032776, "learning_rate": 0.0006369239847984517, "loss": 0.2562841475009918, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.7095258647762687, "eval_ce_loss": 1.0158189947474492, "eval_cos_loss": 0.0742807937743457, "eval_loss": 0.256784121705789, "eval_mse_loss": 0.14713338680871546, "eval_rec_loss": 0.0006407552403591884, "flow/cos_sim": 0.9257192251072627, "flow/improvement_ratio": 0.9743275895510635, "flow/mag_ratio_mean": 0.9202466026016566, "flow/mag_ratio_std": 0.0764387545588354, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.7095258647762687, "eval_ce_loss": 1.0158189947474492, "eval_cos_loss": 0.0742807937743457, "eval_loss": 0.256784121705789, "eval_mse_loss": 0.14713338680871546, "eval_rec_loss": 0.0006407552403591884, "eval_runtime": 145.076, "eval_samples_per_second": 192.954, "eval_steps_per_second": 3.019, "flow/cos_sim": 0.9257192251072627, "flow/improvement_ratio": 0.9743275895510635, "flow/mag_ratio_mean": 0.9202466026016566, "flow/mag_ratio_std": 0.0764387545588354, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.376579225063324, "learning_rate": 0.0006185336574197479, "loss": 0.25477761030197144, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.4893060624599457, "learning_rate": 0.0005999717763379407, "loss": 0.2554967701435089, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.5277368426322937, "learning_rate": 0.0005812652061542363, "loss": 0.2537921667098999, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.3493780791759491, "learning_rate": 0.0005624410208783071, "loss": 0.2518165707588196, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.7175600139987302, "eval_ce_loss": 0.9684542848231041, "eval_cos_loss": 0.07330035061902923, "eval_loss": 0.25065779049782994, "eval_mse_loss": 0.1458930650292194, "eval_rec_loss": 0.000589260474122746, "flow/cos_sim": 0.9266996725236989, "flow/improvement_ratio": 0.9765301047394809, "flow/mag_ratio_mean": 0.9194769399351181, "flow/mag_ratio_std": 0.07280441308946914, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.7175600139987302, "eval_ce_loss": 0.9684542848231041, "eval_cos_loss": 0.07330035061902923, "eval_loss": 0.25065779049782994, "eval_mse_loss": 0.1458930650292194, "eval_rec_loss": 0.000589260474122746, "eval_runtime": 144.907, "eval_samples_per_second": 193.179, "eval_steps_per_second": 3.023, "flow/cos_sim": 0.9266996725236989, "flow/improvement_ratio": 0.9765301047394809, "flow/mag_ratio_mean": 0.9194769399351181, "flow/mag_ratio_std": 0.07280441308946914, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.5341387391090393, "learning_rate": 0.0005435264647440881, "loss": 0.2537766695022583, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.37015679478645325, "learning_rate": 0.000524548912779213, "loss": 0.2499093860387802, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.6591759920120239, "learning_rate": 0.0005055358311851499, "loss": 0.25186970829963684, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.31277748942375183, "learning_rate": 0.0004865147375853812, "loss": 0.25062766671180725, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.7260803484693739, "eval_ce_loss": 0.9400546782092961, "eval_cos_loss": 0.07318596699228298, "eval_loss": 0.2483225218169221, "eval_mse_loss": 0.14649179712210073, "eval_rec_loss": 0.0005066586955975142, "flow/cos_sim": 0.9268140568308634, "flow/improvement_ratio": 0.9762515691045213, "flow/mag_ratio_mean": 0.9157658136326429, "flow/mag_ratio_std": 0.07639177793373256, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.7260803484693739, "eval_ce_loss": 0.9400546782092961, "eval_cos_loss": 0.07318596699228298, "eval_loss": 0.2483225218169221, "eval_mse_loss": 0.14649179712210073, "eval_rec_loss": 0.0005066586955975142, "eval_runtime": 143.4087, "eval_samples_per_second": 195.197, "eval_steps_per_second": 3.054, "flow/cos_sim": 0.9268140568308634, "flow/improvement_ratio": 0.9762515691045213, "flow/mag_ratio_mean": 0.9157658136326429, "flow/mag_ratio_std": 0.07639177793373256, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.7305059432983398, "learning_rate": 0.0004675131611991607, "loss": 0.2499980330467224, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.2771352231502533, "learning_rate": 0.0004485586029984899, "loss": 0.24800576269626617, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.2686781883239746, "learning_rate": 0.00042967849590597266, "loss": 0.24677802622318268, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.37717506289482117, "learning_rate": 0.0004109001650911621, "loss": 0.24666211009025574, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.724567852062454, "eval_ce_loss": 0.9509265820185343, "eval_cos_loss": 0.07338333341663014, "eval_loss": 0.24947158049911125, "eval_mse_loss": 0.1465663725518745, "eval_rec_loss": 0.00047421499268288965, "flow/cos_sim": 0.9266166863920482, "flow/improvement_ratio": 0.9748387396607769, "flow/mag_ratio_mean": 0.9328112143631939, "flow/mag_ratio_std": 0.07196474395113993, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.724567852062454, "eval_ce_loss": 0.9509265820185343, "eval_cos_loss": 0.07338333341663014, "eval_loss": 0.24947158049911125, "eval_mse_loss": 0.1465663725518745, "eval_rec_loss": 0.00047421499268288965, "eval_runtime": 144.4458, "eval_samples_per_second": 193.796, "eval_steps_per_second": 3.032, "flow/cos_sim": 0.9266166863920482, "flow/improvement_ratio": 0.9748387396607769, "flow/mag_ratio_mean": 0.9328112143631939, "flow/mag_ratio_std": 0.07196474395113993, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.7573108673095703, "learning_rate": 0.0003922507884228551, "loss": 0.24700023233890533, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.2122378647327423, "learning_rate": 0.00037375735713457723, "loss": 0.2433316856622696, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.22209292650222778, "learning_rate": 0.00035544663676018276, "loss": 0.246763676404953, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.309989333152771, "learning_rate": 0.00033734512839611255, "loss": 0.24262374639511108, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.7300156788258321, "eval_ce_loss": 0.933146977737614, "eval_cos_loss": 0.07192521648626077, "eval_loss": 0.24540512995224564, "eval_mse_loss": 0.14446957482526837, "eval_rec_loss": 0.00042833433703537476, "flow/cos_sim": 0.9280747922315989, "flow/improvement_ratio": 0.9743193072271129, "flow/mag_ratio_mean": 0.9250430587219866, "flow/mag_ratio_std": 0.07064938510355612, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.7300156788258321, "eval_ce_loss": 0.933146977737614, "eval_cos_loss": 0.07192521648626077, "eval_loss": 0.24540512995224564, "eval_mse_loss": 0.14446957482526837, "eval_rec_loss": 0.00042833433703537476, "eval_runtime": 144.2347, "eval_samples_per_second": 194.079, "eval_steps_per_second": 3.037, "flow/cos_sim": 0.9280747922315989, "flow/improvement_ratio": 0.9743193072271129, "flow/mag_ratio_mean": 0.9250430587219866, "flow/mag_ratio_std": 0.07064938510355612, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.2954619824886322, "learning_rate": 0.0003194790303463687, "loss": 0.24288520216941833, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.6633331775665283, "learning_rate": 0.00030187420020572406, "loss": 0.24463221430778503, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.2086828500032425, "learning_rate": 0.00028455611743603626, "loss": 0.2416921854019165, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.19425669312477112, "learning_rate": 0.0002675498464898373, "loss": 0.2426968514919281, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.7358716031963392, "eval_ce_loss": 0.8970746389126669, "eval_cos_loss": 0.07039773773793216, "eval_loss": 0.23921880884665878, "eval_mse_loss": 0.14204559988899318, "eval_rec_loss": 0.0004259694892114143, "flow/cos_sim": 0.9296022757003296, "flow/improvement_ratio": 0.9752817895586632, "flow/mag_ratio_mean": 0.9346142843978046, "flow/mag_ratio_std": 0.06967301141963975, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.7358716031963392, "eval_ce_loss": 0.8970746389126669, "eval_cos_loss": 0.07039773773793216, "eval_loss": 0.23921880884665878, "eval_mse_loss": 0.14204559988899318, "eval_rec_loss": 0.0004259694892114143, "eval_runtime": 146.9688, "eval_samples_per_second": 190.469, "eval_steps_per_second": 2.98, "flow/cos_sim": 0.9296022757003296, "flow/improvement_ratio": 0.9752817895586632, "flow/mag_ratio_mean": 0.9346142843978046, "flow/mag_ratio_std": 0.06967301141963975, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.5798227787017822, "learning_rate": 0.0002508800005345623, "loss": 0.24217651784420013, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 0.8549785017967224, "learning_rate": 0.00023457070582992562, "loss": 0.24345579743385315, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.4544009566307068, "learning_rate": 0.00021864556680999692, "loss": 0.23991872370243073, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.5695450305938721, "learning_rate": 0.0002031276319205152, "loss": 0.2396196871995926, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.7367086437042604, "eval_ce_loss": 0.8948814071476732, "eval_cos_loss": 0.07030742877422402, "eval_loss": 0.23915054540247677, "eval_mse_loss": 0.14226854150289814, "eval_rec_loss": 0.00036311902771741107, "flow/cos_sim": 0.9296925847661005, "flow/improvement_ratio": 0.9746487660495113, "flow/mag_ratio_mean": 0.9314249843767245, "flow/mag_ratio_std": 0.0696038415476884, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.7367086437042604, "eval_ce_loss": 0.8948814071476732, "eval_cos_loss": 0.07030742877422402, "eval_loss": 0.23915054540247677, "eval_mse_loss": 0.14226854150289814, "eval_rec_loss": 0.00036311902771741107, "eval_runtime": 143.8829, "eval_samples_per_second": 194.554, "eval_steps_per_second": 3.044, "flow/cos_sim": 0.9296925847661005, "flow/improvement_ratio": 0.9746487660495113, "flow/mag_ratio_mean": 0.9314249843767245, "flow/mag_ratio_std": 0.0696038415476884, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.24349141120910645, "learning_rate": 0.00018803936026088542, "loss": 0.23957179486751556, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.2847910225391388, "learning_rate": 0.00017340258907913464, "loss": 0.23970942199230194, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.5366371870040894, "learning_rate": 0.0001592385021668743, "loss": 0.23960672318935394, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.16883717477321625, "learning_rate": 0.0001455675992000087, "loss": 0.238803431391716, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.734623877372401, "eval_ce_loss": 0.8963056962647938, "eval_cos_loss": 0.06929709047045066, "eval_loss": 0.23715336164927373, "eval_mse_loss": 0.14023235147674334, "eval_rec_loss": 0.00036073007059260167, "flow/cos_sim": 0.9307029282665689, "flow/improvement_ratio": 0.9735523003693585, "flow/mag_ratio_mean": 0.9296198963574623, "flow/mag_ratio_std": 0.07116366107203917, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.734623877372401, "eval_ce_loss": 0.8963056962647938, "eval_cos_loss": 0.06929709047045066, "eval_loss": 0.23715336164927373, "eval_mse_loss": 0.14023235147674334, "eval_rec_loss": 0.00036073007059260167, "eval_runtime": 144.8569, "eval_samples_per_second": 193.246, "eval_steps_per_second": 3.024, "flow/cos_sim": 0.9307029282665689, "flow/improvement_ratio": 0.9735523003693585, "flow/mag_ratio_mean": 0.9296198963574623, "flow/mag_ratio_std": 0.07116366107203917, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.18049044907093048, "learning_rate": 0.000132409666069565, "loss": 0.23929160833358765, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.42043963074684143, "learning_rate": 0.0001197837462455823, "loss": 0.23627738654613495, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.19926320016384125, "learning_rate": 0.00010770811321550749, "loss": 0.23734550178050995, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 0.2398398518562317, "learning_rate": 9.620024403698591e-05, "loss": 0.23703321814537048, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.7436939325306295, "eval_ce_loss": 0.8676520106999297, "eval_cos_loss": 0.06962325224933559, "eval_loss": 0.23564744887014502, "eval_mse_loss": 0.14157898780213643, "eval_rec_loss": 0.00034093371078227947, "flow/cos_sim": 0.9303767629410034, "flow/improvement_ratio": 0.9734724662075304, "flow/mag_ratio_mean": 0.9335192646065803, "flow/mag_ratio_std": 0.06777094987531503, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.7436939325306295, "eval_ce_loss": 0.8676520106999297, "eval_cos_loss": 0.06962325224933559, "eval_loss": 0.23564744887014502, "eval_mse_loss": 0.14157898780213643, "eval_rec_loss": 0.00034093371078227947, "eval_runtime": 144.4529, "eval_samples_per_second": 193.786, "eval_steps_per_second": 3.032, "flow/cos_sim": 0.9303767629410034, "flow/improvement_ratio": 0.9734724662075304, "flow/mag_ratio_mean": 0.9335192646065803, "flow/mag_ratio_std": 0.06777094987531503, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 0.781293511390686, "learning_rate": 8.527679404332429e-05, "loss": 0.23650652170181274, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.14831966161727905, "learning_rate": 7.495357273823544e-05, "loss": 0.2379007786512375, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 0.15485990047454834, "learning_rate": 6.524552091475183e-05, "loss": 0.2373773455619812, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 0.13972342014312744, "learning_rate": 5.6166689031422024e-05, "loss": 0.23547400534152985, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.7404975173841, "eval_ce_loss": 0.8763441998664647, "eval_cos_loss": 0.06848628710121869, "eval_loss": 0.2341211859698165, "eval_mse_loss": 0.1393070275040522, "eval_rec_loss": 0.00033110864400998134, "flow/cos_sim": 0.9315137378701336, "flow/improvement_ratio": 0.9746690928663837, "flow/mag_ratio_mean": 0.9297302695986343, "flow/mag_ratio_std": 0.06708622893922406, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.7404975173841, "eval_ce_loss": 0.8763441998664647, "eval_cos_loss": 0.06848628710121869, "eval_loss": 0.2341211859698165, "eval_mse_loss": 0.1393070275040522, "eval_rec_loss": 0.00033110864400998134, "eval_runtime": 144.9916, "eval_samples_per_second": 193.066, "eval_steps_per_second": 3.021, "flow/cos_sim": 0.9315137378701336, "flow/improvement_ratio": 0.9746690928663837, "flow/mag_ratio_mean": 0.9297302695986343, "flow/mag_ratio_std": 0.06708622893922406, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 0.13306647539138794, "learning_rate": 4.773021687709067e-05, "loss": 0.23608988523483276, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 0.16064585745334625, "learning_rate": 3.994831455368719e-05, "loss": 0.23599746823310852, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.19070403277873993, "learning_rate": 3.283224480455282e-05, "loss": 0.23425938189029694, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 1.2540472745895386, "learning_rate": 2.639230671387627e-05, "loss": 0.23733244836330414, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.7411587696578733, "eval_ce_loss": 0.8768332646862013, "eval_cos_loss": 0.06816527606349558, "eval_loss": 0.23368886810595588, "eval_mse_loss": 0.138866622085985, "eval_rec_loss": 0.00032239026382972687, "flow/cos_sim": 0.9318347360989819, "flow/improvement_ratio": 0.974329279574085, "flow/mag_ratio_mean": 0.9297914792141414, "flow/mag_ratio_std": 0.06777094615852997, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.7411587696578733, "eval_ce_loss": 0.8768332646862013, "eval_cos_loss": 0.06816527606349558, "eval_loss": 0.23368886810595588, "eval_mse_loss": 0.138866622085985, "eval_rec_loss": 0.00032239026382972687, "eval_runtime": 144.7333, "eval_samples_per_second": 193.411, "eval_steps_per_second": 3.026, "flow/cos_sim": 0.9318347360989819, "flow/improvement_ratio": 0.974329279574085, "flow/mag_ratio_mean": 0.9297914792141414, "flow/mag_ratio_std": 0.06777094615852997, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 0.1263592690229416, "learning_rate": 2.063782080083576e-05, "loss": 0.23605787754058838, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.12157848477363586, "learning_rate": 1.557711553001523e-05, "loss": 0.23632152378559113, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 0.1504155695438385, "learning_rate": 1.1217515257622269e-05, "loss": 0.2339780181646347, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 0.22383330762386322, "learning_rate": 7.565329630950746e-06, "loss": 0.23464307188987732, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.7380424224705183, "eval_ce_loss": 0.887897579365125, "eval_cos_loss": 0.06765534596934439, "eval_loss": 0.23374415763981266, "eval_mse_loss": 0.13786488870099256, "eval_rec_loss": 0.0003239751909095337, "flow/cos_sim": 0.9323446643134775, "flow/improvement_ratio": 0.9756706645499625, "flow/mag_ratio_mean": 0.9308563253106592, "flow/mag_ratio_std": 0.06712424952417748, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.7380424224705183, "eval_ce_loss": 0.887897579365125, "eval_cos_loss": 0.06765534596934439, "eval_loss": 0.23374415763981266, "eval_mse_loss": 0.13786488870099256, "eval_rec_loss": 0.0003239751909095337, "eval_runtime": 145.0128, "eval_samples_per_second": 193.038, "eval_steps_per_second": 3.02, "flow/cos_sim": 0.9323446643134775, "flow/improvement_ratio": 0.9756706645499625, "flow/mag_ratio_mean": 0.9308563253106592, "flow/mag_ratio_std": 0.06712424952417748, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 1.0795046091079712, "learning_rate": 4.62584445643166e-06, "loss": 0.23464855551719666, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 0.14688174426555634, "learning_rate": 2.40331404948807e-06, "loss": 0.23495320975780487, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 0.5085982084274292, "learning_rate": 9.009550772663965e-07, "loss": 0.23527467250823975, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.1799926906824112, "learning_rate": 1.2094190315575791e-07, "loss": 0.23269928991794586, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.7394941421899379, "eval_ce_loss": 0.8877883761032531, "eval_cos_loss": 0.06756893683173885, "eval_loss": 0.23357277302301094, "eval_mse_loss": 0.13771423821647963, "eval_rec_loss": 0.00032280229498863754, "flow/cos_sim": 0.932431082475131, "flow/improvement_ratio": 0.97383240123862, "flow/mag_ratio_mean": 0.9317929653544405, "flow/mag_ratio_std": 0.06708553743008609, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.7394941421899379, "eval_ce_loss": 0.8877883761032531, "eval_cos_loss": 0.06756893683173885, "eval_loss": 0.23357277302301094, "eval_mse_loss": 0.13771423821647963, "eval_rec_loss": 0.00032280229498863754, "eval_runtime": 146.0954, "eval_samples_per_second": 191.608, "eval_steps_per_second": 2.998, "flow/cos_sim": 0.932431082475131, "flow/improvement_ratio": 0.97383240123862, "flow/mag_ratio_mean": 0.9317929653544405, "flow/mag_ratio_std": 0.06708553743008609, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }