{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.709436053761951, "eval_steps": 1024, "global_step": 15360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.1487267017364502, "learning_rate": 0.000498046875, "loss": 11.798027992248535, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.8386015295982361, "learning_rate": 0.000998046875, "loss": 1.7853779792785645, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.7344363331794739, "learning_rate": 0.000999640996023194, "loss": 1.103014588356018, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 1.1188315153121948, "learning_rate": 0.0009985588674043958, "loss": 0.9580796360969543, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.9192375404443994, "eval_ce_loss": 0.26167990953648745, "eval_cos_loss": 0.26406568816127296, "eval_loss": 0.9037111119864738, "eval_mse_loss": 0.6016385473617135, "eval_rec_loss": 0.013986090569199833, "flow/cos_sim": 0.7359343250048215, "flow/improvement_ratio": 0.9760946458605326, "flow/mag_ratio_mean": 0.7269674200717717, "flow/mag_ratio_std": 0.1390784539316343, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.9192375404443994, "eval_ce_loss": 0.26167990953648745, "eval_cos_loss": 0.26406568816127296, "eval_loss": 0.9037111119864738, "eval_mse_loss": 0.6016385473617135, "eval_rec_loss": 0.013986090569199833, "eval_runtime": 144.0156, "eval_samples_per_second": 194.375, "eval_steps_per_second": 3.041, "flow/cos_sim": 0.7359343250048215, "flow/improvement_ratio": 0.9760946458605326, "flow/mag_ratio_mean": 0.7269674200717717, "flow/mag_ratio_std": 0.1390784539316343, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 1.0113043785095215, "learning_rate": 0.0009967551747861387, "loss": 0.8836896419525146, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.9680636525154114, "learning_rate": 0.000994232528651847, "loss": 0.8432819247245789, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 1.166627049446106, "learning_rate": 0.0009909945800260092, "loss": 0.7870283126831055, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.7747617363929749, "learning_rate": 0.0009870460151900522, "loss": 0.7735522389411926, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.9264483676285591, "eval_ce_loss": 0.22021640589690372, "eval_cos_loss": 0.15934634184864557, "eval_loss": 0.751849727815689, "eval_mse_loss": 0.51190055103879, "eval_rec_loss": 0.0037981332031229603, "flow/cos_sim": 0.8406536742432477, "flow/improvement_ratio": 0.9754726998337871, "flow/mag_ratio_mean": 0.8395581131112085, "flow/mag_ratio_std": 0.09706963221096013, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.9264483676285591, "eval_ce_loss": 0.22021640589690372, "eval_cos_loss": 0.15934634184864557, "eval_loss": 0.751849727815689, "eval_mse_loss": 0.51190055103879, "eval_rec_loss": 0.0037981332031229603, "eval_runtime": 139.3758, "eval_samples_per_second": 200.845, "eval_steps_per_second": 3.143, "flow/cos_sim": 0.8406536742432477, "flow/improvement_ratio": 0.9754726998337871, "flow/mag_ratio_mean": 0.8395581131112085, "flow/mag_ratio_std": 0.09706963221096013, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 1.3507742881774902, "learning_rate": 0.0009823925488998885, "loss": 0.7531520128250122, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 1.090326189994812, "learning_rate": 0.0009770409161149525, "loss": 0.7384664416313171, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 1.6648627519607544, "learning_rate": 0.0009709988622506973, "loss": 0.7159472107887268, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.720405638217926, "learning_rate": 0.000964275131968659, "loss": 0.7134207487106323, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.9362046037171927, "eval_ce_loss": 0.18669388993094638, "eval_cos_loss": 0.09599785676829892, "eval_loss": 0.7009708360177741, "eval_mse_loss": 0.5027340843797274, "eval_rec_loss": 0.0019430727923829022, "flow/cos_sim": 0.9040021625555814, "flow/improvement_ratio": 0.9753212502830104, "flow/mag_ratio_mean": 0.906250716208323, "flow/mag_ratio_std": 0.07307879087519428, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.9362046037171927, "eval_ce_loss": 0.18669388993094638, "eval_cos_loss": 0.09599785676829892, "eval_loss": 0.7009708360177741, "eval_mse_loss": 0.5027340843797274, "eval_rec_loss": 0.0019430727923829022, "eval_runtime": 139.2617, "eval_samples_per_second": 201.01, "eval_steps_per_second": 3.145, "flow/cos_sim": 0.9040021625555814, "flow/improvement_ratio": 0.9753212502830104, "flow/mag_ratio_mean": 0.906250716208323, "flow/mag_ratio_std": 0.07307879087519428, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 1.171225905418396, "learning_rate": 0.0009568794565203123, "loss": 0.6967981457710266, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.9184499979019165, "learning_rate": 0.0009488225396630347, "loss": 0.6859588623046875, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 1.0972322225570679, "learning_rate": 0.0009401160421685646, "loss": 0.68483966588974, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 1.2944236993789673, "learning_rate": 0.0009307725649463714, "loss": 0.6722217202186584, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.9367529454304925, "eval_ce_loss": 0.17851220300531687, "eval_cos_loss": 0.0685597131088308, "eval_loss": 0.6638682314522191, "eval_mse_loss": 0.4771829223660029, "eval_rec_loss": 0.0013171346048419428, "flow/cos_sim": 0.9314403127045392, "flow/improvement_ratio": 0.9747336862021929, "flow/mag_ratio_mean": 0.9314602082722807, "flow/mag_ratio_std": 0.05983651272068013, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.9367529454304925, "eval_ce_loss": 0.17851220300531687, "eval_cos_loss": 0.0685597131088308, "eval_loss": 0.6638682314522191, "eval_mse_loss": 0.4771829223660029, "eval_rec_loss": 0.0013171346048419428, "eval_runtime": 143.247, "eval_samples_per_second": 195.418, "eval_steps_per_second": 3.058, "flow/cos_sim": 0.9314403127045392, "flow/improvement_ratio": 0.9747336862021929, "flow/mag_ratio_mean": 0.9314602082722807, "flow/mag_ratio_std": 0.05983651272068013, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 1.0950664281845093, "learning_rate": 0.0009208056308063659, "loss": 0.6635431051254272, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 1.0510311126708984, "learning_rate": 0.0009102296648873445, "loss": 0.652130126953125, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.7107524275779724, "learning_rate": 0.0008990599737794927, "loss": 0.6548014283180237, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 1.119279146194458, "learning_rate": 0.0008873127233711644, "loss": 0.644295871257782, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.9375087809294717, "eval_ce_loss": 0.17866700632629498, "eval_cos_loss": 0.05568918508379699, "eval_loss": 0.6384022356304404, "eval_mse_loss": 0.4531491862856634, "eval_rec_loss": 0.0010171248973892112, "flow/cos_sim": 0.9443108406785417, "flow/improvement_ratio": 0.9754358607612245, "flow/mag_ratio_mean": 0.9432625220790846, "flow/mag_ratio_std": 0.0532344374550532, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.9375087809294717, "eval_ce_loss": 0.17866700632629498, "eval_cos_loss": 0.05568918508379699, "eval_loss": 0.6384022356304404, "eval_mse_loss": 0.4531491862856634, "eval_rec_loss": 0.0010171248973892112, "eval_runtime": 140.7376, "eval_samples_per_second": 198.902, "eval_steps_per_second": 3.112, "flow/cos_sim": 0.9443108406785417, "flow/improvement_ratio": 0.9754358607612245, "flow/mag_ratio_mean": 0.9432625220790846, "flow/mag_ratio_std": 0.0532344374550532, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 1.223329782485962, "learning_rate": 0.0008750049154520011, "loss": 0.6385497450828552, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.7951129078865051, "learning_rate": 0.0008621543631062487, "loss": 0.6350575089454651, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.8830247521400452, "learning_rate": 0.0008487796649318904, "loss": 0.6269800066947937, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 1.0399079322814941, "learning_rate": 0.0008349001781229053, "loss": 0.6236906051635742, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.9406288888554537, "eval_ce_loss": 0.16271350685439018, "eval_cos_loss": 0.04946032597696128, "eval_loss": 0.6158577807962078, "eval_mse_loss": 0.4472781540868489, "eval_rec_loss": 0.0009200886164281037, "flow/cos_sim": 0.9505396935765602, "flow/improvement_ratio": 0.9752184091365501, "flow/mag_ratio_mean": 0.9574754819205907, "flow/mag_ratio_std": 0.04891788842131014, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.9406288888554537, "eval_ce_loss": 0.16271350685439018, "eval_cos_loss": 0.04946032597696128, "eval_loss": 0.6158577807962078, "eval_mse_loss": 0.4472781540868489, "eval_rec_loss": 0.0009200886164281037, "eval_runtime": 138.7881, "eval_samples_per_second": 201.696, "eval_steps_per_second": 3.156, "flow/cos_sim": 0.9505396935765602, "flow/improvement_ratio": 0.9752184091365501, "flow/mag_ratio_mean": 0.9574754819205907, "flow/mag_ratio_std": 0.04891788842131014, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 1.2804330587387085, "learning_rate": 0.0008205359904536107, "loss": 0.6104704737663269, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 1.038807988166809, "learning_rate": 0.0008057078912056363, "loss": 0.6035579442977905, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 1.1162539720535278, "learning_rate": 0.0007904373410796086, "loss": 0.6099694967269897, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.8053554892539978, "learning_rate": 0.0007747464411350876, "loss": 0.6010444760322571, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.9403609007223014, "eval_ce_loss": 0.16489822024130793, "eval_cos_loss": 0.042615741474307293, "eval_loss": 0.5965314044799979, "eval_mse_loss": 0.4266453656839998, "eval_rec_loss": 0.0007262465627901213, "flow/cos_sim": 0.9573842790573155, "flow/improvement_ratio": 0.9754182146564466, "flow/mag_ratio_mean": 0.9583017167435389, "flow/mag_ratio_std": 0.044361623012584096, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.9403609007223014, "eval_ce_loss": 0.16489822024130793, "eval_cos_loss": 0.042615741474307293, "eval_loss": 0.5965314044799979, "eval_mse_loss": 0.4266453656839998, "eval_rec_loss": 0.0007262465627901213, "eval_runtime": 141.0373, "eval_samples_per_second": 198.479, "eval_steps_per_second": 3.106, "flow/cos_sim": 0.9573842790573155, "flow/improvement_ratio": 0.9754182146564466, "flow/mag_ratio_mean": 0.9583017167435389, "flow/mag_ratio_std": 0.044361623012584096, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.926774799823761, "learning_rate": 0.000758657900803716, "loss": 0.6045265793800354, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.6091651320457458, "learning_rate": 0.000742195005021869, "loss": 0.6046400666236877, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.9995866417884827, "learning_rate": 0.0007253815805303786, "loss": 0.5923656225204468, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.8947123885154724, "learning_rate": 0.0007082419613901028, "loss": 0.5886037349700928, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.9428308469149022, "eval_ce_loss": 0.15756705103943883, "eval_cos_loss": 0.03951380795641849, "eval_loss": 0.5839528071281572, "eval_mse_loss": 0.42179430878325685, "eval_rec_loss": 0.0006400666907252946, "flow/cos_sim": 0.9604862126857723, "flow/improvement_ratio": 0.9746591260988419, "flow/mag_ratio_mean": 0.9592139723638421, "flow/mag_ratio_std": 0.04008094550505893, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.9428308469149022, "eval_ce_loss": 0.15756705103943883, "eval_cos_loss": 0.03951380795641849, "eval_loss": 0.5839528071281572, "eval_mse_loss": 0.42179430878325685, "eval_rec_loss": 0.0006400666907252946, "eval_runtime": 142.8618, "eval_samples_per_second": 195.945, "eval_steps_per_second": 3.066, "flow/cos_sim": 0.9604862126857723, "flow/improvement_ratio": 0.9746591260988419, "flow/mag_ratio_mean": 0.9592139723638421, "flow/mag_ratio_std": 0.04008094550505893, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.9762691259384155, "learning_rate": 0.0006908009537632514, "loss": 0.591374397277832, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.9119466543197632, "learning_rate": 0.0006730838000114403, "loss": 0.5883693099021912, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.7088457942008972, "learning_rate": 0.0006551161421624341, "loss": 0.591505229473114, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.962149441242218, "learning_rate": 0.0006369239847984517, "loss": 0.5791484117507935, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.9397317595032229, "eval_ce_loss": 0.1654567693755643, "eval_cos_loss": 0.036672262636493876, "eval_loss": 0.5844078893394775, "eval_mse_loss": 0.41475171422305174, "eval_rec_loss": 0.0005321793435803258, "flow/cos_sim": 0.9633277582523485, "flow/improvement_ratio": 0.9749675198504913, "flow/mag_ratio_mean": 0.9673667468436776, "flow/mag_ratio_std": 0.03610456871829893, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.9397317595032229, "eval_ce_loss": 0.1654567693755643, "eval_cos_loss": 0.036672262636493876, "eval_loss": 0.5844078893394775, "eval_mse_loss": 0.41475171422305174, "eval_rec_loss": 0.0005321793435803258, "eval_runtime": 140.1978, "eval_samples_per_second": 199.668, "eval_steps_per_second": 3.124, "flow/cos_sim": 0.9633277582523485, "flow/improvement_ratio": 0.9749675198504913, "flow/mag_ratio_mean": 0.9673667468436776, "flow/mag_ratio_std": 0.03610456871829893, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.853702187538147, "learning_rate": 0.0006185336574197479, "loss": 0.5742002725601196, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.9982457160949707, "learning_rate": 0.0005999717763379407, "loss": 0.5812740921974182, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 1.16475510597229, "learning_rate": 0.0005812652061542363, "loss": 0.5733819603919983, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 1.0706837177276611, "learning_rate": 0.0005624410208783071, "loss": 0.5738887190818787, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.9459802582441368, "eval_ce_loss": 0.14516126815459296, "eval_cos_loss": 0.033954130285780995, "eval_loss": 0.5643789048336413, "eval_mse_loss": 0.4153396790702593, "eval_rec_loss": 0.0004825431066599768, "flow/cos_sim": 0.9660458944431723, "flow/improvement_ratio": 0.9768404536051293, "flow/mag_ratio_mean": 0.964692687198996, "flow/mag_ratio_std": 0.033606582049059266, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.9459802582441368, "eval_ce_loss": 0.14516126815459296, "eval_cos_loss": 0.033954130285780995, "eval_loss": 0.5643789048336413, "eval_mse_loss": 0.4153396790702593, "eval_rec_loss": 0.0004825431066599768, "eval_runtime": 138.7003, "eval_samples_per_second": 201.824, "eval_steps_per_second": 3.158, "flow/cos_sim": 0.9660458944431723, "flow/improvement_ratio": 0.9768404536051293, "flow/mag_ratio_mean": 0.964692687198996, "flow/mag_ratio_std": 0.033606582049059266, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 1.2897191047668457, "learning_rate": 0.0005435264647440881, "loss": 0.5754253268241882, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.8889420628547668, "learning_rate": 0.000524548912779213, "loss": 0.5645499229431152, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 1.0012623071670532, "learning_rate": 0.0005055358311851499, "loss": 0.570586085319519, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.7268548011779785, "learning_rate": 0.0004865147375853812, "loss": 0.5649828910827637, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.9438391043268173, "eval_ce_loss": 0.15168385392003883, "eval_cos_loss": 0.031372046842320596, "eval_loss": 0.5559442507349737, "eval_mse_loss": 0.40068320514948946, "eval_rec_loss": 0.0004399877304908156, "flow/cos_sim": 0.9686279725538541, "flow/improvement_ratio": 0.9761811211773249, "flow/mag_ratio_mean": 0.9699762897676529, "flow/mag_ratio_std": 0.031497096553546926, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.9438391043268173, "eval_ce_loss": 0.15168385392003883, "eval_cos_loss": 0.031372046842320596, "eval_loss": 0.5559442507349737, "eval_mse_loss": 0.40068320514948946, "eval_rec_loss": 0.0004399877304908156, "eval_runtime": 142.7289, "eval_samples_per_second": 196.127, "eval_steps_per_second": 3.069, "flow/cos_sim": 0.9686279725538541, "flow/improvement_ratio": 0.9761811211773249, "flow/mag_ratio_mean": 0.9699762897676529, "flow/mag_ratio_std": 0.031497096553546926, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 1.215234637260437, "learning_rate": 0.0004675131611991607, "loss": 0.5652971863746643, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.622643768787384, "learning_rate": 0.0004485586029984899, "loss": 0.5565246343612671, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 1.0961378812789917, "learning_rate": 0.00042967849590597266, "loss": 0.5489715933799744, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.9782881140708923, "learning_rate": 0.0004109001650911621, "loss": 0.5525087118148804, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.9412862694661508, "eval_ce_loss": 0.1635204538911343, "eval_cos_loss": 0.030069323277874895, "eval_loss": 0.5582960859689539, "eval_mse_loss": 0.39135666776737665, "eval_rec_loss": 0.00041203244906421176, "flow/cos_sim": 0.969930695344324, "flow/improvement_ratio": 0.9753037164472553, "flow/mag_ratio_mean": 0.9802452540833112, "flow/mag_ratio_std": 0.030909983262623827, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.9412862694661508, "eval_ce_loss": 0.1635204538911343, "eval_cos_loss": 0.030069323277874895, "eval_loss": 0.5582960859689539, "eval_mse_loss": 0.39135666776737665, "eval_rec_loss": 0.00041203244906421176, "eval_runtime": 141.5788, "eval_samples_per_second": 197.72, "eval_steps_per_second": 3.094, "flow/cos_sim": 0.969930695344324, "flow/improvement_ratio": 0.9753037164472553, "flow/mag_ratio_mean": 0.9802452540833112, "flow/mag_ratio_std": 0.030909983262623827, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 1.1210603713989258, "learning_rate": 0.0003922507884228551, "loss": 0.5510575175285339, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.7037401795387268, "learning_rate": 0.00037375735713457723, "loss": 0.5481619238853455, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.7474592924118042, "learning_rate": 0.00035544663676018276, "loss": 0.5506067872047424, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.7188865542411804, "learning_rate": 0.00033734512839611255, "loss": 0.5451632738113403, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.9428834105382213, "eval_ce_loss": 0.1552433374967239, "eval_cos_loss": 0.02893113123771807, "eval_loss": 0.5456167366951024, "eval_mse_loss": 0.3870963177316265, "eval_rec_loss": 0.0003839662346667844, "flow/cos_sim": 0.9710688916243375, "flow/improvement_ratio": 0.9738836331998921, "flow/mag_ratio_mean": 0.9706440245451993, "flow/mag_ratio_std": 0.029164127312328446, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.9428834105382213, "eval_ce_loss": 0.1552433374967239, "eval_cos_loss": 0.02893113123771807, "eval_loss": 0.5456167366951024, "eval_mse_loss": 0.3870963177316265, "eval_rec_loss": 0.0003839662346667844, "eval_runtime": 141.1326, "eval_samples_per_second": 198.345, "eval_steps_per_second": 3.103, "flow/cos_sim": 0.9710688916243375, "flow/improvement_ratio": 0.9738836331998921, "flow/mag_ratio_mean": 0.9706440245451993, "flow/mag_ratio_std": 0.029164127312328446, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.7716706991195679, "learning_rate": 0.0003194790303463687, "loss": 0.537281334400177, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 1.332189917564392, "learning_rate": 0.00030187420020572406, "loss": 0.5493588447570801, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.7042582035064697, "learning_rate": 0.00028455611743603626, "loss": 0.5357389450073242, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.9820289611816406, "learning_rate": 0.0002675498464898373, "loss": 0.5378322601318359, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.9466625254289972, "eval_ce_loss": 0.14409655002562421, "eval_cos_loss": 0.028247294284097137, "eval_loss": 0.5304203147757544, "eval_mse_loss": 0.38314900074375274, "eval_rec_loss": 0.0003500353127845417, "flow/cos_sim": 0.9717527268684074, "flow/improvement_ratio": 0.9754393091212669, "flow/mag_ratio_mean": 0.973318548915593, "flow/mag_ratio_std": 0.028512526988200674, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.9466625254289972, "eval_ce_loss": 0.14409655002562421, "eval_cos_loss": 0.028247294284097137, "eval_loss": 0.5304203147757544, "eval_mse_loss": 0.38314900074375274, "eval_rec_loss": 0.0003500353127845417, "eval_runtime": 142.684, "eval_samples_per_second": 196.189, "eval_steps_per_second": 3.07, "flow/cos_sim": 0.9717527268684074, "flow/improvement_ratio": 0.9754393091212669, "flow/mag_ratio_mean": 0.973318548915593, "flow/mag_ratio_std": 0.028512526988200674, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.9396564364433289, "learning_rate": 0.0002508800005345623, "loss": 0.5384619832038879, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 1.3034203052520752, "learning_rate": 0.00023457070582992562, "loss": 0.5381016135215759, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.7913278341293335, "learning_rate": 0.00021864556680999692, "loss": 0.5290021896362305, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.9535462856292725, "learning_rate": 0.0002031276319205152, "loss": 0.534302830696106, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.9467887438059129, "eval_ce_loss": 0.14362057716241233, "eval_cos_loss": 0.02781058812607505, "eval_loss": 0.5298489093372266, "eval_mse_loss": 0.38311818289702343, "eval_rec_loss": 0.00032909089600148805, "flow/cos_sim": 0.9721894303685454, "flow/improvement_ratio": 0.974593520844908, "flow/mag_ratio_mean": 0.9698141316572825, "flow/mag_ratio_std": 0.02832854554465372, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.9467887438059129, "eval_ce_loss": 0.14362057716241233, "eval_cos_loss": 0.02781058812607505, "eval_loss": 0.5298489093372266, "eval_mse_loss": 0.38311818289702343, "eval_rec_loss": 0.00032909089600148805, "eval_runtime": 141.9779, "eval_samples_per_second": 197.164, "eval_steps_per_second": 3.085, "flow/cos_sim": 0.9721894303685454, "flow/improvement_ratio": 0.974593520844908, "flow/mag_ratio_mean": 0.9698141316572825, "flow/mag_ratio_std": 0.02832854554465372, "step": 15360 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }