{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932104752667313, "eval_steps": 1024, "global_step": 21504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.8926580548286438, "learning_rate": 0.000498046875, "loss": 7.760124206542969, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 1.5264979600906372, "learning_rate": 0.000998046875, "loss": 1.7502448558807373, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 1.034440517425537, "learning_rate": 0.000999640996023194, "loss": 1.0943012237548828, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 1.2952566146850586, "learning_rate": 0.0009985588674043958, "loss": 0.9299185276031494, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.8108942681968176, "eval_cos_loss": 0.18627598218313635, "eval_dec_loss": 0.22058332814371476, "eval_loss": 0.8581492928065122, "eval_mse2_loss": 0.0729683047781388, "eval_mse_loss": 0.5191411869302732, "eval_rec_loss": 0.024248257404848185, "eval_var_loss": 0.0025806165721318493, "flow/cos_sim": 0.8137240362221791, "flow/improvement_ratio": 0.953395878479361, "flow/mag_ratio_mean": 0.8106630417309939, "flow/mag_ratio_std": 0.14785969192714996, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.8108942681968176, "eval_cos_loss": 0.18627598218313635, "eval_dec_loss": 0.22058332814371476, "eval_loss": 0.8581492928065122, "eval_mse2_loss": 0.0729683047781388, "eval_mse_loss": 0.5191411869302732, "eval_rec_loss": 0.024248257404848185, "eval_runtime": 153.8108, "eval_samples_per_second": 181.996, "eval_steps_per_second": 2.848, "eval_var_loss": 0.0025806165721318493, "flow/cos_sim": 0.8137240362221791, "flow/improvement_ratio": 0.953395878479361, "flow/mag_ratio_mean": 0.8106630417309939, "flow/mag_ratio_std": 0.14785969192714996, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 1.6030019521713257, "learning_rate": 0.0009967551747861387, "loss": 0.8445956110954285, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 1.6468690633773804, "learning_rate": 0.000994232528651847, "loss": 0.7937943339347839, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 1.9545633792877197, "learning_rate": 0.0009909945800260092, "loss": 0.7469877600669861, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 1.2975454330444336, "learning_rate": 0.0009870460151900522, "loss": 0.7277378439903259, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.8332413753983823, "eval_cos_loss": 0.12301718736212003, "eval_dec_loss": 0.18318539169791354, "eval_loss": 0.6979760440517234, "eval_mse2_loss": 0.056748984509134945, "eval_mse_loss": 0.43819310152095203, "eval_rec_loss": 0.006146695995894708, "eval_var_loss": 0.0014001506648651542, "flow/cos_sim": 0.8769828294782334, "flow/improvement_ratio": 0.9491692741984101, "flow/mag_ratio_mean": 0.8741380685268472, "flow/mag_ratio_std": 0.1099956316456675, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.8332413753983823, "eval_cos_loss": 0.12301718736212003, "eval_dec_loss": 0.18318539169791354, "eval_loss": 0.6979760440517234, "eval_mse2_loss": 0.056748984509134945, "eval_mse_loss": 0.43819310152095203, "eval_rec_loss": 0.006146695995894708, "eval_runtime": 147.7361, "eval_samples_per_second": 189.48, "eval_steps_per_second": 2.965, "eval_var_loss": 0.0014001506648651542, "flow/cos_sim": 0.8769828294782334, "flow/improvement_ratio": 0.9491692741984101, "flow/mag_ratio_mean": 0.8741380685268472, "flow/mag_ratio_std": 0.1099956316456675, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 1.0802249908447266, "learning_rate": 0.0009823925488998885, "loss": 0.7090811133384705, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 1.3514765501022339, "learning_rate": 0.0009770409161149525, "loss": 0.6976003646850586, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 1.3863739967346191, "learning_rate": 0.0009709988622506973, "loss": 0.6783488392829895, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 1.2354881763458252, "learning_rate": 0.000964275131968659, "loss": 0.6734734773635864, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.8490467382834135, "eval_cos_loss": 0.09506861453867393, "eval_dec_loss": 0.16530320855903707, "eval_loss": 0.6531072377342068, "eval_mse2_loss": 0.052244682587921344, "eval_mse_loss": 0.4211963676426509, "eval_rec_loss": 0.003093622522146896, "eval_var_loss": 0.0017624946489726027, "flow/cos_sim": 0.9049313971985421, "flow/improvement_ratio": 0.950364352905587, "flow/mag_ratio_mean": 0.8911371738671168, "flow/mag_ratio_std": 0.10241742183764775, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.8490467382834135, "eval_cos_loss": 0.09506861453867393, "eval_dec_loss": 0.16530320855903707, "eval_loss": 0.6531072377342068, "eval_mse2_loss": 0.052244682587921344, "eval_mse_loss": 0.4211963676426509, "eval_rec_loss": 0.003093622522146896, "eval_runtime": 144.2539, "eval_samples_per_second": 194.054, "eval_steps_per_second": 3.036, "eval_var_loss": 0.0017624946489726027, "flow/cos_sim": 0.9049313971985421, "flow/improvement_ratio": 0.950364352905587, "flow/mag_ratio_mean": 0.8911371738671168, "flow/mag_ratio_std": 0.10241742183764775, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 1.2442104816436768, "learning_rate": 0.0009568794565203123, "loss": 0.6588751673698425, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 1.032291054725647, "learning_rate": 0.0009488225396630347, "loss": 0.647803008556366, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 1.2258528470993042, "learning_rate": 0.0009401160421685646, "loss": 0.6433600783348083, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 1.1139024496078491, "learning_rate": 0.0009307725649463714, "loss": 0.6353314518928528, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.8504080018321996, "eval_cos_loss": 0.07373750197901029, "eval_dec_loss": 0.15640341749066086, "eval_loss": 0.6174904828610486, "eval_mse2_loss": 0.04752455838024616, "eval_mse_loss": 0.4026154523979039, "eval_rec_loss": 0.002115985062733488, "eval_var_loss": 0.0014573188677226027, "flow/cos_sim": 0.9262625174979641, "flow/improvement_ratio": 0.9461281280539352, "flow/mag_ratio_mean": 0.9282138170716969, "flow/mag_ratio_std": 0.0990274522844787, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.8504080018321996, "eval_cos_loss": 0.07373750197901029, "eval_dec_loss": 0.15640341749066086, "eval_loss": 0.6174904828610486, "eval_mse2_loss": 0.04752455838024616, "eval_mse_loss": 0.4026154523979039, "eval_rec_loss": 0.002115985062733488, "eval_runtime": 144.4539, "eval_samples_per_second": 193.785, "eval_steps_per_second": 3.032, "eval_var_loss": 0.0014573188677226027, "flow/cos_sim": 0.9262625174979641, "flow/improvement_ratio": 0.9461281280539352, "flow/mag_ratio_mean": 0.9282138170716969, "flow/mag_ratio_std": 0.0990274522844787, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 1.0958069562911987, "learning_rate": 0.0009208056308063659, "loss": 0.6263965368270874, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 1.2000305652618408, "learning_rate": 0.0009102296648873445, "loss": 0.618091344833374, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.8440291285514832, "learning_rate": 0.0008990599737794927, "loss": 0.6174848079681396, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.9181346893310547, "learning_rate": 0.0008873127233711644, "loss": 0.606370747089386, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.854693698336936, "eval_cos_loss": 0.058853609810613064, "eval_dec_loss": 0.15538246511188272, "eval_loss": 0.5900675073334071, "eval_mse2_loss": 0.04474926833488625, "eval_mse_loss": 0.3815204039828418, "eval_rec_loss": 0.0014702541673647402, "eval_var_loss": 0.0010597542540667808, "flow/cos_sim": 0.9411464126687071, "flow/improvement_ratio": 0.9471585357298046, "flow/mag_ratio_mean": 0.9485053326169105, "flow/mag_ratio_std": 0.0952613887347315, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.854693698336936, "eval_cos_loss": 0.058853609810613064, "eval_dec_loss": 0.15538246511188272, "eval_loss": 0.5900675073334071, "eval_mse2_loss": 0.04474926833488625, "eval_mse_loss": 0.3815204039828418, "eval_rec_loss": 0.0014702541673647402, "eval_runtime": 144.8825, "eval_samples_per_second": 193.212, "eval_steps_per_second": 3.023, "eval_var_loss": 0.0010597542540667808, "flow/cos_sim": 0.9411464126687071, "flow/improvement_ratio": 0.9471585357298046, "flow/mag_ratio_mean": 0.9485053326169105, "flow/mag_ratio_std": 0.0952613887347315, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 1.0105012655258179, "learning_rate": 0.0008750049154520011, "loss": 0.5982246398925781, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 1.7041122913360596, "learning_rate": 0.0008621543631062487, "loss": 0.6011320948600769, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 1.2530128955841064, "learning_rate": 0.0008487796649318904, "loss": 0.5908714532852173, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.8214261531829834, "learning_rate": 0.0008349001781229053, "loss": 0.5860975980758667, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.8588448853802904, "eval_cos_loss": 0.050389231161371756, "eval_dec_loss": 0.14640251302147564, "eval_loss": 0.5701193273339642, "eval_mse2_loss": 0.04257897145569869, "eval_mse_loss": 0.3735292319837771, "eval_rec_loss": 0.0011887007649628058, "eval_var_loss": 0.001380990084992152, "flow/cos_sim": 0.9496107875756478, "flow/improvement_ratio": 0.9471838724667623, "flow/mag_ratio_mean": 0.9603897756365336, "flow/mag_ratio_std": 0.09184158112020253, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.8588448853802904, "eval_cos_loss": 0.050389231161371756, "eval_dec_loss": 0.14640251302147564, "eval_loss": 0.5701193273339642, "eval_mse2_loss": 0.04257897145569869, "eval_mse_loss": 0.3735292319837771, "eval_rec_loss": 0.0011887007649628058, "eval_runtime": 145.6643, "eval_samples_per_second": 192.175, "eval_steps_per_second": 3.007, "eval_var_loss": 0.001380990084992152, "flow/cos_sim": 0.9496107875756478, "flow/improvement_ratio": 0.9471838724667623, "flow/mag_ratio_mean": 0.9603897756365336, "flow/mag_ratio_std": 0.09184158112020253, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.9015347361564636, "learning_rate": 0.0008205359904536107, "loss": 0.5818743705749512, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.8061195611953735, "learning_rate": 0.0008057078912056363, "loss": 0.5712096691131592, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.9496876001358032, "learning_rate": 0.0007904373410796086, "loss": 0.5827493667602539, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.9318349957466125, "learning_rate": 0.0007747464411350876, "loss": 0.5713083744049072, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.8655904613504272, "eval_cos_loss": 0.04345733123793177, "eval_dec_loss": 0.1383160431550382, "eval_loss": 0.560146918988119, "eval_mse2_loss": 0.04110090016093973, "eval_mse_loss": 0.37529142364247203, "eval_rec_loss": 0.0010027640903040077, "eval_var_loss": 9.005472540311074e-05, "flow/cos_sim": 0.956542692227995, "flow/improvement_ratio": 0.9472023688222719, "flow/mag_ratio_mean": 0.9649291216782784, "flow/mag_ratio_std": 0.08745992025488043, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.8655904613504272, "eval_cos_loss": 0.04345733123793177, "eval_dec_loss": 0.1383160431550382, "eval_loss": 0.560146918988119, "eval_mse2_loss": 0.04110090016093973, "eval_mse_loss": 0.37529142364247203, "eval_rec_loss": 0.0010027640903040077, "eval_runtime": 147.3557, "eval_samples_per_second": 189.969, "eval_steps_per_second": 2.972, "eval_var_loss": 9.005472540311074e-05, "flow/cos_sim": 0.956542692227995, "flow/improvement_ratio": 0.9472023688222719, "flow/mag_ratio_mean": 0.9649291216782784, "flow/mag_ratio_std": 0.08745992025488043, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 1.0284937620162964, "learning_rate": 0.000758657900803716, "loss": 0.5740544199943542, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.7206848859786987, "learning_rate": 0.000742195005021869, "loss": 0.5760706067085266, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 1.221917748451233, "learning_rate": 0.0007253815805303786, "loss": 0.566294252872467, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.8819605708122253, "learning_rate": 0.0007082419613901028, "loss": 0.5628067851066589, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.871946148436791, "eval_cos_loss": 0.03890436059331785, "eval_dec_loss": 0.1332924633104031, "eval_loss": 0.5495835452183196, "eval_mse2_loss": 0.040371610187674496, "eval_mse_loss": 0.3710057751773155, "eval_rec_loss": 0.0008419993847885518, "eval_var_loss": 0.0001812590855986016, "flow/cos_sim": 0.9610956558898159, "flow/improvement_ratio": 0.9478674083267717, "flow/mag_ratio_mean": 0.9691984739205609, "flow/mag_ratio_std": 0.08465842693431737, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.871946148436791, "eval_cos_loss": 0.03890436059331785, "eval_dec_loss": 0.1332924633104031, "eval_loss": 0.5495835452183196, "eval_mse2_loss": 0.040371610187674496, "eval_mse_loss": 0.3710057751773155, "eval_rec_loss": 0.0008419993847885518, "eval_runtime": 146.7514, "eval_samples_per_second": 190.751, "eval_steps_per_second": 2.985, "eval_var_loss": 0.0001812590855986016, "flow/cos_sim": 0.9610956558898159, "flow/improvement_ratio": 0.9478674083267717, "flow/mag_ratio_mean": 0.9691984739205609, "flow/mag_ratio_std": 0.08465842693431737, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 1.2066078186035156, "learning_rate": 0.0006908009537632514, "loss": 0.5644704699516296, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 1.2743791341781616, "learning_rate": 0.0006730838000114403, "loss": 0.5624759197235107, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.6424040198326111, "learning_rate": 0.0006551161421624341, "loss": 0.5654159188270569, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 1.0390995740890503, "learning_rate": 0.0006369239847984517, "loss": 0.5563592910766602, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.8641208937990813, "eval_cos_loss": 0.034672807719235275, "eval_dec_loss": 0.14222022919962396, "eval_loss": 0.5460346293503835, "eval_mse2_loss": 0.03956130868223704, "eval_mse_loss": 0.3584019422667212, "eval_rec_loss": 0.0007127871282784625, "eval_var_loss": 0.0016710812642694063, "flow/cos_sim": 0.965327212663546, "flow/improvement_ratio": 0.946786184561307, "flow/mag_ratio_mean": 0.9734560983366074, "flow/mag_ratio_std": 0.08033725547872178, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.8641208937990813, "eval_cos_loss": 0.034672807719235275, "eval_dec_loss": 0.14222022919962396, "eval_loss": 0.5460346293503835, "eval_mse2_loss": 0.03956130868223704, "eval_mse_loss": 0.3584019422667212, "eval_rec_loss": 0.0007127871282784625, "eval_runtime": 146.177, "eval_samples_per_second": 191.501, "eval_steps_per_second": 2.996, "eval_var_loss": 0.0016710812642694063, "flow/cos_sim": 0.965327212663546, "flow/improvement_ratio": 0.946786184561307, "flow/mag_ratio_mean": 0.9734560983366074, "flow/mag_ratio_std": 0.08033725547872178, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 1.0673410892486572, "learning_rate": 0.0006185336574197479, "loss": 0.55131596326828, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 1.1385674476623535, "learning_rate": 0.0005999717763379407, "loss": 0.5542811155319214, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 1.3084577322006226, "learning_rate": 0.0005812652061542363, "loss": 0.5522482395172119, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.9078991413116455, "learning_rate": 0.0005624410208783071, "loss": 0.5514112114906311, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.8714027910618674, "eval_cos_loss": 0.031159216023505278, "eval_dec_loss": 0.13065314148744084, "eval_loss": 0.5320979358126584, "eval_mse2_loss": 0.038692950696331455, "eval_mse_loss": 0.3575415439121255, "eval_rec_loss": 0.0006259792361515585, "eval_var_loss": 0.001468397166630993, "flow/cos_sim": 0.968840807404148, "flow/improvement_ratio": 0.9490241132098246, "flow/mag_ratio_mean": 0.9646270977307673, "flow/mag_ratio_std": 0.07568225521407171, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.8714027910618674, "eval_cos_loss": 0.031159216023505278, "eval_dec_loss": 0.13065314148744084, "eval_loss": 0.5320979358126584, "eval_mse2_loss": 0.038692950696331455, "eval_mse_loss": 0.3575415439121255, "eval_rec_loss": 0.0006259792361515585, "eval_runtime": 147.3748, "eval_samples_per_second": 189.944, "eval_steps_per_second": 2.972, "eval_var_loss": 0.001468397166630993, "flow/cos_sim": 0.968840807404148, "flow/improvement_ratio": 0.9490241132098246, "flow/mag_ratio_mean": 0.9646270977307673, "flow/mag_ratio_std": 0.07568225521407171, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 1.3058608770370483, "learning_rate": 0.0005435264647440881, "loss": 0.547296941280365, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 1.0200841426849365, "learning_rate": 0.000524548912779213, "loss": 0.544040322303772, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 1.1076935529708862, "learning_rate": 0.0005055358311851499, "loss": 0.5454155206680298, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.8338369727134705, "learning_rate": 0.0004865147375853812, "loss": 0.5434398651123047, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.8728636919662941, "eval_cos_loss": 0.029134724701682456, "eval_dec_loss": 0.130046037353201, "eval_loss": 0.5287471506829675, "eval_mse2_loss": 0.037880827574969425, "eval_mse_loss": 0.3566115467243543, "eval_rec_loss": 0.0005072098316288007, "eval_var_loss": 0.0007880572314676084, "flow/cos_sim": 0.9708652933166452, "flow/improvement_ratio": 0.9513337348149792, "flow/mag_ratio_mean": 0.9743897437232815, "flow/mag_ratio_std": 0.07394225260914733, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.8728636919662941, "eval_cos_loss": 0.029134724701682456, "eval_dec_loss": 0.130046037353201, "eval_loss": 0.5287471506829675, "eval_mse2_loss": 0.037880827574969425, "eval_mse_loss": 0.3566115467243543, "eval_rec_loss": 0.0005072098316288007, "eval_runtime": 147.0551, "eval_samples_per_second": 190.357, "eval_steps_per_second": 2.978, "eval_var_loss": 0.0007880572314676084, "flow/cos_sim": 0.9708652933166452, "flow/improvement_ratio": 0.9513337348149792, "flow/mag_ratio_mean": 0.9743897437232815, "flow/mag_ratio_std": 0.07394225260914733, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 1.2937544584274292, "learning_rate": 0.0004675131611991607, "loss": 0.5423741936683655, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.9950433373451233, "learning_rate": 0.0004485586029984899, "loss": 0.536012589931488, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.9091536402702332, "learning_rate": 0.00042967849590597266, "loss": 0.5319453477859497, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 1.1773775815963745, "learning_rate": 0.0004109001650911621, "loss": 0.5343883037567139, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.8703424178205617, "eval_cos_loss": 0.026759936142559736, "eval_dec_loss": 0.1364484317855884, "eval_loss": 0.5287964605305293, "eval_mse2_loss": 0.03801968024796955, "eval_mse_loss": 0.34965787868793696, "eval_rec_loss": 0.00048410188376538874, "eval_var_loss": 0.0015103762552618437, "flow/cos_sim": 0.9732400857966784, "flow/improvement_ratio": 0.9476525126254722, "flow/mag_ratio_mean": 0.9758574831975649, "flow/mag_ratio_std": 0.06964536890659702, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.8703424178205617, "eval_cos_loss": 0.026759936142559736, "eval_dec_loss": 0.1364484317855884, "eval_loss": 0.5287964605305293, "eval_mse2_loss": 0.03801968024796955, "eval_mse_loss": 0.34965787868793696, "eval_rec_loss": 0.00048410188376538874, "eval_runtime": 146.8515, "eval_samples_per_second": 190.621, "eval_steps_per_second": 2.983, "eval_var_loss": 0.0015103762552618437, "flow/cos_sim": 0.9732400857966784, "flow/improvement_ratio": 0.9476525126254722, "flow/mag_ratio_mean": 0.9758574831975649, "flow/mag_ratio_std": 0.06964536890659702, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.8995824456214905, "learning_rate": 0.0003922507884228551, "loss": 0.5339757800102234, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.7776889801025391, "learning_rate": 0.00037375735713457723, "loss": 0.5342178344726562, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.7428849935531616, "learning_rate": 0.00035544663676018276, "loss": 0.5334670543670654, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 1.0038323402404785, "learning_rate": 0.00033734512839611255, "loss": 0.5338785648345947, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.8742863450888384, "eval_cos_loss": 0.02497413170153964, "eval_dec_loss": 0.13080026295682476, "eval_loss": 0.5225390315600181, "eval_mse2_loss": 0.03739905954853177, "eval_mse_loss": 0.35115713799652987, "eval_rec_loss": 0.0004604576855995448, "eval_var_loss": 0.00022470134578339042, "flow/cos_sim": 0.9750258914411885, "flow/improvement_ratio": 0.9472176692529356, "flow/mag_ratio_mean": 0.9779115002177078, "flow/mag_ratio_std": 0.06864877739181258, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.8742863450888384, "eval_cos_loss": 0.02497413170153964, "eval_dec_loss": 0.13080026295682476, "eval_loss": 0.5225390315600181, "eval_mse2_loss": 0.03739905954853177, "eval_mse_loss": 0.35115713799652987, "eval_rec_loss": 0.0004604576855995448, "eval_runtime": 147.1381, "eval_samples_per_second": 190.25, "eval_steps_per_second": 2.977, "eval_var_loss": 0.00022470134578339042, "flow/cos_sim": 0.9750258914411885, "flow/improvement_ratio": 0.9472176692529356, "flow/mag_ratio_mean": 0.9779115002177078, "flow/mag_ratio_std": 0.06864877739181258, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.8821900486946106, "learning_rate": 0.0003194790303463687, "loss": 0.5262507200241089, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 1.29916512966156, "learning_rate": 0.00030187420020572406, "loss": 0.5345848798751831, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.8431739211082458, "learning_rate": 0.00028455611743603626, "loss": 0.5232061147689819, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.7449212670326233, "learning_rate": 0.0002675498464898373, "loss": 0.5273455381393433, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.878961824779555, "eval_cos_loss": 0.023681414896222556, "eval_dec_loss": 0.12285568658687752, "eval_loss": 0.5115440155817493, "eval_mse2_loss": 0.036460889154645404, "eval_mse_loss": 0.34879117816278377, "eval_rec_loss": 0.00043564565198285477, "eval_var_loss": 0.0006324733228988299, "flow/cos_sim": 0.9763186070472682, "flow/improvement_ratio": 0.9486979719710676, "flow/mag_ratio_mean": 0.9747681165939053, "flow/mag_ratio_std": 0.06543600253046375, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.878961824779555, "eval_cos_loss": 0.023681414896222556, "eval_dec_loss": 0.12285568658687752, "eval_loss": 0.5115440155817493, "eval_mse2_loss": 0.036460889154645404, "eval_mse_loss": 0.34879117816278377, "eval_rec_loss": 0.00043564565198285477, "eval_runtime": 145.1315, "eval_samples_per_second": 192.88, "eval_steps_per_second": 3.018, "eval_var_loss": 0.0006324733228988299, "flow/cos_sim": 0.9763186070472682, "flow/improvement_ratio": 0.9486979719710676, "flow/mag_ratio_mean": 0.9747681165939053, "flow/mag_ratio_std": 0.06543600253046375, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 1.0061371326446533, "learning_rate": 0.0002508800005345623, "loss": 0.5260058045387268, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 1.3661426305770874, "learning_rate": 0.00023457070582992562, "loss": 0.5261355638504028, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.8166645765304565, "learning_rate": 0.00021864556680999692, "loss": 0.5197286009788513, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.9908707141876221, "learning_rate": 0.0002031276319205152, "loss": 0.5179936289787292, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.8783678378600588, "eval_cos_loss": 0.02282264728745505, "eval_dec_loss": 0.12575754155000868, "eval_loss": 0.5103318595314679, "eval_mse2_loss": 0.03646752246353588, "eval_mse_loss": 0.3436338434752808, "eval_rec_loss": 0.0003914046699129902, "eval_var_loss": 0.0017992829623287672, "flow/cos_sim": 0.9771773755278217, "flow/improvement_ratio": 0.946518616589237, "flow/mag_ratio_mean": 0.9811247203448047, "flow/mag_ratio_std": 0.06391868588076607, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.8783678378600588, "eval_cos_loss": 0.02282264728745505, "eval_dec_loss": 0.12575754155000868, "eval_loss": 0.5103318595314679, "eval_mse2_loss": 0.03646752246353588, "eval_mse_loss": 0.3436338434752808, "eval_rec_loss": 0.0003914046699129902, "eval_runtime": 145.6453, "eval_samples_per_second": 192.2, "eval_steps_per_second": 3.007, "eval_var_loss": 0.0017992829623287672, "flow/cos_sim": 0.9771773755278217, "flow/improvement_ratio": 0.946518616589237, "flow/mag_ratio_mean": 0.9811247203448047, "flow/mag_ratio_std": 0.06391868588076607, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 1.3719778060913086, "learning_rate": 0.00018803936026088542, "loss": 0.5231001973152161, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 1.1663880348205566, "learning_rate": 0.00017340258907913464, "loss": 0.5187292695045471, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 1.2364073991775513, "learning_rate": 0.0001592385021668743, "loss": 0.5198975801467896, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.7590259313583374, "learning_rate": 0.0001455675992000087, "loss": 0.5145216584205627, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.8783302939786105, "eval_cos_loss": 0.02200101312998359, "eval_dec_loss": 0.12474109981311103, "eval_loss": 0.5037317795840572, "eval_mse2_loss": 0.03571532880164445, "eval_mse_loss": 0.3386642832870353, "eval_rec_loss": 0.00036252345826848117, "eval_var_loss": 0.0020484401755136985, "flow/cos_sim": 0.9779990089538435, "flow/improvement_ratio": 0.9449677000579224, "flow/mag_ratio_mean": 0.9812718658414605, "flow/mag_ratio_std": 0.062364814267174835, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.8783302939786105, "eval_cos_loss": 0.02200101312998359, "eval_dec_loss": 0.12474109981311103, "eval_loss": 0.5037317795840572, "eval_mse2_loss": 0.03571532880164445, "eval_mse_loss": 0.3386642832870353, "eval_rec_loss": 0.00036252345826848117, "eval_runtime": 146.8615, "eval_samples_per_second": 190.608, "eval_steps_per_second": 2.982, "eval_var_loss": 0.0020484401755136985, "flow/cos_sim": 0.9779990089538435, "flow/improvement_ratio": 0.9449677000579224, "flow/mag_ratio_mean": 0.9812718658414605, "flow/mag_ratio_std": 0.062364814267174835, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.5260149836540222, "learning_rate": 0.000132409666069565, "loss": 0.5210624933242798, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.9234253168106079, "learning_rate": 0.0001197837462455823, "loss": 0.5090124011039734, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.6521694660186768, "learning_rate": 0.00010770811321550749, "loss": 0.518164336681366, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 1.1480274200439453, "learning_rate": 9.620024403698591e-05, "loss": 0.5141870379447937, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.8834884178752624, "eval_cos_loss": 0.0215921389347274, "eval_dec_loss": 0.12070586444750496, "eval_loss": 0.503621021021991, "eval_mse2_loss": 0.03600667195488193, "eval_mse_loss": 0.3440090203530168, "eval_rec_loss": 0.00033989991171857754, "eval_var_loss": 0.000400351607091895, "flow/cos_sim": 0.9784078830725527, "flow/improvement_ratio": 0.9481515654418022, "flow/mag_ratio_mean": 0.9792273077246261, "flow/mag_ratio_std": 0.061406915386517845, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.8834884178752624, "eval_cos_loss": 0.0215921389347274, "eval_dec_loss": 0.12070586444750496, "eval_loss": 0.503621021021991, "eval_mse2_loss": 0.03600667195488193, "eval_mse_loss": 0.3440090203530168, "eval_rec_loss": 0.00033989991171857754, "eval_runtime": 151.6926, "eval_samples_per_second": 184.538, "eval_steps_per_second": 2.887, "eval_var_loss": 0.000400351607091895, "flow/cos_sim": 0.9784078830725527, "flow/improvement_ratio": 0.9481515654418022, "flow/mag_ratio_mean": 0.9792273077246261, "flow/mag_ratio_std": 0.061406915386517845, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 0.9857662320137024, "learning_rate": 8.527679404332429e-05, "loss": 0.5135464668273926, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.4826514422893524, "learning_rate": 7.495357273823544e-05, "loss": 0.5152989029884338, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 0.5454884171485901, "learning_rate": 6.524552091475183e-05, "loss": 0.5149614810943604, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 0.518525242805481, "learning_rate": 5.6166689031422024e-05, "loss": 0.5079946517944336, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.8796902491543857, "eval_cos_loss": 0.020958454849955427, "eval_dec_loss": 0.12389364775661464, "eval_loss": 0.498462415517193, "eval_mse2_loss": 0.03562375955436736, "eval_mse_loss": 0.3354686865387442, "eval_rec_loss": 0.0003364354677980215, "eval_var_loss": 0.001044042578570919, "flow/cos_sim": 0.9790415661792232, "flow/improvement_ratio": 0.946279785676634, "flow/mag_ratio_mean": 0.9808841357220254, "flow/mag_ratio_std": 0.059541590231126304, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.8796902491543857, "eval_cos_loss": 0.020958454849955427, "eval_dec_loss": 0.12389364775661464, "eval_loss": 0.498462415517193, "eval_mse2_loss": 0.03562375955436736, "eval_mse_loss": 0.3354686865387442, "eval_rec_loss": 0.0003364354677980215, "eval_runtime": 146.5651, "eval_samples_per_second": 190.994, "eval_steps_per_second": 2.988, "eval_var_loss": 0.001044042578570919, "flow/cos_sim": 0.9790415661792232, "flow/improvement_ratio": 0.946279785676634, "flow/mag_ratio_mean": 0.9808841357220254, "flow/mag_ratio_std": 0.059541590231126304, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 0.6128277778625488, "learning_rate": 4.773021687709067e-05, "loss": 0.5122405290603638, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 0.5204885005950928, "learning_rate": 3.994831455368719e-05, "loss": 0.5127500891685486, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.602541446685791, "learning_rate": 3.283224480455282e-05, "loss": 0.5094860196113586, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 1.1697250604629517, "learning_rate": 2.639230671387627e-05, "loss": 0.5139985084533691, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.8812131229893231, "eval_cos_loss": 0.021022337101842172, "eval_dec_loss": 0.12231827581292826, "eval_loss": 0.49907271203385095, "eval_mse2_loss": 0.03566523693229782, "eval_mse_loss": 0.33820473304077914, "eval_rec_loss": 0.00033781120677735894, "eval_var_loss": 0.0004444209407998002, "flow/cos_sim": 0.9789776832001394, "flow/improvement_ratio": 0.9461018956143018, "flow/mag_ratio_mean": 0.9799266838591937, "flow/mag_ratio_std": 0.06017088978530065, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.8812131229893231, "eval_cos_loss": 0.021022337101842172, "eval_dec_loss": 0.12231827581292826, "eval_loss": 0.49907271203385095, "eval_mse2_loss": 0.03566523693229782, "eval_mse_loss": 0.33820473304077914, "eval_rec_loss": 0.00033781120677735894, "eval_runtime": 148.1279, "eval_samples_per_second": 188.979, "eval_steps_per_second": 2.957, "eval_var_loss": 0.0004444209407998002, "flow/cos_sim": 0.9789776832001394, "flow/improvement_ratio": 0.9461018956143018, "flow/mag_ratio_mean": 0.9799266838591937, "flow/mag_ratio_std": 0.06017088978530065, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 0.6937961578369141, "learning_rate": 2.063782080083576e-05, "loss": 0.5091792941093445, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.6024012565612793, "learning_rate": 1.557711553001523e-05, "loss": 0.5097566246986389, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 1.2192533016204834, "learning_rate": 1.1217515257622269e-05, "loss": 0.5056952238082886, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 1.0494381189346313, "learning_rate": 7.565329630950746e-06, "loss": 0.5079949498176575, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.8789524110812165, "eval_cos_loss": 0.020721596508333672, "eval_dec_loss": 0.12240356183929803, "eval_loss": 0.4938700148095823, "eval_mse2_loss": 0.035587785259467555, "eval_mse_loss": 0.3327356912365787, "eval_rec_loss": 0.0003278744384406143, "eval_var_loss": 0.0007429427752211758, "flow/cos_sim": 0.9792784298663815, "flow/improvement_ratio": 0.9488933942361509, "flow/mag_ratio_mean": 0.9814422096563801, "flow/mag_ratio_std": 0.05891708919106553, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.8789524110812165, "eval_cos_loss": 0.020721596508333672, "eval_dec_loss": 0.12240356183929803, "eval_loss": 0.4938700148095823, "eval_mse2_loss": 0.035587785259467555, "eval_mse_loss": 0.3327356912365787, "eval_rec_loss": 0.0003278744384406143, "eval_runtime": 147.5662, "eval_samples_per_second": 189.698, "eval_steps_per_second": 2.968, "eval_var_loss": 0.0007429427752211758, "flow/cos_sim": 0.9792784298663815, "flow/improvement_ratio": 0.9488933942361509, "flow/mag_ratio_mean": 0.9814422096563801, "flow/mag_ratio_std": 0.05891708919106553, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 1.0610053539276123, "learning_rate": 4.62584445643166e-06, "loss": 0.5107224583625793, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 0.3382054269313812, "learning_rate": 2.40331404948807e-06, "loss": 0.5061094760894775, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 0.8232002854347229, "learning_rate": 9.009550772663965e-07, "loss": 0.5084012150764465, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.4803735911846161, "learning_rate": 1.2094190315575791e-07, "loss": 0.502625048160553, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.878898171590065, "eval_cos_loss": 0.020868751262931248, "eval_dec_loss": 0.12675162426463954, "eval_loss": 0.5008763382423959, "eval_mse2_loss": 0.03563899073545655, "eval_mse_loss": 0.33525587856497396, "eval_rec_loss": 0.00033000375075045425, "eval_var_loss": 0.0008129659853025114, "flow/cos_sim": 0.9791312738912835, "flow/improvement_ratio": 0.9457402734179475, "flow/mag_ratio_mean": 0.9810643102208229, "flow/mag_ratio_std": 0.059360814905942305, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.878898171590065, "eval_cos_loss": 0.020868751262931248, "eval_dec_loss": 0.12675162426463954, "eval_loss": 0.5008763382423959, "eval_mse2_loss": 0.03563899073545655, "eval_mse_loss": 0.33525587856497396, "eval_rec_loss": 0.00033000375075045425, "eval_runtime": 146.8291, "eval_samples_per_second": 190.65, "eval_steps_per_second": 2.983, "eval_var_loss": 0.0008129659853025114, "flow/cos_sim": 0.9791312738912835, "flow/improvement_ratio": 0.9457402734179475, "flow/mag_ratio_mean": 0.9810643102208229, "flow/mag_ratio_std": 0.059360814905942305, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }