{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.596543209876543, "eval_steps": 256, "global_step": 17408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12641975308641976, "grad_norm": 0.49028417468070984, "learning_rate": 0.000498046875, "loss": 1.2244819402694702, "step": 256 }, { "epoch": 0.12641975308641976, "eval_bleu": 0.023714342494794382, "eval_loss": 1.1823350772625063, "eval_mse_loss": 1.1823350772625063, "step": 256 }, { "epoch": 0.12641975308641976, "eval_bleu": 0.023714342494794382, "eval_loss": 1.1823350772625063, "eval_mse_loss": 1.1823350772625063, "eval_runtime": 7.3875, "eval_samples_per_second": 354.383, "eval_steps_per_second": 5.55, "step": 256 }, { "epoch": 0.2528395061728395, "grad_norm": 0.578209638595581, "learning_rate": 0.000998046875, "loss": 1.1352403163909912, "step": 512 }, { "epoch": 0.2528395061728395, "eval_bleu": 0.022467213421973074, "eval_loss": 1.0758944168323423, "eval_mse_loss": 1.0758944168323423, "step": 512 }, { "epoch": 0.2528395061728395, "eval_bleu": 0.022467213421973074, "eval_loss": 1.0758944168323423, "eval_mse_loss": 1.0758944168323423, "eval_runtime": 7.512, "eval_samples_per_second": 348.51, "eval_steps_per_second": 5.458, "step": 512 }, { "epoch": 0.37925925925925924, "grad_norm": 1.1504969596862793, "learning_rate": 0.0009995882304697813, "loss": 1.0155020952224731, "step": 768 }, { "epoch": 0.37925925925925924, "eval_bleu": 0.03303487719486685, "eval_loss": 0.9521724465416699, "eval_mse_loss": 0.9521724465416699, "step": 768 }, { "epoch": 0.37925925925925924, "eval_bleu": 0.03303487719486685, "eval_loss": 0.9521724465416699, "eval_mse_loss": 0.9521724465416699, "eval_runtime": 7.3269, "eval_samples_per_second": 357.313, "eval_steps_per_second": 5.596, "step": 768 }, { "epoch": 0.505679012345679, "grad_norm": 0.9992234706878662, "learning_rate": 0.0009983471408586747, "loss": 0.9070967435836792, "step": 1024 }, { "epoch": 0.505679012345679, "eval_bleu": 0.05283670615976002, "eval_loss": 0.8704100134896069, "eval_mse_loss": 0.8704100134896069, "step": 1024 }, { "epoch": 0.505679012345679, "eval_bleu": 0.05283670615976002, "eval_loss": 0.8704100134896069, "eval_mse_loss": 0.8704100134896069, "eval_runtime": 7.9933, "eval_samples_per_second": 327.523, "eval_steps_per_second": 5.129, "step": 1024 }, { "epoch": 0.6320987654320988, "grad_norm": 0.931308925151825, "learning_rate": 0.000996278785066807, "loss": 0.8445582389831543, "step": 1280 }, { "epoch": 0.6320987654320988, "eval_bleu": 0.062080457666779255, "eval_loss": 0.8229870461836094, "eval_mse_loss": 0.8229870461836094, "step": 1280 }, { "epoch": 0.6320987654320988, "eval_bleu": 0.062080457666779255, "eval_loss": 0.8229870461836094, "eval_mse_loss": 0.8229870461836094, "eval_runtime": 8.2397, "eval_samples_per_second": 317.731, "eval_steps_per_second": 4.976, "step": 1280 }, { "epoch": 0.7585185185185185, "grad_norm": 1.1758294105529785, "learning_rate": 0.0009933865966059944, "loss": 0.7974789142608643, "step": 1536 }, { "epoch": 0.7585185185185185, "eval_bleu": 0.08761458974204005, "eval_loss": 0.7755879759788513, "eval_mse_loss": 0.7755879759788513, "step": 1536 }, { "epoch": 0.7585185185185185, "eval_bleu": 0.08761458974204005, "eval_loss": 0.7755879759788513, "eval_mse_loss": 0.7755879759788513, "eval_runtime": 7.2349, "eval_samples_per_second": 361.856, "eval_steps_per_second": 5.667, "step": 1536 }, { "epoch": 0.8849382716049383, "grad_norm": 1.6121838092803955, "learning_rate": 0.0009896753765666925, "loss": 0.7649396657943726, "step": 1792 }, { "epoch": 0.8849382716049383, "eval_bleu": 0.09849079764244906, "eval_loss": 0.7508636625801645, "eval_mse_loss": 0.7508636625801645, "step": 1792 }, { "epoch": 0.8849382716049383, "eval_bleu": 0.09849079764244906, "eval_loss": 0.7508636625801645, "eval_mse_loss": 0.7508636625801645, "eval_runtime": 7.7694, "eval_samples_per_second": 336.963, "eval_steps_per_second": 5.277, "step": 1792 }, { "epoch": 1.011358024691358, "grad_norm": 1.966202974319458, "learning_rate": 0.0009851512856480936, "loss": 0.7395287156105042, "step": 2048 }, { "epoch": 1.011358024691358, "eval_bleu": 0.11072005270411464, "eval_loss": 0.7292228166649981, "eval_mse_loss": 0.7292228166649981, "step": 2048 }, { "epoch": 1.011358024691358, "eval_bleu": 0.11072005270411464, "eval_loss": 0.7292228166649981, "eval_mse_loss": 0.7292228166649981, "eval_runtime": 7.9736, "eval_samples_per_second": 328.334, "eval_steps_per_second": 5.142, "step": 2048 }, { "epoch": 1.1377777777777778, "grad_norm": 1.720017433166504, "learning_rate": 0.0009798218339312412, "loss": 0.7159179449081421, "step": 2304 }, { "epoch": 1.1377777777777778, "eval_bleu": 0.11064736251702126, "eval_loss": 0.7017144793417396, "eval_mse_loss": 0.7017144793417396, "step": 2304 }, { "epoch": 1.1377777777777778, "eval_bleu": 0.11064736251702126, "eval_loss": 0.7017144793417396, "eval_mse_loss": 0.7017144793417396, "eval_runtime": 7.2741, "eval_samples_per_second": 359.908, "eval_steps_per_second": 5.636, "step": 2304 }, { "epoch": 1.2641975308641975, "grad_norm": 1.0777727365493774, "learning_rate": 0.0009736958684121462, "loss": 0.6964433789253235, "step": 2560 }, { "epoch": 1.2641975308641975, "eval_bleu": 0.1276517391343738, "eval_loss": 0.6854300062830855, "eval_mse_loss": 0.6854300062830855, "step": 2560 }, { "epoch": 1.2641975308641975, "eval_bleu": 0.1276517391343738, "eval_loss": 0.6854300062830855, "eval_mse_loss": 0.6854300062830855, "eval_runtime": 7.5038, "eval_samples_per_second": 348.888, "eval_steps_per_second": 5.464, "step": 2560 }, { "epoch": 1.3906172839506172, "grad_norm": 2.31036639213562, "learning_rate": 0.0009667835583155982, "loss": 0.6787664890289307, "step": 2816 }, { "epoch": 1.3906172839506172, "eval_bleu": 0.14606448546092324, "eval_loss": 0.6666799565640892, "eval_mse_loss": 0.6666799565640892, "step": 2816 }, { "epoch": 1.3906172839506172, "eval_bleu": 0.14606448546092324, "eval_loss": 0.6666799565640892, "eval_mse_loss": 0.6666799565640892, "eval_runtime": 8.1414, "eval_samples_per_second": 321.565, "eval_steps_per_second": 5.036, "step": 2816 }, { "epoch": 1.5170370370370372, "grad_norm": 1.8440918922424316, "learning_rate": 0.0009590963782140483, "loss": 0.6634169220924377, "step": 3072 }, { "epoch": 1.5170370370370372, "eval_bleu": 0.14900832500200872, "eval_loss": 0.6537455029603911, "eval_mse_loss": 0.6537455029603911, "step": 3072 }, { "epoch": 1.5170370370370372, "eval_bleu": 0.14900832500200872, "eval_loss": 0.6537455029603911, "eval_mse_loss": 0.6537455029603911, "eval_runtime": 8.5409, "eval_samples_per_second": 306.525, "eval_steps_per_second": 4.8, "step": 3072 }, { "epoch": 1.643456790123457, "grad_norm": 2.082489013671875, "learning_rate": 0.0009506470889795917, "loss": 0.6479848027229309, "step": 3328 }, { "epoch": 1.643456790123457, "eval_bleu": 0.15870707580244742, "eval_loss": 0.6412253728727015, "eval_mse_loss": 0.6412253728727015, "step": 3328 }, { "epoch": 1.643456790123457, "eval_bleu": 0.15870707580244742, "eval_loss": 0.6412253728727015, "eval_mse_loss": 0.6412253728727015, "eval_runtime": 7.43, "eval_samples_per_second": 352.355, "eval_steps_per_second": 5.518, "step": 3328 }, { "epoch": 1.7698765432098766, "grad_norm": 2.5578348636627197, "learning_rate": 0.0009414497166006652, "loss": 0.6374112963676453, "step": 3584 }, { "epoch": 1.7698765432098766, "eval_bleu": 0.16517790474378305, "eval_loss": 0.6340285481476202, "eval_mse_loss": 0.6340285481476202, "step": 3584 }, { "epoch": 1.7698765432098766, "eval_bleu": 0.16517790474378305, "eval_loss": 0.6340285481476202, "eval_mse_loss": 0.6340285481476202, "eval_runtime": 7.8092, "eval_samples_per_second": 335.246, "eval_steps_per_second": 5.25, "step": 3584 }, { "epoch": 1.8962962962962964, "grad_norm": 1.9383974075317383, "learning_rate": 0.0009315195288986269, "loss": 0.623887836933136, "step": 3840 }, { "epoch": 1.8962962962962964, "eval_bleu": 0.17847564897785617, "eval_loss": 0.6204298661976326, "eval_mse_loss": 0.6204298661976326, "step": 3840 }, { "epoch": 1.8962962962962964, "eval_bleu": 0.17847564897785617, "eval_loss": 0.6204298661976326, "eval_mse_loss": 0.6204298661976326, "eval_runtime": 7.8808, "eval_samples_per_second": 332.2, "eval_steps_per_second": 5.203, "step": 3840 }, { "epoch": 2.022716049382716, "grad_norm": 2.074370861053467, "learning_rate": 0.0009208730101828687, "loss": 0.612588107585907, "step": 4096 }, { "epoch": 2.022716049382716, "eval_bleu": 0.20019494813992078, "eval_loss": 0.6014744639396667, "eval_mse_loss": 0.6014744639396667, "step": 4096 }, { "epoch": 2.022716049382716, "eval_bleu": 0.20019494813992078, "eval_loss": 0.6014744639396667, "eval_mse_loss": 0.6014744639396667, "eval_runtime": 8.3023, "eval_samples_per_second": 315.333, "eval_steps_per_second": 4.938, "step": 4096 }, { "epoch": 2.149135802469136, "grad_norm": 2.1671674251556396, "learning_rate": 0.0009095278338865343, "loss": 0.5997830629348755, "step": 4352 }, { "epoch": 2.149135802469136, "eval_bleu": 0.1987893921535938, "eval_loss": 0.5986893453249117, "eval_mse_loss": 0.5986893453249117, "step": 4352 }, { "epoch": 2.149135802469136, "eval_bleu": 0.1987893921535938, "eval_loss": 0.5986893453249117, "eval_mse_loss": 0.5986893453249117, "eval_runtime": 7.4027, "eval_samples_per_second": 353.653, "eval_steps_per_second": 5.538, "step": 4352 }, { "epoch": 2.2755555555555556, "grad_norm": 3.125169038772583, "learning_rate": 0.0008975028332282684, "loss": 0.5991740822792053, "step": 4608 }, { "epoch": 2.2755555555555556, "eval_bleu": 0.20658330468877353, "eval_loss": 0.6155260525098661, "eval_mse_loss": 0.6155260525098661, "step": 4608 }, { "epoch": 2.2755555555555556, "eval_bleu": 0.20658330468877353, "eval_loss": 0.6155260525098661, "eval_mse_loss": 0.6155260525098661, "eval_runtime": 7.4818, "eval_samples_per_second": 349.915, "eval_steps_per_second": 5.48, "step": 4608 }, { "epoch": 2.4019753086419753, "grad_norm": 4.6324992179870605, "learning_rate": 0.0008848179699486985, "loss": 0.6436169147491455, "step": 4864 }, { "epoch": 2.4019753086419753, "eval_bleu": 0.19786110402686427, "eval_loss": 0.6434223041301821, "eval_mse_loss": 0.6434223041301821, "step": 4864 }, { "epoch": 2.4019753086419753, "eval_bleu": 0.19786110402686427, "eval_loss": 0.6434223041301821, "eval_mse_loss": 0.6434223041301821, "eval_runtime": 8.06, "eval_samples_per_second": 324.813, "eval_steps_per_second": 5.087, "step": 4864 }, { "epoch": 2.528395061728395, "grad_norm": 4.647606372833252, "learning_rate": 0.0008714943011735476, "loss": 0.6646981239318848, "step": 5120 }, { "epoch": 2.528395061728395, "eval_bleu": 0.17652369495305426, "eval_loss": 0.6581660363732315, "eval_mse_loss": 0.6581660363732315, "step": 5120 }, { "epoch": 2.528395061728395, "eval_bleu": 0.17652369495305426, "eval_loss": 0.6581660363732315, "eval_mse_loss": 0.6581660363732315, "eval_runtime": 8.4642, "eval_samples_per_second": 309.304, "eval_steps_per_second": 4.844, "step": 5120 }, { "epoch": 2.6548148148148147, "grad_norm": 4.655036926269531, "learning_rate": 0.000857553944458386, "loss": 0.670133650302887, "step": 5376 }, { "epoch": 2.6548148148148147, "eval_bleu": 0.16665801459747773, "eval_loss": 0.6649525950594646, "eval_mse_loss": 0.6649525950594646, "step": 5376 }, { "epoch": 2.6548148148148147, "eval_bleu": 0.16665801459747773, "eval_loss": 0.6649525950594646, "eval_mse_loss": 0.6649525950594646, "eval_runtime": 7.3082, "eval_samples_per_second": 358.23, "eval_steps_per_second": 5.61, "step": 5376 }, { "epoch": 2.7812345679012345, "grad_norm": 6.748809814453125, "learning_rate": 0.000843020041073049, "loss": 0.6625139117240906, "step": 5632 }, { "epoch": 2.7812345679012345, "eval_bleu": 0.173997037607852, "eval_loss": 0.6807852968937014, "eval_mse_loss": 0.6807852968937014, "step": 5632 }, { "epoch": 2.7812345679012345, "eval_bleu": 0.173997037607852, "eval_loss": 0.6807852968937014, "eval_mse_loss": 0.6807852968937014, "eval_runtime": 7.5532, "eval_samples_per_second": 346.607, "eval_steps_per_second": 5.428, "step": 5632 }, { "epoch": 2.907654320987654, "grad_norm": 4.406154632568359, "learning_rate": 0.0008279167175866678, "loss": 0.6519719958305359, "step": 5888 }, { "epoch": 2.907654320987654, "eval_bleu": 0.19570266564584513, "eval_loss": 0.6394063074414323, "eval_mse_loss": 0.6394063074414323, "step": 5888 }, { "epoch": 2.907654320987654, "eval_bleu": 0.19570266564584513, "eval_loss": 0.6394063074414323, "eval_mse_loss": 0.6394063074414323, "eval_runtime": 7.9229, "eval_samples_per_second": 330.434, "eval_steps_per_second": 5.175, "step": 5888 }, { "epoch": 3.034074074074074, "grad_norm": 4.702131271362305, "learning_rate": 0.0008122690458170833, "loss": 0.6394258737564087, "step": 6144 }, { "epoch": 3.034074074074074, "eval_bleu": 0.18025210346558498, "eval_loss": 0.6386287488588472, "eval_mse_loss": 0.6386287488588472, "step": 6144 }, { "epoch": 3.034074074074074, "eval_bleu": 0.18025210346558498, "eval_loss": 0.6386287488588472, "eval_mse_loss": 0.6386287488588472, "eval_runtime": 8.212, "eval_samples_per_second": 318.804, "eval_steps_per_second": 4.993, "step": 6144 }, { "epoch": 3.1604938271604937, "grad_norm": 3.3306374549865723, "learning_rate": 0.0007961030012111305, "loss": 0.6313198208808899, "step": 6400 }, { "epoch": 3.1604938271604937, "eval_bleu": 0.19600803362588348, "eval_loss": 0.6227354814366597, "eval_mse_loss": 0.6227354814366597, "step": 6400 }, { "epoch": 3.1604938271604937, "eval_bleu": 0.19600803362588348, "eval_loss": 0.6227354814366597, "eval_mse_loss": 0.6227354814366597, "eval_runtime": 7.9462, "eval_samples_per_second": 329.467, "eval_steps_per_second": 5.16, "step": 6400 }, { "epoch": 3.286913580246914, "grad_norm": 4.519600868225098, "learning_rate": 0.0007794454197248784, "loss": 0.6270281672477722, "step": 6656 }, { "epoch": 3.286913580246914, "eval_bleu": 0.21406015637359221, "eval_loss": 0.6298785456796971, "eval_mse_loss": 0.6298785456796971, "step": 6656 }, { "epoch": 3.286913580246914, "eval_bleu": 0.21406015637359221, "eval_loss": 0.6298785456796971, "eval_mse_loss": 0.6298785456796971, "eval_runtime": 7.1729, "eval_samples_per_second": 364.985, "eval_steps_per_second": 5.716, "step": 6656 }, { "epoch": 3.413333333333333, "grad_norm": 4.557300090789795, "learning_rate": 0.0007623239532754083, "loss": 0.6133501529693604, "step": 6912 }, { "epoch": 3.413333333333333, "eval_bleu": 0.21008851882230098, "eval_loss": 0.6231076019566234, "eval_mse_loss": 0.6231076019566234, "step": 6912 }, { "epoch": 3.413333333333333, "eval_bleu": 0.21008851882230098, "eval_loss": 0.6231076019566234, "eval_mse_loss": 0.6231076019566234, "eval_runtime": 7.9356, "eval_samples_per_second": 329.904, "eval_steps_per_second": 5.167, "step": 6912 }, { "epoch": 3.5397530864197533, "grad_norm": 4.322099208831787, "learning_rate": 0.0007447670238380815, "loss": 0.6112697720527649, "step": 7168 }, { "epoch": 3.5397530864197533, "eval_bleu": 0.2060799623946906, "eval_loss": 0.6111486045325675, "eval_mse_loss": 0.6111486045325675, "step": 7168 }, { "epoch": 3.5397530864197533, "eval_bleu": 0.2060799623946906, "eval_loss": 0.6111486045325675, "eval_mse_loss": 0.6111486045325675, "eval_runtime": 8.1866, "eval_samples_per_second": 319.793, "eval_steps_per_second": 5.008, "step": 7168 }, { "epoch": 3.6661728395061726, "grad_norm": 4.115531921386719, "learning_rate": 0.0007268037762654929, "loss": 0.6039376854896545, "step": 7424 }, { "epoch": 3.6661728395061726, "eval_bleu": 0.21503632161601938, "eval_loss": 0.6128283637325939, "eval_mse_loss": 0.6128283637325939, "step": 7424 }, { "epoch": 3.6661728395061726, "eval_bleu": 0.21503632161601938, "eval_loss": 0.6128283637325939, "eval_mse_loss": 0.6128283637325939, "eval_runtime": 8.2856, "eval_samples_per_second": 315.97, "eval_steps_per_second": 4.948, "step": 7424 }, { "epoch": 3.7925925925925927, "grad_norm": 4.654777526855469, "learning_rate": 0.0007084640299064357, "loss": 0.5996431708335876, "step": 7680 }, { "epoch": 3.7925925925925927, "eval_bleu": 0.22869408936536856, "eval_loss": 0.5915597095722105, "eval_mse_loss": 0.5915597095722105, "step": 7680 }, { "epoch": 3.7925925925925927, "eval_bleu": 0.22869408936536856, "eval_loss": 0.5915597095722105, "eval_mse_loss": 0.5915597095722105, "eval_runtime": 8.2255, "eval_samples_per_second": 318.278, "eval_steps_per_second": 4.984, "step": 7680 }, { "epoch": 3.9190123456790125, "grad_norm": 3.9882354736328125, "learning_rate": 0.0006897782291051889, "loss": 0.5931000709533691, "step": 7936 }, { "epoch": 3.9190123456790125, "eval_bleu": 0.2349496066220062, "eval_loss": 0.5926558040991062, "eval_mse_loss": 0.5926558040991062, "step": 7936 }, { "epoch": 3.9190123456790125, "eval_bleu": 0.2349496066220062, "eval_loss": 0.5926558040991062, "eval_mse_loss": 0.5926558040991062, "eval_runtime": 8.1758, "eval_samples_per_second": 320.212, "eval_steps_per_second": 5.015, "step": 7936 }, { "epoch": 4.045432098765432, "grad_norm": 4.266123294830322, "learning_rate": 0.000670777392663298, "loss": 0.5860158205032349, "step": 8192 }, { "epoch": 4.045432098765432, "eval_bleu": 0.25634671575676854, "eval_loss": 0.5731269004868298, "eval_mse_loss": 0.5731269004868298, "step": 8192 }, { "epoch": 4.045432098765432, "eval_bleu": 0.25634671575676854, "eval_loss": 0.5731269004868298, "eval_mse_loss": 0.5731269004868298, "eval_runtime": 7.4662, "eval_samples_per_second": 350.648, "eval_steps_per_second": 5.491, "step": 8192 }, { "epoch": 4.1718518518518515, "grad_norm": 5.411952018737793, "learning_rate": 0.0006514930623477486, "loss": 0.5818273425102234, "step": 8448 }, { "epoch": 4.1718518518518515, "eval_bleu": 0.2520441554433453, "eval_loss": 0.5782059794519006, "eval_mse_loss": 0.5782059794519006, "step": 8448 }, { "epoch": 4.1718518518518515, "eval_bleu": 0.2520441554433453, "eval_loss": 0.5782059794519006, "eval_mse_loss": 0.5782059794519006, "eval_runtime": 7.4057, "eval_samples_per_second": 353.512, "eval_steps_per_second": 5.536, "step": 8448 }, { "epoch": 4.298271604938272, "grad_norm": 3.5435891151428223, "learning_rate": 0.0006319572505310022, "loss": 0.5773241519927979, "step": 8704 }, { "epoch": 4.298271604938272, "eval_bleu": 0.2534190472706376, "eval_loss": 0.5589999033183586, "eval_mse_loss": 0.5589999033183586, "step": 8704 }, { "epoch": 4.298271604938272, "eval_bleu": 0.2534190472706376, "eval_loss": 0.5589999033183586, "eval_mse_loss": 0.5589999033183586, "eval_runtime": 7.7623, "eval_samples_per_second": 337.273, "eval_steps_per_second": 5.282, "step": 8704 }, { "epoch": 4.424691358024692, "grad_norm": 3.660297155380249, "learning_rate": 0.000612202387049823, "loss": 0.5752817988395691, "step": 8960 }, { "epoch": 4.424691358024692, "eval_bleu": 0.25405581690850026, "eval_loss": 0.5717670982930718, "eval_mse_loss": 0.5717670982930718, "step": 8960 }, { "epoch": 4.424691358024692, "eval_bleu": 0.25405581690850026, "eval_loss": 0.5717670982930718, "eval_mse_loss": 0.5717670982930718, "eval_runtime": 8.0671, "eval_samples_per_second": 324.527, "eval_steps_per_second": 5.082, "step": 8960 }, { "epoch": 4.551111111111111, "grad_norm": 4.61627197265625, "learning_rate": 0.0005922612653711009, "loss": 0.5677421689033508, "step": 9216 }, { "epoch": 4.551111111111111, "eval_bleu": 0.2616680879763428, "eval_loss": 0.5714971786592065, "eval_mse_loss": 0.5714971786592065, "step": 9216 }, { "epoch": 4.551111111111111, "eval_bleu": 0.2616680879763428, "eval_loss": 0.5714971786592065, "eval_mse_loss": 0.5714971786592065, "eval_runtime": 8.2747, "eval_samples_per_second": 316.384, "eval_steps_per_second": 4.955, "step": 9216 }, { "epoch": 4.67753086419753, "grad_norm": 4.289632797241211, "learning_rate": 0.0005721669881540442, "loss": 0.5655048489570618, "step": 9472 }, { "epoch": 4.67753086419753, "eval_bleu": 0.27326453853008387, "eval_loss": 0.5655288245619797, "eval_mse_loss": 0.5655288245619797, "step": 9472 }, { "epoch": 4.67753086419753, "eval_bleu": 0.27326453853008387, "eval_loss": 0.5655288245619797, "eval_mse_loss": 0.5655288245619797, "eval_runtime": 8.1079, "eval_samples_per_second": 322.895, "eval_steps_per_second": 5.057, "step": 9472 }, { "epoch": 4.803950617283951, "grad_norm": 4.95919132232666, "learning_rate": 0.0005519529122991056, "loss": 0.5713540315628052, "step": 9728 }, { "epoch": 4.803950617283951, "eval_bleu": 0.2627683773991363, "eval_loss": 0.5628622283295888, "eval_mse_loss": 0.5628622283295888, "step": 9728 }, { "epoch": 4.803950617283951, "eval_bleu": 0.2627683773991363, "eval_loss": 0.5628622283295888, "eval_mse_loss": 0.5628622283295888, "eval_runtime": 7.9584, "eval_samples_per_second": 328.962, "eval_steps_per_second": 5.152, "step": 9728 }, { "epoch": 4.930370370370371, "grad_norm": 6.370813369750977, "learning_rate": 0.0005316525935748631, "loss": 0.5834425687789917, "step": 9984 }, { "epoch": 4.930370370370371, "eval_bleu": 0.26641828019601, "eval_loss": 0.5816229349229394, "eval_mse_loss": 0.5816229349229394, "step": 9984 }, { "epoch": 4.930370370370371, "eval_bleu": 0.26641828019601, "eval_loss": 0.5816229349229394, "eval_mse_loss": 0.5816229349229394, "eval_runtime": 7.1012, "eval_samples_per_second": 368.672, "eval_steps_per_second": 5.774, "step": 9984 }, { "epoch": 5.05679012345679, "grad_norm": 6.872233867645264, "learning_rate": 0.0005112997309147753, "loss": 0.5829513669013977, "step": 10240 }, { "epoch": 5.05679012345679, "eval_bleu": 0.2556546592309415, "eval_loss": 0.5965675159198481, "eval_mse_loss": 0.5965675159198481, "step": 10240 }, { "epoch": 5.05679012345679, "eval_bleu": 0.2556546592309415, "eval_loss": 0.5965675159198481, "eval_mse_loss": 0.5965675159198481, "eval_runtime": 7.6184, "eval_samples_per_second": 343.643, "eval_steps_per_second": 5.382, "step": 10240 }, { "epoch": 5.18320987654321, "grad_norm": 6.554806232452393, "learning_rate": 0.0004909281104762817, "loss": 0.5979596376419067, "step": 10496 }, { "epoch": 5.18320987654321, "eval_bleu": 0.24496216459175849, "eval_loss": 0.5925210903330547, "eval_mse_loss": 0.5925210903330547, "step": 10496 }, { "epoch": 5.18320987654321, "eval_bleu": 0.24496216459175849, "eval_loss": 0.5925210903330547, "eval_mse_loss": 0.5925210903330547, "eval_runtime": 8.1186, "eval_samples_per_second": 322.468, "eval_steps_per_second": 5.05, "step": 10496 }, { "epoch": 5.3096296296296295, "grad_norm": 8.099956512451172, "learning_rate": 0.0004705715495551068, "loss": 0.5981637835502625, "step": 10752 }, { "epoch": 5.3096296296296295, "eval_bleu": 0.24892557597714382, "eval_loss": 0.6025734194895116, "eval_mse_loss": 0.6025734194895116, "step": 10752 }, { "epoch": 5.3096296296296295, "eval_bleu": 0.24892557597714382, "eval_loss": 0.6025734194895116, "eval_mse_loss": 0.6025734194895116, "eval_runtime": 8.3839, "eval_samples_per_second": 312.265, "eval_steps_per_second": 4.89, "step": 10752 }, { "epoch": 5.43604938271605, "grad_norm": 6.063875675201416, "learning_rate": 0.00045026384044787715, "loss": 0.600553572177887, "step": 11008 }, { "epoch": 5.43604938271605, "eval_bleu": 0.2584749312819124, "eval_loss": 0.5978762725504433, "eval_mse_loss": 0.5978762725504433, "step": 11008 }, { "epoch": 5.43604938271605, "eval_bleu": 0.2584749312819124, "eval_loss": 0.5978762725504433, "eval_mse_loss": 0.5978762725504433, "eval_runtime": 8.4473, "eval_samples_per_second": 309.921, "eval_steps_per_second": 4.854, "step": 11008 }, { "epoch": 5.562469135802469, "grad_norm": 6.819189071655273, "learning_rate": 0.0004300386943562342, "loss": 0.6050165891647339, "step": 11264 }, { "epoch": 5.562469135802469, "eval_bleu": 0.23936172268263006, "eval_loss": 0.6197842156014791, "eval_mse_loss": 0.6197842156014791, "step": 11264 }, { "epoch": 5.562469135802469, "eval_bleu": 0.23936172268263006, "eval_loss": 0.6197842156014791, "eval_mse_loss": 0.6197842156014791, "eval_runtime": 7.275, "eval_samples_per_second": 359.864, "eval_steps_per_second": 5.636, "step": 11264 }, { "epoch": 5.688888888888889, "grad_norm": 8.074504852294922, "learning_rate": 0.0004099296854255696, "loss": 0.6032764315605164, "step": 11520 }, { "epoch": 5.688888888888889, "eval_bleu": 0.2563038856939558, "eval_loss": 0.5922774854229718, "eval_mse_loss": 0.5922774854229718, "step": 11520 }, { "epoch": 5.688888888888889, "eval_bleu": 0.2563038856939558, "eval_loss": 0.5922774854229718, "eval_mse_loss": 0.5922774854229718, "eval_runtime": 7.7562, "eval_samples_per_second": 337.535, "eval_steps_per_second": 5.286, "step": 11520 }, { "epoch": 5.815308641975308, "grad_norm": 7.949519157409668, "learning_rate": 0.00038997019501127406, "loss": 0.5983560681343079, "step": 11776 }, { "epoch": 5.815308641975308, "eval_bleu": 0.25476493381302434, "eval_loss": 0.6024370585999838, "eval_mse_loss": 0.6024370585999838, "step": 11776 }, { "epoch": 5.815308641975308, "eval_bleu": 0.25476493381302434, "eval_loss": 0.6024370585999838, "eval_mse_loss": 0.6024370585999838, "eval_runtime": 7.9866, "eval_samples_per_second": 327.799, "eval_steps_per_second": 5.134, "step": 11776 }, { "epoch": 5.941728395061729, "grad_norm": 7.21051025390625, "learning_rate": 0.00037019335626502263, "loss": 0.5926205515861511, "step": 12032 }, { "epoch": 5.941728395061729, "eval_bleu": 0.2579463063130564, "eval_loss": 0.5958762241572868, "eval_mse_loss": 0.5958762241572868, "step": 12032 }, { "epoch": 5.941728395061729, "eval_bleu": 0.2579463063130564, "eval_loss": 0.5958762241572868, "eval_mse_loss": 0.5958762241572868, "eval_runtime": 8.1465, "eval_samples_per_second": 321.364, "eval_steps_per_second": 5.033, "step": 12032 }, { "epoch": 6.068148148148148, "grad_norm": 6.125677108764648, "learning_rate": 0.0003506319991330833, "loss": 0.5896713137626648, "step": 12288 }, { "epoch": 6.068148148148148, "eval_bleu": 0.2633499094850468, "eval_loss": 0.5901360758920995, "eval_mse_loss": 0.5901360758920995, "step": 12288 }, { "epoch": 6.068148148148148, "eval_bleu": 0.2633499094850468, "eval_loss": 0.5901360758920995, "eval_mse_loss": 0.5901360758920995, "eval_runtime": 7.4076, "eval_samples_per_second": 353.421, "eval_steps_per_second": 5.535, "step": 12288 }, { "epoch": 6.194567901234568, "grad_norm": 8.257464408874512, "learning_rate": 0.00033131859585795183, "loss": 0.5833750367164612, "step": 12544 }, { "epoch": 6.194567901234568, "eval_bleu": 0.2632131293368136, "eval_loss": 0.5901335754045626, "eval_mse_loss": 0.5901335754045626, "step": 12544 }, { "epoch": 6.194567901234568, "eval_bleu": 0.2632131293368136, "eval_loss": 0.5901335754045626, "eval_mse_loss": 0.5901335754045626, "eval_runtime": 7.4243, "eval_samples_per_second": 352.625, "eval_steps_per_second": 5.522, "step": 12544 }, { "epoch": 6.320987654320987, "grad_norm": 4.863134384155273, "learning_rate": 0.0003122852070737825, "loss": 0.581442654132843, "step": 12800 }, { "epoch": 6.320987654320987, "eval_bleu": 0.26764936587915455, "eval_loss": 0.5768222677998427, "eval_mse_loss": 0.5768222677998427, "step": 12800 }, { "epoch": 6.320987654320987, "eval_bleu": 0.26764936587915455, "eval_loss": 0.5768222677998427, "eval_mse_loss": 0.5768222677998427, "eval_runtime": 7.9069, "eval_samples_per_second": 331.105, "eval_steps_per_second": 5.185, "step": 12800 }, { "epoch": 6.4474074074074075, "grad_norm": 8.697264671325684, "learning_rate": 0.00029356342858509677, "loss": 0.5772220492362976, "step": 13056 }, { "epoch": 6.4474074074074075, "eval_bleu": 0.27568777700132663, "eval_loss": 0.584543146738192, "eval_mse_loss": 0.584543146738192, "step": 13056 }, { "epoch": 6.4474074074074075, "eval_bleu": 0.27568777700132663, "eval_loss": 0.584543146738192, "eval_mse_loss": 0.584543146738192, "eval_runtime": 8.0062, "eval_samples_per_second": 326.995, "eval_steps_per_second": 5.121, "step": 13056 }, { "epoch": 6.573827160493828, "grad_norm": 6.309488773345947, "learning_rate": 0.0002751843389171185, "loss": 0.5722501277923584, "step": 13312 }, { "epoch": 6.573827160493828, "eval_bleu": 0.27306378384191476, "eval_loss": 0.5755636982801484, "eval_mse_loss": 0.5755636982801484, "step": 13312 }, { "epoch": 6.573827160493828, "eval_bleu": 0.27306378384191476, "eval_loss": 0.5755636982801484, "eval_mse_loss": 0.5755636982801484, "eval_runtime": 8.591, "eval_samples_per_second": 304.738, "eval_steps_per_second": 4.772, "step": 13312 }, { "epoch": 6.700246913580247, "grad_norm": 5.390190601348877, "learning_rate": 0.0002571784477248029, "loss": 0.5714833736419678, "step": 13568 }, { "epoch": 6.700246913580247, "eval_bleu": 0.2848300030499236, "eval_loss": 0.570485861563101, "eval_mse_loss": 0.570485861563101, "step": 13568 }, { "epoch": 6.700246913580247, "eval_bleu": 0.2848300030499236, "eval_loss": 0.570485861563101, "eval_mse_loss": 0.570485861563101, "eval_runtime": 7.2761, "eval_samples_per_second": 359.808, "eval_steps_per_second": 5.635, "step": 13568 }, { "epoch": 6.826666666666666, "grad_norm": 7.215004920959473, "learning_rate": 0.0002395756451462014, "loss": 0.5674658417701721, "step": 13824 }, { "epoch": 6.826666666666666, "eval_bleu": 0.2756130503152812, "eval_loss": 0.5792907963438731, "eval_mse_loss": 0.5792907963438731, "step": 13824 }, { "epoch": 6.826666666666666, "eval_bleu": 0.2756130503152812, "eval_loss": 0.5792907963438731, "eval_mse_loss": 0.5792907963438731, "eval_runtime": 7.3879, "eval_samples_per_second": 354.362, "eval_steps_per_second": 5.55, "step": 13824 }, { "epoch": 6.953086419753086, "grad_norm": 6.6079816818237305, "learning_rate": 0.00022240515218423758, "loss": 0.5638896226882935, "step": 14080 }, { "epoch": 6.953086419753086, "eval_bleu": 0.2958995495334816, "eval_loss": 0.5594088427904176, "eval_mse_loss": 0.5594088427904176, "step": 14080 }, { "epoch": 6.953086419753086, "eval_bleu": 0.2958995495334816, "eval_loss": 0.5594088427904176, "eval_mse_loss": 0.5594088427904176, "eval_runtime": 7.9094, "eval_samples_per_second": 330.998, "eval_steps_per_second": 5.184, "step": 14080 }, { "epoch": 7.079506172839507, "grad_norm": 6.562555313110352, "learning_rate": 0.00020569547219925934, "loss": 0.5631716251373291, "step": 14336 }, { "epoch": 7.079506172839507, "eval_bleu": 0.2905136606672008, "eval_loss": 0.5631622737500726, "eval_mse_loss": 0.5631622737500726, "step": 14336 }, { "epoch": 7.079506172839507, "eval_bleu": 0.2905136606672008, "eval_loss": 0.5631622737500726, "eval_mse_loss": 0.5631622737500726, "eval_runtime": 8.3002, "eval_samples_per_second": 315.415, "eval_steps_per_second": 4.94, "step": 14336 }, { "epoch": 7.205925925925926, "grad_norm": 5.195821285247803, "learning_rate": 0.00018947434359289434, "loss": 0.5603777170181274, "step": 14592 }, { "epoch": 7.205925925925926, "eval_bleu": 0.29437709495493225, "eval_loss": 0.5465006559360318, "eval_mse_loss": 0.5465006559360318, "step": 14592 }, { "epoch": 7.205925925925926, "eval_bleu": 0.29437709495493225, "eval_loss": 0.5465006559360318, "eval_mse_loss": 0.5465006559360318, "eval_runtime": 7.1426, "eval_samples_per_second": 366.535, "eval_steps_per_second": 5.74, "step": 14592 }, { "epoch": 7.332345679012346, "grad_norm": 8.156927108764648, "learning_rate": 0.0001737686937617491, "loss": 0.5557982325553894, "step": 14848 }, { "epoch": 7.332345679012346, "eval_bleu": 0.29449963629268144, "eval_loss": 0.557792792959911, "eval_mse_loss": 0.557792792959911, "step": 14848 }, { "epoch": 7.332345679012346, "eval_bleu": 0.29449963629268144, "eval_loss": 0.557792792959911, "eval_mse_loss": 0.557792792959911, "eval_runtime": 7.7275, "eval_samples_per_second": 338.789, "eval_steps_per_second": 5.306, "step": 14848 }, { "epoch": 7.458765432098765, "grad_norm": 5.172349452972412, "learning_rate": 0.00015860459439739582, "loss": 0.5558417439460754, "step": 15104 }, { "epoch": 7.458765432098765, "eval_bleu": 0.31102631980865114, "eval_loss": 0.5469118814642836, "eval_mse_loss": 0.5469118814642836, "step": 15104 }, { "epoch": 7.458765432098765, "eval_bleu": 0.31102631980865114, "eval_loss": 0.5469118814642836, "eval_mse_loss": 0.5469118814642836, "eval_runtime": 7.9676, "eval_samples_per_second": 328.58, "eval_steps_per_second": 5.146, "step": 15104 }, { "epoch": 7.5851851851851855, "grad_norm": 5.335220813751221, "learning_rate": 0.0001440072182068436, "loss": 0.5527888536453247, "step": 15360 }, { "epoch": 7.5851851851851855, "eval_bleu": 0.3040853321881768, "eval_loss": 0.5505978399660529, "eval_mse_loss": 0.5505978399660529, "step": 15360 }, { "epoch": 7.5851851851851855, "eval_bleu": 0.3040853321881768, "eval_loss": 0.5505978399660529, "eval_mse_loss": 0.5505978399660529, "eval_runtime": 8.0279, "eval_samples_per_second": 326.113, "eval_steps_per_second": 5.107, "step": 15360 }, { "epoch": 7.711604938271605, "grad_norm": 3.120297908782959, "learning_rate": 0.00013000079712534475, "loss": 0.5498585104942322, "step": 15616 }, { "epoch": 7.711604938271605, "eval_bleu": 0.30098409224195205, "eval_loss": 0.54582195819878, "eval_mse_loss": 0.54582195819878, "step": 15616 }, { "epoch": 7.711604938271605, "eval_bleu": 0.30098409224195205, "eval_loss": 0.54582195819878, "eval_mse_loss": 0.54582195819878, "eval_runtime": 8.0703, "eval_samples_per_second": 324.401, "eval_steps_per_second": 5.08, "step": 15616 }, { "epoch": 7.838024691358025, "grad_norm": 4.802690029144287, "learning_rate": 0.00011660858209089819, "loss": 0.5511511564254761, "step": 15872 }, { "epoch": 7.838024691358025, "eval_bleu": 0.3068134406523234, "eval_loss": 0.5499871998298459, "eval_mse_loss": 0.5499871998298459, "step": 15872 }, { "epoch": 7.838024691358025, "eval_bleu": 0.3068134406523234, "eval_loss": 0.5499871998298459, "eval_mse_loss": 0.5499871998298459, "eval_runtime": 7.2713, "eval_samples_per_second": 360.046, "eval_steps_per_second": 5.639, "step": 15872 }, { "epoch": 7.964444444444444, "grad_norm": 3.471189260482788, "learning_rate": 0.00010385280444723056, "loss": 0.5520000457763672, "step": 16128 }, { "epoch": 7.964444444444444, "eval_bleu": 0.3052295052303889, "eval_loss": 0.5441080797009352, "eval_mse_loss": 0.5441080797009352, "step": 16128 }, { "epoch": 7.964444444444444, "eval_bleu": 0.3052295052303889, "eval_loss": 0.5441080797009352, "eval_mse_loss": 0.5441080797009352, "eval_runtime": 7.7809, "eval_samples_per_second": 336.463, "eval_steps_per_second": 5.269, "step": 16128 }, { "epoch": 8.090864197530864, "grad_norm": 5.1973466873168945, "learning_rate": 9.175463903932168e-05, "loss": 0.5478553771972656, "step": 16384 }, { "epoch": 8.090864197530864, "eval_bleu": 0.2907079883758542, "eval_loss": 0.5581492403658425, "eval_mse_loss": 0.5581492403658425, "step": 16384 }, { "epoch": 8.090864197530864, "eval_bleu": 0.2907079883758542, "eval_loss": 0.5581492403658425, "eval_mse_loss": 0.5581492403658425, "eval_runtime": 8.0444, "eval_samples_per_second": 325.444, "eval_steps_per_second": 5.097, "step": 16384 }, { "epoch": 8.217283950617285, "grad_norm": 3.1891634464263916, "learning_rate": 8.033416906274093e-05, "loss": 0.5465660095214844, "step": 16640 }, { "epoch": 8.217283950617285, "eval_bleu": 0.3151093389207819, "eval_loss": 0.5382588927338763, "eval_mse_loss": 0.5382588927338763, "step": 16640 }, { "epoch": 8.217283950617285, "eval_bleu": 0.3151093389207819, "eval_loss": 0.5382588927338763, "eval_mse_loss": 0.5382588927338763, "eval_runtime": 8.1703, "eval_samples_per_second": 320.43, "eval_steps_per_second": 5.018, "step": 16640 }, { "epoch": 8.343703703703703, "grad_norm": 3.5024101734161377, "learning_rate": 6.961035272514177e-05, "loss": 0.5455322265625, "step": 16896 }, { "epoch": 8.343703703703703, "eval_bleu": 0.31440992086262937, "eval_loss": 0.5380844590140552, "eval_mse_loss": 0.5380844590140552, "step": 16896 }, { "epoch": 8.343703703703703, "eval_bleu": 0.31440992086262937, "eval_loss": 0.5380844590140552, "eval_mse_loss": 0.5380844590140552, "eval_runtime": 7.1273, "eval_samples_per_second": 367.32, "eval_steps_per_second": 5.753, "step": 16896 }, { "epoch": 8.470123456790123, "grad_norm": 3.242410659790039, "learning_rate": 5.960099177526024e-05, "loss": 0.5413248538970947, "step": 17152 }, { "epoch": 8.470123456790123, "eval_bleu": 0.3030533280572146, "eval_loss": 0.5433335965726434, "eval_mse_loss": 0.5433335965726434, "step": 17152 }, { "epoch": 8.470123456790123, "eval_bleu": 0.3030533280572146, "eval_loss": 0.5433335965726434, "eval_mse_loss": 0.5433335965726434, "eval_runtime": 8.049, "eval_samples_per_second": 325.26, "eval_steps_per_second": 5.094, "step": 17152 }, { "epoch": 8.596543209876543, "grad_norm": 3.3194503784179688, "learning_rate": 5.032270195165667e-05, "loss": 0.5402602553367615, "step": 17408 }, { "epoch": 8.596543209876543, "eval_bleu": 0.3284021854773732, "eval_loss": 0.5290337989969951, "eval_mse_loss": 0.5290337989969951, "step": 17408 }, { "epoch": 8.596543209876543, "eval_bleu": 0.3284021854773732, "eval_loss": 0.5290337989969951, "eval_mse_loss": 0.5290337989969951, "eval_runtime": 7.8967, "eval_samples_per_second": 331.532, "eval_steps_per_second": 5.192, "step": 17408 } ], "logging_steps": 256, "max_steps": 20250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 256, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }