| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 25.31645569620253, | |
| "eval_steps": 500, | |
| "global_step": 42000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3, | |
| "learning_rate": 4.969861362266426e-05, | |
| "loss": 3.7006, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 4.939722724532851e-05, | |
| "loss": 3.5872, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 4.909584086799277e-05, | |
| "loss": 3.4617, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_bleu": 12.3227, | |
| "eval_gen_len": 54.6075, | |
| "eval_loss": 3.1113245487213135, | |
| "eval_runtime": 122.2496, | |
| "eval_samples_per_second": 3.272, | |
| "eval_steps_per_second": 0.409, | |
| "step": 1659 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "learning_rate": 4.8794454490657024e-05, | |
| "loss": 3.298, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 4.849306811332128e-05, | |
| "loss": 3.2018, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 4.8191681735985535e-05, | |
| "loss": 3.1014, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_bleu": 15.8487, | |
| "eval_gen_len": 50.1125, | |
| "eval_loss": 2.8111488819122314, | |
| "eval_runtime": 92.4044, | |
| "eval_samples_per_second": 4.329, | |
| "eval_steps_per_second": 0.541, | |
| "step": 3318 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "learning_rate": 4.789029535864979e-05, | |
| "loss": 2.9998, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "learning_rate": 4.7588908981314046e-05, | |
| "loss": 2.883, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "learning_rate": 4.7287522603978304e-05, | |
| "loss": 2.8409, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_bleu": 20.5509, | |
| "eval_gen_len": 43.98, | |
| "eval_loss": 2.617112398147583, | |
| "eval_runtime": 70.0674, | |
| "eval_samples_per_second": 5.709, | |
| "eval_steps_per_second": 0.714, | |
| "step": 4977 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "learning_rate": 4.6986136226642556e-05, | |
| "loss": 2.8043, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "learning_rate": 4.6684749849306815e-05, | |
| "loss": 2.6486, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "learning_rate": 4.638336347197107e-05, | |
| "loss": 2.6127, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "learning_rate": 4.6081977094635326e-05, | |
| "loss": 2.5718, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_bleu": 21.5273, | |
| "eval_gen_len": 40.8575, | |
| "eval_loss": 2.4335193634033203, | |
| "eval_runtime": 62.0368, | |
| "eval_samples_per_second": 6.448, | |
| "eval_steps_per_second": 0.806, | |
| "step": 6636 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "learning_rate": 4.5780590717299585e-05, | |
| "loss": 2.4535, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "learning_rate": 4.547920433996384e-05, | |
| "loss": 2.4269, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "learning_rate": 4.5177817962628096e-05, | |
| "loss": 2.3852, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_bleu": 24.0185, | |
| "eval_gen_len": 38.945, | |
| "eval_loss": 2.2908990383148193, | |
| "eval_runtime": 53.5509, | |
| "eval_samples_per_second": 7.47, | |
| "eval_steps_per_second": 0.934, | |
| "step": 8295 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "learning_rate": 4.487643158529235e-05, | |
| "loss": 2.3305, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "learning_rate": 4.45750452079566e-05, | |
| "loss": 2.2361, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "learning_rate": 4.427365883062086e-05, | |
| "loss": 2.2201, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_bleu": 25.0722, | |
| "eval_gen_len": 38.4525, | |
| "eval_loss": 2.2150681018829346, | |
| "eval_runtime": 47.2306, | |
| "eval_samples_per_second": 8.469, | |
| "eval_steps_per_second": 1.059, | |
| "step": 9954 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "learning_rate": 4.397227245328511e-05, | |
| "loss": 2.1955, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.33, | |
| "learning_rate": 4.367088607594937e-05, | |
| "loss": 2.0928, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.63, | |
| "learning_rate": 4.336949969861363e-05, | |
| "loss": 2.0947, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "learning_rate": 4.306811332127788e-05, | |
| "loss": 2.0583, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_bleu": 26.051, | |
| "eval_gen_len": 40.0775, | |
| "eval_loss": 2.1219234466552734, | |
| "eval_runtime": 55.6386, | |
| "eval_samples_per_second": 7.189, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11613 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "learning_rate": 4.276672694394214e-05, | |
| "loss": 1.9657, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.53, | |
| "learning_rate": 4.246534056660639e-05, | |
| "loss": 1.9594, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "learning_rate": 4.216395418927065e-05, | |
| "loss": 1.9464, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_bleu": 27.8486, | |
| "eval_gen_len": 39.54, | |
| "eval_loss": 2.0415802001953125, | |
| "eval_runtime": 50.0785, | |
| "eval_samples_per_second": 7.987, | |
| "eval_steps_per_second": 0.998, | |
| "step": 13272 | |
| }, | |
| { | |
| "epoch": 8.14, | |
| "learning_rate": 4.186256781193491e-05, | |
| "loss": 1.8901, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 8.44, | |
| "learning_rate": 4.1561181434599153e-05, | |
| "loss": 1.8331, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.74, | |
| "learning_rate": 4.125979505726341e-05, | |
| "loss": 1.8273, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_bleu": 28.6882, | |
| "eval_gen_len": 38.97, | |
| "eval_loss": 1.9714975357055664, | |
| "eval_runtime": 47.8353, | |
| "eval_samples_per_second": 8.362, | |
| "eval_steps_per_second": 1.045, | |
| "step": 14931 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "learning_rate": 4.095840867992767e-05, | |
| "loss": 1.8071, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.34, | |
| "learning_rate": 4.065702230259192e-05, | |
| "loss": 1.724, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 9.64, | |
| "learning_rate": 4.035563592525618e-05, | |
| "loss": 1.7173, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "learning_rate": 4.0054249547920434e-05, | |
| "loss": 1.7341, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_bleu": 29.4158, | |
| "eval_gen_len": 39.27, | |
| "eval_loss": 1.922670602798462, | |
| "eval_runtime": 48.3901, | |
| "eval_samples_per_second": 8.266, | |
| "eval_steps_per_second": 1.033, | |
| "step": 16590 | |
| }, | |
| { | |
| "epoch": 10.25, | |
| "learning_rate": 3.975286317058469e-05, | |
| "loss": 1.6432, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 10.55, | |
| "learning_rate": 3.945147679324895e-05, | |
| "loss": 1.6414, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 10.85, | |
| "learning_rate": 3.9150090415913203e-05, | |
| "loss": 1.6285, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_bleu": 29.6336, | |
| "eval_gen_len": 39.7025, | |
| "eval_loss": 1.8723887205123901, | |
| "eval_runtime": 49.1746, | |
| "eval_samples_per_second": 8.134, | |
| "eval_steps_per_second": 1.017, | |
| "step": 18249 | |
| }, | |
| { | |
| "epoch": 11.15, | |
| "learning_rate": 3.884870403857746e-05, | |
| "loss": 1.5753, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 11.45, | |
| "learning_rate": 3.8547317661241714e-05, | |
| "loss": 1.5525, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "learning_rate": 3.8245931283905966e-05, | |
| "loss": 1.5466, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_bleu": 31.3296, | |
| "eval_gen_len": 39.8675, | |
| "eval_loss": 1.816349744796753, | |
| "eval_runtime": 49.6256, | |
| "eval_samples_per_second": 8.06, | |
| "eval_steps_per_second": 1.008, | |
| "step": 19908 | |
| }, | |
| { | |
| "epoch": 12.06, | |
| "learning_rate": 3.7944544906570225e-05, | |
| "loss": 1.5254, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 12.36, | |
| "learning_rate": 3.764315852923448e-05, | |
| "loss": 1.4676, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 12.66, | |
| "learning_rate": 3.7341772151898736e-05, | |
| "loss": 1.4678, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "learning_rate": 3.7040385774562995e-05, | |
| "loss": 1.4607, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_bleu": 31.7515, | |
| "eval_gen_len": 38.405, | |
| "eval_loss": 1.7929939031600952, | |
| "eval_runtime": 44.5172, | |
| "eval_samples_per_second": 8.985, | |
| "eval_steps_per_second": 1.123, | |
| "step": 21567 | |
| }, | |
| { | |
| "epoch": 13.26, | |
| "learning_rate": 3.6738999397227247e-05, | |
| "loss": 1.3787, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 13.56, | |
| "learning_rate": 3.6437613019891505e-05, | |
| "loss": 1.4049, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 13.86, | |
| "learning_rate": 3.613622664255576e-05, | |
| "loss": 1.385, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_bleu": 32.458, | |
| "eval_gen_len": 39.4675, | |
| "eval_loss": 1.7518789768218994, | |
| "eval_runtime": 49.1331, | |
| "eval_samples_per_second": 8.141, | |
| "eval_steps_per_second": 1.018, | |
| "step": 23226 | |
| }, | |
| { | |
| "epoch": 14.17, | |
| "learning_rate": 3.5834840265220016e-05, | |
| "loss": 1.3403, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 14.47, | |
| "learning_rate": 3.553345388788427e-05, | |
| "loss": 1.3166, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 14.77, | |
| "learning_rate": 3.523206751054853e-05, | |
| "loss": 1.321, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_bleu": 32.9411, | |
| "eval_gen_len": 38.8025, | |
| "eval_loss": 1.7194263935089111, | |
| "eval_runtime": 45.6686, | |
| "eval_samples_per_second": 8.759, | |
| "eval_steps_per_second": 1.095, | |
| "step": 24885 | |
| }, | |
| { | |
| "epoch": 15.07, | |
| "learning_rate": 3.493068113321278e-05, | |
| "loss": 1.2976, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 15.37, | |
| "learning_rate": 3.462929475587703e-05, | |
| "loss": 1.2358, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 15.67, | |
| "learning_rate": 3.432790837854129e-05, | |
| "loss": 1.2592, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 15.97, | |
| "learning_rate": 3.402652200120555e-05, | |
| "loss": 1.2662, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_bleu": 33.8478, | |
| "eval_gen_len": 39.1275, | |
| "eval_loss": 1.6950603723526, | |
| "eval_runtime": 49.9911, | |
| "eval_samples_per_second": 8.001, | |
| "eval_steps_per_second": 1.0, | |
| "step": 26544 | |
| }, | |
| { | |
| "epoch": 16.27, | |
| "learning_rate": 3.37251356238698e-05, | |
| "loss": 1.1963, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 16.58, | |
| "learning_rate": 3.342374924653406e-05, | |
| "loss": 1.2002, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 16.88, | |
| "learning_rate": 3.312236286919831e-05, | |
| "loss": 1.1939, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_bleu": 34.5277, | |
| "eval_gen_len": 39.0225, | |
| "eval_loss": 1.685713529586792, | |
| "eval_runtime": 49.4943, | |
| "eval_samples_per_second": 8.082, | |
| "eval_steps_per_second": 1.01, | |
| "step": 28203 | |
| }, | |
| { | |
| "epoch": 17.18, | |
| "learning_rate": 3.282097649186257e-05, | |
| "loss": 1.1459, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 17.48, | |
| "learning_rate": 3.251959011452683e-05, | |
| "loss": 1.1326, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 17.78, | |
| "learning_rate": 3.221820373719108e-05, | |
| "loss": 1.1406, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_bleu": 35.8691, | |
| "eval_gen_len": 38.76, | |
| "eval_loss": 1.6470690965652466, | |
| "eval_runtime": 45.2962, | |
| "eval_samples_per_second": 8.831, | |
| "eval_steps_per_second": 1.104, | |
| "step": 29862 | |
| }, | |
| { | |
| "epoch": 18.08, | |
| "learning_rate": 3.191681735985534e-05, | |
| "loss": 1.1292, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 18.38, | |
| "learning_rate": 3.161543098251959e-05, | |
| "loss": 1.071, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 18.69, | |
| "learning_rate": 3.1314044605183844e-05, | |
| "loss": 1.0918, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 18.99, | |
| "learning_rate": 3.10126582278481e-05, | |
| "loss": 1.0759, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_bleu": 36.4448, | |
| "eval_gen_len": 38.6925, | |
| "eval_loss": 1.6456927061080933, | |
| "eval_runtime": 46.4772, | |
| "eval_samples_per_second": 8.606, | |
| "eval_steps_per_second": 1.076, | |
| "step": 31521 | |
| }, | |
| { | |
| "epoch": 19.29, | |
| "learning_rate": 3.0711271850512355e-05, | |
| "loss": 1.0193, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 19.59, | |
| "learning_rate": 3.0409885473176613e-05, | |
| "loss": 1.0248, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 19.89, | |
| "learning_rate": 3.010849909584087e-05, | |
| "loss": 1.0378, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_bleu": 37.2905, | |
| "eval_gen_len": 38.945, | |
| "eval_loss": 1.6285927295684814, | |
| "eval_runtime": 49.741, | |
| "eval_samples_per_second": 8.042, | |
| "eval_steps_per_second": 1.005, | |
| "step": 33180 | |
| }, | |
| { | |
| "epoch": 20.19, | |
| "learning_rate": 2.9807112718505124e-05, | |
| "loss": 0.9915, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 20.49, | |
| "learning_rate": 2.9505726341169383e-05, | |
| "loss": 0.9848, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "learning_rate": 2.9204339963833638e-05, | |
| "loss": 0.9851, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_bleu": 38.4264, | |
| "eval_gen_len": 38.7175, | |
| "eval_loss": 1.5997543334960938, | |
| "eval_runtime": 44.5032, | |
| "eval_samples_per_second": 8.988, | |
| "eval_steps_per_second": 1.124, | |
| "step": 34839 | |
| }, | |
| { | |
| "epoch": 21.1, | |
| "learning_rate": 2.8902953586497894e-05, | |
| "loss": 0.97, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 21.4, | |
| "learning_rate": 2.8601567209162146e-05, | |
| "loss": 0.9436, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 21.7, | |
| "learning_rate": 2.83001808318264e-05, | |
| "loss": 0.9372, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_bleu": 37.9614, | |
| "eval_gen_len": 38.9425, | |
| "eval_loss": 1.607030987739563, | |
| "eval_runtime": 47.0014, | |
| "eval_samples_per_second": 8.51, | |
| "eval_steps_per_second": 1.064, | |
| "step": 36498 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "learning_rate": 2.7998794454490656e-05, | |
| "loss": 0.9437, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 22.3, | |
| "learning_rate": 2.7697408077154912e-05, | |
| "loss": 0.8917, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 22.6, | |
| "learning_rate": 2.7396021699819167e-05, | |
| "loss": 0.8692, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 22.91, | |
| "learning_rate": 2.7094635322483426e-05, | |
| "loss": 0.9191, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_bleu": 38.8655, | |
| "eval_gen_len": 38.8825, | |
| "eval_loss": 1.5746939182281494, | |
| "eval_runtime": 50.4993, | |
| "eval_samples_per_second": 7.921, | |
| "eval_steps_per_second": 0.99, | |
| "step": 38157 | |
| }, | |
| { | |
| "epoch": 23.21, | |
| "learning_rate": 2.679324894514768e-05, | |
| "loss": 0.8555, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 23.51, | |
| "learning_rate": 2.6491862567811937e-05, | |
| "loss": 0.8533, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 23.81, | |
| "learning_rate": 2.6190476190476192e-05, | |
| "loss": 0.8673, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_bleu": 39.4605, | |
| "eval_gen_len": 39.0175, | |
| "eval_loss": 1.5650146007537842, | |
| "eval_runtime": 50.8092, | |
| "eval_samples_per_second": 7.873, | |
| "eval_steps_per_second": 0.984, | |
| "step": 39816 | |
| }, | |
| { | |
| "epoch": 24.11, | |
| "learning_rate": 2.5889089813140448e-05, | |
| "loss": 0.841, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 24.41, | |
| "learning_rate": 2.5587703435804706e-05, | |
| "loss": 0.8155, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 24.71, | |
| "learning_rate": 2.5286317058468955e-05, | |
| "loss": 0.811, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_bleu": 39.6804, | |
| "eval_gen_len": 38.77, | |
| "eval_loss": 1.5603779554367065, | |
| "eval_runtime": 45.7389, | |
| "eval_samples_per_second": 8.745, | |
| "eval_steps_per_second": 1.093, | |
| "step": 41475 | |
| }, | |
| { | |
| "epoch": 25.02, | |
| "learning_rate": 2.4984930681133214e-05, | |
| "loss": 0.8335, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 25.32, | |
| "learning_rate": 2.468354430379747e-05, | |
| "loss": 0.7688, | |
| "step": 42000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 82950, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "total_flos": 1.0127259403812864e+16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |