| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 13.44, | |
| "global_step": 42000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 0.011603174731135368, | |
| "loss": 1.5357, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "learning_rate": 0.023230696097016335, | |
| "loss": 0.0593, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "learning_rate": 0.034898791462183, | |
| "loss": 0.0531, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 0.04661125689744949, | |
| "loss": 0.044, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 0.05841406062245369, | |
| "loss": 0.0461, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "learning_rate": 0.07035055011510849, | |
| "loss": 0.0433, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_avg_length": 17.2926, | |
| "eval_bleu": 0.0241, | |
| "eval_loss": 0.06959044933319092, | |
| "eval_rouge1": 0.5901, | |
| "eval_rouge2": 0.5715, | |
| "eval_runtime": 56.7394, | |
| "eval_samples_per_second": 88.122, | |
| "eval_steps_per_second": 2.767, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "learning_rate": 0.08241234719753265, | |
| "loss": 0.0408, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "learning_rate": 0.09467849135398865, | |
| "loss": 0.0406, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "learning_rate": 0.10735532641410828, | |
| "loss": 0.0445, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "learning_rate": 0.12028851360082626, | |
| "loss": 0.0406, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "learning_rate": 0.13352635502815247, | |
| "loss": 0.0405, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "learning_rate": 0.1470595896244049, | |
| "loss": 0.0412, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_avg_length": 17.3172, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07180308550596237, | |
| "eval_rouge1": 0.5879, | |
| "eval_rouge2": 0.5692, | |
| "eval_runtime": 55.7642, | |
| "eval_samples_per_second": 89.663, | |
| "eval_steps_per_second": 2.815, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "learning_rate": 0.16096152365207672, | |
| "loss": 0.0395, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "learning_rate": 0.17507727444171906, | |
| "loss": 0.0407, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "learning_rate": 0.18983061611652374, | |
| "loss": 0.0393, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "learning_rate": 0.2049846649169922, | |
| "loss": 0.0401, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "learning_rate": 0.22153085470199585, | |
| "loss": 0.0414, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "learning_rate": 0.23806197941303253, | |
| "loss": 0.0425, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_avg_length": 17.3112, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07267959415912628, | |
| "eval_rouge1": 0.5871, | |
| "eval_rouge2": 0.5681, | |
| "eval_runtime": 54.2458, | |
| "eval_samples_per_second": 92.173, | |
| "eval_steps_per_second": 2.894, | |
| "step": 9375 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "learning_rate": 0.2547408640384674, | |
| "loss": 0.0393, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "learning_rate": 0.27236127853393555, | |
| "loss": 0.0377, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "learning_rate": 0.26978379487991333, | |
| "loss": 0.0372, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "learning_rate": 0.26798078417778015, | |
| "loss": 0.0383, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "learning_rate": 0.2658974826335907, | |
| "loss": 0.0391, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "learning_rate": 0.2626633942127228, | |
| "loss": 0.0392, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "learning_rate": 0.26069551706314087, | |
| "loss": 0.0381, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_avg_length": 17.321, | |
| "eval_bleu": 0.0238, | |
| "eval_loss": 0.07149858772754669, | |
| "eval_rouge1": 0.5866, | |
| "eval_rouge2": 0.5677, | |
| "eval_runtime": 52.6009, | |
| "eval_samples_per_second": 95.055, | |
| "eval_steps_per_second": 2.985, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "learning_rate": 0.25828537344932556, | |
| "loss": 0.0341, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "learning_rate": 0.2559824585914612, | |
| "loss": 0.0323, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "learning_rate": 0.2536338269710541, | |
| "loss": 0.0316, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "learning_rate": 0.2517462372779846, | |
| "loss": 0.0314, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "learning_rate": 0.25038695335388184, | |
| "loss": 0.0317, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "learning_rate": 0.24828433990478516, | |
| "loss": 0.0334, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_avg_length": 17.3102, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07320648431777954, | |
| "eval_rouge1": 0.5873, | |
| "eval_rouge2": 0.5682, | |
| "eval_runtime": 52.0328, | |
| "eval_samples_per_second": 96.093, | |
| "eval_steps_per_second": 3.017, | |
| "step": 15625 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "learning_rate": 0.24564550817012787, | |
| "loss": 0.028, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "learning_rate": 0.24398992955684662, | |
| "loss": 0.0279, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "learning_rate": 0.24214749038219452, | |
| "loss": 0.0277, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "learning_rate": 0.24071773886680603, | |
| "loss": 0.0302, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "learning_rate": 0.23937062919139862, | |
| "loss": 0.0268, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "learning_rate": 0.23846393823623657, | |
| "loss": 0.0293, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_avg_length": 17.306, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07230091094970703, | |
| "eval_rouge1": 0.5874, | |
| "eval_rouge2": 0.5673, | |
| "eval_runtime": 51.9475, | |
| "eval_samples_per_second": 96.251, | |
| "eval_steps_per_second": 3.022, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "learning_rate": 0.23700089752674103, | |
| "loss": 0.0268, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "learning_rate": 0.23510468006134033, | |
| "loss": 0.025, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "learning_rate": 0.23372133076190948, | |
| "loss": 0.0234, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "learning_rate": 0.23236095905303955, | |
| "loss": 0.0242, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "learning_rate": 0.23128560185432434, | |
| "loss": 0.0243, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "learning_rate": 0.23010540008544922, | |
| "loss": 0.0251, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_avg_length": 17.3118, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07389520108699799, | |
| "eval_rouge1": 0.5872, | |
| "eval_rouge2": 0.5672, | |
| "eval_runtime": 52.1854, | |
| "eval_samples_per_second": 95.812, | |
| "eval_steps_per_second": 3.009, | |
| "step": 21875 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "learning_rate": 0.22910726070404053, | |
| "loss": 0.0244, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "learning_rate": 0.22765885293483734, | |
| "loss": 0.0194, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "learning_rate": 0.22604411840438843, | |
| "loss": 0.0211, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "learning_rate": 0.22470736503601074, | |
| "loss": 0.0205, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "learning_rate": 0.22348329424858093, | |
| "loss": 0.0223, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "learning_rate": 0.22219397127628326, | |
| "loss": 0.0226, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "learning_rate": 0.2213691771030426, | |
| "loss": 0.0222, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_avg_length": 17.307, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07857740670442581, | |
| "eval_rouge1": 0.5869, | |
| "eval_rouge2": 0.5671, | |
| "eval_runtime": 52.6845, | |
| "eval_samples_per_second": 94.905, | |
| "eval_steps_per_second": 2.98, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "learning_rate": 0.22002732753753662, | |
| "loss": 0.0166, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "learning_rate": 0.21867167949676514, | |
| "loss": 0.018, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "learning_rate": 0.21748971939086914, | |
| "loss": 0.0193, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "learning_rate": 0.2165619432926178, | |
| "loss": 0.0194, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "learning_rate": 0.2155638188123703, | |
| "loss": 0.019, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "learning_rate": 0.2147785723209381, | |
| "loss": 0.0183, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_avg_length": 17.3154, | |
| "eval_bleu": 0.0237, | |
| "eval_loss": 0.07949095219373703, | |
| "eval_rouge1": 0.5873, | |
| "eval_rouge2": 0.5677, | |
| "eval_runtime": 52.9652, | |
| "eval_samples_per_second": 94.402, | |
| "eval_steps_per_second": 2.964, | |
| "step": 28125 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "learning_rate": 0.21370729804039001, | |
| "loss": 0.0163, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "learning_rate": 0.2124933898448944, | |
| "loss": 0.0155, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "learning_rate": 0.21130582690238953, | |
| "loss": 0.0152, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "learning_rate": 0.21038685739040375, | |
| "loss": 0.0158, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "learning_rate": 0.20950140058994293, | |
| "loss": 0.0165, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "learning_rate": 0.20879273116588593, | |
| "loss": 0.0183, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_avg_length": 17.308, | |
| "eval_bleu": 0.0236, | |
| "eval_loss": 0.08381666988134384, | |
| "eval_rouge1": 0.5869, | |
| "eval_rouge2": 0.5668, | |
| "eval_runtime": 53.7004, | |
| "eval_samples_per_second": 93.109, | |
| "eval_steps_per_second": 2.924, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 10.08, | |
| "learning_rate": 0.20795859396457672, | |
| "loss": 0.0153, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 10.24, | |
| "learning_rate": 0.20694369077682495, | |
| "loss": 0.0132, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "learning_rate": 0.20593030750751495, | |
| "loss": 0.0145, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "learning_rate": 0.20512063801288605, | |
| "loss": 0.0142, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 10.72, | |
| "learning_rate": 0.204327791929245, | |
| "loss": 0.0136, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 10.88, | |
| "learning_rate": 0.2036626935005188, | |
| "loss": 0.0147, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_avg_length": 17.3118, | |
| "eval_bleu": 0.0236, | |
| "eval_loss": 0.08516956865787506, | |
| "eval_rouge1": 0.5867, | |
| "eval_rouge2": 0.567, | |
| "eval_runtime": 53.2317, | |
| "eval_samples_per_second": 93.929, | |
| "eval_steps_per_second": 2.949, | |
| "step": 34375 | |
| }, | |
| { | |
| "epoch": 11.04, | |
| "learning_rate": 0.20300111174583435, | |
| "loss": 0.0145, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "learning_rate": 0.20208755135536194, | |
| "loss": 0.0112, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 11.36, | |
| "learning_rate": 0.20132538676261902, | |
| "loss": 0.0123, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 11.52, | |
| "learning_rate": 0.2005048245191574, | |
| "loss": 0.012, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 11.68, | |
| "learning_rate": 0.1997680366039276, | |
| "loss": 0.013, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 11.84, | |
| "learning_rate": 0.19915533065795898, | |
| "loss": 0.0131, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "learning_rate": 0.19845974445343018, | |
| "loss": 0.0132, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_avg_length": 17.3078, | |
| "eval_bleu": 0.0236, | |
| "eval_loss": 0.08685878664255142, | |
| "eval_rouge1": 0.5872, | |
| "eval_rouge2": 0.5671, | |
| "eval_runtime": 53.7846, | |
| "eval_samples_per_second": 92.963, | |
| "eval_steps_per_second": 2.919, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 12.16, | |
| "learning_rate": 0.19766050577163696, | |
| "loss": 0.0098, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 12.32, | |
| "learning_rate": 0.19691884517669678, | |
| "loss": 0.0105, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 12.48, | |
| "learning_rate": 0.19617730379104614, | |
| "loss": 0.0101, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 12.64, | |
| "learning_rate": 0.19556531310081482, | |
| "loss": 0.0118, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "learning_rate": 0.1950163096189499, | |
| "loss": 0.0112, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "learning_rate": 0.19457842409610748, | |
| "loss": 0.0128, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_avg_length": 17.313, | |
| "eval_bleu": 0.0236, | |
| "eval_loss": 0.09262268990278244, | |
| "eval_rouge1": 0.5859, | |
| "eval_rouge2": 0.5654, | |
| "eval_runtime": 54.0715, | |
| "eval_samples_per_second": 92.47, | |
| "eval_steps_per_second": 2.904, | |
| "step": 40625 | |
| }, | |
| { | |
| "epoch": 13.12, | |
| "learning_rate": 0.19387850165367126, | |
| "loss": 0.0086, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 13.28, | |
| "learning_rate": 0.19318543374538422, | |
| "loss": 0.0087, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 13.44, | |
| "learning_rate": 0.19256973266601562, | |
| "loss": 0.0095, | |
| "step": 42000 | |
| } | |
| ], | |
| "max_steps": 312500, | |
| "num_train_epochs": 100, | |
| "total_flos": 9.0949690589184e+16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |