| { | |
| "best_metric": 0.645846426486969, | |
| "best_model_checkpoint": "models/checkpoints/checkpoint-9000", | |
| "epoch": 16.2748643761302, | |
| "global_step": 9000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.18, | |
| "learning_rate": 1.9819168173598556e-05, | |
| "loss": 1.9232, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "learning_rate": 1.9638336347197107e-05, | |
| "loss": 1.3768, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "learning_rate": 1.9457504520795662e-05, | |
| "loss": 1.269, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "learning_rate": 1.9276672694394213e-05, | |
| "loss": 1.2114, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 1.9095840867992768e-05, | |
| "loss": 1.1765, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.0532047748565674, | |
| "eval_runtime": 12.8374, | |
| "eval_samples_per_second": 612.197, | |
| "eval_steps_per_second": 4.83, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "learning_rate": 1.8915009041591322e-05, | |
| "loss": 1.1195, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 1.8734177215189874e-05, | |
| "loss": 1.1068, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "learning_rate": 1.8553345388788428e-05, | |
| "loss": 1.0338, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "learning_rate": 1.8372513562386983e-05, | |
| "loss": 1.0417, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 1.8191681735985537e-05, | |
| "loss": 1.0382, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "eval_loss": 0.964276909828186, | |
| "eval_runtime": 12.8331, | |
| "eval_samples_per_second": 612.399, | |
| "eval_steps_per_second": 4.831, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "learning_rate": 1.801084990958409e-05, | |
| "loss": 1.0084, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "learning_rate": 1.783001808318264e-05, | |
| "loss": 0.9976, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "learning_rate": 1.7649186256781194e-05, | |
| "loss": 0.9962, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "learning_rate": 1.746835443037975e-05, | |
| "loss": 0.953, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "learning_rate": 1.72875226039783e-05, | |
| "loss": 0.9687, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "eval_loss": 0.9130759239196777, | |
| "eval_runtime": 12.8023, | |
| "eval_samples_per_second": 613.873, | |
| "eval_steps_per_second": 4.843, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "learning_rate": 1.7106690777576855e-05, | |
| "loss": 0.9551, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "learning_rate": 1.692585895117541e-05, | |
| "loss": 0.9156, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "learning_rate": 1.6745027124773964e-05, | |
| "loss": 0.9072, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "learning_rate": 1.6564195298372515e-05, | |
| "loss": 0.9038, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "learning_rate": 1.6383363471971066e-05, | |
| "loss": 0.9157, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "eval_loss": 0.8786566853523254, | |
| "eval_runtime": 12.8067, | |
| "eval_samples_per_second": 613.662, | |
| "eval_steps_per_second": 4.841, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "learning_rate": 1.620253164556962e-05, | |
| "loss": 0.8994, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "learning_rate": 1.6021699819168176e-05, | |
| "loss": 0.8963, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "learning_rate": 1.584086799276673e-05, | |
| "loss": 0.8878, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "learning_rate": 1.566003616636528e-05, | |
| "loss": 0.8739, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "learning_rate": 1.5479204339963836e-05, | |
| "loss": 0.8671, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "eval_loss": 0.8628956079483032, | |
| "eval_runtime": 12.8156, | |
| "eval_samples_per_second": 613.239, | |
| "eval_steps_per_second": 4.838, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "learning_rate": 1.5298372513562387e-05, | |
| "loss": 0.8676, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "learning_rate": 1.5117540687160942e-05, | |
| "loss": 0.8329, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 5.06, | |
| "learning_rate": 1.4936708860759495e-05, | |
| "loss": 0.8438, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 5.24, | |
| "learning_rate": 1.4755877034358048e-05, | |
| "loss": 0.8294, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "learning_rate": 1.4575045207956602e-05, | |
| "loss": 0.8259, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "eval_loss": 0.825766921043396, | |
| "eval_runtime": 12.7989, | |
| "eval_samples_per_second": 614.036, | |
| "eval_steps_per_second": 4.844, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.61, | |
| "learning_rate": 1.4394213381555155e-05, | |
| "loss": 0.8065, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "learning_rate": 1.421338155515371e-05, | |
| "loss": 0.8193, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "learning_rate": 1.403254972875226e-05, | |
| "loss": 0.8152, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 6.15, | |
| "learning_rate": 1.3851717902350814e-05, | |
| "loss": 0.784, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.33, | |
| "learning_rate": 1.3670886075949368e-05, | |
| "loss": 0.7941, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.33, | |
| "eval_loss": 0.8114328384399414, | |
| "eval_runtime": 12.8036, | |
| "eval_samples_per_second": 613.81, | |
| "eval_steps_per_second": 4.842, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "learning_rate": 1.3490054249547921e-05, | |
| "loss": 0.7952, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 6.69, | |
| "learning_rate": 1.3309222423146476e-05, | |
| "loss": 0.8168, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "learning_rate": 1.3128390596745029e-05, | |
| "loss": 0.7917, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "learning_rate": 1.2947558770343582e-05, | |
| "loss": 0.7582, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "learning_rate": 1.2766726943942136e-05, | |
| "loss": 0.7605, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "eval_loss": 0.7806207537651062, | |
| "eval_runtime": 12.8016, | |
| "eval_samples_per_second": 613.906, | |
| "eval_steps_per_second": 4.843, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "learning_rate": 1.2585895117540687e-05, | |
| "loss": 0.7651, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "learning_rate": 1.240506329113924e-05, | |
| "loss": 0.7716, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "learning_rate": 1.2224231464737795e-05, | |
| "loss": 0.7704, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "learning_rate": 1.2043399638336348e-05, | |
| "loss": 0.7483, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 8.14, | |
| "learning_rate": 1.1862567811934902e-05, | |
| "loss": 0.7323, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 8.14, | |
| "eval_loss": 0.7683702707290649, | |
| "eval_runtime": 12.8031, | |
| "eval_samples_per_second": 613.836, | |
| "eval_steps_per_second": 4.843, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "learning_rate": 1.1681735985533455e-05, | |
| "loss": 0.7432, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "learning_rate": 1.150090415913201e-05, | |
| "loss": 0.7486, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "learning_rate": 1.1320072332730561e-05, | |
| "loss": 0.7479, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 8.86, | |
| "learning_rate": 1.1139240506329114e-05, | |
| "loss": 0.7192, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "learning_rate": 1.0958408679927669e-05, | |
| "loss": 0.7292, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "eval_loss": 0.7645160555839539, | |
| "eval_runtime": 12.802, | |
| "eval_samples_per_second": 613.888, | |
| "eval_steps_per_second": 4.843, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 9.22, | |
| "learning_rate": 1.0777576853526221e-05, | |
| "loss": 0.7055, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "learning_rate": 1.0596745027124774e-05, | |
| "loss": 0.7094, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 9.58, | |
| "learning_rate": 1.0415913200723329e-05, | |
| "loss": 0.7297, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "learning_rate": 1.0235081374321882e-05, | |
| "loss": 0.7096, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "learning_rate": 1.0054249547920433e-05, | |
| "loss": 0.7259, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "eval_loss": 0.7333822846412659, | |
| "eval_runtime": 12.806, | |
| "eval_samples_per_second": 613.699, | |
| "eval_steps_per_second": 4.841, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 10.13, | |
| "learning_rate": 9.87341772151899e-06, | |
| "loss": 0.7138, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 10.31, | |
| "learning_rate": 9.69258589511754e-06, | |
| "loss": 0.7118, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 10.49, | |
| "learning_rate": 9.511754068716095e-06, | |
| "loss": 0.7091, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 10.67, | |
| "learning_rate": 9.330922242314648e-06, | |
| "loss": 0.6839, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 10.85, | |
| "learning_rate": 9.150090415913203e-06, | |
| "loss": 0.7045, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 10.85, | |
| "eval_loss": 0.7454974055290222, | |
| "eval_runtime": 12.7967, | |
| "eval_samples_per_second": 614.144, | |
| "eval_steps_per_second": 4.845, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 11.03, | |
| "learning_rate": 8.969258589511754e-06, | |
| "loss": 0.7059, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 11.21, | |
| "learning_rate": 8.788426763110308e-06, | |
| "loss": 0.6759, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 11.39, | |
| "learning_rate": 8.607594936708861e-06, | |
| "loss": 0.6806, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 11.57, | |
| "learning_rate": 8.426763110307414e-06, | |
| "loss": 0.6734, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "learning_rate": 8.245931283905967e-06, | |
| "loss": 0.677, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "eval_loss": 0.7277078032493591, | |
| "eval_runtime": 12.8036, | |
| "eval_samples_per_second": 613.809, | |
| "eval_steps_per_second": 4.842, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 11.93, | |
| "learning_rate": 8.065099457504522e-06, | |
| "loss": 0.6811, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 12.12, | |
| "learning_rate": 7.884267631103075e-06, | |
| "loss": 0.669, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 12.3, | |
| "learning_rate": 7.703435804701628e-06, | |
| "loss": 0.6586, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 12.48, | |
| "learning_rate": 7.522603978300181e-06, | |
| "loss": 0.6428, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 12.66, | |
| "learning_rate": 7.341772151898735e-06, | |
| "loss": 0.6618, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 12.66, | |
| "eval_loss": 0.7127390503883362, | |
| "eval_runtime": 12.8054, | |
| "eval_samples_per_second": 613.727, | |
| "eval_steps_per_second": 4.842, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 12.84, | |
| "learning_rate": 7.160940325497288e-06, | |
| "loss": 0.6703, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 13.02, | |
| "learning_rate": 6.980108499095841e-06, | |
| "loss": 0.6577, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 13.2, | |
| "learning_rate": 6.799276672694395e-06, | |
| "loss": 0.6431, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 13.38, | |
| "learning_rate": 6.618444846292948e-06, | |
| "loss": 0.6525, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 13.56, | |
| "learning_rate": 6.437613019891501e-06, | |
| "loss": 0.6817, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 13.56, | |
| "eval_loss": 0.7023378014564514, | |
| "eval_runtime": 12.8161, | |
| "eval_samples_per_second": 613.215, | |
| "eval_steps_per_second": 4.838, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 13.74, | |
| "learning_rate": 6.256781193490055e-06, | |
| "loss": 0.6523, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 13.92, | |
| "learning_rate": 6.075949367088608e-06, | |
| "loss": 0.6679, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 14.1, | |
| "learning_rate": 5.895117540687162e-06, | |
| "loss": 0.6368, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 14.29, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.641, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 14.47, | |
| "learning_rate": 5.533453887884268e-06, | |
| "loss": 0.6337, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 14.47, | |
| "eval_loss": 0.7140629887580872, | |
| "eval_runtime": 12.8021, | |
| "eval_samples_per_second": 613.882, | |
| "eval_steps_per_second": 4.843, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 14.65, | |
| "learning_rate": 5.352622061482822e-06, | |
| "loss": 0.6359, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 14.83, | |
| "learning_rate": 5.171790235081374e-06, | |
| "loss": 0.6298, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 15.01, | |
| "learning_rate": 4.990958408679928e-06, | |
| "loss": 0.6444, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 15.19, | |
| "learning_rate": 4.8101265822784815e-06, | |
| "loss": 0.639, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 15.37, | |
| "learning_rate": 4.6292947558770344e-06, | |
| "loss": 0.6182, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 15.37, | |
| "eval_loss": 0.7226015329360962, | |
| "eval_runtime": 12.7692, | |
| "eval_samples_per_second": 615.466, | |
| "eval_steps_per_second": 4.855, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 15.55, | |
| "learning_rate": 4.448462929475588e-06, | |
| "loss": 0.6425, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 15.73, | |
| "learning_rate": 4.267631103074141e-06, | |
| "loss": 0.6255, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 15.91, | |
| "learning_rate": 4.086799276672695e-06, | |
| "loss": 0.6223, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 16.09, | |
| "learning_rate": 3.905967450271248e-06, | |
| "loss": 0.6303, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 16.27, | |
| "learning_rate": 3.7251356238698015e-06, | |
| "loss": 0.6367, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 16.27, | |
| "eval_loss": 0.645846426486969, | |
| "eval_runtime": 12.785, | |
| "eval_samples_per_second": 614.705, | |
| "eval_steps_per_second": 4.849, | |
| "step": 9000 | |
| } | |
| ], | |
| "max_steps": 11060, | |
| "num_train_epochs": 20, | |
| "total_flos": 5.947714605600499e+16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |