| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 397, | |
| "global_step": 9536, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.17, | |
| "eval_gen_len": 83.5593, | |
| "eval_loss": 1.6528608798980713, | |
| "eval_rouge1": 50.4957, | |
| "eval_rouge2": 32.5323, | |
| "eval_rougeL": 40.7567, | |
| "eval_rougeLsum": 40.5733, | |
| "eval_runtime": 41.984, | |
| "eval_samples_per_second": 1.405, | |
| "eval_steps_per_second": 0.715, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 8.36017894744873, | |
| "learning_rate": 1.9404362416107384e-05, | |
| "loss": 2.7418, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "eval_gen_len": 83.3729, | |
| "eval_loss": 1.3848459720611572, | |
| "eval_rouge1": 49.9993, | |
| "eval_rouge2": 31.5422, | |
| "eval_rougeL": 40.7043, | |
| "eval_rougeLsum": 40.6632, | |
| "eval_runtime": 41.848, | |
| "eval_samples_per_second": 1.41, | |
| "eval_steps_per_second": 0.717, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 10.013521194458008, | |
| "learning_rate": 1.880512943432407e-05, | |
| "loss": 1.6117, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_gen_len": 84.8983, | |
| "eval_loss": 1.3274192810058594, | |
| "eval_rouge1": 50.0655, | |
| "eval_rouge2": 31.4638, | |
| "eval_rougeL": 40.2184, | |
| "eval_rougeLsum": 39.9987, | |
| "eval_runtime": 42.1731, | |
| "eval_samples_per_second": 1.399, | |
| "eval_steps_per_second": 0.711, | |
| "step": 1191 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 9.272841453552246, | |
| "learning_rate": 1.820589645254075e-05, | |
| "loss": 1.4861, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_gen_len": 87.1864, | |
| "eval_loss": 1.3262691497802734, | |
| "eval_rouge1": 51.2154, | |
| "eval_rouge2": 33.6289, | |
| "eval_rougeL": 41.9642, | |
| "eval_rougeLsum": 41.7649, | |
| "eval_runtime": 43.0575, | |
| "eval_samples_per_second": 1.37, | |
| "eval_steps_per_second": 0.697, | |
| "step": 1588 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "eval_gen_len": 85.9661, | |
| "eval_loss": 1.2881355285644531, | |
| "eval_rouge1": 52.2072, | |
| "eval_rouge2": 34.2681, | |
| "eval_rougeL": 42.7582, | |
| "eval_rougeLsum": 42.5683, | |
| "eval_runtime": 42.4338, | |
| "eval_samples_per_second": 1.39, | |
| "eval_steps_per_second": 0.707, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 10.048806190490723, | |
| "learning_rate": 1.7609060402684567e-05, | |
| "loss": 1.495, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_gen_len": 80.1864, | |
| "eval_loss": 1.2640005350112915, | |
| "eval_rouge1": 52.1344, | |
| "eval_rouge2": 34.3518, | |
| "eval_rougeL": 42.9145, | |
| "eval_rougeLsum": 42.7837, | |
| "eval_runtime": 40.643, | |
| "eval_samples_per_second": 1.452, | |
| "eval_steps_per_second": 0.738, | |
| "step": 2382 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 9.145220756530762, | |
| "learning_rate": 1.7009827420901247e-05, | |
| "loss": 1.4292, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "eval_gen_len": 83.5593, | |
| "eval_loss": 1.2814366817474365, | |
| "eval_rouge1": 51.9388, | |
| "eval_rouge2": 33.6073, | |
| "eval_rougeL": 41.9771, | |
| "eval_rougeLsum": 41.8638, | |
| "eval_runtime": 41.9785, | |
| "eval_samples_per_second": 1.405, | |
| "eval_steps_per_second": 0.715, | |
| "step": 2779 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 9.626166343688965, | |
| "learning_rate": 1.641059443911793e-05, | |
| "loss": 1.2572, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "eval_gen_len": 81.7458, | |
| "eval_loss": 1.3041572570800781, | |
| "eval_rouge1": 52.685, | |
| "eval_rouge2": 34.8664, | |
| "eval_rougeL": 43.247, | |
| "eval_rougeLsum": 43.2174, | |
| "eval_runtime": 40.4041, | |
| "eval_samples_per_second": 1.46, | |
| "eval_steps_per_second": 0.742, | |
| "step": 3176 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 8.85732364654541, | |
| "learning_rate": 1.5811361457334612e-05, | |
| "loss": 1.2858, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_gen_len": 83.4915, | |
| "eval_loss": 1.250982403755188, | |
| "eval_rouge1": 53.1395, | |
| "eval_rouge2": 35.0366, | |
| "eval_rougeL": 44.0336, | |
| "eval_rougeLsum": 43.8277, | |
| "eval_runtime": 41.066, | |
| "eval_samples_per_second": 1.437, | |
| "eval_steps_per_second": 0.731, | |
| "step": 3573 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_gen_len": 85.7797, | |
| "eval_loss": 1.2450958490371704, | |
| "eval_rouge1": 53.2435, | |
| "eval_rouge2": 34.0265, | |
| "eval_rougeL": 43.1606, | |
| "eval_rougeLsum": 42.9125, | |
| "eval_runtime": 42.4863, | |
| "eval_samples_per_second": 1.389, | |
| "eval_steps_per_second": 0.706, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 9.060718536376953, | |
| "learning_rate": 1.5212128475551296e-05, | |
| "loss": 1.2632, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "eval_gen_len": 84.0678, | |
| "eval_loss": 1.2505569458007812, | |
| "eval_rouge1": 52.9033, | |
| "eval_rouge2": 34.6637, | |
| "eval_rougeL": 43.0146, | |
| "eval_rougeLsum": 42.8985, | |
| "eval_runtime": 42.1089, | |
| "eval_samples_per_second": 1.401, | |
| "eval_steps_per_second": 0.712, | |
| "step": 4367 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 7.555502414703369, | |
| "learning_rate": 1.4612895493767978e-05, | |
| "loss": 1.2367, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_gen_len": 82.322, | |
| "eval_loss": 1.2485252618789673, | |
| "eval_rouge1": 50.1387, | |
| "eval_rouge2": 31.1201, | |
| "eval_rougeL": 40.0786, | |
| "eval_rougeLsum": 40.1657, | |
| "eval_runtime": 41.189, | |
| "eval_samples_per_second": 1.432, | |
| "eval_steps_per_second": 0.728, | |
| "step": 4764 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 7.1890788078308105, | |
| "learning_rate": 1.401366251198466e-05, | |
| "loss": 1.1512, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_gen_len": 82.9322, | |
| "eval_loss": 1.261144757270813, | |
| "eval_rouge1": 52.7072, | |
| "eval_rouge2": 34.6442, | |
| "eval_rougeL": 43.2377, | |
| "eval_rougeLsum": 43.1384, | |
| "eval_runtime": 41.2591, | |
| "eval_samples_per_second": 1.43, | |
| "eval_steps_per_second": 0.727, | |
| "step": 5161 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 7.86561918258667, | |
| "learning_rate": 1.341562799616491e-05, | |
| "loss": 1.0728, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "eval_gen_len": 86.4237, | |
| "eval_loss": 1.2699768543243408, | |
| "eval_rouge1": 52.3383, | |
| "eval_rouge2": 34.7756, | |
| "eval_rougeL": 42.9406, | |
| "eval_rougeLsum": 42.7658, | |
| "eval_runtime": 42.4715, | |
| "eval_samples_per_second": 1.389, | |
| "eval_steps_per_second": 0.706, | |
| "step": 5558 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_gen_len": 84.3051, | |
| "eval_loss": 1.2631828784942627, | |
| "eval_rouge1": 52.8233, | |
| "eval_rouge2": 35.1768, | |
| "eval_rougeL": 43.8642, | |
| "eval_rougeLsum": 43.7259, | |
| "eval_runtime": 41.727, | |
| "eval_samples_per_second": 1.414, | |
| "eval_steps_per_second": 0.719, | |
| "step": 5955 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 7.868692398071289, | |
| "learning_rate": 1.2816395014381592e-05, | |
| "loss": 1.0826, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "eval_gen_len": 82.8644, | |
| "eval_loss": 1.2638760805130005, | |
| "eval_rouge1": 53.9367, | |
| "eval_rouge2": 36.2676, | |
| "eval_rougeL": 44.9414, | |
| "eval_rougeLsum": 44.7603, | |
| "eval_runtime": 42.7574, | |
| "eval_samples_per_second": 1.38, | |
| "eval_steps_per_second": 0.702, | |
| "step": 6352 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 7.449892997741699, | |
| "learning_rate": 1.2217162032598275e-05, | |
| "loss": 1.0921, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "eval_gen_len": 82.4237, | |
| "eval_loss": 1.2491506338119507, | |
| "eval_rouge1": 52.8146, | |
| "eval_rouge2": 34.6392, | |
| "eval_rougeL": 43.5323, | |
| "eval_rougeLsum": 43.4647, | |
| "eval_runtime": 42.5398, | |
| "eval_samples_per_second": 1.387, | |
| "eval_steps_per_second": 0.705, | |
| "step": 6749 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 7.139917850494385, | |
| "learning_rate": 1.1617929050814957e-05, | |
| "loss": 1.1129, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_gen_len": 83.1356, | |
| "eval_loss": 1.2625495195388794, | |
| "eval_rouge1": 53.6493, | |
| "eval_rouge2": 35.0396, | |
| "eval_rougeL": 43.501, | |
| "eval_rougeLsum": 43.4039, | |
| "eval_runtime": 43.1051, | |
| "eval_samples_per_second": 1.369, | |
| "eval_steps_per_second": 0.696, | |
| "step": 7146 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 5.8409600257873535, | |
| "learning_rate": 1.1018696069031641e-05, | |
| "loss": 0.9783, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "eval_gen_len": 84.7797, | |
| "eval_loss": 1.293487787246704, | |
| "eval_rouge1": 53.245, | |
| "eval_rouge2": 35.655, | |
| "eval_rougeL": 44.4306, | |
| "eval_rougeLsum": 44.482, | |
| "eval_runtime": 41.7791, | |
| "eval_samples_per_second": 1.412, | |
| "eval_steps_per_second": 0.718, | |
| "step": 7543 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "eval_gen_len": 84.1186, | |
| "eval_loss": 1.266953706741333, | |
| "eval_rouge1": 52.146, | |
| "eval_rouge2": 33.0632, | |
| "eval_rougeL": 41.4382, | |
| "eval_rougeLsum": 41.5159, | |
| "eval_runtime": 41.1238, | |
| "eval_samples_per_second": 1.435, | |
| "eval_steps_per_second": 0.73, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 8.737879753112793, | |
| "learning_rate": 1.0419463087248323e-05, | |
| "loss": 0.9771, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_gen_len": 82.8475, | |
| "eval_loss": 1.275550127029419, | |
| "eval_rouge1": 51.7108, | |
| "eval_rouge2": 33.5352, | |
| "eval_rougeL": 42.4153, | |
| "eval_rougeLsum": 42.4572, | |
| "eval_runtime": 41.088, | |
| "eval_samples_per_second": 1.436, | |
| "eval_steps_per_second": 0.73, | |
| "step": 8337 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 8.45171070098877, | |
| "learning_rate": 9.820230105465006e-06, | |
| "loss": 0.9841, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "eval_gen_len": 84.322, | |
| "eval_loss": 1.260237455368042, | |
| "eval_rouge1": 53.2394, | |
| "eval_rouge2": 34.9695, | |
| "eval_rougeL": 43.2182, | |
| "eval_rougeLsum": 43.1333, | |
| "eval_runtime": 41.5567, | |
| "eval_samples_per_second": 1.42, | |
| "eval_steps_per_second": 0.722, | |
| "step": 8734 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 47.454078674316406, | |
| "learning_rate": 9.220997123681688e-06, | |
| "loss": 0.9643, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "eval_gen_len": 81.4915, | |
| "eval_loss": 1.27409827709198, | |
| "eval_rouge1": 53.5588, | |
| "eval_rouge2": 36.0425, | |
| "eval_rougeL": 44.2044, | |
| "eval_rougeLsum": 44.2287, | |
| "eval_runtime": 40.5566, | |
| "eval_samples_per_second": 1.455, | |
| "eval_steps_per_second": 0.74, | |
| "step": 9131 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 7.34140157699585, | |
| "learning_rate": 8.62176414189837e-06, | |
| "loss": 0.9439, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_gen_len": 86.1864, | |
| "eval_loss": 1.2641756534576416, | |
| "eval_rouge1": 53.7305, | |
| "eval_rouge2": 35.3844, | |
| "eval_rougeL": 43.8211, | |
| "eval_rougeLsum": 43.7597, | |
| "eval_runtime": 42.1877, | |
| "eval_samples_per_second": 1.399, | |
| "eval_steps_per_second": 0.711, | |
| "step": 9528 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 16688, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 1192, | |
| "total_flos": 2.071244574793728e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |