| { | |
| "best_metric": 23.6596, | |
| "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14/checkpoint-100000", | |
| "epoch": 2.7777777777777777, | |
| "eval_steps": 10000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1388888888888889, | |
| "grad_norm": 1.066943645477295, | |
| "learning_rate": 0.000475, | |
| "loss": 1.9627, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 0.9774492383003235, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 1.7738, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "eval_bleu": 20.1598, | |
| "eval_gen_len": 28.1563, | |
| "eval_loss": 1.914583444595337, | |
| "eval_runtime": 241.8013, | |
| "eval_samples_per_second": 12.407, | |
| "eval_steps_per_second": 1.551, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 1.4306731224060059, | |
| "learning_rate": 0.000425, | |
| "loss": 1.6951, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 1.1782424449920654, | |
| "learning_rate": 0.0004, | |
| "loss": 1.6498, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "eval_bleu": 21.4167, | |
| "eval_gen_len": 27.853, | |
| "eval_loss": 1.855008840560913, | |
| "eval_runtime": 242.3949, | |
| "eval_samples_per_second": 12.376, | |
| "eval_steps_per_second": 1.547, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6944444444444444, | |
| "grad_norm": 1.219376802444458, | |
| "learning_rate": 0.000375, | |
| "loss": 1.6172, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 1.2735612392425537, | |
| "learning_rate": 0.00035, | |
| "loss": 1.5903, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "eval_bleu": 22.604, | |
| "eval_gen_len": 27.7613, | |
| "eval_loss": 1.8276705741882324, | |
| "eval_runtime": 240.5149, | |
| "eval_samples_per_second": 12.473, | |
| "eval_steps_per_second": 1.559, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.9722222222222222, | |
| "grad_norm": 1.0282609462738037, | |
| "learning_rate": 0.00032500000000000004, | |
| "loss": 1.5633, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.406827688217163, | |
| "learning_rate": 0.0003, | |
| "loss": 1.5151, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "eval_bleu": 22.1273, | |
| "eval_gen_len": 27.3187, | |
| "eval_loss": 1.8127936124801636, | |
| "eval_runtime": 234.7049, | |
| "eval_samples_per_second": 12.782, | |
| "eval_steps_per_second": 1.598, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.174306035041809, | |
| "learning_rate": 0.000275, | |
| "loss": 1.5004, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 1.5665515661239624, | |
| "learning_rate": 0.00025, | |
| "loss": 1.4866, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "eval_bleu": 22.8295, | |
| "eval_gen_len": 27.419, | |
| "eval_loss": 1.7999275922775269, | |
| "eval_runtime": 233.8115, | |
| "eval_samples_per_second": 12.831, | |
| "eval_steps_per_second": 1.604, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.5277777777777777, | |
| "grad_norm": 1.1425319910049438, | |
| "learning_rate": 0.00022500000000000002, | |
| "loss": 1.4799, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.123904824256897, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4696, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "eval_bleu": 22.9923, | |
| "eval_gen_len": 27.7387, | |
| "eval_loss": 1.780959963798523, | |
| "eval_runtime": 240.0938, | |
| "eval_samples_per_second": 12.495, | |
| "eval_steps_per_second": 1.562, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.8055555555555556, | |
| "grad_norm": 1.4292243719100952, | |
| "learning_rate": 0.000175, | |
| "loss": 1.4613, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 1.1662226915359497, | |
| "learning_rate": 0.00015, | |
| "loss": 1.4508, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "eval_bleu": 23.1046, | |
| "eval_gen_len": 27.7057, | |
| "eval_loss": 1.7654317617416382, | |
| "eval_runtime": 236.6367, | |
| "eval_samples_per_second": 12.678, | |
| "eval_steps_per_second": 1.585, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 0.9245423674583435, | |
| "learning_rate": 0.000125, | |
| "loss": 1.4235, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.2502944469451904, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4053, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "eval_bleu": 23.5079, | |
| "eval_gen_len": 27.643, | |
| "eval_loss": 1.758699655532837, | |
| "eval_runtime": 237.5663, | |
| "eval_samples_per_second": 12.628, | |
| "eval_steps_per_second": 1.579, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.361111111111111, | |
| "grad_norm": 0.9593023061752319, | |
| "learning_rate": 7.5e-05, | |
| "loss": 1.408, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.440004825592041, | |
| "learning_rate": 5e-05, | |
| "loss": 1.3956, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_bleu": 23.3848, | |
| "eval_gen_len": 27.6637, | |
| "eval_loss": 1.752461552619934, | |
| "eval_runtime": 237.0184, | |
| "eval_samples_per_second": 12.657, | |
| "eval_steps_per_second": 1.582, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.638888888888889, | |
| "grad_norm": 1.1929932832717896, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3938, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 1.0216492414474487, | |
| "learning_rate": 0.0, | |
| "loss": 1.3903, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "eval_bleu": 23.6596, | |
| "eval_gen_len": 27.526, | |
| "eval_loss": 1.7469114065170288, | |
| "eval_runtime": 235.9542, | |
| "eval_samples_per_second": 12.714, | |
| "eval_steps_per_second": 1.589, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "step": 100000, | |
| "total_flos": 3.803274433029734e+16, | |
| "train_loss": 1.5316169482421875, | |
| "train_runtime": 15895.0874, | |
| "train_samples_per_second": 100.66, | |
| "train_steps_per_second": 6.291 | |
| } | |
| ], | |
| "logging_steps": 5000, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000, | |
| "total_flos": 3.803274433029734e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |