Upload 9 files
Browse filesadd trainer_states for run 1
- trainer_state_1.json +181 -0
- trainer_state_2.json +153 -0
- trainer_state_3.json +181 -0
- trainer_state_4.json +181 -0
- trainer_state_5.json +181 -0
- trainer_state_6.json +181 -0
- trainer_state_7.json +181 -0
- trainer_state_8.json +181 -0
- trainer_state_9.json +181 -0
trainer_state_1.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.0300140380859375,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_1-task2/checkpoint-1260",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.0692138671875,
|
| 14 |
+
"eval_rouge1": 95.7517,
|
| 15 |
+
"eval_rouge2": 94.6841,
|
| 16 |
+
"eval_rougeL": 95.6971,
|
| 17 |
+
"eval_rougeLsum": 95.7331,
|
| 18 |
+
"eval_runtime": 33.2038,
|
| 19 |
+
"eval_samples_per_second": 33.731,
|
| 20 |
+
"eval_steps_per_second": 1.054,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.034423828125,
|
| 26 |
+
"eval_rouge1": 97.351,
|
| 27 |
+
"eval_rouge2": 96.6251,
|
| 28 |
+
"eval_rougeL": 97.3032,
|
| 29 |
+
"eval_rougeLsum": 97.2964,
|
| 30 |
+
"eval_runtime": 32.6308,
|
| 31 |
+
"eval_samples_per_second": 34.323,
|
| 32 |
+
"eval_steps_per_second": 1.073,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.4508669972419739,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.0924,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.03924560546875,
|
| 45 |
+
"eval_rouge1": 97.2483,
|
| 46 |
+
"eval_rouge2": 96.6208,
|
| 47 |
+
"eval_rougeL": 97.2291,
|
| 48 |
+
"eval_rougeLsum": 97.2002,
|
| 49 |
+
"eval_runtime": 33.4876,
|
| 50 |
+
"eval_samples_per_second": 33.445,
|
| 51 |
+
"eval_steps_per_second": 1.045,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.0413818359375,
|
| 57 |
+
"eval_rouge1": 97.1906,
|
| 58 |
+
"eval_rouge2": 96.559,
|
| 59 |
+
"eval_rougeL": 97.1839,
|
| 60 |
+
"eval_rougeLsum": 97.1621,
|
| 61 |
+
"eval_runtime": 34.4429,
|
| 62 |
+
"eval_samples_per_second": 32.518,
|
| 63 |
+
"eval_steps_per_second": 1.016,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.1619143784046173,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0276,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.042449951171875,
|
| 76 |
+
"eval_rouge1": 97.4464,
|
| 77 |
+
"eval_rouge2": 96.8611,
|
| 78 |
+
"eval_rougeL": 97.4297,
|
| 79 |
+
"eval_rougeLsum": 97.4266,
|
| 80 |
+
"eval_runtime": 32.4102,
|
| 81 |
+
"eval_samples_per_second": 34.557,
|
| 82 |
+
"eval_steps_per_second": 1.08,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.0300140380859375,
|
| 88 |
+
"eval_rouge1": 98.1516,
|
| 89 |
+
"eval_rouge2": 97.6994,
|
| 90 |
+
"eval_rougeL": 98.1475,
|
| 91 |
+
"eval_rougeLsum": 98.155,
|
| 92 |
+
"eval_runtime": 32.3626,
|
| 93 |
+
"eval_samples_per_second": 34.608,
|
| 94 |
+
"eval_steps_per_second": 1.081,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.031494140625,
|
| 100 |
+
"eval_rouge1": 97.6953,
|
| 101 |
+
"eval_rouge2": 97.1861,
|
| 102 |
+
"eval_rougeL": 97.7355,
|
| 103 |
+
"eval_rougeLsum": 97.713,
|
| 104 |
+
"eval_runtime": 31.6892,
|
| 105 |
+
"eval_samples_per_second": 35.343,
|
| 106 |
+
"eval_steps_per_second": 1.104,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.17737896740436554,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0138,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.03729248046875,
|
| 119 |
+
"eval_rouge1": 98.0557,
|
| 120 |
+
"eval_rouge2": 97.5844,
|
| 121 |
+
"eval_rougeL": 98.0434,
|
| 122 |
+
"eval_rougeLsum": 98.0396,
|
| 123 |
+
"eval_runtime": 32.4888,
|
| 124 |
+
"eval_samples_per_second": 34.473,
|
| 125 |
+
"eval_steps_per_second": 1.077,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.0333251953125,
|
| 131 |
+
"eval_rouge1": 98.2547,
|
| 132 |
+
"eval_rouge2": 97.8119,
|
| 133 |
+
"eval_rougeL": 98.2452,
|
| 134 |
+
"eval_rougeLsum": 98.2669,
|
| 135 |
+
"eval_runtime": 31.823,
|
| 136 |
+
"eval_samples_per_second": 35.195,
|
| 137 |
+
"eval_steps_per_second": 1.1,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.1923867166042328,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0088,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.03363037109375,
|
| 150 |
+
"eval_rouge1": 98.2547,
|
| 151 |
+
"eval_rouge2": 97.8119,
|
| 152 |
+
"eval_rougeL": 98.2452,
|
| 153 |
+
"eval_rougeLsum": 98.2669,
|
| 154 |
+
"eval_runtime": 31.6927,
|
| 155 |
+
"eval_samples_per_second": 35.339,
|
| 156 |
+
"eval_steps_per_second": 1.104,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_2.json
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.0810546875,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_2-task2/checkpoint-6",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 30,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.10284423828125,
|
| 14 |
+
"eval_rouge1": 95.9465,
|
| 15 |
+
"eval_rouge2": 93.9682,
|
| 16 |
+
"eval_rougeL": 95.3944,
|
| 17 |
+
"eval_rougeLsum": 95.9557,
|
| 18 |
+
"eval_runtime": 29.9579,
|
| 19 |
+
"eval_samples_per_second": 37.386,
|
| 20 |
+
"eval_steps_per_second": 1.168,
|
| 21 |
+
"step": 3
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.0810546875,
|
| 26 |
+
"eval_rouge1": 96.5004,
|
| 27 |
+
"eval_rouge2": 94.78,
|
| 28 |
+
"eval_rougeL": 96.0088,
|
| 29 |
+
"eval_rougeLsum": 96.4773,
|
| 30 |
+
"eval_runtime": 28.554,
|
| 31 |
+
"eval_samples_per_second": 39.224,
|
| 32 |
+
"eval_steps_per_second": 1.226,
|
| 33 |
+
"step": 6
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 3.0,
|
| 37 |
+
"eval_loss": 0.0819091796875,
|
| 38 |
+
"eval_rouge1": 96.6176,
|
| 39 |
+
"eval_rouge2": 94.9705,
|
| 40 |
+
"eval_rougeL": 96.1457,
|
| 41 |
+
"eval_rougeLsum": 96.6106,
|
| 42 |
+
"eval_runtime": 28.3985,
|
| 43 |
+
"eval_samples_per_second": 39.439,
|
| 44 |
+
"eval_steps_per_second": 1.232,
|
| 45 |
+
"step": 9
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 4.0,
|
| 49 |
+
"eval_loss": 0.095703125,
|
| 50 |
+
"eval_rouge1": 96.1348,
|
| 51 |
+
"eval_rouge2": 94.1813,
|
| 52 |
+
"eval_rougeL": 95.5333,
|
| 53 |
+
"eval_rougeLsum": 96.1226,
|
| 54 |
+
"eval_runtime": 27.8269,
|
| 55 |
+
"eval_samples_per_second": 40.249,
|
| 56 |
+
"eval_steps_per_second": 1.258,
|
| 57 |
+
"step": 12
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"epoch": 5.0,
|
| 61 |
+
"eval_loss": 0.1134033203125,
|
| 62 |
+
"eval_rouge1": 95.4562,
|
| 63 |
+
"eval_rouge2": 93.2875,
|
| 64 |
+
"eval_rougeL": 94.812,
|
| 65 |
+
"eval_rougeLsum": 95.4534,
|
| 66 |
+
"eval_runtime": 27.905,
|
| 67 |
+
"eval_samples_per_second": 40.136,
|
| 68 |
+
"eval_steps_per_second": 1.254,
|
| 69 |
+
"step": 15
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"epoch": 6.0,
|
| 73 |
+
"eval_loss": 0.1260986328125,
|
| 74 |
+
"eval_rouge1": 94.9572,
|
| 75 |
+
"eval_rouge2": 92.576,
|
| 76 |
+
"eval_rougeL": 94.3038,
|
| 77 |
+
"eval_rougeLsum": 94.9406,
|
| 78 |
+
"eval_runtime": 27.7213,
|
| 79 |
+
"eval_samples_per_second": 40.402,
|
| 80 |
+
"eval_steps_per_second": 1.263,
|
| 81 |
+
"step": 18
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"epoch": 7.0,
|
| 85 |
+
"eval_loss": 0.132080078125,
|
| 86 |
+
"eval_rouge1": 94.7046,
|
| 87 |
+
"eval_rouge2": 92.2489,
|
| 88 |
+
"eval_rougeL": 94.0746,
|
| 89 |
+
"eval_rougeLsum": 94.7023,
|
| 90 |
+
"eval_runtime": 27.3971,
|
| 91 |
+
"eval_samples_per_second": 40.88,
|
| 92 |
+
"eval_steps_per_second": 1.278,
|
| 93 |
+
"step": 21
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 8.0,
|
| 97 |
+
"eval_loss": 0.1346435546875,
|
| 98 |
+
"eval_rouge1": 94.6117,
|
| 99 |
+
"eval_rouge2": 92.0736,
|
| 100 |
+
"eval_rougeL": 93.9435,
|
| 101 |
+
"eval_rougeLsum": 94.6048,
|
| 102 |
+
"eval_runtime": 27.2256,
|
| 103 |
+
"eval_samples_per_second": 41.138,
|
| 104 |
+
"eval_steps_per_second": 1.286,
|
| 105 |
+
"step": 24
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"epoch": 9.0,
|
| 109 |
+
"eval_loss": 0.1351318359375,
|
| 110 |
+
"eval_rouge1": 94.5465,
|
| 111 |
+
"eval_rouge2": 91.9795,
|
| 112 |
+
"eval_rougeL": 93.8758,
|
| 113 |
+
"eval_rougeLsum": 94.5584,
|
| 114 |
+
"eval_runtime": 27.1737,
|
| 115 |
+
"eval_samples_per_second": 41.216,
|
| 116 |
+
"eval_steps_per_second": 1.288,
|
| 117 |
+
"step": 27
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"epoch": 10.0,
|
| 121 |
+
"eval_loss": 0.1353759765625,
|
| 122 |
+
"eval_rouge1": 94.5465,
|
| 123 |
+
"eval_rouge2": 91.9795,
|
| 124 |
+
"eval_rougeL": 93.8758,
|
| 125 |
+
"eval_rougeLsum": 94.5584,
|
| 126 |
+
"eval_runtime": 27.1504,
|
| 127 |
+
"eval_samples_per_second": 41.252,
|
| 128 |
+
"eval_steps_per_second": 1.289,
|
| 129 |
+
"step": 30
|
| 130 |
+
}
|
| 131 |
+
],
|
| 132 |
+
"logging_steps": 500,
|
| 133 |
+
"max_steps": 30,
|
| 134 |
+
"num_input_tokens_seen": 0,
|
| 135 |
+
"num_train_epochs": 10,
|
| 136 |
+
"save_steps": 500,
|
| 137 |
+
"stateful_callbacks": {
|
| 138 |
+
"TrainerControl": {
|
| 139 |
+
"args": {
|
| 140 |
+
"should_epoch_stop": false,
|
| 141 |
+
"should_evaluate": false,
|
| 142 |
+
"should_log": false,
|
| 143 |
+
"should_save": true,
|
| 144 |
+
"should_training_stop": true
|
| 145 |
+
},
|
| 146 |
+
"attributes": {}
|
| 147 |
+
}
|
| 148 |
+
},
|
| 149 |
+
"total_flos": 274990104576000.0,
|
| 150 |
+
"train_batch_size": 16,
|
| 151 |
+
"trial_name": null,
|
| 152 |
+
"trial_params": null
|
| 153 |
+
}
|
trainer_state_3.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.000484466552734375,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_2-task3/checkpoint-1680",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.005573272705078125,
|
| 14 |
+
"eval_rouge1": 99.5247,
|
| 15 |
+
"eval_rouge2": 99.1868,
|
| 16 |
+
"eval_rougeL": 99.3767,
|
| 17 |
+
"eval_rougeLsum": 99.523,
|
| 18 |
+
"eval_runtime": 32.5017,
|
| 19 |
+
"eval_samples_per_second": 34.46,
|
| 20 |
+
"eval_steps_per_second": 1.077,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.003314971923828125,
|
| 26 |
+
"eval_rouge1": 99.6944,
|
| 27 |
+
"eval_rouge2": 99.4612,
|
| 28 |
+
"eval_rougeL": 99.5701,
|
| 29 |
+
"eval_rougeLsum": 99.6944,
|
| 30 |
+
"eval_runtime": 31.661,
|
| 31 |
+
"eval_samples_per_second": 35.375,
|
| 32 |
+
"eval_steps_per_second": 1.105,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 1.6153414249420166,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.0413,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.0029888153076171875,
|
| 45 |
+
"eval_rouge1": 99.779,
|
| 46 |
+
"eval_rouge2": 99.6013,
|
| 47 |
+
"eval_rougeL": 99.6771,
|
| 48 |
+
"eval_rougeLsum": 99.7743,
|
| 49 |
+
"eval_runtime": 32.3166,
|
| 50 |
+
"eval_samples_per_second": 34.657,
|
| 51 |
+
"eval_steps_per_second": 1.083,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.002841949462890625,
|
| 57 |
+
"eval_rouge1": 99.6692,
|
| 58 |
+
"eval_rouge2": 99.4798,
|
| 59 |
+
"eval_rougeL": 99.5698,
|
| 60 |
+
"eval_rougeLsum": 99.667,
|
| 61 |
+
"eval_runtime": 32.5644,
|
| 62 |
+
"eval_samples_per_second": 34.393,
|
| 63 |
+
"eval_steps_per_second": 1.075,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.001943291281349957,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0055,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.0016384124755859375,
|
| 76 |
+
"eval_rouge1": 99.8226,
|
| 77 |
+
"eval_rouge2": 99.6738,
|
| 78 |
+
"eval_rougeL": 99.7404,
|
| 79 |
+
"eval_rougeLsum": 99.8124,
|
| 80 |
+
"eval_runtime": 32.1973,
|
| 81 |
+
"eval_samples_per_second": 34.786,
|
| 82 |
+
"eval_steps_per_second": 1.087,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.0015192031860351562,
|
| 88 |
+
"eval_rouge1": 99.8683,
|
| 89 |
+
"eval_rouge2": 99.77,
|
| 90 |
+
"eval_rougeL": 99.8145,
|
| 91 |
+
"eval_rougeLsum": 99.8683,
|
| 92 |
+
"eval_runtime": 31.6836,
|
| 93 |
+
"eval_samples_per_second": 35.35,
|
| 94 |
+
"eval_steps_per_second": 1.105,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.001491546630859375,
|
| 100 |
+
"eval_rouge1": 99.9154,
|
| 101 |
+
"eval_rouge2": 99.8424,
|
| 102 |
+
"eval_rougeL": 99.8778,
|
| 103 |
+
"eval_rougeLsum": 99.9154,
|
| 104 |
+
"eval_runtime": 31.7073,
|
| 105 |
+
"eval_samples_per_second": 35.323,
|
| 106 |
+
"eval_steps_per_second": 1.104,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.034123487770557404,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0014,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.000484466552734375,
|
| 119 |
+
"eval_rouge1": 99.9577,
|
| 120 |
+
"eval_rouge2": 99.9212,
|
| 121 |
+
"eval_rougeL": 99.9389,
|
| 122 |
+
"eval_rougeLsum": 99.9577,
|
| 123 |
+
"eval_runtime": 31.5909,
|
| 124 |
+
"eval_samples_per_second": 35.453,
|
| 125 |
+
"eval_steps_per_second": 1.108,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.0007829666137695312,
|
| 131 |
+
"eval_rouge1": 99.9577,
|
| 132 |
+
"eval_rouge2": 99.9212,
|
| 133 |
+
"eval_rougeL": 99.9389,
|
| 134 |
+
"eval_rougeLsum": 99.9577,
|
| 135 |
+
"eval_runtime": 31.712,
|
| 136 |
+
"eval_samples_per_second": 35.318,
|
| 137 |
+
"eval_steps_per_second": 1.104,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 3.215530887246132e-05,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0007,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.0007658004760742188,
|
| 150 |
+
"eval_rouge1": 99.9577,
|
| 151 |
+
"eval_rouge2": 99.9212,
|
| 152 |
+
"eval_rougeL": 99.9389,
|
| 153 |
+
"eval_rougeLsum": 99.9577,
|
| 154 |
+
"eval_runtime": 31.6985,
|
| 155 |
+
"eval_samples_per_second": 35.333,
|
| 156 |
+
"eval_steps_per_second": 1.104,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_4.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.016448974609375,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_3-task4/checkpoint-1890",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.0394287109375,
|
| 14 |
+
"eval_rouge1": 98.8133,
|
| 15 |
+
"eval_rouge2": 98.2303,
|
| 16 |
+
"eval_rougeL": 98.6613,
|
| 17 |
+
"eval_rougeLsum": 98.8046,
|
| 18 |
+
"eval_runtime": 26.1194,
|
| 19 |
+
"eval_samples_per_second": 42.88,
|
| 20 |
+
"eval_steps_per_second": 1.34,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.032379150390625,
|
| 26 |
+
"eval_rouge1": 98.0957,
|
| 27 |
+
"eval_rouge2": 97.1666,
|
| 28 |
+
"eval_rougeL": 97.8621,
|
| 29 |
+
"eval_rougeLsum": 98.1056,
|
| 30 |
+
"eval_runtime": 25.8031,
|
| 31 |
+
"eval_samples_per_second": 43.406,
|
| 32 |
+
"eval_steps_per_second": 1.356,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.016574041917920113,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.0725,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.0189056396484375,
|
| 45 |
+
"eval_rouge1": 99.423,
|
| 46 |
+
"eval_rouge2": 99.1539,
|
| 47 |
+
"eval_rougeL": 99.359,
|
| 48 |
+
"eval_rougeLsum": 99.4285,
|
| 49 |
+
"eval_runtime": 25.7666,
|
| 50 |
+
"eval_samples_per_second": 43.467,
|
| 51 |
+
"eval_steps_per_second": 1.358,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.03497314453125,
|
| 57 |
+
"eval_rouge1": 98.9701,
|
| 58 |
+
"eval_rouge2": 98.497,
|
| 59 |
+
"eval_rougeL": 98.8414,
|
| 60 |
+
"eval_rougeLsum": 98.9712,
|
| 61 |
+
"eval_runtime": 25.9235,
|
| 62 |
+
"eval_samples_per_second": 43.204,
|
| 63 |
+
"eval_steps_per_second": 1.35,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.016128525137901306,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0226,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.0195159912109375,
|
| 76 |
+
"eval_rouge1": 99.2315,
|
| 77 |
+
"eval_rouge2": 98.8414,
|
| 78 |
+
"eval_rougeL": 99.1293,
|
| 79 |
+
"eval_rougeLsum": 99.2314,
|
| 80 |
+
"eval_runtime": 26.0919,
|
| 81 |
+
"eval_samples_per_second": 42.925,
|
| 82 |
+
"eval_steps_per_second": 1.341,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.020782470703125,
|
| 88 |
+
"eval_rouge1": 99.5165,
|
| 89 |
+
"eval_rouge2": 99.2985,
|
| 90 |
+
"eval_rougeL": 99.4726,
|
| 91 |
+
"eval_rougeLsum": 99.5153,
|
| 92 |
+
"eval_runtime": 25.9934,
|
| 93 |
+
"eval_samples_per_second": 43.088,
|
| 94 |
+
"eval_steps_per_second": 1.346,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.0180206298828125,
|
| 100 |
+
"eval_rouge1": 99.5187,
|
| 101 |
+
"eval_rouge2": 99.3048,
|
| 102 |
+
"eval_rougeL": 99.4708,
|
| 103 |
+
"eval_rougeLsum": 99.5346,
|
| 104 |
+
"eval_runtime": 25.8971,
|
| 105 |
+
"eval_samples_per_second": 43.248,
|
| 106 |
+
"eval_steps_per_second": 1.352,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.08678867667913437,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0096,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.016693115234375,
|
| 119 |
+
"eval_rouge1": 99.4715,
|
| 120 |
+
"eval_rouge2": 99.2326,
|
| 121 |
+
"eval_rougeL": 99.4096,
|
| 122 |
+
"eval_rougeLsum": 99.484,
|
| 123 |
+
"eval_runtime": 25.9209,
|
| 124 |
+
"eval_samples_per_second": 43.208,
|
| 125 |
+
"eval_steps_per_second": 1.35,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.016448974609375,
|
| 131 |
+
"eval_rouge1": 99.5187,
|
| 132 |
+
"eval_rouge2": 99.3048,
|
| 133 |
+
"eval_rougeL": 99.4708,
|
| 134 |
+
"eval_rougeLsum": 99.5346,
|
| 135 |
+
"eval_runtime": 25.8641,
|
| 136 |
+
"eval_samples_per_second": 43.303,
|
| 137 |
+
"eval_steps_per_second": 1.353,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.015132551081478596,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0063,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.0164794921875,
|
| 150 |
+
"eval_rouge1": 99.5187,
|
| 151 |
+
"eval_rouge2": 99.3048,
|
| 152 |
+
"eval_rougeL": 99.4708,
|
| 153 |
+
"eval_rougeLsum": 99.5346,
|
| 154 |
+
"eval_runtime": 25.9133,
|
| 155 |
+
"eval_samples_per_second": 43.221,
|
| 156 |
+
"eval_steps_per_second": 1.351,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_5.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.0059661865234375,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_4-task5/checkpoint-630",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.017303466796875,
|
| 14 |
+
"eval_rouge1": 98.4539,
|
| 15 |
+
"eval_rouge2": 97.5762,
|
| 16 |
+
"eval_rougeL": 98.1882,
|
| 17 |
+
"eval_rougeLsum": 98.4482,
|
| 18 |
+
"eval_runtime": 34.4843,
|
| 19 |
+
"eval_samples_per_second": 32.479,
|
| 20 |
+
"eval_steps_per_second": 1.015,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.00751495361328125,
|
| 26 |
+
"eval_rouge1": 99.1583,
|
| 27 |
+
"eval_rouge2": 98.6851,
|
| 28 |
+
"eval_rougeL": 99.0047,
|
| 29 |
+
"eval_rougeLsum": 99.1647,
|
| 30 |
+
"eval_runtime": 32.741,
|
| 31 |
+
"eval_samples_per_second": 34.208,
|
| 32 |
+
"eval_steps_per_second": 1.069,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.014579183422029018,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.0573,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.0059661865234375,
|
| 45 |
+
"eval_rouge1": 99.3646,
|
| 46 |
+
"eval_rouge2": 98.9205,
|
| 47 |
+
"eval_rougeL": 99.2221,
|
| 48 |
+
"eval_rougeLsum": 99.3603,
|
| 49 |
+
"eval_runtime": 32.9254,
|
| 50 |
+
"eval_samples_per_second": 34.016,
|
| 51 |
+
"eval_steps_per_second": 1.063,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.00925445556640625,
|
| 57 |
+
"eval_rouge1": 99.1633,
|
| 58 |
+
"eval_rouge2": 98.6891,
|
| 59 |
+
"eval_rougeL": 99.0235,
|
| 60 |
+
"eval_rougeLsum": 99.1601,
|
| 61 |
+
"eval_runtime": 33.2811,
|
| 62 |
+
"eval_samples_per_second": 33.653,
|
| 63 |
+
"eval_steps_per_second": 1.052,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.1038060188293457,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0102,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.007312774658203125,
|
| 76 |
+
"eval_rouge1": 99.3523,
|
| 77 |
+
"eval_rouge2": 98.8747,
|
| 78 |
+
"eval_rougeL": 99.1909,
|
| 79 |
+
"eval_rougeLsum": 99.3521,
|
| 80 |
+
"eval_runtime": 33.1107,
|
| 81 |
+
"eval_samples_per_second": 33.826,
|
| 82 |
+
"eval_steps_per_second": 1.057,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.008575439453125,
|
| 88 |
+
"eval_rouge1": 99.4615,
|
| 89 |
+
"eval_rouge2": 99.0736,
|
| 90 |
+
"eval_rougeL": 99.3351,
|
| 91 |
+
"eval_rougeLsum": 99.4494,
|
| 92 |
+
"eval_runtime": 32.8562,
|
| 93 |
+
"eval_samples_per_second": 34.088,
|
| 94 |
+
"eval_steps_per_second": 1.065,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.00952911376953125,
|
| 100 |
+
"eval_rouge1": 99.3799,
|
| 101 |
+
"eval_rouge2": 99.0097,
|
| 102 |
+
"eval_rougeL": 99.2562,
|
| 103 |
+
"eval_rougeLsum": 99.371,
|
| 104 |
+
"eval_runtime": 33.1813,
|
| 105 |
+
"eval_samples_per_second": 33.754,
|
| 106 |
+
"eval_steps_per_second": 1.055,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.025980567559599876,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0042,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.00867462158203125,
|
| 119 |
+
"eval_rouge1": 99.4488,
|
| 120 |
+
"eval_rouge2": 99.0827,
|
| 121 |
+
"eval_rougeL": 99.3346,
|
| 122 |
+
"eval_rougeLsum": 99.4596,
|
| 123 |
+
"eval_runtime": 32.7477,
|
| 124 |
+
"eval_samples_per_second": 34.201,
|
| 125 |
+
"eval_steps_per_second": 1.069,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.00786590576171875,
|
| 131 |
+
"eval_rouge1": 99.3538,
|
| 132 |
+
"eval_rouge2": 98.9422,
|
| 133 |
+
"eval_rougeL": 99.2192,
|
| 134 |
+
"eval_rougeLsum": 99.3563,
|
| 135 |
+
"eval_runtime": 33.0744,
|
| 136 |
+
"eval_samples_per_second": 33.863,
|
| 137 |
+
"eval_steps_per_second": 1.058,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.18442556262016296,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0023,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.007965087890625,
|
| 150 |
+
"eval_rouge1": 99.3538,
|
| 151 |
+
"eval_rouge2": 98.9422,
|
| 152 |
+
"eval_rougeL": 99.2192,
|
| 153 |
+
"eval_rougeLsum": 99.3563,
|
| 154 |
+
"eval_runtime": 33.0331,
|
| 155 |
+
"eval_samples_per_second": 33.905,
|
| 156 |
+
"eval_steps_per_second": 1.06,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_6.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.03607177734375,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_5-task6/checkpoint-1260",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.051544189453125,
|
| 14 |
+
"eval_rouge1": 97.5746,
|
| 15 |
+
"eval_rouge2": 95.9962,
|
| 16 |
+
"eval_rougeL": 96.8014,
|
| 17 |
+
"eval_rougeLsum": 97.5759,
|
| 18 |
+
"eval_runtime": 33.8471,
|
| 19 |
+
"eval_samples_per_second": 33.09,
|
| 20 |
+
"eval_steps_per_second": 1.034,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.038238525390625,
|
| 26 |
+
"eval_rouge1": 97.0504,
|
| 27 |
+
"eval_rouge2": 95.0089,
|
| 28 |
+
"eval_rougeL": 96.0484,
|
| 29 |
+
"eval_rougeLsum": 97.0773,
|
| 30 |
+
"eval_runtime": 33.7575,
|
| 31 |
+
"eval_samples_per_second": 33.178,
|
| 32 |
+
"eval_steps_per_second": 1.037,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.15559855103492737,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.0759,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.039306640625,
|
| 45 |
+
"eval_rouge1": 97.8822,
|
| 46 |
+
"eval_rouge2": 96.4368,
|
| 47 |
+
"eval_rougeL": 97.1693,
|
| 48 |
+
"eval_rougeLsum": 97.886,
|
| 49 |
+
"eval_runtime": 33.8292,
|
| 50 |
+
"eval_samples_per_second": 33.107,
|
| 51 |
+
"eval_steps_per_second": 1.035,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.036468505859375,
|
| 57 |
+
"eval_rouge1": 97.8707,
|
| 58 |
+
"eval_rouge2": 96.3824,
|
| 59 |
+
"eval_rougeL": 97.1565,
|
| 60 |
+
"eval_rougeLsum": 97.88,
|
| 61 |
+
"eval_runtime": 34.5353,
|
| 62 |
+
"eval_samples_per_second": 32.431,
|
| 63 |
+
"eval_steps_per_second": 1.013,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.021474618464708328,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0211,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.03961181640625,
|
| 76 |
+
"eval_rouge1": 97.5277,
|
| 77 |
+
"eval_rouge2": 95.8576,
|
| 78 |
+
"eval_rougeL": 96.691,
|
| 79 |
+
"eval_rougeLsum": 97.527,
|
| 80 |
+
"eval_runtime": 33.9424,
|
| 81 |
+
"eval_samples_per_second": 32.997,
|
| 82 |
+
"eval_steps_per_second": 1.031,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.03607177734375,
|
| 88 |
+
"eval_rouge1": 98.1371,
|
| 89 |
+
"eval_rouge2": 96.8791,
|
| 90 |
+
"eval_rougeL": 97.5059,
|
| 91 |
+
"eval_rougeLsum": 98.173,
|
| 92 |
+
"eval_runtime": 33.9961,
|
| 93 |
+
"eval_samples_per_second": 32.945,
|
| 94 |
+
"eval_steps_per_second": 1.03,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.040924072265625,
|
| 100 |
+
"eval_rouge1": 98.1004,
|
| 101 |
+
"eval_rouge2": 96.774,
|
| 102 |
+
"eval_rougeL": 97.4333,
|
| 103 |
+
"eval_rougeLsum": 98.1087,
|
| 104 |
+
"eval_runtime": 34.4539,
|
| 105 |
+
"eval_samples_per_second": 32.507,
|
| 106 |
+
"eval_steps_per_second": 1.016,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.08763577789068222,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0103,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.03826904296875,
|
| 119 |
+
"eval_rouge1": 97.8337,
|
| 120 |
+
"eval_rouge2": 96.3488,
|
| 121 |
+
"eval_rougeL": 97.096,
|
| 122 |
+
"eval_rougeLsum": 97.8664,
|
| 123 |
+
"eval_runtime": 34.3599,
|
| 124 |
+
"eval_samples_per_second": 32.596,
|
| 125 |
+
"eval_steps_per_second": 1.019,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.038909912109375,
|
| 131 |
+
"eval_rouge1": 97.9644,
|
| 132 |
+
"eval_rouge2": 96.525,
|
| 133 |
+
"eval_rougeL": 97.2236,
|
| 134 |
+
"eval_rougeLsum": 97.9585,
|
| 135 |
+
"eval_runtime": 33.8333,
|
| 136 |
+
"eval_samples_per_second": 33.103,
|
| 137 |
+
"eval_steps_per_second": 1.034,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.19326545298099518,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0071,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.039276123046875,
|
| 150 |
+
"eval_rouge1": 98.0097,
|
| 151 |
+
"eval_rouge2": 96.6105,
|
| 152 |
+
"eval_rougeL": 97.2833,
|
| 153 |
+
"eval_rougeLsum": 98.0092,
|
| 154 |
+
"eval_runtime": 33.8974,
|
| 155 |
+
"eval_samples_per_second": 33.041,
|
| 156 |
+
"eval_steps_per_second": 1.033,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_7.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.0484619140625,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_6-task7/checkpoint-1470",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.1253662109375,
|
| 14 |
+
"eval_rouge1": 93.102,
|
| 15 |
+
"eval_rouge2": 89.1364,
|
| 16 |
+
"eval_rougeL": 92.1128,
|
| 17 |
+
"eval_rougeLsum": 93.0898,
|
| 18 |
+
"eval_runtime": 30.5369,
|
| 19 |
+
"eval_samples_per_second": 36.677,
|
| 20 |
+
"eval_steps_per_second": 1.146,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.07708740234375,
|
| 26 |
+
"eval_rouge1": 94.6253,
|
| 27 |
+
"eval_rouge2": 91.7196,
|
| 28 |
+
"eval_rougeL": 93.9284,
|
| 29 |
+
"eval_rougeLsum": 94.6153,
|
| 30 |
+
"eval_runtime": 29.0183,
|
| 31 |
+
"eval_samples_per_second": 38.596,
|
| 32 |
+
"eval_steps_per_second": 1.206,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.4770593047142029,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.1042,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.05633544921875,
|
| 45 |
+
"eval_rouge1": 95.1246,
|
| 46 |
+
"eval_rouge2": 92.2081,
|
| 47 |
+
"eval_rougeL": 94.3701,
|
| 48 |
+
"eval_rougeLsum": 95.1249,
|
| 49 |
+
"eval_runtime": 28.82,
|
| 50 |
+
"eval_samples_per_second": 38.862,
|
| 51 |
+
"eval_steps_per_second": 1.214,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.052490234375,
|
| 57 |
+
"eval_rouge1": 95.9748,
|
| 58 |
+
"eval_rouge2": 93.6071,
|
| 59 |
+
"eval_rougeL": 95.3787,
|
| 60 |
+
"eval_rougeLsum": 95.9622,
|
| 61 |
+
"eval_runtime": 28.7568,
|
| 62 |
+
"eval_samples_per_second": 38.947,
|
| 63 |
+
"eval_steps_per_second": 1.217,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.2157868593931198,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0397,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.052734375,
|
| 76 |
+
"eval_rouge1": 96.4573,
|
| 77 |
+
"eval_rouge2": 94.4045,
|
| 78 |
+
"eval_rougeL": 95.96,
|
| 79 |
+
"eval_rougeLsum": 96.4689,
|
| 80 |
+
"eval_runtime": 28.2262,
|
| 81 |
+
"eval_samples_per_second": 39.679,
|
| 82 |
+
"eval_steps_per_second": 1.24,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.053009033203125,
|
| 88 |
+
"eval_rouge1": 96.692,
|
| 89 |
+
"eval_rouge2": 94.7143,
|
| 90 |
+
"eval_rougeL": 96.2205,
|
| 91 |
+
"eval_rougeLsum": 96.6725,
|
| 92 |
+
"eval_runtime": 28.1396,
|
| 93 |
+
"eval_samples_per_second": 39.802,
|
| 94 |
+
"eval_steps_per_second": 1.244,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.0484619140625,
|
| 100 |
+
"eval_rouge1": 96.2898,
|
| 101 |
+
"eval_rouge2": 94.1342,
|
| 102 |
+
"eval_rougeL": 95.7357,
|
| 103 |
+
"eval_rougeLsum": 96.2989,
|
| 104 |
+
"eval_runtime": 28.7901,
|
| 105 |
+
"eval_samples_per_second": 38.902,
|
| 106 |
+
"eval_steps_per_second": 1.216,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.13793426752090454,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0253,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.050872802734375,
|
| 119 |
+
"eval_rouge1": 96.419,
|
| 120 |
+
"eval_rouge2": 94.2908,
|
| 121 |
+
"eval_rougeL": 95.887,
|
| 122 |
+
"eval_rougeLsum": 96.431,
|
| 123 |
+
"eval_runtime": 28.8197,
|
| 124 |
+
"eval_samples_per_second": 38.862,
|
| 125 |
+
"eval_steps_per_second": 1.214,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.050994873046875,
|
| 131 |
+
"eval_rouge1": 96.5301,
|
| 132 |
+
"eval_rouge2": 94.465,
|
| 133 |
+
"eval_rougeL": 96.014,
|
| 134 |
+
"eval_rougeLsum": 96.5445,
|
| 135 |
+
"eval_runtime": 28.8499,
|
| 136 |
+
"eval_samples_per_second": 38.822,
|
| 137 |
+
"eval_steps_per_second": 1.213,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.44433069229125977,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0172,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.05072021484375,
|
| 150 |
+
"eval_rouge1": 96.5339,
|
| 151 |
+
"eval_rouge2": 94.4734,
|
| 152 |
+
"eval_rougeL": 96.0133,
|
| 153 |
+
"eval_rougeLsum": 96.5439,
|
| 154 |
+
"eval_runtime": 28.8295,
|
| 155 |
+
"eval_samples_per_second": 38.849,
|
| 156 |
+
"eval_steps_per_second": 1.214,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_8.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.053009033203125,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_7-task8/checkpoint-2100",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.061859130859375,
|
| 14 |
+
"eval_rouge1": 95.8441,
|
| 15 |
+
"eval_rouge2": 93.3788,
|
| 16 |
+
"eval_rougeL": 95.0225,
|
| 17 |
+
"eval_rougeLsum": 95.815,
|
| 18 |
+
"eval_runtime": 27.9858,
|
| 19 |
+
"eval_samples_per_second": 40.02,
|
| 20 |
+
"eval_steps_per_second": 1.251,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.06298828125,
|
| 26 |
+
"eval_rouge1": 96.0859,
|
| 27 |
+
"eval_rouge2": 93.638,
|
| 28 |
+
"eval_rougeL": 95.2187,
|
| 29 |
+
"eval_rougeLsum": 96.1001,
|
| 30 |
+
"eval_runtime": 27.0107,
|
| 31 |
+
"eval_samples_per_second": 41.465,
|
| 32 |
+
"eval_steps_per_second": 1.296,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 1.263279914855957,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.1261,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.065185546875,
|
| 45 |
+
"eval_rouge1": 96.0899,
|
| 46 |
+
"eval_rouge2": 93.5338,
|
| 47 |
+
"eval_rougeL": 95.2307,
|
| 48 |
+
"eval_rougeLsum": 96.1038,
|
| 49 |
+
"eval_runtime": 26.8529,
|
| 50 |
+
"eval_samples_per_second": 41.709,
|
| 51 |
+
"eval_steps_per_second": 1.303,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.056182861328125,
|
| 57 |
+
"eval_rouge1": 97.029,
|
| 58 |
+
"eval_rouge2": 95.1136,
|
| 59 |
+
"eval_rougeL": 96.381,
|
| 60 |
+
"eval_rougeLsum": 97.0201,
|
| 61 |
+
"eval_runtime": 27.0655,
|
| 62 |
+
"eval_samples_per_second": 41.381,
|
| 63 |
+
"eval_steps_per_second": 1.293,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.31182149052619934,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0504,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.0604248046875,
|
| 76 |
+
"eval_rouge1": 96.8222,
|
| 77 |
+
"eval_rouge2": 94.606,
|
| 78 |
+
"eval_rougeL": 96.0562,
|
| 79 |
+
"eval_rougeLsum": 96.8378,
|
| 80 |
+
"eval_runtime": 26.2531,
|
| 81 |
+
"eval_samples_per_second": 42.662,
|
| 82 |
+
"eval_steps_per_second": 1.333,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.057708740234375,
|
| 88 |
+
"eval_rouge1": 97.1054,
|
| 89 |
+
"eval_rouge2": 95.2868,
|
| 90 |
+
"eval_rougeL": 96.4767,
|
| 91 |
+
"eval_rougeLsum": 97.1103,
|
| 92 |
+
"eval_runtime": 27.0931,
|
| 93 |
+
"eval_samples_per_second": 41.339,
|
| 94 |
+
"eval_steps_per_second": 1.292,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.053466796875,
|
| 100 |
+
"eval_rouge1": 97.2485,
|
| 101 |
+
"eval_rouge2": 95.4551,
|
| 102 |
+
"eval_rougeL": 96.597,
|
| 103 |
+
"eval_rougeLsum": 97.2618,
|
| 104 |
+
"eval_runtime": 27.2167,
|
| 105 |
+
"eval_samples_per_second": 41.151,
|
| 106 |
+
"eval_steps_per_second": 1.286,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 1.2603168487548828,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0298,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.053192138671875,
|
| 119 |
+
"eval_rouge1": 97.2732,
|
| 120 |
+
"eval_rouge2": 95.5288,
|
| 121 |
+
"eval_rougeL": 96.7034,
|
| 122 |
+
"eval_rougeLsum": 97.2673,
|
| 123 |
+
"eval_runtime": 27.1844,
|
| 124 |
+
"eval_samples_per_second": 41.2,
|
| 125 |
+
"eval_steps_per_second": 1.288,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.053436279296875,
|
| 131 |
+
"eval_rouge1": 97.159,
|
| 132 |
+
"eval_rouge2": 95.4117,
|
| 133 |
+
"eval_rougeL": 96.5979,
|
| 134 |
+
"eval_rougeLsum": 97.1608,
|
| 135 |
+
"eval_runtime": 26.8091,
|
| 136 |
+
"eval_samples_per_second": 41.777,
|
| 137 |
+
"eval_steps_per_second": 1.306,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.7883169054985046,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0214,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.053009033203125,
|
| 150 |
+
"eval_rouge1": 97.2705,
|
| 151 |
+
"eval_rouge2": 95.5761,
|
| 152 |
+
"eval_rougeL": 96.7291,
|
| 153 |
+
"eval_rougeLsum": 97.2653,
|
| 154 |
+
"eval_runtime": 26.7465,
|
| 155 |
+
"eval_samples_per_second": 41.875,
|
| 156 |
+
"eval_steps_per_second": 1.309,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|
trainer_state_9.json
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": 0.0093231201171875,
|
| 3 |
+
"best_model_checkpoint": "model_fewrel_1_8-task9/checkpoint-1890",
|
| 4 |
+
"epoch": 10.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 2100,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 1.0,
|
| 13 |
+
"eval_loss": 0.012847900390625,
|
| 14 |
+
"eval_rouge1": 98.6287,
|
| 15 |
+
"eval_rouge2": 98.0199,
|
| 16 |
+
"eval_rougeL": 98.3895,
|
| 17 |
+
"eval_rougeLsum": 98.5995,
|
| 18 |
+
"eval_runtime": 34.0174,
|
| 19 |
+
"eval_samples_per_second": 32.924,
|
| 20 |
+
"eval_steps_per_second": 1.029,
|
| 21 |
+
"step": 210
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"epoch": 2.0,
|
| 25 |
+
"eval_loss": 0.0115509033203125,
|
| 26 |
+
"eval_rouge1": 98.9154,
|
| 27 |
+
"eval_rouge2": 98.5367,
|
| 28 |
+
"eval_rougeL": 98.8038,
|
| 29 |
+
"eval_rougeLsum": 98.9131,
|
| 30 |
+
"eval_runtime": 34.4471,
|
| 31 |
+
"eval_samples_per_second": 32.514,
|
| 32 |
+
"eval_steps_per_second": 1.016,
|
| 33 |
+
"step": 420
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"epoch": 2.380952380952381,
|
| 37 |
+
"grad_norm": 0.4975210428237915,
|
| 38 |
+
"learning_rate": 0.0008665259359149131,
|
| 39 |
+
"loss": 0.055,
|
| 40 |
+
"step": 500
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"epoch": 3.0,
|
| 44 |
+
"eval_loss": 0.01337432861328125,
|
| 45 |
+
"eval_rouge1": 98.6194,
|
| 46 |
+
"eval_rouge2": 98.0506,
|
| 47 |
+
"eval_rougeL": 98.3926,
|
| 48 |
+
"eval_rougeLsum": 98.5818,
|
| 49 |
+
"eval_runtime": 33.5087,
|
| 50 |
+
"eval_samples_per_second": 33.424,
|
| 51 |
+
"eval_steps_per_second": 1.045,
|
| 52 |
+
"step": 630
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 4.0,
|
| 56 |
+
"eval_loss": 0.01132965087890625,
|
| 57 |
+
"eval_rouge1": 98.8522,
|
| 58 |
+
"eval_rouge2": 98.4253,
|
| 59 |
+
"eval_rougeL": 98.7051,
|
| 60 |
+
"eval_rougeLsum": 98.8197,
|
| 61 |
+
"eval_runtime": 33.1182,
|
| 62 |
+
"eval_samples_per_second": 33.818,
|
| 63 |
+
"eval_steps_per_second": 1.057,
|
| 64 |
+
"step": 840
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 4.761904761904762,
|
| 68 |
+
"grad_norm": 0.9489365816116333,
|
| 69 |
+
"learning_rate": 0.0005373650467932121,
|
| 70 |
+
"loss": 0.0088,
|
| 71 |
+
"step": 1000
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"epoch": 5.0,
|
| 75 |
+
"eval_loss": 0.01326751708984375,
|
| 76 |
+
"eval_rouge1": 99.2134,
|
| 77 |
+
"eval_rouge2": 98.8765,
|
| 78 |
+
"eval_rougeL": 99.0941,
|
| 79 |
+
"eval_rougeLsum": 99.2096,
|
| 80 |
+
"eval_runtime": 36.9437,
|
| 81 |
+
"eval_samples_per_second": 30.316,
|
| 82 |
+
"eval_steps_per_second": 0.947,
|
| 83 |
+
"step": 1050
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"epoch": 6.0,
|
| 87 |
+
"eval_loss": 0.01346588134765625,
|
| 88 |
+
"eval_rouge1": 99.2312,
|
| 89 |
+
"eval_rouge2": 98.8839,
|
| 90 |
+
"eval_rougeL": 99.0944,
|
| 91 |
+
"eval_rougeLsum": 99.2153,
|
| 92 |
+
"eval_runtime": 34.9945,
|
| 93 |
+
"eval_samples_per_second": 32.005,
|
| 94 |
+
"eval_steps_per_second": 1.0,
|
| 95 |
+
"step": 1260
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"epoch": 7.0,
|
| 99 |
+
"eval_loss": 0.00937652587890625,
|
| 100 |
+
"eval_rouge1": 99.5998,
|
| 101 |
+
"eval_rouge2": 99.3899,
|
| 102 |
+
"eval_rougeL": 99.5205,
|
| 103 |
+
"eval_rougeLsum": 99.5994,
|
| 104 |
+
"eval_runtime": 34.6991,
|
| 105 |
+
"eval_samples_per_second": 32.278,
|
| 106 |
+
"eval_steps_per_second": 1.009,
|
| 107 |
+
"step": 1470
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 7.142857142857143,
|
| 111 |
+
"grad_norm": 0.030887478962540627,
|
| 112 |
+
"learning_rate": 0.00018825509907063325,
|
| 113 |
+
"loss": 0.0032,
|
| 114 |
+
"step": 1500
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 8.0,
|
| 118 |
+
"eval_loss": 0.010223388671875,
|
| 119 |
+
"eval_rouge1": 99.4127,
|
| 120 |
+
"eval_rouge2": 99.1295,
|
| 121 |
+
"eval_rougeL": 99.3072,
|
| 122 |
+
"eval_rougeLsum": 99.3986,
|
| 123 |
+
"eval_runtime": 34.3802,
|
| 124 |
+
"eval_samples_per_second": 32.577,
|
| 125 |
+
"eval_steps_per_second": 1.018,
|
| 126 |
+
"step": 1680
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"epoch": 9.0,
|
| 130 |
+
"eval_loss": 0.0093231201171875,
|
| 131 |
+
"eval_rouge1": 99.4127,
|
| 132 |
+
"eval_rouge2": 99.1295,
|
| 133 |
+
"eval_rougeL": 99.3072,
|
| 134 |
+
"eval_rougeLsum": 99.3986,
|
| 135 |
+
"eval_runtime": 34.2644,
|
| 136 |
+
"eval_samples_per_second": 32.687,
|
| 137 |
+
"eval_steps_per_second": 1.021,
|
| 138 |
+
"step": 1890
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 9.523809523809524,
|
| 142 |
+
"grad_norm": 0.0026726792566478252,
|
| 143 |
+
"learning_rate": 5.5845868874357386e-06,
|
| 144 |
+
"loss": 0.0011,
|
| 145 |
+
"step": 2000
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 10.0,
|
| 149 |
+
"eval_loss": 0.00940704345703125,
|
| 150 |
+
"eval_rouge1": 99.4127,
|
| 151 |
+
"eval_rouge2": 99.1295,
|
| 152 |
+
"eval_rougeL": 99.3072,
|
| 153 |
+
"eval_rougeLsum": 99.3986,
|
| 154 |
+
"eval_runtime": 34.3879,
|
| 155 |
+
"eval_samples_per_second": 32.57,
|
| 156 |
+
"eval_steps_per_second": 1.018,
|
| 157 |
+
"step": 2100
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"logging_steps": 500,
|
| 161 |
+
"max_steps": 2100,
|
| 162 |
+
"num_input_tokens_seen": 0,
|
| 163 |
+
"num_train_epochs": 10,
|
| 164 |
+
"save_steps": 500,
|
| 165 |
+
"stateful_callbacks": {
|
| 166 |
+
"TrainerControl": {
|
| 167 |
+
"args": {
|
| 168 |
+
"should_epoch_stop": false,
|
| 169 |
+
"should_evaluate": false,
|
| 170 |
+
"should_log": false,
|
| 171 |
+
"should_save": true,
|
| 172 |
+
"should_training_stop": true
|
| 173 |
+
},
|
| 174 |
+
"attributes": {}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"total_flos": 2.3099168784384e+16,
|
| 178 |
+
"train_batch_size": 16,
|
| 179 |
+
"trial_name": null,
|
| 180 |
+
"trial_params": null
|
| 181 |
+
}
|