Sefika commited on
Commit
e81706f
·
verified ·
1 Parent(s): 301d50f

Upload 9 files

Browse files

add trainer_states for run 1

trainer_state_1.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0300140380859375,
3
+ "best_model_checkpoint": "model_fewrel_1_1-task2/checkpoint-1260",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.0692138671875,
14
+ "eval_rouge1": 95.7517,
15
+ "eval_rouge2": 94.6841,
16
+ "eval_rougeL": 95.6971,
17
+ "eval_rougeLsum": 95.7331,
18
+ "eval_runtime": 33.2038,
19
+ "eval_samples_per_second": 33.731,
20
+ "eval_steps_per_second": 1.054,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.034423828125,
26
+ "eval_rouge1": 97.351,
27
+ "eval_rouge2": 96.6251,
28
+ "eval_rougeL": 97.3032,
29
+ "eval_rougeLsum": 97.2964,
30
+ "eval_runtime": 32.6308,
31
+ "eval_samples_per_second": 34.323,
32
+ "eval_steps_per_second": 1.073,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.4508669972419739,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.0924,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.03924560546875,
45
+ "eval_rouge1": 97.2483,
46
+ "eval_rouge2": 96.6208,
47
+ "eval_rougeL": 97.2291,
48
+ "eval_rougeLsum": 97.2002,
49
+ "eval_runtime": 33.4876,
50
+ "eval_samples_per_second": 33.445,
51
+ "eval_steps_per_second": 1.045,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.0413818359375,
57
+ "eval_rouge1": 97.1906,
58
+ "eval_rouge2": 96.559,
59
+ "eval_rougeL": 97.1839,
60
+ "eval_rougeLsum": 97.1621,
61
+ "eval_runtime": 34.4429,
62
+ "eval_samples_per_second": 32.518,
63
+ "eval_steps_per_second": 1.016,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.1619143784046173,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0276,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.042449951171875,
76
+ "eval_rouge1": 97.4464,
77
+ "eval_rouge2": 96.8611,
78
+ "eval_rougeL": 97.4297,
79
+ "eval_rougeLsum": 97.4266,
80
+ "eval_runtime": 32.4102,
81
+ "eval_samples_per_second": 34.557,
82
+ "eval_steps_per_second": 1.08,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.0300140380859375,
88
+ "eval_rouge1": 98.1516,
89
+ "eval_rouge2": 97.6994,
90
+ "eval_rougeL": 98.1475,
91
+ "eval_rougeLsum": 98.155,
92
+ "eval_runtime": 32.3626,
93
+ "eval_samples_per_second": 34.608,
94
+ "eval_steps_per_second": 1.081,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.031494140625,
100
+ "eval_rouge1": 97.6953,
101
+ "eval_rouge2": 97.1861,
102
+ "eval_rougeL": 97.7355,
103
+ "eval_rougeLsum": 97.713,
104
+ "eval_runtime": 31.6892,
105
+ "eval_samples_per_second": 35.343,
106
+ "eval_steps_per_second": 1.104,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.17737896740436554,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0138,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.03729248046875,
119
+ "eval_rouge1": 98.0557,
120
+ "eval_rouge2": 97.5844,
121
+ "eval_rougeL": 98.0434,
122
+ "eval_rougeLsum": 98.0396,
123
+ "eval_runtime": 32.4888,
124
+ "eval_samples_per_second": 34.473,
125
+ "eval_steps_per_second": 1.077,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.0333251953125,
131
+ "eval_rouge1": 98.2547,
132
+ "eval_rouge2": 97.8119,
133
+ "eval_rougeL": 98.2452,
134
+ "eval_rougeLsum": 98.2669,
135
+ "eval_runtime": 31.823,
136
+ "eval_samples_per_second": 35.195,
137
+ "eval_steps_per_second": 1.1,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.1923867166042328,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0088,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.03363037109375,
150
+ "eval_rouge1": 98.2547,
151
+ "eval_rouge2": 97.8119,
152
+ "eval_rougeL": 98.2452,
153
+ "eval_rougeLsum": 98.2669,
154
+ "eval_runtime": 31.6927,
155
+ "eval_samples_per_second": 35.339,
156
+ "eval_steps_per_second": 1.104,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_2.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0810546875,
3
+ "best_model_checkpoint": "model_fewrel_1_2-task2/checkpoint-6",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 30,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.10284423828125,
14
+ "eval_rouge1": 95.9465,
15
+ "eval_rouge2": 93.9682,
16
+ "eval_rougeL": 95.3944,
17
+ "eval_rougeLsum": 95.9557,
18
+ "eval_runtime": 29.9579,
19
+ "eval_samples_per_second": 37.386,
20
+ "eval_steps_per_second": 1.168,
21
+ "step": 3
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.0810546875,
26
+ "eval_rouge1": 96.5004,
27
+ "eval_rouge2": 94.78,
28
+ "eval_rougeL": 96.0088,
29
+ "eval_rougeLsum": 96.4773,
30
+ "eval_runtime": 28.554,
31
+ "eval_samples_per_second": 39.224,
32
+ "eval_steps_per_second": 1.226,
33
+ "step": 6
34
+ },
35
+ {
36
+ "epoch": 3.0,
37
+ "eval_loss": 0.0819091796875,
38
+ "eval_rouge1": 96.6176,
39
+ "eval_rouge2": 94.9705,
40
+ "eval_rougeL": 96.1457,
41
+ "eval_rougeLsum": 96.6106,
42
+ "eval_runtime": 28.3985,
43
+ "eval_samples_per_second": 39.439,
44
+ "eval_steps_per_second": 1.232,
45
+ "step": 9
46
+ },
47
+ {
48
+ "epoch": 4.0,
49
+ "eval_loss": 0.095703125,
50
+ "eval_rouge1": 96.1348,
51
+ "eval_rouge2": 94.1813,
52
+ "eval_rougeL": 95.5333,
53
+ "eval_rougeLsum": 96.1226,
54
+ "eval_runtime": 27.8269,
55
+ "eval_samples_per_second": 40.249,
56
+ "eval_steps_per_second": 1.258,
57
+ "step": 12
58
+ },
59
+ {
60
+ "epoch": 5.0,
61
+ "eval_loss": 0.1134033203125,
62
+ "eval_rouge1": 95.4562,
63
+ "eval_rouge2": 93.2875,
64
+ "eval_rougeL": 94.812,
65
+ "eval_rougeLsum": 95.4534,
66
+ "eval_runtime": 27.905,
67
+ "eval_samples_per_second": 40.136,
68
+ "eval_steps_per_second": 1.254,
69
+ "step": 15
70
+ },
71
+ {
72
+ "epoch": 6.0,
73
+ "eval_loss": 0.1260986328125,
74
+ "eval_rouge1": 94.9572,
75
+ "eval_rouge2": 92.576,
76
+ "eval_rougeL": 94.3038,
77
+ "eval_rougeLsum": 94.9406,
78
+ "eval_runtime": 27.7213,
79
+ "eval_samples_per_second": 40.402,
80
+ "eval_steps_per_second": 1.263,
81
+ "step": 18
82
+ },
83
+ {
84
+ "epoch": 7.0,
85
+ "eval_loss": 0.132080078125,
86
+ "eval_rouge1": 94.7046,
87
+ "eval_rouge2": 92.2489,
88
+ "eval_rougeL": 94.0746,
89
+ "eval_rougeLsum": 94.7023,
90
+ "eval_runtime": 27.3971,
91
+ "eval_samples_per_second": 40.88,
92
+ "eval_steps_per_second": 1.278,
93
+ "step": 21
94
+ },
95
+ {
96
+ "epoch": 8.0,
97
+ "eval_loss": 0.1346435546875,
98
+ "eval_rouge1": 94.6117,
99
+ "eval_rouge2": 92.0736,
100
+ "eval_rougeL": 93.9435,
101
+ "eval_rougeLsum": 94.6048,
102
+ "eval_runtime": 27.2256,
103
+ "eval_samples_per_second": 41.138,
104
+ "eval_steps_per_second": 1.286,
105
+ "step": 24
106
+ },
107
+ {
108
+ "epoch": 9.0,
109
+ "eval_loss": 0.1351318359375,
110
+ "eval_rouge1": 94.5465,
111
+ "eval_rouge2": 91.9795,
112
+ "eval_rougeL": 93.8758,
113
+ "eval_rougeLsum": 94.5584,
114
+ "eval_runtime": 27.1737,
115
+ "eval_samples_per_second": 41.216,
116
+ "eval_steps_per_second": 1.288,
117
+ "step": 27
118
+ },
119
+ {
120
+ "epoch": 10.0,
121
+ "eval_loss": 0.1353759765625,
122
+ "eval_rouge1": 94.5465,
123
+ "eval_rouge2": 91.9795,
124
+ "eval_rougeL": 93.8758,
125
+ "eval_rougeLsum": 94.5584,
126
+ "eval_runtime": 27.1504,
127
+ "eval_samples_per_second": 41.252,
128
+ "eval_steps_per_second": 1.289,
129
+ "step": 30
130
+ }
131
+ ],
132
+ "logging_steps": 500,
133
+ "max_steps": 30,
134
+ "num_input_tokens_seen": 0,
135
+ "num_train_epochs": 10,
136
+ "save_steps": 500,
137
+ "stateful_callbacks": {
138
+ "TrainerControl": {
139
+ "args": {
140
+ "should_epoch_stop": false,
141
+ "should_evaluate": false,
142
+ "should_log": false,
143
+ "should_save": true,
144
+ "should_training_stop": true
145
+ },
146
+ "attributes": {}
147
+ }
148
+ },
149
+ "total_flos": 274990104576000.0,
150
+ "train_batch_size": 16,
151
+ "trial_name": null,
152
+ "trial_params": null
153
+ }
trainer_state_3.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.000484466552734375,
3
+ "best_model_checkpoint": "model_fewrel_1_2-task3/checkpoint-1680",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.005573272705078125,
14
+ "eval_rouge1": 99.5247,
15
+ "eval_rouge2": 99.1868,
16
+ "eval_rougeL": 99.3767,
17
+ "eval_rougeLsum": 99.523,
18
+ "eval_runtime": 32.5017,
19
+ "eval_samples_per_second": 34.46,
20
+ "eval_steps_per_second": 1.077,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.003314971923828125,
26
+ "eval_rouge1": 99.6944,
27
+ "eval_rouge2": 99.4612,
28
+ "eval_rougeL": 99.5701,
29
+ "eval_rougeLsum": 99.6944,
30
+ "eval_runtime": 31.661,
31
+ "eval_samples_per_second": 35.375,
32
+ "eval_steps_per_second": 1.105,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 1.6153414249420166,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.0413,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.0029888153076171875,
45
+ "eval_rouge1": 99.779,
46
+ "eval_rouge2": 99.6013,
47
+ "eval_rougeL": 99.6771,
48
+ "eval_rougeLsum": 99.7743,
49
+ "eval_runtime": 32.3166,
50
+ "eval_samples_per_second": 34.657,
51
+ "eval_steps_per_second": 1.083,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.002841949462890625,
57
+ "eval_rouge1": 99.6692,
58
+ "eval_rouge2": 99.4798,
59
+ "eval_rougeL": 99.5698,
60
+ "eval_rougeLsum": 99.667,
61
+ "eval_runtime": 32.5644,
62
+ "eval_samples_per_second": 34.393,
63
+ "eval_steps_per_second": 1.075,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.001943291281349957,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0055,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.0016384124755859375,
76
+ "eval_rouge1": 99.8226,
77
+ "eval_rouge2": 99.6738,
78
+ "eval_rougeL": 99.7404,
79
+ "eval_rougeLsum": 99.8124,
80
+ "eval_runtime": 32.1973,
81
+ "eval_samples_per_second": 34.786,
82
+ "eval_steps_per_second": 1.087,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.0015192031860351562,
88
+ "eval_rouge1": 99.8683,
89
+ "eval_rouge2": 99.77,
90
+ "eval_rougeL": 99.8145,
91
+ "eval_rougeLsum": 99.8683,
92
+ "eval_runtime": 31.6836,
93
+ "eval_samples_per_second": 35.35,
94
+ "eval_steps_per_second": 1.105,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.001491546630859375,
100
+ "eval_rouge1": 99.9154,
101
+ "eval_rouge2": 99.8424,
102
+ "eval_rougeL": 99.8778,
103
+ "eval_rougeLsum": 99.9154,
104
+ "eval_runtime": 31.7073,
105
+ "eval_samples_per_second": 35.323,
106
+ "eval_steps_per_second": 1.104,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.034123487770557404,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0014,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.000484466552734375,
119
+ "eval_rouge1": 99.9577,
120
+ "eval_rouge2": 99.9212,
121
+ "eval_rougeL": 99.9389,
122
+ "eval_rougeLsum": 99.9577,
123
+ "eval_runtime": 31.5909,
124
+ "eval_samples_per_second": 35.453,
125
+ "eval_steps_per_second": 1.108,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.0007829666137695312,
131
+ "eval_rouge1": 99.9577,
132
+ "eval_rouge2": 99.9212,
133
+ "eval_rougeL": 99.9389,
134
+ "eval_rougeLsum": 99.9577,
135
+ "eval_runtime": 31.712,
136
+ "eval_samples_per_second": 35.318,
137
+ "eval_steps_per_second": 1.104,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 3.215530887246132e-05,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0007,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.0007658004760742188,
150
+ "eval_rouge1": 99.9577,
151
+ "eval_rouge2": 99.9212,
152
+ "eval_rougeL": 99.9389,
153
+ "eval_rougeLsum": 99.9577,
154
+ "eval_runtime": 31.6985,
155
+ "eval_samples_per_second": 35.333,
156
+ "eval_steps_per_second": 1.104,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_4.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.016448974609375,
3
+ "best_model_checkpoint": "model_fewrel_1_3-task4/checkpoint-1890",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.0394287109375,
14
+ "eval_rouge1": 98.8133,
15
+ "eval_rouge2": 98.2303,
16
+ "eval_rougeL": 98.6613,
17
+ "eval_rougeLsum": 98.8046,
18
+ "eval_runtime": 26.1194,
19
+ "eval_samples_per_second": 42.88,
20
+ "eval_steps_per_second": 1.34,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.032379150390625,
26
+ "eval_rouge1": 98.0957,
27
+ "eval_rouge2": 97.1666,
28
+ "eval_rougeL": 97.8621,
29
+ "eval_rougeLsum": 98.1056,
30
+ "eval_runtime": 25.8031,
31
+ "eval_samples_per_second": 43.406,
32
+ "eval_steps_per_second": 1.356,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.016574041917920113,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.0725,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.0189056396484375,
45
+ "eval_rouge1": 99.423,
46
+ "eval_rouge2": 99.1539,
47
+ "eval_rougeL": 99.359,
48
+ "eval_rougeLsum": 99.4285,
49
+ "eval_runtime": 25.7666,
50
+ "eval_samples_per_second": 43.467,
51
+ "eval_steps_per_second": 1.358,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.03497314453125,
57
+ "eval_rouge1": 98.9701,
58
+ "eval_rouge2": 98.497,
59
+ "eval_rougeL": 98.8414,
60
+ "eval_rougeLsum": 98.9712,
61
+ "eval_runtime": 25.9235,
62
+ "eval_samples_per_second": 43.204,
63
+ "eval_steps_per_second": 1.35,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.016128525137901306,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0226,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.0195159912109375,
76
+ "eval_rouge1": 99.2315,
77
+ "eval_rouge2": 98.8414,
78
+ "eval_rougeL": 99.1293,
79
+ "eval_rougeLsum": 99.2314,
80
+ "eval_runtime": 26.0919,
81
+ "eval_samples_per_second": 42.925,
82
+ "eval_steps_per_second": 1.341,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.020782470703125,
88
+ "eval_rouge1": 99.5165,
89
+ "eval_rouge2": 99.2985,
90
+ "eval_rougeL": 99.4726,
91
+ "eval_rougeLsum": 99.5153,
92
+ "eval_runtime": 25.9934,
93
+ "eval_samples_per_second": 43.088,
94
+ "eval_steps_per_second": 1.346,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.0180206298828125,
100
+ "eval_rouge1": 99.5187,
101
+ "eval_rouge2": 99.3048,
102
+ "eval_rougeL": 99.4708,
103
+ "eval_rougeLsum": 99.5346,
104
+ "eval_runtime": 25.8971,
105
+ "eval_samples_per_second": 43.248,
106
+ "eval_steps_per_second": 1.352,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.08678867667913437,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0096,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.016693115234375,
119
+ "eval_rouge1": 99.4715,
120
+ "eval_rouge2": 99.2326,
121
+ "eval_rougeL": 99.4096,
122
+ "eval_rougeLsum": 99.484,
123
+ "eval_runtime": 25.9209,
124
+ "eval_samples_per_second": 43.208,
125
+ "eval_steps_per_second": 1.35,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.016448974609375,
131
+ "eval_rouge1": 99.5187,
132
+ "eval_rouge2": 99.3048,
133
+ "eval_rougeL": 99.4708,
134
+ "eval_rougeLsum": 99.5346,
135
+ "eval_runtime": 25.8641,
136
+ "eval_samples_per_second": 43.303,
137
+ "eval_steps_per_second": 1.353,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.015132551081478596,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0063,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.0164794921875,
150
+ "eval_rouge1": 99.5187,
151
+ "eval_rouge2": 99.3048,
152
+ "eval_rougeL": 99.4708,
153
+ "eval_rougeLsum": 99.5346,
154
+ "eval_runtime": 25.9133,
155
+ "eval_samples_per_second": 43.221,
156
+ "eval_steps_per_second": 1.351,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_5.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0059661865234375,
3
+ "best_model_checkpoint": "model_fewrel_1_4-task5/checkpoint-630",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.017303466796875,
14
+ "eval_rouge1": 98.4539,
15
+ "eval_rouge2": 97.5762,
16
+ "eval_rougeL": 98.1882,
17
+ "eval_rougeLsum": 98.4482,
18
+ "eval_runtime": 34.4843,
19
+ "eval_samples_per_second": 32.479,
20
+ "eval_steps_per_second": 1.015,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.00751495361328125,
26
+ "eval_rouge1": 99.1583,
27
+ "eval_rouge2": 98.6851,
28
+ "eval_rougeL": 99.0047,
29
+ "eval_rougeLsum": 99.1647,
30
+ "eval_runtime": 32.741,
31
+ "eval_samples_per_second": 34.208,
32
+ "eval_steps_per_second": 1.069,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.014579183422029018,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.0573,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.0059661865234375,
45
+ "eval_rouge1": 99.3646,
46
+ "eval_rouge2": 98.9205,
47
+ "eval_rougeL": 99.2221,
48
+ "eval_rougeLsum": 99.3603,
49
+ "eval_runtime": 32.9254,
50
+ "eval_samples_per_second": 34.016,
51
+ "eval_steps_per_second": 1.063,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.00925445556640625,
57
+ "eval_rouge1": 99.1633,
58
+ "eval_rouge2": 98.6891,
59
+ "eval_rougeL": 99.0235,
60
+ "eval_rougeLsum": 99.1601,
61
+ "eval_runtime": 33.2811,
62
+ "eval_samples_per_second": 33.653,
63
+ "eval_steps_per_second": 1.052,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.1038060188293457,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0102,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.007312774658203125,
76
+ "eval_rouge1": 99.3523,
77
+ "eval_rouge2": 98.8747,
78
+ "eval_rougeL": 99.1909,
79
+ "eval_rougeLsum": 99.3521,
80
+ "eval_runtime": 33.1107,
81
+ "eval_samples_per_second": 33.826,
82
+ "eval_steps_per_second": 1.057,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.008575439453125,
88
+ "eval_rouge1": 99.4615,
89
+ "eval_rouge2": 99.0736,
90
+ "eval_rougeL": 99.3351,
91
+ "eval_rougeLsum": 99.4494,
92
+ "eval_runtime": 32.8562,
93
+ "eval_samples_per_second": 34.088,
94
+ "eval_steps_per_second": 1.065,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.00952911376953125,
100
+ "eval_rouge1": 99.3799,
101
+ "eval_rouge2": 99.0097,
102
+ "eval_rougeL": 99.2562,
103
+ "eval_rougeLsum": 99.371,
104
+ "eval_runtime": 33.1813,
105
+ "eval_samples_per_second": 33.754,
106
+ "eval_steps_per_second": 1.055,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.025980567559599876,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0042,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.00867462158203125,
119
+ "eval_rouge1": 99.4488,
120
+ "eval_rouge2": 99.0827,
121
+ "eval_rougeL": 99.3346,
122
+ "eval_rougeLsum": 99.4596,
123
+ "eval_runtime": 32.7477,
124
+ "eval_samples_per_second": 34.201,
125
+ "eval_steps_per_second": 1.069,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.00786590576171875,
131
+ "eval_rouge1": 99.3538,
132
+ "eval_rouge2": 98.9422,
133
+ "eval_rougeL": 99.2192,
134
+ "eval_rougeLsum": 99.3563,
135
+ "eval_runtime": 33.0744,
136
+ "eval_samples_per_second": 33.863,
137
+ "eval_steps_per_second": 1.058,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.18442556262016296,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0023,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.007965087890625,
150
+ "eval_rouge1": 99.3538,
151
+ "eval_rouge2": 98.9422,
152
+ "eval_rougeL": 99.2192,
153
+ "eval_rougeLsum": 99.3563,
154
+ "eval_runtime": 33.0331,
155
+ "eval_samples_per_second": 33.905,
156
+ "eval_steps_per_second": 1.06,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_6.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.03607177734375,
3
+ "best_model_checkpoint": "model_fewrel_1_5-task6/checkpoint-1260",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.051544189453125,
14
+ "eval_rouge1": 97.5746,
15
+ "eval_rouge2": 95.9962,
16
+ "eval_rougeL": 96.8014,
17
+ "eval_rougeLsum": 97.5759,
18
+ "eval_runtime": 33.8471,
19
+ "eval_samples_per_second": 33.09,
20
+ "eval_steps_per_second": 1.034,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.038238525390625,
26
+ "eval_rouge1": 97.0504,
27
+ "eval_rouge2": 95.0089,
28
+ "eval_rougeL": 96.0484,
29
+ "eval_rougeLsum": 97.0773,
30
+ "eval_runtime": 33.7575,
31
+ "eval_samples_per_second": 33.178,
32
+ "eval_steps_per_second": 1.037,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.15559855103492737,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.0759,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.039306640625,
45
+ "eval_rouge1": 97.8822,
46
+ "eval_rouge2": 96.4368,
47
+ "eval_rougeL": 97.1693,
48
+ "eval_rougeLsum": 97.886,
49
+ "eval_runtime": 33.8292,
50
+ "eval_samples_per_second": 33.107,
51
+ "eval_steps_per_second": 1.035,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.036468505859375,
57
+ "eval_rouge1": 97.8707,
58
+ "eval_rouge2": 96.3824,
59
+ "eval_rougeL": 97.1565,
60
+ "eval_rougeLsum": 97.88,
61
+ "eval_runtime": 34.5353,
62
+ "eval_samples_per_second": 32.431,
63
+ "eval_steps_per_second": 1.013,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.021474618464708328,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0211,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.03961181640625,
76
+ "eval_rouge1": 97.5277,
77
+ "eval_rouge2": 95.8576,
78
+ "eval_rougeL": 96.691,
79
+ "eval_rougeLsum": 97.527,
80
+ "eval_runtime": 33.9424,
81
+ "eval_samples_per_second": 32.997,
82
+ "eval_steps_per_second": 1.031,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.03607177734375,
88
+ "eval_rouge1": 98.1371,
89
+ "eval_rouge2": 96.8791,
90
+ "eval_rougeL": 97.5059,
91
+ "eval_rougeLsum": 98.173,
92
+ "eval_runtime": 33.9961,
93
+ "eval_samples_per_second": 32.945,
94
+ "eval_steps_per_second": 1.03,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.040924072265625,
100
+ "eval_rouge1": 98.1004,
101
+ "eval_rouge2": 96.774,
102
+ "eval_rougeL": 97.4333,
103
+ "eval_rougeLsum": 98.1087,
104
+ "eval_runtime": 34.4539,
105
+ "eval_samples_per_second": 32.507,
106
+ "eval_steps_per_second": 1.016,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.08763577789068222,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0103,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.03826904296875,
119
+ "eval_rouge1": 97.8337,
120
+ "eval_rouge2": 96.3488,
121
+ "eval_rougeL": 97.096,
122
+ "eval_rougeLsum": 97.8664,
123
+ "eval_runtime": 34.3599,
124
+ "eval_samples_per_second": 32.596,
125
+ "eval_steps_per_second": 1.019,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.038909912109375,
131
+ "eval_rouge1": 97.9644,
132
+ "eval_rouge2": 96.525,
133
+ "eval_rougeL": 97.2236,
134
+ "eval_rougeLsum": 97.9585,
135
+ "eval_runtime": 33.8333,
136
+ "eval_samples_per_second": 33.103,
137
+ "eval_steps_per_second": 1.034,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.19326545298099518,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0071,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.039276123046875,
150
+ "eval_rouge1": 98.0097,
151
+ "eval_rouge2": 96.6105,
152
+ "eval_rougeL": 97.2833,
153
+ "eval_rougeLsum": 98.0092,
154
+ "eval_runtime": 33.8974,
155
+ "eval_samples_per_second": 33.041,
156
+ "eval_steps_per_second": 1.033,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_7.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0484619140625,
3
+ "best_model_checkpoint": "model_fewrel_1_6-task7/checkpoint-1470",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.1253662109375,
14
+ "eval_rouge1": 93.102,
15
+ "eval_rouge2": 89.1364,
16
+ "eval_rougeL": 92.1128,
17
+ "eval_rougeLsum": 93.0898,
18
+ "eval_runtime": 30.5369,
19
+ "eval_samples_per_second": 36.677,
20
+ "eval_steps_per_second": 1.146,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.07708740234375,
26
+ "eval_rouge1": 94.6253,
27
+ "eval_rouge2": 91.7196,
28
+ "eval_rougeL": 93.9284,
29
+ "eval_rougeLsum": 94.6153,
30
+ "eval_runtime": 29.0183,
31
+ "eval_samples_per_second": 38.596,
32
+ "eval_steps_per_second": 1.206,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.4770593047142029,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.1042,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.05633544921875,
45
+ "eval_rouge1": 95.1246,
46
+ "eval_rouge2": 92.2081,
47
+ "eval_rougeL": 94.3701,
48
+ "eval_rougeLsum": 95.1249,
49
+ "eval_runtime": 28.82,
50
+ "eval_samples_per_second": 38.862,
51
+ "eval_steps_per_second": 1.214,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.052490234375,
57
+ "eval_rouge1": 95.9748,
58
+ "eval_rouge2": 93.6071,
59
+ "eval_rougeL": 95.3787,
60
+ "eval_rougeLsum": 95.9622,
61
+ "eval_runtime": 28.7568,
62
+ "eval_samples_per_second": 38.947,
63
+ "eval_steps_per_second": 1.217,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.2157868593931198,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0397,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.052734375,
76
+ "eval_rouge1": 96.4573,
77
+ "eval_rouge2": 94.4045,
78
+ "eval_rougeL": 95.96,
79
+ "eval_rougeLsum": 96.4689,
80
+ "eval_runtime": 28.2262,
81
+ "eval_samples_per_second": 39.679,
82
+ "eval_steps_per_second": 1.24,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.053009033203125,
88
+ "eval_rouge1": 96.692,
89
+ "eval_rouge2": 94.7143,
90
+ "eval_rougeL": 96.2205,
91
+ "eval_rougeLsum": 96.6725,
92
+ "eval_runtime": 28.1396,
93
+ "eval_samples_per_second": 39.802,
94
+ "eval_steps_per_second": 1.244,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.0484619140625,
100
+ "eval_rouge1": 96.2898,
101
+ "eval_rouge2": 94.1342,
102
+ "eval_rougeL": 95.7357,
103
+ "eval_rougeLsum": 96.2989,
104
+ "eval_runtime": 28.7901,
105
+ "eval_samples_per_second": 38.902,
106
+ "eval_steps_per_second": 1.216,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.13793426752090454,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0253,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.050872802734375,
119
+ "eval_rouge1": 96.419,
120
+ "eval_rouge2": 94.2908,
121
+ "eval_rougeL": 95.887,
122
+ "eval_rougeLsum": 96.431,
123
+ "eval_runtime": 28.8197,
124
+ "eval_samples_per_second": 38.862,
125
+ "eval_steps_per_second": 1.214,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.050994873046875,
131
+ "eval_rouge1": 96.5301,
132
+ "eval_rouge2": 94.465,
133
+ "eval_rougeL": 96.014,
134
+ "eval_rougeLsum": 96.5445,
135
+ "eval_runtime": 28.8499,
136
+ "eval_samples_per_second": 38.822,
137
+ "eval_steps_per_second": 1.213,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.44433069229125977,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0172,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.05072021484375,
150
+ "eval_rouge1": 96.5339,
151
+ "eval_rouge2": 94.4734,
152
+ "eval_rougeL": 96.0133,
153
+ "eval_rougeLsum": 96.5439,
154
+ "eval_runtime": 28.8295,
155
+ "eval_samples_per_second": 38.849,
156
+ "eval_steps_per_second": 1.214,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_8.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.053009033203125,
3
+ "best_model_checkpoint": "model_fewrel_1_7-task8/checkpoint-2100",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.061859130859375,
14
+ "eval_rouge1": 95.8441,
15
+ "eval_rouge2": 93.3788,
16
+ "eval_rougeL": 95.0225,
17
+ "eval_rougeLsum": 95.815,
18
+ "eval_runtime": 27.9858,
19
+ "eval_samples_per_second": 40.02,
20
+ "eval_steps_per_second": 1.251,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.06298828125,
26
+ "eval_rouge1": 96.0859,
27
+ "eval_rouge2": 93.638,
28
+ "eval_rougeL": 95.2187,
29
+ "eval_rougeLsum": 96.1001,
30
+ "eval_runtime": 27.0107,
31
+ "eval_samples_per_second": 41.465,
32
+ "eval_steps_per_second": 1.296,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 1.263279914855957,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.1261,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.065185546875,
45
+ "eval_rouge1": 96.0899,
46
+ "eval_rouge2": 93.5338,
47
+ "eval_rougeL": 95.2307,
48
+ "eval_rougeLsum": 96.1038,
49
+ "eval_runtime": 26.8529,
50
+ "eval_samples_per_second": 41.709,
51
+ "eval_steps_per_second": 1.303,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.056182861328125,
57
+ "eval_rouge1": 97.029,
58
+ "eval_rouge2": 95.1136,
59
+ "eval_rougeL": 96.381,
60
+ "eval_rougeLsum": 97.0201,
61
+ "eval_runtime": 27.0655,
62
+ "eval_samples_per_second": 41.381,
63
+ "eval_steps_per_second": 1.293,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.31182149052619934,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0504,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.0604248046875,
76
+ "eval_rouge1": 96.8222,
77
+ "eval_rouge2": 94.606,
78
+ "eval_rougeL": 96.0562,
79
+ "eval_rougeLsum": 96.8378,
80
+ "eval_runtime": 26.2531,
81
+ "eval_samples_per_second": 42.662,
82
+ "eval_steps_per_second": 1.333,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.057708740234375,
88
+ "eval_rouge1": 97.1054,
89
+ "eval_rouge2": 95.2868,
90
+ "eval_rougeL": 96.4767,
91
+ "eval_rougeLsum": 97.1103,
92
+ "eval_runtime": 27.0931,
93
+ "eval_samples_per_second": 41.339,
94
+ "eval_steps_per_second": 1.292,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.053466796875,
100
+ "eval_rouge1": 97.2485,
101
+ "eval_rouge2": 95.4551,
102
+ "eval_rougeL": 96.597,
103
+ "eval_rougeLsum": 97.2618,
104
+ "eval_runtime": 27.2167,
105
+ "eval_samples_per_second": 41.151,
106
+ "eval_steps_per_second": 1.286,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 1.2603168487548828,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0298,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.053192138671875,
119
+ "eval_rouge1": 97.2732,
120
+ "eval_rouge2": 95.5288,
121
+ "eval_rougeL": 96.7034,
122
+ "eval_rougeLsum": 97.2673,
123
+ "eval_runtime": 27.1844,
124
+ "eval_samples_per_second": 41.2,
125
+ "eval_steps_per_second": 1.288,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.053436279296875,
131
+ "eval_rouge1": 97.159,
132
+ "eval_rouge2": 95.4117,
133
+ "eval_rougeL": 96.5979,
134
+ "eval_rougeLsum": 97.1608,
135
+ "eval_runtime": 26.8091,
136
+ "eval_samples_per_second": 41.777,
137
+ "eval_steps_per_second": 1.306,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.7883169054985046,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0214,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.053009033203125,
150
+ "eval_rouge1": 97.2705,
151
+ "eval_rouge2": 95.5761,
152
+ "eval_rougeL": 96.7291,
153
+ "eval_rougeLsum": 97.2653,
154
+ "eval_runtime": 26.7465,
155
+ "eval_samples_per_second": 41.875,
156
+ "eval_steps_per_second": 1.309,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }
trainer_state_9.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0093231201171875,
3
+ "best_model_checkpoint": "model_fewrel_1_8-task9/checkpoint-1890",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.012847900390625,
14
+ "eval_rouge1": 98.6287,
15
+ "eval_rouge2": 98.0199,
16
+ "eval_rougeL": 98.3895,
17
+ "eval_rougeLsum": 98.5995,
18
+ "eval_runtime": 34.0174,
19
+ "eval_samples_per_second": 32.924,
20
+ "eval_steps_per_second": 1.029,
21
+ "step": 210
22
+ },
23
+ {
24
+ "epoch": 2.0,
25
+ "eval_loss": 0.0115509033203125,
26
+ "eval_rouge1": 98.9154,
27
+ "eval_rouge2": 98.5367,
28
+ "eval_rougeL": 98.8038,
29
+ "eval_rougeLsum": 98.9131,
30
+ "eval_runtime": 34.4471,
31
+ "eval_samples_per_second": 32.514,
32
+ "eval_steps_per_second": 1.016,
33
+ "step": 420
34
+ },
35
+ {
36
+ "epoch": 2.380952380952381,
37
+ "grad_norm": 0.4975210428237915,
38
+ "learning_rate": 0.0008665259359149131,
39
+ "loss": 0.055,
40
+ "step": 500
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "eval_loss": 0.01337432861328125,
45
+ "eval_rouge1": 98.6194,
46
+ "eval_rouge2": 98.0506,
47
+ "eval_rougeL": 98.3926,
48
+ "eval_rougeLsum": 98.5818,
49
+ "eval_runtime": 33.5087,
50
+ "eval_samples_per_second": 33.424,
51
+ "eval_steps_per_second": 1.045,
52
+ "step": 630
53
+ },
54
+ {
55
+ "epoch": 4.0,
56
+ "eval_loss": 0.01132965087890625,
57
+ "eval_rouge1": 98.8522,
58
+ "eval_rouge2": 98.4253,
59
+ "eval_rougeL": 98.7051,
60
+ "eval_rougeLsum": 98.8197,
61
+ "eval_runtime": 33.1182,
62
+ "eval_samples_per_second": 33.818,
63
+ "eval_steps_per_second": 1.057,
64
+ "step": 840
65
+ },
66
+ {
67
+ "epoch": 4.761904761904762,
68
+ "grad_norm": 0.9489365816116333,
69
+ "learning_rate": 0.0005373650467932121,
70
+ "loss": 0.0088,
71
+ "step": 1000
72
+ },
73
+ {
74
+ "epoch": 5.0,
75
+ "eval_loss": 0.01326751708984375,
76
+ "eval_rouge1": 99.2134,
77
+ "eval_rouge2": 98.8765,
78
+ "eval_rougeL": 99.0941,
79
+ "eval_rougeLsum": 99.2096,
80
+ "eval_runtime": 36.9437,
81
+ "eval_samples_per_second": 30.316,
82
+ "eval_steps_per_second": 0.947,
83
+ "step": 1050
84
+ },
85
+ {
86
+ "epoch": 6.0,
87
+ "eval_loss": 0.01346588134765625,
88
+ "eval_rouge1": 99.2312,
89
+ "eval_rouge2": 98.8839,
90
+ "eval_rougeL": 99.0944,
91
+ "eval_rougeLsum": 99.2153,
92
+ "eval_runtime": 34.9945,
93
+ "eval_samples_per_second": 32.005,
94
+ "eval_steps_per_second": 1.0,
95
+ "step": 1260
96
+ },
97
+ {
98
+ "epoch": 7.0,
99
+ "eval_loss": 0.00937652587890625,
100
+ "eval_rouge1": 99.5998,
101
+ "eval_rouge2": 99.3899,
102
+ "eval_rougeL": 99.5205,
103
+ "eval_rougeLsum": 99.5994,
104
+ "eval_runtime": 34.6991,
105
+ "eval_samples_per_second": 32.278,
106
+ "eval_steps_per_second": 1.009,
107
+ "step": 1470
108
+ },
109
+ {
110
+ "epoch": 7.142857142857143,
111
+ "grad_norm": 0.030887478962540627,
112
+ "learning_rate": 0.00018825509907063325,
113
+ "loss": 0.0032,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_loss": 0.010223388671875,
119
+ "eval_rouge1": 99.4127,
120
+ "eval_rouge2": 99.1295,
121
+ "eval_rougeL": 99.3072,
122
+ "eval_rougeLsum": 99.3986,
123
+ "eval_runtime": 34.3802,
124
+ "eval_samples_per_second": 32.577,
125
+ "eval_steps_per_second": 1.018,
126
+ "step": 1680
127
+ },
128
+ {
129
+ "epoch": 9.0,
130
+ "eval_loss": 0.0093231201171875,
131
+ "eval_rouge1": 99.4127,
132
+ "eval_rouge2": 99.1295,
133
+ "eval_rougeL": 99.3072,
134
+ "eval_rougeLsum": 99.3986,
135
+ "eval_runtime": 34.2644,
136
+ "eval_samples_per_second": 32.687,
137
+ "eval_steps_per_second": 1.021,
138
+ "step": 1890
139
+ },
140
+ {
141
+ "epoch": 9.523809523809524,
142
+ "grad_norm": 0.0026726792566478252,
143
+ "learning_rate": 5.5845868874357386e-06,
144
+ "loss": 0.0011,
145
+ "step": 2000
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "eval_loss": 0.00940704345703125,
150
+ "eval_rouge1": 99.4127,
151
+ "eval_rouge2": 99.1295,
152
+ "eval_rougeL": 99.3072,
153
+ "eval_rougeLsum": 99.3986,
154
+ "eval_runtime": 34.3879,
155
+ "eval_samples_per_second": 32.57,
156
+ "eval_steps_per_second": 1.018,
157
+ "step": 2100
158
+ }
159
+ ],
160
+ "logging_steps": 500,
161
+ "max_steps": 2100,
162
+ "num_input_tokens_seen": 0,
163
+ "num_train_epochs": 10,
164
+ "save_steps": 500,
165
+ "stateful_callbacks": {
166
+ "TrainerControl": {
167
+ "args": {
168
+ "should_epoch_stop": false,
169
+ "should_evaluate": false,
170
+ "should_log": false,
171
+ "should_save": true,
172
+ "should_training_stop": true
173
+ },
174
+ "attributes": {}
175
+ }
176
+ },
177
+ "total_flos": 2.3099168784384e+16,
178
+ "train_batch_size": 16,
179
+ "trial_name": null,
180
+ "trial_params": null
181
+ }