VSPuzzler commited on
Commit
782f201
·
1 Parent(s): 0ff8303

Upload with huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +1 -1
  2. optimizer.pt +1 -1
  3. pytorch_model.bin +1 -1
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +248 -106
  7. training_args.bin +1 -1
config.json CHANGED
@@ -30,6 +30,6 @@
30
  "sinusoidal_pos_embds": false,
31
  "tie_weights_": true,
32
  "torch_dtype": "float32",
33
- "transformers_version": "4.22.1",
34
  "vocab_size": 30522
35
  }
 
30
  "sinusoidal_pos_embds": false,
31
  "tie_weights_": true,
32
  "torch_dtype": "float32",
33
+ "transformers_version": "4.22.2",
34
  "vocab_size": 30522
35
  }
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:431ad2bedc905016d50214e1a771b3219558d30779f412266033ca17ed597886
3
  size 535706209
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7327fe504822a05662b22d3cfe21ff0e7de6c6e4b7c374380cae868ea4a93eb6
3
  size 535706209
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2f4c51e09cea740af5b2f44c2c1e35be36e170c124a1f4f3773393c5f001950
3
  size 267857393
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7076639dfca474f09c658516b24750b35f5c801e3c8a6e7d16258b7436dcbb07
3
  size 267857393
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:963cf45a726db07a7c0602709ff01c87a884dcaf23b158e5db3ae50351d7f5c3
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bb946997ff0f7320ddf7f019e45e0110a3e181f25be61b45566bc48e9f18dfa
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61560d7c3158ca687a6c99d6feb9d1ce2d657a43e44e3abe5cd9a9a2d2302612
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77a8263d2c9ab2d2fd99749ca0bd684b2d23ea39ae6b5deffc35ec07d427d80
3
  size 623
trainer_state.json CHANGED
@@ -1,138 +1,280 @@
1
  {
2
- "best_metric": 0.5875,
3
- "best_model_checkpoint": "./results/checkpoint-500",
4
- "epoch": 2.5,
5
- "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
- {
11
- "epoch": 0.25,
12
- "eval_accuracy": 0.5875,
13
- "eval_latency_in_seconds": 0.016140898968749935,
14
- "eval_loss": 1.0195873975753784,
15
- "eval_runtime": 37.9935,
16
- "eval_samples_per_second": 21.056,
17
- "eval_steps_per_second": 1.316,
18
- "eval_total_time_in_seconds": 12.91271917499995,
19
- "step": 50
20
- },
21
- {
22
- "epoch": 0.5,
23
- "eval_accuracy": 0.5875,
24
- "eval_latency_in_seconds": 0.016577634310000065,
25
- "eval_loss": 0.9936361908912659,
26
- "eval_runtime": 30.0792,
27
- "eval_samples_per_second": 26.596,
28
- "eval_steps_per_second": 1.662,
29
- "eval_total_time_in_seconds": 13.262107448000052,
30
- "step": 100
31
- },
32
- {
33
- "epoch": 0.75,
34
- "eval_accuracy": 0.5875,
35
- "eval_latency_in_seconds": 0.016618314516250052,
36
- "eval_loss": 1.0124518871307373,
37
- "eval_runtime": 29.9038,
38
- "eval_samples_per_second": 26.752,
39
- "eval_steps_per_second": 1.672,
40
- "eval_total_time_in_seconds": 13.294651613000042,
41
- "step": 150
42
- },
43
  {
44
  "epoch": 1.0,
45
  "learning_rate": 1.9e-05,
46
- "loss": 1.007,
47
- "step": 200
48
- },
49
- {
50
- "epoch": 1.0,
51
- "eval_accuracy": 0.5875,
52
- "eval_latency_in_seconds": 0.01668884421499996,
53
- "eval_loss": 1.0121251344680786,
54
- "eval_runtime": 29.8443,
55
- "eval_samples_per_second": 26.806,
56
- "eval_steps_per_second": 1.675,
57
- "eval_total_time_in_seconds": 13.351075371999968,
58
  "step": 200
59
  },
60
  {
61
  "epoch": 1.25,
62
- "eval_accuracy": 0.5875,
63
- "eval_latency_in_seconds": 0.016656154167500006,
64
- "eval_loss": 0.9109314680099487,
65
- "eval_runtime": 29.8759,
66
- "eval_samples_per_second": 26.777,
67
- "eval_steps_per_second": 1.674,
68
- "eval_total_time_in_seconds": 13.324923334000005,
69
  "step": 250
70
  },
71
  {
72
- "epoch": 1.5,
73
- "eval_accuracy": 0.5875,
74
- "eval_latency_in_seconds": 0.016652411622499984,
75
- "eval_loss": 0.8836429119110107,
76
- "eval_runtime": 29.8232,
77
- "eval_samples_per_second": 26.825,
78
- "eval_steps_per_second": 1.677,
79
- "eval_total_time_in_seconds": 13.321929297999986,
80
- "step": 300
81
  },
82
  {
83
- "epoch": 1.75,
84
- "eval_accuracy": 0.5875,
85
- "eval_latency_in_seconds": 0.016655641563749983,
86
- "eval_loss": 0.8780828714370728,
87
- "eval_runtime": 29.9106,
88
- "eval_samples_per_second": 26.746,
89
- "eval_steps_per_second": 1.672,
90
- "eval_total_time_in_seconds": 13.324513250999985,
91
- "step": 350
92
  },
93
  {
94
- "epoch": 2.0,
95
- "learning_rate": 1.8e-05,
96
- "loss": 0.8906,
97
- "step": 400
98
  },
99
  {
100
- "epoch": 2.0,
101
- "eval_accuracy": 0.5875,
102
- "eval_latency_in_seconds": 0.016710829281249884,
103
- "eval_loss": 0.844797670841217,
104
- "eval_runtime": 29.9696,
105
- "eval_samples_per_second": 26.694,
106
- "eval_steps_per_second": 1.668,
107
- "eval_total_time_in_seconds": 13.368663424999909,
108
- "step": 400
109
  },
110
  {
111
- "epoch": 2.25,
112
- "eval_accuracy": 0.5875,
113
- "eval_latency_in_seconds": 0.016734712483749945,
114
- "eval_loss": 0.8375944495201111,
115
- "eval_runtime": 29.9532,
116
- "eval_samples_per_second": 26.708,
117
- "eval_steps_per_second": 1.669,
118
- "eval_total_time_in_seconds": 13.387769986999956,
119
- "step": 450
120
  },
121
  {
122
- "epoch": 2.5,
123
- "eval_accuracy": 0.5875,
124
- "eval_latency_in_seconds": 0.016665541630000006,
125
- "eval_loss": 0.828183114528656,
126
- "eval_runtime": 29.8508,
127
- "eval_samples_per_second": 26.8,
128
- "eval_steps_per_second": 1.675,
129
- "eval_total_time_in_seconds": 13.332433304000006,
130
- "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "max_steps": 4000,
134
  "num_train_epochs": 20,
135
- "total_flos": 1059758088192000.0,
136
  "trial_name": null,
137
  "trial_params": null
138
  }
 
1
  {
2
+ "best_metric": 0.62125,
3
+ "best_model_checkpoint": "./results/checkpoint-4000",
4
+ "epoch": 20.0,
5
+ "global_step": 4000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  {
11
  "epoch": 1.0,
12
  "learning_rate": 1.9e-05,
13
+ "loss": 0.6815,
 
 
 
 
 
 
 
 
 
 
 
14
  "step": 200
15
  },
16
  {
17
  "epoch": 1.25,
18
+ "eval_accuracy": 0.63875,
19
+ "eval_loss": 0.9424235820770264,
20
+ "eval_runtime": 15.8114,
21
+ "eval_samples_per_second": 50.596,
22
+ "eval_steps_per_second": 3.162,
 
 
23
  "step": 250
24
  },
25
  {
26
+ "epoch": 2.0,
27
+ "learning_rate": 1.8e-05,
28
+ "loss": 0.4755,
29
+ "step": 400
 
 
 
 
 
30
  },
31
  {
32
+ "epoch": 2.5,
33
+ "eval_accuracy": 0.6175,
34
+ "eval_loss": 1.0750542879104614,
35
+ "eval_runtime": 15.7962,
36
+ "eval_samples_per_second": 50.645,
37
+ "eval_steps_per_second": 3.165,
38
+ "step": 500
 
 
39
  },
40
  {
41
+ "epoch": 3.0,
42
+ "learning_rate": 1.7e-05,
43
+ "loss": 0.3536,
44
+ "step": 600
45
  },
46
  {
47
+ "epoch": 3.75,
48
+ "eval_accuracy": 0.6125,
49
+ "eval_loss": 1.3325245380401611,
50
+ "eval_runtime": 15.7804,
51
+ "eval_samples_per_second": 50.696,
52
+ "eval_steps_per_second": 3.168,
53
+ "step": 750
 
 
54
  },
55
  {
56
+ "epoch": 4.0,
57
+ "learning_rate": 1.6000000000000003e-05,
58
+ "loss": 0.2045,
59
+ "step": 800
 
 
 
 
 
60
  },
61
  {
62
+ "epoch": 5.0,
63
+ "learning_rate": 1.5000000000000002e-05,
64
+ "loss": 0.124,
65
+ "step": 1000
66
+ },
67
+ {
68
+ "epoch": 5.0,
69
+ "eval_accuracy": 0.61625,
70
+ "eval_loss": 1.8875421285629272,
71
+ "eval_runtime": 15.7904,
72
+ "eval_samples_per_second": 50.664,
73
+ "eval_steps_per_second": 3.166,
74
+ "step": 1000
75
+ },
76
+ {
77
+ "epoch": 6.0,
78
+ "learning_rate": 1.4e-05,
79
+ "loss": 0.0825,
80
+ "step": 1200
81
+ },
82
+ {
83
+ "epoch": 6.25,
84
+ "eval_accuracy": 0.5975,
85
+ "eval_loss": 2.205134868621826,
86
+ "eval_runtime": 15.7943,
87
+ "eval_samples_per_second": 50.651,
88
+ "eval_steps_per_second": 3.166,
89
+ "step": 1250
90
+ },
91
+ {
92
+ "epoch": 7.0,
93
+ "learning_rate": 1.3000000000000001e-05,
94
+ "loss": 0.046,
95
+ "step": 1400
96
+ },
97
+ {
98
+ "epoch": 7.5,
99
+ "eval_accuracy": 0.59625,
100
+ "eval_loss": 2.5616235733032227,
101
+ "eval_runtime": 15.8017,
102
+ "eval_samples_per_second": 50.627,
103
+ "eval_steps_per_second": 3.164,
104
+ "step": 1500
105
+ },
106
+ {
107
+ "epoch": 8.0,
108
+ "learning_rate": 1.2e-05,
109
+ "loss": 0.0331,
110
+ "step": 1600
111
+ },
112
+ {
113
+ "epoch": 8.75,
114
+ "eval_accuracy": 0.62125,
115
+ "eval_loss": 2.4932045936584473,
116
+ "eval_runtime": 15.809,
117
+ "eval_samples_per_second": 50.604,
118
+ "eval_steps_per_second": 3.163,
119
+ "step": 1750
120
+ },
121
+ {
122
+ "epoch": 9.0,
123
+ "learning_rate": 1.1000000000000001e-05,
124
+ "loss": 0.033,
125
+ "step": 1800
126
+ },
127
+ {
128
+ "epoch": 10.0,
129
+ "learning_rate": 1e-05,
130
+ "loss": 0.0143,
131
+ "step": 2000
132
+ },
133
+ {
134
+ "epoch": 10.0,
135
+ "eval_accuracy": 0.5975,
136
+ "eval_loss": 2.796316623687744,
137
+ "eval_runtime": 15.7544,
138
+ "eval_samples_per_second": 50.78,
139
+ "eval_steps_per_second": 3.174,
140
+ "step": 2000
141
+ },
142
+ {
143
+ "epoch": 11.0,
144
+ "learning_rate": 9e-06,
145
+ "loss": 0.0177,
146
+ "step": 2200
147
+ },
148
+ {
149
+ "epoch": 11.25,
150
+ "eval_accuracy": 0.6125,
151
+ "eval_loss": 2.800485134124756,
152
+ "eval_runtime": 15.79,
153
+ "eval_samples_per_second": 50.665,
154
+ "eval_steps_per_second": 3.167,
155
+ "step": 2250
156
+ },
157
+ {
158
+ "epoch": 12.0,
159
+ "learning_rate": 8.000000000000001e-06,
160
+ "loss": 0.0125,
161
+ "step": 2400
162
+ },
163
+ {
164
+ "epoch": 12.5,
165
+ "eval_accuracy": 0.61375,
166
+ "eval_loss": 2.8941900730133057,
167
+ "eval_runtime": 15.8937,
168
+ "eval_samples_per_second": 50.334,
169
+ "eval_steps_per_second": 3.146,
170
+ "step": 2500
171
+ },
172
+ {
173
+ "epoch": 13.0,
174
+ "learning_rate": 7e-06,
175
+ "loss": 0.0066,
176
+ "step": 2600
177
+ },
178
+ {
179
+ "epoch": 13.75,
180
+ "eval_accuracy": 0.60125,
181
+ "eval_loss": 3.0244345664978027,
182
+ "eval_runtime": 15.9227,
183
+ "eval_samples_per_second": 50.243,
184
+ "eval_steps_per_second": 3.14,
185
+ "step": 2750
186
+ },
187
+ {
188
+ "epoch": 14.0,
189
+ "learning_rate": 6e-06,
190
+ "loss": 0.0109,
191
+ "step": 2800
192
+ },
193
+ {
194
+ "epoch": 15.0,
195
+ "learning_rate": 5e-06,
196
+ "loss": 0.0082,
197
+ "step": 3000
198
+ },
199
+ {
200
+ "epoch": 15.0,
201
+ "eval_accuracy": 0.6175,
202
+ "eval_loss": 2.967337131500244,
203
+ "eval_runtime": 15.7712,
204
+ "eval_samples_per_second": 50.725,
205
+ "eval_steps_per_second": 3.17,
206
+ "step": 3000
207
+ },
208
+ {
209
+ "epoch": 16.0,
210
+ "learning_rate": 4.000000000000001e-06,
211
+ "loss": 0.0086,
212
+ "step": 3200
213
+ },
214
+ {
215
+ "epoch": 16.25,
216
+ "eval_accuracy": 0.61375,
217
+ "eval_loss": 2.9286487102508545,
218
+ "eval_runtime": 15.7803,
219
+ "eval_samples_per_second": 50.696,
220
+ "eval_steps_per_second": 3.169,
221
+ "step": 3250
222
+ },
223
+ {
224
+ "epoch": 17.0,
225
+ "learning_rate": 3e-06,
226
+ "loss": 0.0057,
227
+ "step": 3400
228
+ },
229
+ {
230
+ "epoch": 17.5,
231
+ "eval_accuracy": 0.61875,
232
+ "eval_loss": 2.9928808212280273,
233
+ "eval_runtime": 15.7926,
234
+ "eval_samples_per_second": 50.657,
235
+ "eval_steps_per_second": 3.166,
236
+ "step": 3500
237
+ },
238
+ {
239
+ "epoch": 18.0,
240
+ "learning_rate": 2.0000000000000003e-06,
241
+ "loss": 0.0041,
242
+ "step": 3600
243
+ },
244
+ {
245
+ "epoch": 18.75,
246
+ "eval_accuracy": 0.62375,
247
+ "eval_loss": 3.0005481243133545,
248
+ "eval_runtime": 15.78,
249
+ "eval_samples_per_second": 50.697,
250
+ "eval_steps_per_second": 3.169,
251
+ "step": 3750
252
+ },
253
+ {
254
+ "epoch": 19.0,
255
+ "learning_rate": 1.0000000000000002e-06,
256
+ "loss": 0.0029,
257
+ "step": 3800
258
+ },
259
+ {
260
+ "epoch": 20.0,
261
+ "learning_rate": 0.0,
262
+ "loss": 0.0023,
263
+ "step": 4000
264
+ },
265
+ {
266
+ "epoch": 20.0,
267
+ "eval_accuracy": 0.62125,
268
+ "eval_loss": 3.0099549293518066,
269
+ "eval_runtime": 15.792,
270
+ "eval_samples_per_second": 50.659,
271
+ "eval_steps_per_second": 3.166,
272
+ "step": 4000
273
  }
274
  ],
275
  "max_steps": 4000,
276
  "num_train_epochs": 20,
277
+ "total_flos": 8478064705536000.0,
278
  "trial_name": null,
279
  "trial_params": null
280
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8ff86c8a019682990b3e68731bbdebe11c9e8cba636f73bf25cbdcdd1ec4db
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16feb7e2c9585fbbe041e34b576d6b2ceef8285c36959cd644796abb4a7413a9
3
  size 3375