VSPuzzler commited on
Commit
f0c505b
·
1 Parent(s): e05d52e

Upload with huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +1 -1
  2. optimizer.pt +1 -1
  3. pytorch_model.bin +1 -1
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +694 -34
  7. training_args.bin +2 -2
config.json CHANGED
@@ -30,6 +30,6 @@
30
  "sinusoidal_pos_embds": false,
31
  "tie_weights_": true,
32
  "torch_dtype": "float32",
33
- "transformers_version": "4.21.3",
34
  "vocab_size": 30522
35
  }
 
30
  "sinusoidal_pos_embds": false,
31
  "tie_weights_": true,
32
  "torch_dtype": "float32",
33
+ "transformers_version": "4.22.1",
34
  "vocab_size": 30522
35
  }
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c405f6c40d1200845afd7ec4a280ad78efb3b4aabdfb7199f110d3784f20f9c4
3
  size 535706209
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926621184a6b7a8ac5a4774f5c694efcbfc318bc5db3481564c2e2441ce39f30
3
  size 535706209
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0117a50d4b2287f08ff497c869353972a1746688126e083cbb1fd3b315b22eef
3
  size 267857393
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7107ccbb7746e6dbde30bd9edb06601ecf5b7d95f0b337000cbae60a0f34cec
3
  size 267857393
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24e9e3beccc7ce8a393ea90cf69333474d4b0305b2f9b9b904790dea80380696
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24340d944dcaa627af1c5f51313a436ed4a3ad2a03a0651b1cd4a51bbe28284d
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e35b3e1d3db27a8ab844c20114e9e1cec88dd3ed6b54599b8cbd45a7f79a10d2
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b5c9ed8372e0eec740369001de73734db6b5b259af9ae69a359108a1dd0fd17
3
  size 623
trainer_state.json CHANGED
@@ -1,105 +1,765 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
  "epoch": 15.0,
5
  "global_step": 3000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  {
11
  "epoch": 1.0,
12
- "learning_rate": 1.9333333333333333e-05,
13
- "loss": 0.0569,
 
 
 
 
 
14
  "step": 200
15
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  {
17
  "epoch": 2.0,
18
- "learning_rate": 1.866666666666667e-05,
19
- "loss": 0.0366,
 
 
 
 
 
20
  "step": 400
21
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  {
23
  "epoch": 3.0,
24
- "learning_rate": 1.8e-05,
25
- "loss": 0.0292,
 
 
 
 
 
 
 
 
 
 
 
26
  "step": 600
27
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  {
29
  "epoch": 4.0,
30
- "learning_rate": 1.7333333333333336e-05,
31
- "loss": 0.032,
 
 
 
 
 
32
  "step": 800
33
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "epoch": 5.0,
36
- "learning_rate": 1.6666666666666667e-05,
37
- "loss": 0.0253,
 
 
 
 
 
38
  "step": 1000
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "epoch": 6.0,
42
- "learning_rate": 1.6000000000000003e-05,
43
- "loss": 0.0312,
 
 
 
 
 
 
 
 
 
 
 
44
  "step": 1200
45
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  {
47
  "epoch": 7.0,
48
- "learning_rate": 1.5333333333333334e-05,
49
- "loss": 0.0285,
 
 
 
 
 
50
  "step": 1400
51
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  {
53
  "epoch": 8.0,
54
- "learning_rate": 1.4666666666666666e-05,
55
- "loss": 0.029,
 
 
 
 
 
56
  "step": 1600
57
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  {
59
  "epoch": 9.0,
60
- "learning_rate": 1.4e-05,
61
- "loss": 0.0308,
 
 
 
 
 
 
 
 
 
 
 
62
  "step": 1800
63
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  {
65
  "epoch": 10.0,
66
- "learning_rate": 1.3333333333333333e-05,
67
- "loss": 0.0324,
 
 
 
 
 
68
  "step": 2000
69
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  {
71
  "epoch": 11.0,
72
- "learning_rate": 1.2666666666666667e-05,
73
- "loss": 0.0165,
 
 
 
 
 
74
  "step": 2200
75
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  {
77
  "epoch": 12.0,
78
- "learning_rate": 1.2e-05,
79
- "loss": 0.0207,
80
  "step": 2400
81
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  {
83
  "epoch": 13.0,
84
- "learning_rate": 1.1333333333333334e-05,
85
- "loss": 0.0157,
86
  "step": 2600
87
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  {
89
  "epoch": 14.0,
90
- "learning_rate": 1.0666666666666667e-05,
91
- "loss": 0.0136,
92
  "step": 2800
93
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  {
95
  "epoch": 15.0,
96
- "learning_rate": 1e-05,
97
- "loss": 0.0058,
 
 
 
 
 
 
 
 
 
 
 
98
  "step": 3000
99
  }
100
  ],
101
- "max_steps": 6000,
102
- "num_train_epochs": 30,
103
  "total_flos": 6358548529152000.0,
104
  "trial_name": null,
105
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5875,
3
+ "best_model_checkpoint": "./results/checkpoint-500",
4
  "epoch": 15.0,
5
  "global_step": 3000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
+ {
11
+ "epoch": 0.25,
12
+ "eval_accuracy": 0.5875,
13
+ "eval_latency_in_seconds": 0.016140898968749935,
14
+ "eval_loss": 1.0195873975753784,
15
+ "eval_runtime": 37.9935,
16
+ "eval_samples_per_second": 21.056,
17
+ "eval_steps_per_second": 1.316,
18
+ "eval_total_time_in_seconds": 12.91271917499995,
19
+ "step": 50
20
+ },
21
+ {
22
+ "epoch": 0.5,
23
+ "eval_accuracy": 0.5875,
24
+ "eval_latency_in_seconds": 0.016577634310000065,
25
+ "eval_loss": 0.9936361908912659,
26
+ "eval_runtime": 30.0792,
27
+ "eval_samples_per_second": 26.596,
28
+ "eval_steps_per_second": 1.662,
29
+ "eval_total_time_in_seconds": 13.262107448000052,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.75,
34
+ "eval_accuracy": 0.5875,
35
+ "eval_latency_in_seconds": 0.016618314516250052,
36
+ "eval_loss": 1.0124518871307373,
37
+ "eval_runtime": 29.9038,
38
+ "eval_samples_per_second": 26.752,
39
+ "eval_steps_per_second": 1.672,
40
+ "eval_total_time_in_seconds": 13.294651613000042,
41
+ "step": 150
42
+ },
43
+ {
44
+ "epoch": 1.0,
45
+ "learning_rate": 1.9e-05,
46
+ "loss": 1.007,
47
+ "step": 200
48
+ },
49
  {
50
  "epoch": 1.0,
51
+ "eval_accuracy": 0.5875,
52
+ "eval_latency_in_seconds": 0.01668884421499996,
53
+ "eval_loss": 1.0121251344680786,
54
+ "eval_runtime": 29.8443,
55
+ "eval_samples_per_second": 26.806,
56
+ "eval_steps_per_second": 1.675,
57
+ "eval_total_time_in_seconds": 13.351075371999968,
58
  "step": 200
59
  },
60
+ {
61
+ "epoch": 1.25,
62
+ "eval_accuracy": 0.5875,
63
+ "eval_latency_in_seconds": 0.016656154167500006,
64
+ "eval_loss": 0.9109314680099487,
65
+ "eval_runtime": 29.8759,
66
+ "eval_samples_per_second": 26.777,
67
+ "eval_steps_per_second": 1.674,
68
+ "eval_total_time_in_seconds": 13.324923334000005,
69
+ "step": 250
70
+ },
71
+ {
72
+ "epoch": 1.5,
73
+ "eval_accuracy": 0.5875,
74
+ "eval_latency_in_seconds": 0.016652411622499984,
75
+ "eval_loss": 0.8836429119110107,
76
+ "eval_runtime": 29.8232,
77
+ "eval_samples_per_second": 26.825,
78
+ "eval_steps_per_second": 1.677,
79
+ "eval_total_time_in_seconds": 13.321929297999986,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 1.75,
84
+ "eval_accuracy": 0.5875,
85
+ "eval_latency_in_seconds": 0.016655641563749983,
86
+ "eval_loss": 0.8780828714370728,
87
+ "eval_runtime": 29.9106,
88
+ "eval_samples_per_second": 26.746,
89
+ "eval_steps_per_second": 1.672,
90
+ "eval_total_time_in_seconds": 13.324513250999985,
91
+ "step": 350
92
+ },
93
+ {
94
+ "epoch": 2.0,
95
+ "learning_rate": 1.8e-05,
96
+ "loss": 0.8906,
97
+ "step": 400
98
+ },
99
  {
100
  "epoch": 2.0,
101
+ "eval_accuracy": 0.5875,
102
+ "eval_latency_in_seconds": 0.016710829281249884,
103
+ "eval_loss": 0.844797670841217,
104
+ "eval_runtime": 29.9696,
105
+ "eval_samples_per_second": 26.694,
106
+ "eval_steps_per_second": 1.668,
107
+ "eval_total_time_in_seconds": 13.368663424999909,
108
  "step": 400
109
  },
110
+ {
111
+ "epoch": 2.25,
112
+ "eval_accuracy": 0.5875,
113
+ "eval_latency_in_seconds": 0.016734712483749945,
114
+ "eval_loss": 0.8375944495201111,
115
+ "eval_runtime": 29.9532,
116
+ "eval_samples_per_second": 26.708,
117
+ "eval_steps_per_second": 1.669,
118
+ "eval_total_time_in_seconds": 13.387769986999956,
119
+ "step": 450
120
+ },
121
+ {
122
+ "epoch": 2.5,
123
+ "eval_accuracy": 0.5875,
124
+ "eval_latency_in_seconds": 0.016665541630000006,
125
+ "eval_loss": 0.828183114528656,
126
+ "eval_runtime": 29.8508,
127
+ "eval_samples_per_second": 26.8,
128
+ "eval_steps_per_second": 1.675,
129
+ "eval_total_time_in_seconds": 13.332433304000006,
130
+ "step": 500
131
+ },
132
+ {
133
+ "epoch": 2.75,
134
+ "eval_accuracy": 0.5875,
135
+ "eval_latency_in_seconds": 0.01663229147999999,
136
+ "eval_loss": 0.8245559930801392,
137
+ "eval_runtime": 29.8164,
138
+ "eval_samples_per_second": 26.831,
139
+ "eval_steps_per_second": 1.677,
140
+ "eval_total_time_in_seconds": 13.305833183999994,
141
+ "step": 550
142
+ },
143
  {
144
  "epoch": 3.0,
145
+ "learning_rate": 1.7e-05,
146
+ "loss": 0.7455,
147
+ "step": 600
148
+ },
149
+ {
150
+ "epoch": 3.0,
151
+ "eval_accuracy": 0.5875,
152
+ "eval_latency_in_seconds": 0.01658837374625023,
153
+ "eval_loss": 0.8258004784584045,
154
+ "eval_runtime": 29.782,
155
+ "eval_samples_per_second": 26.862,
156
+ "eval_steps_per_second": 1.679,
157
+ "eval_total_time_in_seconds": 13.270698997000181,
158
  "step": 600
159
  },
160
+ {
161
+ "epoch": 3.25,
162
+ "eval_accuracy": 0.5875,
163
+ "eval_latency_in_seconds": 0.016743442093749936,
164
+ "eval_loss": 0.8827661275863647,
165
+ "eval_runtime": 30.0887,
166
+ "eval_samples_per_second": 26.588,
167
+ "eval_steps_per_second": 1.662,
168
+ "eval_total_time_in_seconds": 13.394753674999947,
169
+ "step": 650
170
+ },
171
+ {
172
+ "epoch": 3.5,
173
+ "eval_accuracy": 0.5875,
174
+ "eval_latency_in_seconds": 0.01667224465624997,
175
+ "eval_loss": 0.8894978165626526,
176
+ "eval_runtime": 30.1041,
177
+ "eval_samples_per_second": 26.574,
178
+ "eval_steps_per_second": 1.661,
179
+ "eval_total_time_in_seconds": 13.337795724999978,
180
+ "step": 700
181
+ },
182
+ {
183
+ "epoch": 3.75,
184
+ "eval_accuracy": 0.5875,
185
+ "eval_latency_in_seconds": 0.016693762731250103,
186
+ "eval_loss": 0.9063799381256104,
187
+ "eval_runtime": 29.8535,
188
+ "eval_samples_per_second": 26.797,
189
+ "eval_steps_per_second": 1.675,
190
+ "eval_total_time_in_seconds": 13.355010185000083,
191
+ "step": 750
192
+ },
193
+ {
194
+ "epoch": 4.0,
195
+ "learning_rate": 1.6000000000000003e-05,
196
+ "loss": 0.549,
197
+ "step": 800
198
+ },
199
  {
200
  "epoch": 4.0,
201
+ "eval_accuracy": 0.5875,
202
+ "eval_latency_in_seconds": 0.016649374238749886,
203
+ "eval_loss": 0.9239490032196045,
204
+ "eval_runtime": 29.9071,
205
+ "eval_samples_per_second": 26.749,
206
+ "eval_steps_per_second": 1.672,
207
+ "eval_total_time_in_seconds": 13.319499390999908,
208
  "step": 800
209
  },
210
+ {
211
+ "epoch": 4.25,
212
+ "eval_accuracy": 0.5875,
213
+ "eval_latency_in_seconds": 0.01666813849999983,
214
+ "eval_loss": 0.9834891557693481,
215
+ "eval_runtime": 29.9398,
216
+ "eval_samples_per_second": 26.72,
217
+ "eval_steps_per_second": 1.67,
218
+ "eval_total_time_in_seconds": 13.334510799999862,
219
+ "step": 850
220
+ },
221
+ {
222
+ "epoch": 4.5,
223
+ "eval_accuracy": 0.5875,
224
+ "eval_latency_in_seconds": 0.01672061295875011,
225
+ "eval_loss": 1.0434612035751343,
226
+ "eval_runtime": 30.0191,
227
+ "eval_samples_per_second": 26.65,
228
+ "eval_steps_per_second": 1.666,
229
+ "eval_total_time_in_seconds": 13.376490367000088,
230
+ "step": 900
231
+ },
232
+ {
233
+ "epoch": 4.75,
234
+ "eval_accuracy": 0.5875,
235
+ "eval_latency_in_seconds": 0.016696340333749903,
236
+ "eval_loss": 1.0434249639511108,
237
+ "eval_runtime": 30.0388,
238
+ "eval_samples_per_second": 26.632,
239
+ "eval_steps_per_second": 1.665,
240
+ "eval_total_time_in_seconds": 13.357072266999921,
241
+ "step": 950
242
+ },
243
+ {
244
+ "epoch": 5.0,
245
+ "learning_rate": 1.5000000000000002e-05,
246
+ "loss": 0.37,
247
+ "step": 1000
248
+ },
249
  {
250
  "epoch": 5.0,
251
+ "eval_accuracy": 0.5875,
252
+ "eval_latency_in_seconds": 0.01674834000874995,
253
+ "eval_loss": 1.0716580152511597,
254
+ "eval_runtime": 30.0668,
255
+ "eval_samples_per_second": 26.607,
256
+ "eval_steps_per_second": 1.663,
257
+ "eval_total_time_in_seconds": 13.39867200699996,
258
  "step": 1000
259
  },
260
+ {
261
+ "epoch": 5.25,
262
+ "eval_accuracy": 0.5875,
263
+ "eval_latency_in_seconds": 0.01658757922625,
264
+ "eval_loss": 1.1702197790145874,
265
+ "eval_runtime": 29.7005,
266
+ "eval_samples_per_second": 26.936,
267
+ "eval_steps_per_second": 1.683,
268
+ "eval_total_time_in_seconds": 13.270063381,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 5.5,
273
+ "eval_accuracy": 0.5875,
274
+ "eval_latency_in_seconds": 0.016588210039999753,
275
+ "eval_loss": 1.363202691078186,
276
+ "eval_runtime": 29.7061,
277
+ "eval_samples_per_second": 26.931,
278
+ "eval_steps_per_second": 1.683,
279
+ "eval_total_time_in_seconds": 13.270568031999801,
280
+ "step": 1100
281
+ },
282
+ {
283
+ "epoch": 5.75,
284
+ "eval_accuracy": 0.5875,
285
+ "eval_latency_in_seconds": 0.016613341418749882,
286
+ "eval_loss": 1.3025741577148438,
287
+ "eval_runtime": 29.8243,
288
+ "eval_samples_per_second": 26.824,
289
+ "eval_steps_per_second": 1.676,
290
+ "eval_total_time_in_seconds": 13.290673134999906,
291
+ "step": 1150
292
+ },
293
  {
294
  "epoch": 6.0,
295
+ "learning_rate": 1.4e-05,
296
+ "loss": 0.231,
297
+ "step": 1200
298
+ },
299
+ {
300
+ "epoch": 6.0,
301
+ "eval_accuracy": 0.5875,
302
+ "eval_latency_in_seconds": 0.016670319153750484,
303
+ "eval_loss": 1.241083025932312,
304
+ "eval_runtime": 29.8584,
305
+ "eval_samples_per_second": 26.793,
306
+ "eval_steps_per_second": 1.675,
307
+ "eval_total_time_in_seconds": 13.336255323000387,
308
  "step": 1200
309
  },
310
+ {
311
+ "epoch": 6.25,
312
+ "eval_accuracy": 0.5875,
313
+ "eval_latency_in_seconds": 0.01664680648125,
314
+ "eval_loss": 1.3705066442489624,
315
+ "eval_runtime": 29.9316,
316
+ "eval_samples_per_second": 26.728,
317
+ "eval_steps_per_second": 1.67,
318
+ "eval_total_time_in_seconds": 13.317445184999997,
319
+ "step": 1250
320
+ },
321
+ {
322
+ "epoch": 6.5,
323
+ "eval_accuracy": 0.5875,
324
+ "eval_latency_in_seconds": 0.01666591738875013,
325
+ "eval_loss": 1.5176929235458374,
326
+ "eval_runtime": 29.9921,
327
+ "eval_samples_per_second": 26.674,
328
+ "eval_steps_per_second": 1.667,
329
+ "eval_total_time_in_seconds": 13.332733911000105,
330
+ "step": 1300
331
+ },
332
+ {
333
+ "epoch": 6.75,
334
+ "eval_accuracy": 0.5875,
335
+ "eval_latency_in_seconds": 0.016700183542499757,
336
+ "eval_loss": 1.8310248851776123,
337
+ "eval_runtime": 29.9705,
338
+ "eval_samples_per_second": 26.693,
339
+ "eval_steps_per_second": 1.668,
340
+ "eval_total_time_in_seconds": 13.360146833999806,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 7.0,
345
+ "learning_rate": 1.3000000000000001e-05,
346
+ "loss": 0.1348,
347
+ "step": 1400
348
+ },
349
  {
350
  "epoch": 7.0,
351
+ "eval_accuracy": 0.5875,
352
+ "eval_latency_in_seconds": 0.016722027651250075,
353
+ "eval_loss": 1.6683160066604614,
354
+ "eval_runtime": 30.0677,
355
+ "eval_samples_per_second": 26.607,
356
+ "eval_steps_per_second": 1.663,
357
+ "eval_total_time_in_seconds": 13.377622121000059,
358
  "step": 1400
359
  },
360
+ {
361
+ "epoch": 7.25,
362
+ "eval_accuracy": 0.5875,
363
+ "eval_latency_in_seconds": 0.016639053724999825,
364
+ "eval_loss": 1.8127371072769165,
365
+ "eval_runtime": 29.9019,
366
+ "eval_samples_per_second": 26.754,
367
+ "eval_steps_per_second": 1.672,
368
+ "eval_total_time_in_seconds": 13.31124297999986,
369
+ "step": 1450
370
+ },
371
+ {
372
+ "epoch": 7.5,
373
+ "eval_accuracy": 0.5875,
374
+ "eval_latency_in_seconds": 0.016672658619999652,
375
+ "eval_loss": 1.8947601318359375,
376
+ "eval_runtime": 29.8732,
377
+ "eval_samples_per_second": 26.78,
378
+ "eval_steps_per_second": 1.674,
379
+ "eval_total_time_in_seconds": 13.338126895999721,
380
+ "step": 1500
381
+ },
382
+ {
383
+ "epoch": 7.75,
384
+ "eval_accuracy": 0.5875,
385
+ "eval_latency_in_seconds": 0.016528693013750057,
386
+ "eval_loss": 1.9334509372711182,
387
+ "eval_runtime": 29.6754,
388
+ "eval_samples_per_second": 26.958,
389
+ "eval_steps_per_second": 1.685,
390
+ "eval_total_time_in_seconds": 13.222954411000046,
391
+ "step": 1550
392
+ },
393
+ {
394
+ "epoch": 8.0,
395
+ "learning_rate": 1.2e-05,
396
+ "loss": 0.0896,
397
+ "step": 1600
398
+ },
399
  {
400
  "epoch": 8.0,
401
+ "eval_accuracy": 0.5875,
402
+ "eval_latency_in_seconds": 0.016713374875000114,
403
+ "eval_loss": 1.8743096590042114,
404
+ "eval_runtime": 29.9755,
405
+ "eval_samples_per_second": 26.688,
406
+ "eval_steps_per_second": 1.668,
407
+ "eval_total_time_in_seconds": 13.37069990000009,
408
  "step": 1600
409
  },
410
+ {
411
+ "epoch": 8.25,
412
+ "eval_accuracy": 0.5875,
413
+ "eval_latency_in_seconds": 0.01665160638624968,
414
+ "eval_loss": 1.999316692352295,
415
+ "eval_runtime": 29.865,
416
+ "eval_samples_per_second": 26.787,
417
+ "eval_steps_per_second": 1.674,
418
+ "eval_total_time_in_seconds": 13.321285108999746,
419
+ "step": 1650
420
+ },
421
+ {
422
+ "epoch": 8.5,
423
+ "eval_accuracy": 0.5875,
424
+ "eval_latency_in_seconds": 0.016658729771249912,
425
+ "eval_loss": 2.0449576377868652,
426
+ "eval_runtime": 29.8258,
427
+ "eval_samples_per_second": 26.822,
428
+ "eval_steps_per_second": 1.676,
429
+ "eval_total_time_in_seconds": 13.326983816999928,
430
+ "step": 1700
431
+ },
432
+ {
433
+ "epoch": 8.75,
434
+ "eval_accuracy": 0.5875,
435
+ "eval_latency_in_seconds": 0.016647704048749573,
436
+ "eval_loss": 2.0946853160858154,
437
+ "eval_runtime": 29.8963,
438
+ "eval_samples_per_second": 26.759,
439
+ "eval_steps_per_second": 1.672,
440
+ "eval_total_time_in_seconds": 13.31816323899966,
441
+ "step": 1750
442
+ },
443
  {
444
  "epoch": 9.0,
445
+ "learning_rate": 1.1000000000000001e-05,
446
+ "loss": 0.0599,
447
+ "step": 1800
448
+ },
449
+ {
450
+ "epoch": 9.0,
451
+ "eval_accuracy": 0.5875,
452
+ "eval_latency_in_seconds": 0.016691148148750015,
453
+ "eval_loss": 2.231701612472534,
454
+ "eval_runtime": 30.1095,
455
+ "eval_samples_per_second": 26.57,
456
+ "eval_steps_per_second": 1.661,
457
+ "eval_total_time_in_seconds": 13.352918519000013,
458
  "step": 1800
459
  },
460
+ {
461
+ "epoch": 9.25,
462
+ "eval_accuracy": 0.5875,
463
+ "eval_latency_in_seconds": 0.0166487254499998,
464
+ "eval_loss": 2.204172134399414,
465
+ "eval_runtime": 29.9643,
466
+ "eval_samples_per_second": 26.698,
467
+ "eval_steps_per_second": 1.669,
468
+ "eval_total_time_in_seconds": 13.318980359999841,
469
+ "step": 1850
470
+ },
471
+ {
472
+ "epoch": 9.5,
473
+ "eval_accuracy": 0.5875,
474
+ "eval_latency_in_seconds": 0.016727415717500093,
475
+ "eval_loss": 2.282806396484375,
476
+ "eval_runtime": 30.0555,
477
+ "eval_samples_per_second": 26.617,
478
+ "eval_steps_per_second": 1.664,
479
+ "eval_total_time_in_seconds": 13.381932574000075,
480
+ "step": 1900
481
+ },
482
+ {
483
+ "epoch": 9.75,
484
+ "eval_accuracy": 0.5875,
485
+ "eval_latency_in_seconds": 0.0166388489374998,
486
+ "eval_loss": 2.298581123352051,
487
+ "eval_runtime": 29.9445,
488
+ "eval_samples_per_second": 26.716,
489
+ "eval_steps_per_second": 1.67,
490
+ "eval_total_time_in_seconds": 13.311079149999841,
491
+ "step": 1950
492
+ },
493
+ {
494
+ "epoch": 10.0,
495
+ "learning_rate": 1e-05,
496
+ "loss": 0.0495,
497
+ "step": 2000
498
+ },
499
  {
500
  "epoch": 10.0,
501
+ "eval_accuracy": 0.5875,
502
+ "eval_latency_in_seconds": 0.01665341812874999,
503
+ "eval_loss": 2.227576494216919,
504
+ "eval_runtime": 29.8758,
505
+ "eval_samples_per_second": 26.777,
506
+ "eval_steps_per_second": 1.674,
507
+ "eval_total_time_in_seconds": 13.322734502999992,
508
  "step": 2000
509
  },
510
+ {
511
+ "epoch": 10.25,
512
+ "eval_accuracy": 0.5875,
513
+ "eval_latency_in_seconds": 0.016612832117500035,
514
+ "eval_loss": 2.2979378700256348,
515
+ "eval_runtime": 29.8189,
516
+ "eval_samples_per_second": 26.829,
517
+ "eval_steps_per_second": 1.677,
518
+ "eval_total_time_in_seconds": 13.290265694000027,
519
+ "step": 2050
520
+ },
521
+ {
522
+ "epoch": 10.5,
523
+ "eval_accuracy": 0.5875,
524
+ "eval_latency_in_seconds": 0.016637017281250336,
525
+ "eval_loss": 2.284951686859131,
526
+ "eval_runtime": 29.8695,
527
+ "eval_samples_per_second": 26.783,
528
+ "eval_steps_per_second": 1.674,
529
+ "eval_total_time_in_seconds": 13.30961382500027,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 10.75,
534
+ "eval_accuracy": 0.5875,
535
+ "eval_latency_in_seconds": 0.016614138501249726,
536
+ "eval_loss": 2.3448538780212402,
537
+ "eval_runtime": 29.8656,
538
+ "eval_samples_per_second": 26.787,
539
+ "eval_steps_per_second": 1.674,
540
+ "eval_total_time_in_seconds": 13.291310800999781,
541
+ "step": 2150
542
+ },
543
+ {
544
+ "epoch": 11.0,
545
+ "learning_rate": 9e-06,
546
+ "loss": 0.0276,
547
+ "step": 2200
548
+ },
549
  {
550
  "epoch": 11.0,
551
+ "eval_accuracy": 0.5875,
552
+ "eval_latency_in_seconds": 0.016691225096250265,
553
+ "eval_loss": 2.307952880859375,
554
+ "eval_runtime": 29.8828,
555
+ "eval_samples_per_second": 26.771,
556
+ "eval_steps_per_second": 1.673,
557
+ "eval_total_time_in_seconds": 13.352980077000211,
558
  "step": 2200
559
  },
560
+ {
561
+ "epoch": 11.25,
562
+ "eval_accuracy": 0.5875,
563
+ "eval_latency_in_seconds": 0.01659034708999968,
564
+ "eval_loss": 2.3699042797088623,
565
+ "eval_runtime": 29.8101,
566
+ "eval_samples_per_second": 26.837,
567
+ "eval_steps_per_second": 1.677,
568
+ "eval_total_time_in_seconds": 13.272277671999746,
569
+ "step": 2250
570
+ },
571
+ {
572
+ "epoch": 11.5,
573
+ "eval_accuracy": 0.5875,
574
+ "eval_latency_in_seconds": 0.01659643314749985,
575
+ "eval_loss": 2.4644832611083984,
576
+ "eval_runtime": 29.8395,
577
+ "eval_samples_per_second": 26.81,
578
+ "eval_steps_per_second": 1.676,
579
+ "eval_total_time_in_seconds": 13.277146517999881,
580
+ "step": 2300
581
+ },
582
+ {
583
+ "epoch": 11.75,
584
+ "eval_accuracy": 0.5875,
585
+ "eval_latency_in_seconds": 0.016764325188750036,
586
+ "eval_loss": 2.4088258743286133,
587
+ "eval_runtime": 30.0842,
588
+ "eval_samples_per_second": 26.592,
589
+ "eval_steps_per_second": 1.662,
590
+ "eval_total_time_in_seconds": 13.411460151000028,
591
+ "step": 2350
592
+ },
593
  {
594
  "epoch": 12.0,
595
+ "learning_rate": 8.000000000000001e-06,
596
+ "loss": 0.0131,
597
  "step": 2400
598
  },
599
+ {
600
+ "epoch": 12.0,
601
+ "eval_accuracy": 0.5875,
602
+ "eval_latency_in_seconds": 0.016665852497500282,
603
+ "eval_loss": 2.4499921798706055,
604
+ "eval_runtime": 29.9819,
605
+ "eval_samples_per_second": 26.683,
606
+ "eval_steps_per_second": 1.668,
607
+ "eval_total_time_in_seconds": 13.332681998000226,
608
+ "step": 2400
609
+ },
610
+ {
611
+ "epoch": 12.25,
612
+ "eval_accuracy": 0.5875,
613
+ "eval_latency_in_seconds": 0.01666696527499994,
614
+ "eval_loss": 2.471630096435547,
615
+ "eval_runtime": 29.8811,
616
+ "eval_samples_per_second": 26.773,
617
+ "eval_steps_per_second": 1.673,
618
+ "eval_total_time_in_seconds": 13.333572219999951,
619
+ "step": 2450
620
+ },
621
+ {
622
+ "epoch": 12.5,
623
+ "eval_accuracy": 0.5875,
624
+ "eval_latency_in_seconds": 0.016629205762500307,
625
+ "eval_loss": 2.5404670238494873,
626
+ "eval_runtime": 29.8336,
627
+ "eval_samples_per_second": 26.815,
628
+ "eval_steps_per_second": 1.676,
629
+ "eval_total_time_in_seconds": 13.303364610000244,
630
+ "step": 2500
631
+ },
632
+ {
633
+ "epoch": 12.75,
634
+ "eval_accuracy": 0.5875,
635
+ "eval_latency_in_seconds": 0.016587090722500763,
636
+ "eval_loss": 2.5341074466705322,
637
+ "eval_runtime": 29.8717,
638
+ "eval_samples_per_second": 26.781,
639
+ "eval_steps_per_second": 1.674,
640
+ "eval_total_time_in_seconds": 13.269672578000609,
641
+ "step": 2550
642
+ },
643
  {
644
  "epoch": 13.0,
645
+ "learning_rate": 7e-06,
646
+ "loss": 0.0127,
647
  "step": 2600
648
  },
649
+ {
650
+ "epoch": 13.0,
651
+ "eval_accuracy": 0.5875,
652
+ "eval_latency_in_seconds": 0.016596427140000286,
653
+ "eval_loss": 2.550121784210205,
654
+ "eval_runtime": 29.7591,
655
+ "eval_samples_per_second": 26.883,
656
+ "eval_steps_per_second": 1.68,
657
+ "eval_total_time_in_seconds": 13.27714171200023,
658
+ "step": 2600
659
+ },
660
+ {
661
+ "epoch": 13.25,
662
+ "eval_accuracy": 0.5875,
663
+ "eval_latency_in_seconds": 0.01661448852124977,
664
+ "eval_loss": 2.5775039196014404,
665
+ "eval_runtime": 29.7738,
666
+ "eval_samples_per_second": 26.869,
667
+ "eval_steps_per_second": 1.679,
668
+ "eval_total_time_in_seconds": 13.291590816999815,
669
+ "step": 2650
670
+ },
671
+ {
672
+ "epoch": 13.5,
673
+ "eval_accuracy": 0.5875,
674
+ "eval_latency_in_seconds": 0.01660655404249951,
675
+ "eval_loss": 2.5819220542907715,
676
+ "eval_runtime": 29.8343,
677
+ "eval_samples_per_second": 26.815,
678
+ "eval_steps_per_second": 1.676,
679
+ "eval_total_time_in_seconds": 13.285243233999608,
680
+ "step": 2700
681
+ },
682
+ {
683
+ "epoch": 13.75,
684
+ "eval_accuracy": 0.5875,
685
+ "eval_latency_in_seconds": 0.01656317381874942,
686
+ "eval_loss": 2.562864303588867,
687
+ "eval_runtime": 29.749,
688
+ "eval_samples_per_second": 26.892,
689
+ "eval_steps_per_second": 1.681,
690
+ "eval_total_time_in_seconds": 13.250539054999535,
691
+ "step": 2750
692
+ },
693
  {
694
  "epoch": 14.0,
695
+ "learning_rate": 6e-06,
696
+ "loss": 0.0112,
697
  "step": 2800
698
  },
699
+ {
700
+ "epoch": 14.0,
701
+ "eval_accuracy": 0.5875,
702
+ "eval_latency_in_seconds": 0.016711680048749712,
703
+ "eval_loss": 2.6634762287139893,
704
+ "eval_runtime": 30.0388,
705
+ "eval_samples_per_second": 26.632,
706
+ "eval_steps_per_second": 1.665,
707
+ "eval_total_time_in_seconds": 13.36934403899977,
708
+ "step": 2800
709
+ },
710
+ {
711
+ "epoch": 14.25,
712
+ "eval_accuracy": 0.5875,
713
+ "eval_latency_in_seconds": 0.01673866046249941,
714
+ "eval_loss": 2.649559259414673,
715
+ "eval_runtime": 30.035,
716
+ "eval_samples_per_second": 26.636,
717
+ "eval_steps_per_second": 1.665,
718
+ "eval_total_time_in_seconds": 13.390928369999529,
719
+ "step": 2850
720
+ },
721
+ {
722
+ "epoch": 14.5,
723
+ "eval_accuracy": 0.5875,
724
+ "eval_latency_in_seconds": 0.016693803422499515,
725
+ "eval_loss": 2.6672680377960205,
726
+ "eval_runtime": 29.947,
727
+ "eval_samples_per_second": 26.714,
728
+ "eval_steps_per_second": 1.67,
729
+ "eval_total_time_in_seconds": 13.35504273799961,
730
+ "step": 2900
731
+ },
732
+ {
733
+ "epoch": 14.75,
734
+ "eval_accuracy": 0.5875,
735
+ "eval_latency_in_seconds": 0.016712438349999276,
736
+ "eval_loss": 2.7292675971984863,
737
+ "eval_runtime": 29.9456,
738
+ "eval_samples_per_second": 26.715,
739
+ "eval_steps_per_second": 1.67,
740
+ "eval_total_time_in_seconds": 13.36995067999942,
741
+ "step": 2950
742
+ },
743
  {
744
  "epoch": 15.0,
745
+ "learning_rate": 5e-06,
746
+ "loss": 0.0108,
747
+ "step": 3000
748
+ },
749
+ {
750
+ "epoch": 15.0,
751
+ "eval_accuracy": 0.5875,
752
+ "eval_latency_in_seconds": 0.016635910904999492,
753
+ "eval_loss": 2.664646625518799,
754
+ "eval_runtime": 29.8887,
755
+ "eval_samples_per_second": 26.766,
756
+ "eval_steps_per_second": 1.673,
757
+ "eval_total_time_in_seconds": 13.308728723999593,
758
  "step": 3000
759
  }
760
  ],
761
+ "max_steps": 4000,
762
+ "num_train_epochs": 20,
763
  "total_flos": 6358548529152000.0,
764
  "trial_name": null,
765
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3a822423ddf836858e1b92c846c9074238ed0a832e2ec766ae1ee99843630ed
3
- size 3311
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8ff86c8a019682990b3e68731bbdebe11c9e8cba636f73bf25cbdcdd1ec4db
3
+ size 3375