bamertl commited on
Commit
c505446
·
1 Parent(s): e5bf063

Upload 7 files

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "NEGATIVE",
13
+ "1": "POSITIVE"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "NEGATIVE": 0,
18
+ "POSITIVE": 1
19
+ },
20
+ "max_position_embeddings": 512,
21
+ "model_type": "distilbert",
22
+ "n_heads": 12,
23
+ "n_layers": 6,
24
+ "pad_token_id": 0,
25
+ "problem_type": "single_label_classification",
26
+ "qa_dropout": 0.1,
27
+ "seq_classif_dropout": 0.2,
28
+ "sinusoidal_pos_embds": false,
29
+ "tie_weights_": true,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.28.1",
32
+ "vocab_size": 30522
33
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5953f81e3a65d865ba17b3722fefc988408cbb0cff88779e5a5eb813dc12c26
3
+ size 267855533
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "DistilBertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
trainer_state.json ADDED
@@ -0,0 +1,1036 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5703100562095642,
3
+ "best_model_checkpoint": "my_awesome_model/checkpoint-10000",
4
+ "epoch": 0.9983028850953379,
5
+ "global_step": 10000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "eval_accuracy": 0.5599324704558244,
13
+ "eval_loss": 0.6817283630371094,
14
+ "eval_runtime": 87.555,
15
+ "eval_samples_per_second": 385.621,
16
+ "eval_steps_per_second": 24.111,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.02,
21
+ "eval_accuracy": 0.6039155288333383,
22
+ "eval_loss": 0.6646063923835754,
23
+ "eval_runtime": 98.0865,
24
+ "eval_samples_per_second": 344.217,
25
+ "eval_steps_per_second": 21.522,
26
+ "step": 200
27
+ },
28
+ {
29
+ "epoch": 0.03,
30
+ "eval_accuracy": 0.6300684180908095,
31
+ "eval_loss": 0.6491487622261047,
32
+ "eval_runtime": 112.999,
33
+ "eval_samples_per_second": 298.79,
34
+ "eval_steps_per_second": 18.682,
35
+ "step": 300
36
+ },
37
+ {
38
+ "epoch": 0.04,
39
+ "eval_accuracy": 0.5918609128335752,
40
+ "eval_loss": 0.6735296249389648,
41
+ "eval_runtime": 122.3302,
42
+ "eval_samples_per_second": 275.999,
43
+ "eval_steps_per_second": 17.257,
44
+ "step": 400
45
+ },
46
+ {
47
+ "epoch": 0.05,
48
+ "learning_rate": 1.9001697114904664e-05,
49
+ "loss": 0.6666,
50
+ "step": 500
51
+ },
52
+ {
53
+ "epoch": 0.05,
54
+ "eval_accuracy": 0.6279951426117347,
55
+ "eval_loss": 0.6454014778137207,
56
+ "eval_runtime": 125.5195,
57
+ "eval_samples_per_second": 268.986,
58
+ "eval_steps_per_second": 16.818,
59
+ "step": 500
60
+ },
61
+ {
62
+ "epoch": 0.06,
63
+ "eval_accuracy": 0.6282024701596423,
64
+ "eval_loss": 0.643072247505188,
65
+ "eval_runtime": 134.8724,
66
+ "eval_samples_per_second": 250.333,
67
+ "eval_steps_per_second": 15.652,
68
+ "step": 600
69
+ },
70
+ {
71
+ "epoch": 0.07,
72
+ "eval_accuracy": 0.6378283920267749,
73
+ "eval_loss": 0.6360176205635071,
74
+ "eval_runtime": 135.5933,
75
+ "eval_samples_per_second": 249.002,
76
+ "eval_steps_per_second": 15.569,
77
+ "step": 700
78
+ },
79
+ {
80
+ "epoch": 0.08,
81
+ "eval_accuracy": 0.6515120101886681,
82
+ "eval_loss": 0.6259913444519043,
83
+ "eval_runtime": 135.7915,
84
+ "eval_samples_per_second": 248.639,
85
+ "eval_steps_per_second": 15.546,
86
+ "step": 800
87
+ },
88
+ {
89
+ "epoch": 0.09,
90
+ "eval_accuracy": 0.6522524657169091,
91
+ "eval_loss": 0.6238675713539124,
92
+ "eval_runtime": 145.4218,
93
+ "eval_samples_per_second": 232.173,
94
+ "eval_steps_per_second": 14.516,
95
+ "step": 900
96
+ },
97
+ {
98
+ "epoch": 0.1,
99
+ "learning_rate": 1.8003394229809327e-05,
100
+ "loss": 0.6365,
101
+ "step": 1000
102
+ },
103
+ {
104
+ "epoch": 0.1,
105
+ "eval_accuracy": 0.6556881793679472,
106
+ "eval_loss": 0.6250938177108765,
107
+ "eval_runtime": 142.1467,
108
+ "eval_samples_per_second": 237.522,
109
+ "eval_steps_per_second": 14.851,
110
+ "step": 1000
111
+ },
112
+ {
113
+ "epoch": 0.11,
114
+ "eval_accuracy": 0.6417676154370169,
115
+ "eval_loss": 0.6320825815200806,
116
+ "eval_runtime": 144.5395,
117
+ "eval_samples_per_second": 233.59,
118
+ "eval_steps_per_second": 14.605,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "epoch": 0.12,
123
+ "eval_accuracy": 0.656517489559577,
124
+ "eval_loss": 0.6173945665359497,
125
+ "eval_runtime": 145.4484,
126
+ "eval_samples_per_second": 232.13,
127
+ "eval_steps_per_second": 14.514,
128
+ "step": 1200
129
+ },
130
+ {
131
+ "epoch": 0.13,
132
+ "eval_accuracy": 0.6573171815300773,
133
+ "eval_loss": 0.6152162551879883,
134
+ "eval_runtime": 148.4944,
135
+ "eval_samples_per_second": 227.369,
136
+ "eval_steps_per_second": 14.216,
137
+ "step": 1300
138
+ },
139
+ {
140
+ "epoch": 0.14,
141
+ "eval_accuracy": 0.6613156413825786,
142
+ "eval_loss": 0.6118360757827759,
143
+ "eval_runtime": 147.9224,
144
+ "eval_samples_per_second": 228.248,
145
+ "eval_steps_per_second": 14.271,
146
+ "step": 1400
147
+ },
148
+ {
149
+ "epoch": 0.15,
150
+ "learning_rate": 1.700509134471399e-05,
151
+ "loss": 0.6256,
152
+ "step": 1500
153
+ },
154
+ {
155
+ "epoch": 0.15,
156
+ "eval_accuracy": 0.6628557888813198,
157
+ "eval_loss": 0.6129273176193237,
158
+ "eval_runtime": 144.8646,
159
+ "eval_samples_per_second": 233.066,
160
+ "eval_steps_per_second": 14.572,
161
+ "step": 1500
162
+ },
163
+ {
164
+ "epoch": 0.16,
165
+ "eval_accuracy": 0.6606048040754672,
166
+ "eval_loss": 0.6123291254043579,
167
+ "eval_runtime": 147.7984,
168
+ "eval_samples_per_second": 228.44,
169
+ "eval_steps_per_second": 14.283,
170
+ "step": 1600
171
+ },
172
+ {
173
+ "epoch": 0.17,
174
+ "eval_accuracy": 0.6520451381690016,
175
+ "eval_loss": 0.6141842603683472,
176
+ "eval_runtime": 146.6547,
177
+ "eval_samples_per_second": 230.221,
178
+ "eval_steps_per_second": 14.394,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "epoch": 0.18,
183
+ "eval_accuracy": 0.6619968604685602,
184
+ "eval_loss": 0.6088994145393372,
185
+ "eval_runtime": 144.5506,
186
+ "eval_samples_per_second": 233.572,
187
+ "eval_steps_per_second": 14.604,
188
+ "step": 1800
189
+ },
190
+ {
191
+ "epoch": 0.19,
192
+ "eval_accuracy": 0.6693125610875811,
193
+ "eval_loss": 0.6045596599578857,
194
+ "eval_runtime": 149.624,
195
+ "eval_samples_per_second": 225.652,
196
+ "eval_steps_per_second": 14.109,
197
+ "step": 1900
198
+ },
199
+ {
200
+ "epoch": 0.2,
201
+ "learning_rate": 1.600678845961865e-05,
202
+ "loss": 0.6206,
203
+ "step": 2000
204
+ },
205
+ {
206
+ "epoch": 0.2,
207
+ "eval_accuracy": 0.6670911945028581,
208
+ "eval_loss": 0.6107525825500488,
209
+ "eval_runtime": 150.716,
210
+ "eval_samples_per_second": 224.017,
211
+ "eval_steps_per_second": 14.006,
212
+ "step": 2000
213
+ },
214
+ {
215
+ "epoch": 0.21,
216
+ "eval_accuracy": 0.6699937801735628,
217
+ "eval_loss": 0.6015494465827942,
218
+ "eval_runtime": 145.7183,
219
+ "eval_samples_per_second": 231.7,
220
+ "eval_steps_per_second": 14.487,
221
+ "step": 2100
222
+ },
223
+ {
224
+ "epoch": 0.22,
225
+ "eval_accuracy": 0.67514735065012,
226
+ "eval_loss": 0.6029151678085327,
227
+ "eval_runtime": 146.7946,
228
+ "eval_samples_per_second": 230.002,
229
+ "eval_steps_per_second": 14.381,
230
+ "step": 2200
231
+ },
232
+ {
233
+ "epoch": 0.23,
234
+ "eval_accuracy": 0.6646032639279685,
235
+ "eval_loss": 0.611104428768158,
236
+ "eval_runtime": 148.7299,
237
+ "eval_samples_per_second": 227.009,
238
+ "eval_steps_per_second": 14.194,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "epoch": 0.24,
243
+ "eval_accuracy": 0.6710304179131001,
244
+ "eval_loss": 0.598213791847229,
245
+ "eval_runtime": 148.4521,
246
+ "eval_samples_per_second": 227.434,
247
+ "eval_steps_per_second": 14.22,
248
+ "step": 2400
249
+ },
250
+ {
251
+ "epoch": 0.25,
252
+ "learning_rate": 1.5008485574523313e-05,
253
+ "loss": 0.611,
254
+ "step": 2500
255
+ },
256
+ {
257
+ "epoch": 0.25,
258
+ "eval_accuracy": 0.666528448301395,
259
+ "eval_loss": 0.6279574632644653,
260
+ "eval_runtime": 146.8286,
261
+ "eval_samples_per_second": 229.948,
262
+ "eval_steps_per_second": 14.377,
263
+ "step": 2500
264
+ },
265
+ {
266
+ "epoch": 0.26,
267
+ "eval_accuracy": 0.6672392856085063,
268
+ "eval_loss": 0.607886791229248,
269
+ "eval_runtime": 152.0709,
270
+ "eval_samples_per_second": 222.021,
271
+ "eval_steps_per_second": 13.882,
272
+ "step": 2600
273
+ },
274
+ {
275
+ "epoch": 0.27,
276
+ "eval_accuracy": 0.67185972810473,
277
+ "eval_loss": 0.5957935452461243,
278
+ "eval_runtime": 151.6655,
279
+ "eval_samples_per_second": 222.615,
280
+ "eval_steps_per_second": 13.919,
281
+ "step": 2700
282
+ },
283
+ {
284
+ "epoch": 0.28,
285
+ "eval_accuracy": 0.6712377454610076,
286
+ "eval_loss": 0.6077716946601868,
287
+ "eval_runtime": 153.1917,
288
+ "eval_samples_per_second": 220.397,
289
+ "eval_steps_per_second": 13.78,
290
+ "step": 2800
291
+ },
292
+ {
293
+ "epoch": 0.29,
294
+ "eval_accuracy": 0.6713858365666558,
295
+ "eval_loss": 0.5977779626846313,
296
+ "eval_runtime": 148.182,
297
+ "eval_samples_per_second": 227.848,
298
+ "eval_steps_per_second": 14.246,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "epoch": 0.3,
303
+ "learning_rate": 1.4010182689427974e-05,
304
+ "loss": 0.6131,
305
+ "step": 3000
306
+ },
307
+ {
308
+ "epoch": 0.3,
309
+ "eval_accuracy": 0.6731333116133045,
310
+ "eval_loss": 0.5991169810295105,
311
+ "eval_runtime": 148.2187,
312
+ "eval_samples_per_second": 227.792,
313
+ "eval_steps_per_second": 14.242,
314
+ "step": 3000
315
+ },
316
+ {
317
+ "epoch": 0.31,
318
+ "eval_accuracy": 0.6711785090187483,
319
+ "eval_loss": 0.6033596396446228,
320
+ "eval_runtime": 147.6774,
321
+ "eval_samples_per_second": 228.627,
322
+ "eval_steps_per_second": 14.295,
323
+ "step": 3100
324
+ },
325
+ {
326
+ "epoch": 0.32,
327
+ "eval_accuracy": 0.6733110209400823,
328
+ "eval_loss": 0.5941784977912903,
329
+ "eval_runtime": 146.1758,
330
+ "eval_samples_per_second": 230.975,
331
+ "eval_steps_per_second": 14.442,
332
+ "step": 3200
333
+ },
334
+ {
335
+ "epoch": 0.33,
336
+ "eval_accuracy": 0.6770132985812872,
337
+ "eval_loss": 0.5986897945404053,
338
+ "eval_runtime": 132.6551,
339
+ "eval_samples_per_second": 254.517,
340
+ "eval_steps_per_second": 15.913,
341
+ "step": 3300
342
+ },
343
+ {
344
+ "epoch": 0.34,
345
+ "eval_accuracy": 0.6789681011758434,
346
+ "eval_loss": 0.5917928218841553,
347
+ "eval_runtime": 126.761,
348
+ "eval_samples_per_second": 266.352,
349
+ "eval_steps_per_second": 16.653,
350
+ "step": 3400
351
+ },
352
+ {
353
+ "epoch": 0.35,
354
+ "learning_rate": 1.3011879804332637e-05,
355
+ "loss": 0.6037,
356
+ "step": 3500
357
+ },
358
+ {
359
+ "epoch": 0.35,
360
+ "eval_accuracy": 0.6815448864141219,
361
+ "eval_loss": 0.591849148273468,
362
+ "eval_runtime": 127.1262,
363
+ "eval_samples_per_second": 265.586,
364
+ "eval_steps_per_second": 16.606,
365
+ "step": 3500
366
+ },
367
+ {
368
+ "epoch": 0.36,
369
+ "eval_accuracy": 0.6831738885762522,
370
+ "eval_loss": 0.5918092131614685,
371
+ "eval_runtime": 123.6609,
372
+ "eval_samples_per_second": 273.029,
373
+ "eval_steps_per_second": 17.071,
374
+ "step": 3600
375
+ },
376
+ {
377
+ "epoch": 0.37,
378
+ "eval_accuracy": 0.6806563397802328,
379
+ "eval_loss": 0.5881184935569763,
380
+ "eval_runtime": 123.6786,
381
+ "eval_samples_per_second": 272.99,
382
+ "eval_steps_per_second": 17.068,
383
+ "step": 3700
384
+ },
385
+ {
386
+ "epoch": 0.38,
387
+ "eval_accuracy": 0.6733998756034713,
388
+ "eval_loss": 0.6080012321472168,
389
+ "eval_runtime": 123.437,
390
+ "eval_samples_per_second": 273.524,
391
+ "eval_steps_per_second": 17.102,
392
+ "step": 3800
393
+ },
394
+ {
395
+ "epoch": 0.39,
396
+ "eval_accuracy": 0.6735479667091194,
397
+ "eval_loss": 0.6100932359695435,
398
+ "eval_runtime": 118.7243,
399
+ "eval_samples_per_second": 284.382,
400
+ "eval_steps_per_second": 17.781,
401
+ "step": 3900
402
+ },
403
+ {
404
+ "epoch": 0.4,
405
+ "learning_rate": 1.2013576919237297e-05,
406
+ "loss": 0.596,
407
+ "step": 4000
408
+ },
409
+ {
410
+ "epoch": 0.4,
411
+ "eval_accuracy": 0.6837662529988449,
412
+ "eval_loss": 0.5860297679901123,
413
+ "eval_runtime": 120.6458,
414
+ "eval_samples_per_second": 279.852,
415
+ "eval_steps_per_second": 17.498,
416
+ "step": 4000
417
+ },
418
+ {
419
+ "epoch": 0.41,
420
+ "eval_accuracy": 0.683588543672067,
421
+ "eval_loss": 0.5865428447723389,
422
+ "eval_runtime": 119.5342,
423
+ "eval_samples_per_second": 282.455,
424
+ "eval_steps_per_second": 17.66,
425
+ "step": 4100
426
+ },
427
+ {
428
+ "epoch": 0.42,
429
+ "eval_accuracy": 0.6822261055001037,
430
+ "eval_loss": 0.5836812257766724,
431
+ "eval_runtime": 119.7355,
432
+ "eval_samples_per_second": 281.98,
433
+ "eval_steps_per_second": 17.631,
434
+ "step": 4200
435
+ },
436
+ {
437
+ "epoch": 0.43,
438
+ "eval_accuracy": 0.6840328169890116,
439
+ "eval_loss": 0.5837206840515137,
440
+ "eval_runtime": 120.8178,
441
+ "eval_samples_per_second": 279.454,
442
+ "eval_steps_per_second": 17.473,
443
+ "step": 4300
444
+ },
445
+ {
446
+ "epoch": 0.44,
447
+ "eval_accuracy": 0.6855137280454936,
448
+ "eval_loss": 0.5865352153778076,
449
+ "eval_runtime": 122.1434,
450
+ "eval_samples_per_second": 276.421,
451
+ "eval_steps_per_second": 17.283,
452
+ "step": 4400
453
+ },
454
+ {
455
+ "epoch": 0.45,
456
+ "learning_rate": 1.101527403414196e-05,
457
+ "loss": 0.5948,
458
+ "step": 4500
459
+ },
460
+ {
461
+ "epoch": 0.45,
462
+ "eval_accuracy": 0.6864022746793828,
463
+ "eval_loss": 0.5826652646064758,
464
+ "eval_runtime": 121.7629,
465
+ "eval_samples_per_second": 277.285,
466
+ "eval_steps_per_second": 17.337,
467
+ "step": 4500
468
+ },
469
+ {
470
+ "epoch": 0.46,
471
+ "eval_accuracy": 0.6799751206942511,
472
+ "eval_loss": 0.5849358439445496,
473
+ "eval_runtime": 118.4911,
474
+ "eval_samples_per_second": 284.941,
475
+ "eval_steps_per_second": 17.816,
476
+ "step": 4600
477
+ },
478
+ {
479
+ "epoch": 0.47,
480
+ "eval_accuracy": 0.6880608950626426,
481
+ "eval_loss": 0.584037721157074,
482
+ "eval_runtime": 121.4782,
483
+ "eval_samples_per_second": 277.935,
484
+ "eval_steps_per_second": 17.378,
485
+ "step": 4700
486
+ },
487
+ {
488
+ "epoch": 0.48,
489
+ "eval_accuracy": 0.6825815241536594,
490
+ "eval_loss": 0.5843004584312439,
491
+ "eval_runtime": 120.7083,
492
+ "eval_samples_per_second": 279.707,
493
+ "eval_steps_per_second": 17.488,
494
+ "step": 4800
495
+ },
496
+ {
497
+ "epoch": 0.49,
498
+ "eval_accuracy": 0.6855729644877528,
499
+ "eval_loss": 0.5819188356399536,
500
+ "eval_runtime": 119.2894,
501
+ "eval_samples_per_second": 283.034,
502
+ "eval_steps_per_second": 17.696,
503
+ "step": 4900
504
+ },
505
+ {
506
+ "epoch": 0.5,
507
+ "learning_rate": 1.0016971149046621e-05,
508
+ "loss": 0.5862,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "epoch": 0.5,
513
+ "eval_accuracy": 0.6876462399668276,
514
+ "eval_loss": 0.5809924602508545,
515
+ "eval_runtime": 118.4258,
516
+ "eval_samples_per_second": 285.098,
517
+ "eval_steps_per_second": 17.826,
518
+ "step": 5000
519
+ },
520
+ {
521
+ "epoch": 0.51,
522
+ "eval_accuracy": 0.67268903829636,
523
+ "eval_loss": 0.5965932607650757,
524
+ "eval_runtime": 120.9456,
525
+ "eval_samples_per_second": 279.158,
526
+ "eval_steps_per_second": 17.454,
527
+ "step": 5100
528
+ },
529
+ {
530
+ "epoch": 0.52,
531
+ "eval_accuracy": 0.6881793679471611,
532
+ "eval_loss": 0.582930862903595,
533
+ "eval_runtime": 121.936,
534
+ "eval_samples_per_second": 276.891,
535
+ "eval_steps_per_second": 17.312,
536
+ "step": 5200
537
+ },
538
+ {
539
+ "epoch": 0.53,
540
+ "eval_accuracy": 0.6695198886354885,
541
+ "eval_loss": 0.5954956412315369,
542
+ "eval_runtime": 120.2241,
543
+ "eval_samples_per_second": 280.834,
544
+ "eval_steps_per_second": 17.559,
545
+ "step": 5300
546
+ },
547
+ {
548
+ "epoch": 0.54,
549
+ "eval_accuracy": 0.6855433462666233,
550
+ "eval_loss": 0.5801506638526917,
551
+ "eval_runtime": 123.0327,
552
+ "eval_samples_per_second": 274.423,
553
+ "eval_steps_per_second": 17.158,
554
+ "step": 5400
555
+ },
556
+ {
557
+ "epoch": 0.55,
558
+ "learning_rate": 9.018668263951284e-06,
559
+ "loss": 0.5868,
560
+ "step": 5500
561
+ },
562
+ {
563
+ "epoch": 0.55,
564
+ "eval_accuracy": 0.6835589254509374,
565
+ "eval_loss": 0.5841577649116516,
566
+ "eval_runtime": 122.0071,
567
+ "eval_samples_per_second": 276.73,
568
+ "eval_steps_per_second": 17.302,
569
+ "step": 5500
570
+ },
571
+ {
572
+ "epoch": 0.56,
573
+ "eval_accuracy": 0.6864318929005124,
574
+ "eval_loss": 0.5806400775909424,
575
+ "eval_runtime": 121.8556,
576
+ "eval_samples_per_second": 277.074,
577
+ "eval_steps_per_second": 17.324,
578
+ "step": 5600
579
+ },
580
+ {
581
+ "epoch": 0.57,
582
+ "eval_accuracy": 0.6876462399668276,
583
+ "eval_loss": 0.582034170627594,
584
+ "eval_runtime": 121.9909,
585
+ "eval_samples_per_second": 276.767,
586
+ "eval_steps_per_second": 17.305,
587
+ "step": 5700
588
+ },
589
+ {
590
+ "epoch": 0.58,
591
+ "eval_accuracy": 0.6849213636229008,
592
+ "eval_loss": 0.5870340466499329,
593
+ "eval_runtime": 123.4237,
594
+ "eval_samples_per_second": 273.554,
595
+ "eval_steps_per_second": 17.104,
596
+ "step": 5800
597
+ },
598
+ {
599
+ "epoch": 0.59,
600
+ "eval_accuracy": 0.6870538755442348,
601
+ "eval_loss": 0.579352080821991,
602
+ "eval_runtime": 123.7102,
603
+ "eval_samples_per_second": 272.92,
604
+ "eval_steps_per_second": 17.064,
605
+ "step": 5900
606
+ },
607
+ {
608
+ "epoch": 0.6,
609
+ "learning_rate": 8.020365378855946e-06,
610
+ "loss": 0.5868,
611
+ "step": 6000
612
+ },
613
+ {
614
+ "epoch": 0.6,
615
+ "eval_accuracy": 0.6886828777063649,
616
+ "eval_loss": 0.5769456624984741,
617
+ "eval_runtime": 121.7224,
618
+ "eval_samples_per_second": 277.377,
619
+ "eval_steps_per_second": 17.343,
620
+ "step": 6000
621
+ },
622
+ {
623
+ "epoch": 0.61,
624
+ "eval_accuracy": 0.6878239492936055,
625
+ "eval_loss": 0.5787535309791565,
626
+ "eval_runtime": 118.515,
627
+ "eval_samples_per_second": 284.884,
628
+ "eval_steps_per_second": 17.812,
629
+ "step": 6100
630
+ },
631
+ {
632
+ "epoch": 0.62,
633
+ "eval_accuracy": 0.6861653289103457,
634
+ "eval_loss": 0.5844166278839111,
635
+ "eval_runtime": 119.9114,
636
+ "eval_samples_per_second": 281.566,
637
+ "eval_steps_per_second": 17.605,
638
+ "step": 6200
639
+ },
640
+ {
641
+ "epoch": 0.63,
642
+ "eval_accuracy": 0.6863430382371235,
643
+ "eval_loss": 0.5797388553619385,
644
+ "eval_runtime": 121.613,
645
+ "eval_samples_per_second": 277.627,
646
+ "eval_steps_per_second": 17.358,
647
+ "step": 6300
648
+ },
649
+ {
650
+ "epoch": 0.64,
651
+ "eval_accuracy": 0.6922370642419217,
652
+ "eval_loss": 0.5815860033035278,
653
+ "eval_runtime": 121.9323,
654
+ "eval_samples_per_second": 276.9,
655
+ "eval_steps_per_second": 17.313,
656
+ "step": 6400
657
+ },
658
+ {
659
+ "epoch": 0.65,
660
+ "learning_rate": 7.022062493760607e-06,
661
+ "loss": 0.5872,
662
+ "step": 6500
663
+ },
664
+ {
665
+ "epoch": 0.65,
666
+ "eval_accuracy": 0.6891863874655688,
667
+ "eval_loss": 0.5755571126937866,
668
+ "eval_runtime": 120.5961,
669
+ "eval_samples_per_second": 279.968,
670
+ "eval_steps_per_second": 17.505,
671
+ "step": 6500
672
+ },
673
+ {
674
+ "epoch": 0.66,
675
+ "eval_accuracy": 0.6912892811657732,
676
+ "eval_loss": 0.5765994191169739,
677
+ "eval_runtime": 118.5726,
678
+ "eval_samples_per_second": 284.745,
679
+ "eval_steps_per_second": 17.803,
680
+ "step": 6600
681
+ },
682
+ {
683
+ "epoch": 0.67,
684
+ "eval_accuracy": 0.6933033202025887,
685
+ "eval_loss": 0.5784236192703247,
686
+ "eval_runtime": 120.5154,
687
+ "eval_samples_per_second": 280.155,
688
+ "eval_steps_per_second": 17.516,
689
+ "step": 6700
690
+ },
691
+ {
692
+ "epoch": 0.68,
693
+ "eval_accuracy": 0.6937475935195332,
694
+ "eval_loss": 0.5751758813858032,
695
+ "eval_runtime": 121.1939,
696
+ "eval_samples_per_second": 278.587,
697
+ "eval_steps_per_second": 17.418,
698
+ "step": 6800
699
+ },
700
+ {
701
+ "epoch": 0.69,
702
+ "eval_accuracy": 0.6900453158783284,
703
+ "eval_loss": 0.5770964622497559,
704
+ "eval_runtime": 122.265,
705
+ "eval_samples_per_second": 276.146,
706
+ "eval_steps_per_second": 17.266,
707
+ "step": 6900
708
+ },
709
+ {
710
+ "epoch": 0.7,
711
+ "learning_rate": 6.02375960866527e-06,
712
+ "loss": 0.584,
713
+ "step": 7000
714
+ },
715
+ {
716
+ "epoch": 0.7,
717
+ "eval_accuracy": 0.691466990492551,
718
+ "eval_loss": 0.5755699872970581,
719
+ "eval_runtime": 120.2886,
720
+ "eval_samples_per_second": 280.683,
721
+ "eval_steps_per_second": 17.549,
722
+ "step": 7000
723
+ },
724
+ {
725
+ "epoch": 0.71,
726
+ "eval_accuracy": 0.6928294286645144,
727
+ "eval_loss": 0.5778803825378418,
728
+ "eval_runtime": 119.22,
729
+ "eval_samples_per_second": 283.199,
730
+ "eval_steps_per_second": 17.707,
731
+ "step": 7100
732
+ },
733
+ {
734
+ "epoch": 0.72,
735
+ "eval_accuracy": 0.691052335396736,
736
+ "eval_loss": 0.5816224813461304,
737
+ "eval_runtime": 121.6119,
738
+ "eval_samples_per_second": 277.629,
739
+ "eval_steps_per_second": 17.359,
740
+ "step": 7200
741
+ },
742
+ {
743
+ "epoch": 0.73,
744
+ "eval_accuracy": 0.6899268429938098,
745
+ "eval_loss": 0.5753040313720703,
746
+ "eval_runtime": 122.0359,
747
+ "eval_samples_per_second": 276.665,
748
+ "eval_steps_per_second": 17.298,
749
+ "step": 7300
750
+ },
751
+ {
752
+ "epoch": 0.74,
753
+ "eval_accuracy": 0.6927109557799959,
754
+ "eval_loss": 0.578673779964447,
755
+ "eval_runtime": 120.6028,
756
+ "eval_samples_per_second": 279.952,
757
+ "eval_steps_per_second": 17.504,
758
+ "step": 7400
759
+ },
760
+ {
761
+ "epoch": 0.75,
762
+ "learning_rate": 5.0254567235699314e-06,
763
+ "loss": 0.5812,
764
+ "step": 7500
765
+ },
766
+ {
767
+ "epoch": 0.75,
768
+ "eval_accuracy": 0.6879720403992536,
769
+ "eval_loss": 0.5793206691741943,
770
+ "eval_runtime": 121.5038,
771
+ "eval_samples_per_second": 277.876,
772
+ "eval_steps_per_second": 17.374,
773
+ "step": 7500
774
+ },
775
+ {
776
+ "epoch": 0.76,
777
+ "eval_accuracy": 0.6891567692444391,
778
+ "eval_loss": 0.5748796463012695,
779
+ "eval_runtime": 120.9422,
780
+ "eval_samples_per_second": 279.166,
781
+ "eval_steps_per_second": 17.455,
782
+ "step": 7600
783
+ },
784
+ {
785
+ "epoch": 0.77,
786
+ "eval_accuracy": 0.6884459319373278,
787
+ "eval_loss": 0.5827967524528503,
788
+ "eval_runtime": 120.4061,
789
+ "eval_samples_per_second": 280.409,
790
+ "eval_steps_per_second": 17.532,
791
+ "step": 7700
792
+ },
793
+ {
794
+ "epoch": 0.78,
795
+ "eval_accuracy": 0.6921482095785327,
796
+ "eval_loss": 0.5722939372062683,
797
+ "eval_runtime": 120.2444,
798
+ "eval_samples_per_second": 280.787,
799
+ "eval_steps_per_second": 17.556,
800
+ "step": 7800
801
+ },
802
+ {
803
+ "epoch": 0.79,
804
+ "eval_accuracy": 0.6930071379912922,
805
+ "eval_loss": 0.5720646381378174,
806
+ "eval_runtime": 120.4878,
807
+ "eval_samples_per_second": 280.219,
808
+ "eval_steps_per_second": 17.52,
809
+ "step": 7900
810
+ },
811
+ {
812
+ "epoch": 0.8,
813
+ "learning_rate": 4.027153838474593e-06,
814
+ "loss": 0.5754,
815
+ "step": 8000
816
+ },
817
+ {
818
+ "epoch": 0.8,
819
+ "eval_accuracy": 0.6890382963599206,
820
+ "eval_loss": 0.5755317211151123,
821
+ "eval_runtime": 121.8943,
822
+ "eval_samples_per_second": 276.986,
823
+ "eval_steps_per_second": 17.318,
824
+ "step": 8000
825
+ },
826
+ {
827
+ "epoch": 0.81,
828
+ "eval_accuracy": 0.6926813375588662,
829
+ "eval_loss": 0.580621063709259,
830
+ "eval_runtime": 120.5342,
831
+ "eval_samples_per_second": 280.111,
832
+ "eval_steps_per_second": 17.514,
833
+ "step": 8100
834
+ },
835
+ {
836
+ "epoch": 0.82,
837
+ "eval_accuracy": 0.6944880490477742,
838
+ "eval_loss": 0.5727642178535461,
839
+ "eval_runtime": 121.4458,
840
+ "eval_samples_per_second": 278.009,
841
+ "eval_steps_per_second": 17.382,
842
+ "step": 8200
843
+ },
844
+ {
845
+ "epoch": 0.83,
846
+ "eval_accuracy": 0.6914966087136807,
847
+ "eval_loss": 0.5764381885528564,
848
+ "eval_runtime": 119.5977,
849
+ "eval_samples_per_second": 282.305,
850
+ "eval_steps_per_second": 17.651,
851
+ "step": 8300
852
+ },
853
+ {
854
+ "epoch": 0.84,
855
+ "eval_accuracy": 0.693925302846311,
856
+ "eval_loss": 0.5707160830497742,
857
+ "eval_runtime": 119.8766,
858
+ "eval_samples_per_second": 281.648,
859
+ "eval_steps_per_second": 17.61,
860
+ "step": 8400
861
+ },
862
+ {
863
+ "epoch": 0.85,
864
+ "learning_rate": 3.0288509533792554e-06,
865
+ "loss": 0.5743,
866
+ "step": 8500
867
+ },
868
+ {
869
+ "epoch": 0.85,
870
+ "eval_accuracy": 0.6939845392885703,
871
+ "eval_loss": 0.5710459351539612,
872
+ "eval_runtime": 121.7301,
873
+ "eval_samples_per_second": 277.36,
874
+ "eval_steps_per_second": 17.342,
875
+ "step": 8500
876
+ },
877
+ {
878
+ "epoch": 0.86,
879
+ "eval_accuracy": 0.6937475935195332,
880
+ "eval_loss": 0.5757591128349304,
881
+ "eval_runtime": 121.0195,
882
+ "eval_samples_per_second": 278.988,
883
+ "eval_steps_per_second": 17.443,
884
+ "step": 8600
885
+ },
886
+ {
887
+ "epoch": 0.87,
888
+ "eval_accuracy": 0.6917039362615881,
889
+ "eval_loss": 0.574606716632843,
890
+ "eval_runtime": 124.4692,
891
+ "eval_samples_per_second": 271.256,
892
+ "eval_steps_per_second": 16.96,
893
+ "step": 8700
894
+ },
895
+ {
896
+ "epoch": 0.88,
897
+ "eval_accuracy": 0.6962355240944229,
898
+ "eval_loss": 0.5732554197311401,
899
+ "eval_runtime": 120.9096,
900
+ "eval_samples_per_second": 279.242,
901
+ "eval_steps_per_second": 17.459,
902
+ "step": 8800
903
+ },
904
+ {
905
+ "epoch": 0.89,
906
+ "eval_accuracy": 0.6945769037111631,
907
+ "eval_loss": 0.5710918307304382,
908
+ "eval_runtime": 119.8896,
909
+ "eval_samples_per_second": 281.617,
910
+ "eval_steps_per_second": 17.608,
911
+ "step": 8900
912
+ },
913
+ {
914
+ "epoch": 0.9,
915
+ "learning_rate": 2.0305480682839176e-06,
916
+ "loss": 0.5664,
917
+ "step": 9000
918
+ },
919
+ {
920
+ "epoch": 0.9,
921
+ "eval_accuracy": 0.6929775197701626,
922
+ "eval_loss": 0.57152259349823,
923
+ "eval_runtime": 120.0446,
924
+ "eval_samples_per_second": 281.254,
925
+ "eval_steps_per_second": 17.585,
926
+ "step": 9000
927
+ },
928
+ {
929
+ "epoch": 0.91,
930
+ "eval_accuracy": 0.6967686520747564,
931
+ "eval_loss": 0.5711397528648376,
932
+ "eval_runtime": 119.8872,
933
+ "eval_samples_per_second": 281.623,
934
+ "eval_steps_per_second": 17.608,
935
+ "step": 9100
936
+ },
937
+ {
938
+ "epoch": 0.92,
939
+ "eval_accuracy": 0.6950211770281077,
940
+ "eval_loss": 0.5720114707946777,
941
+ "eval_runtime": 119.8996,
942
+ "eval_samples_per_second": 281.594,
943
+ "eval_steps_per_second": 17.606,
944
+ "step": 9200
945
+ },
946
+ {
947
+ "epoch": 0.93,
948
+ "eval_accuracy": 0.6930663744335516,
949
+ "eval_loss": 0.5728961825370789,
950
+ "eval_runtime": 121.5387,
951
+ "eval_samples_per_second": 277.796,
952
+ "eval_steps_per_second": 17.369,
953
+ "step": 9300
954
+ },
955
+ {
956
+ "epoch": 0.94,
957
+ "eval_accuracy": 0.6957912507774783,
958
+ "eval_loss": 0.5721818804740906,
959
+ "eval_runtime": 121.3258,
960
+ "eval_samples_per_second": 278.284,
961
+ "eval_steps_per_second": 17.399,
962
+ "step": 9400
963
+ },
964
+ {
965
+ "epoch": 0.95,
966
+ "learning_rate": 1.0322451831885795e-06,
967
+ "loss": 0.5659,
968
+ "step": 9500
969
+ },
970
+ {
971
+ "epoch": 0.95,
972
+ "eval_accuracy": 0.6956727778929598,
973
+ "eval_loss": 0.5728616118431091,
974
+ "eval_runtime": 122.9243,
975
+ "eval_samples_per_second": 274.665,
976
+ "eval_steps_per_second": 17.173,
977
+ "step": 9500
978
+ },
979
+ {
980
+ "epoch": 0.96,
981
+ "eval_accuracy": 0.695406213902793,
982
+ "eval_loss": 0.571672797203064,
983
+ "eval_runtime": 117.4658,
984
+ "eval_samples_per_second": 287.428,
985
+ "eval_steps_per_second": 17.971,
986
+ "step": 9600
987
+ },
988
+ {
989
+ "epoch": 0.97,
990
+ "eval_accuracy": 0.6966205609691082,
991
+ "eval_loss": 0.569725513458252,
992
+ "eval_runtime": 122.1907,
993
+ "eval_samples_per_second": 276.314,
994
+ "eval_steps_per_second": 17.276,
995
+ "step": 9700
996
+ },
997
+ {
998
+ "epoch": 0.98,
999
+ "eval_accuracy": 0.6969759796226639,
1000
+ "eval_loss": 0.5699070692062378,
1001
+ "eval_runtime": 119.8449,
1002
+ "eval_samples_per_second": 281.722,
1003
+ "eval_steps_per_second": 17.614,
1004
+ "step": 9800
1005
+ },
1006
+ {
1007
+ "epoch": 0.99,
1008
+ "eval_accuracy": 0.6957616325563487,
1009
+ "eval_loss": 0.5702488422393799,
1010
+ "eval_runtime": 121.0927,
1011
+ "eval_samples_per_second": 278.82,
1012
+ "eval_steps_per_second": 17.433,
1013
+ "step": 9900
1014
+ },
1015
+ {
1016
+ "epoch": 1.0,
1017
+ "learning_rate": 3.394229809324149e-08,
1018
+ "loss": 0.5699,
1019
+ "step": 10000
1020
+ },
1021
+ {
1022
+ "epoch": 1.0,
1023
+ "eval_accuracy": 0.6955839232295709,
1024
+ "eval_loss": 0.5703100562095642,
1025
+ "eval_runtime": 120.0243,
1026
+ "eval_samples_per_second": 281.301,
1027
+ "eval_steps_per_second": 17.588,
1028
+ "step": 10000
1029
+ }
1030
+ ],
1031
+ "max_steps": 10017,
1032
+ "num_train_epochs": 1,
1033
+ "total_flos": 7221381585213888.0,
1034
+ "trial_name": null,
1035
+ "trial_params": null
1036
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff