AlGe commited on
Commit
8574b5f
·
1 Parent(s): 70a0088

Initial model commit

Browse files
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForMaskedLM"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": false,
20
+ "tie_weights_": true,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.38.0.dev0",
23
+ "vocab_size": 30522
24
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d896ec994c0f1257c864da336cdb270ff52c37d333d8a8010a0adec25af553a
3
+ size 267954768
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b61fdd1721cddde02e5c9b44dc44abab444041ee5339c9c7eeb77ff1aceb33b
3
+ size 535972218
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0262c33f41012cbd6f2c2e1c1d2ee441ed917ca305d539e54003cf41a55ef078
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f138103487a53a67a03b5f54425825e5c29acd5184bb795b4c677e1677fe9332
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "DistilBertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trainer_state.json ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.148005148005148,
5
+ "eval_steps": 2000,
6
+ "global_step": 40000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "learning_rate": 4.9679536679536684e-05,
14
+ "loss": 2.6056,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 4.935778635778636e-05,
20
+ "loss": 2.1389,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.19,
25
+ "learning_rate": 4.9036036036036035e-05,
26
+ "loss": 1.9778,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.26,
31
+ "learning_rate": 4.8714285714285714e-05,
32
+ "loss": 1.8847,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.26,
37
+ "eval_accuracy": 0.6430994438209938,
38
+ "eval_loss": 1.7282191514968872,
39
+ "eval_runtime": 168.5943,
40
+ "eval_samples_per_second": 163.131,
41
+ "eval_steps_per_second": 20.392,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 0.32,
46
+ "learning_rate": 4.8392535392535393e-05,
47
+ "loss": 1.8149,
48
+ "step": 2500
49
+ },
50
+ {
51
+ "epoch": 0.39,
52
+ "learning_rate": 4.807078507078507e-05,
53
+ "loss": 1.7584,
54
+ "step": 3000
55
+ },
56
+ {
57
+ "epoch": 0.45,
58
+ "learning_rate": 4.774903474903475e-05,
59
+ "loss": 1.7177,
60
+ "step": 3500
61
+ },
62
+ {
63
+ "epoch": 0.51,
64
+ "learning_rate": 4.742728442728443e-05,
65
+ "loss": 1.6753,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.51,
70
+ "eval_accuracy": 0.6748989378707684,
71
+ "eval_loss": 1.55081307888031,
72
+ "eval_runtime": 171.3896,
73
+ "eval_samples_per_second": 160.471,
74
+ "eval_steps_per_second": 20.06,
75
+ "step": 4000
76
+ },
77
+ {
78
+ "epoch": 0.58,
79
+ "learning_rate": 4.710553410553411e-05,
80
+ "loss": 1.6493,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 0.64,
85
+ "learning_rate": 4.678378378378379e-05,
86
+ "loss": 1.6196,
87
+ "step": 5000
88
+ },
89
+ {
90
+ "epoch": 0.71,
91
+ "learning_rate": 4.646203346203347e-05,
92
+ "loss": 1.5848,
93
+ "step": 5500
94
+ },
95
+ {
96
+ "epoch": 0.77,
97
+ "learning_rate": 4.614028314028314e-05,
98
+ "loss": 1.5664,
99
+ "step": 6000
100
+ },
101
+ {
102
+ "epoch": 0.77,
103
+ "eval_accuracy": 0.6920701992459631,
104
+ "eval_loss": 1.4559696912765503,
105
+ "eval_runtime": 172.8272,
106
+ "eval_samples_per_second": 159.136,
107
+ "eval_steps_per_second": 19.893,
108
+ "step": 6000
109
+ },
110
+ {
111
+ "epoch": 0.84,
112
+ "learning_rate": 4.581853281853282e-05,
113
+ "loss": 1.5418,
114
+ "step": 6500
115
+ },
116
+ {
117
+ "epoch": 0.9,
118
+ "learning_rate": 4.54967824967825e-05,
119
+ "loss": 1.5232,
120
+ "step": 7000
121
+ },
122
+ {
123
+ "epoch": 0.97,
124
+ "learning_rate": 4.517503217503218e-05,
125
+ "loss": 1.5102,
126
+ "step": 7500
127
+ },
128
+ {
129
+ "epoch": 1.03,
130
+ "learning_rate": 4.4853281853281856e-05,
131
+ "loss": 1.4973,
132
+ "step": 8000
133
+ },
134
+ {
135
+ "epoch": 1.03,
136
+ "eval_accuracy": 0.7051554952905896,
137
+ "eval_loss": 1.3886678218841553,
138
+ "eval_runtime": 170.2873,
139
+ "eval_samples_per_second": 161.509,
140
+ "eval_steps_per_second": 20.189,
141
+ "step": 8000
142
+ },
143
+ {
144
+ "epoch": 1.09,
145
+ "learning_rate": 4.4531531531531535e-05,
146
+ "loss": 1.4758,
147
+ "step": 8500
148
+ },
149
+ {
150
+ "epoch": 1.16,
151
+ "learning_rate": 4.421042471042471e-05,
152
+ "loss": 1.4534,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 1.22,
157
+ "learning_rate": 4.388867438867439e-05,
158
+ "loss": 1.45,
159
+ "step": 9500
160
+ },
161
+ {
162
+ "epoch": 1.29,
163
+ "learning_rate": 4.356692406692407e-05,
164
+ "loss": 1.4371,
165
+ "step": 10000
166
+ },
167
+ {
168
+ "epoch": 1.29,
169
+ "eval_accuracy": 0.7145240916645529,
170
+ "eval_loss": 1.3381266593933105,
171
+ "eval_runtime": 171.6166,
172
+ "eval_samples_per_second": 160.258,
173
+ "eval_steps_per_second": 20.033,
174
+ "step": 10000
175
+ },
176
+ {
177
+ "epoch": 1.35,
178
+ "learning_rate": 4.324517374517374e-05,
179
+ "loss": 1.4315,
180
+ "step": 10500
181
+ },
182
+ {
183
+ "epoch": 1.42,
184
+ "learning_rate": 4.292342342342343e-05,
185
+ "loss": 1.4166,
186
+ "step": 11000
187
+ },
188
+ {
189
+ "epoch": 1.48,
190
+ "learning_rate": 4.2602316602316605e-05,
191
+ "loss": 1.4071,
192
+ "step": 11500
193
+ },
194
+ {
195
+ "epoch": 1.54,
196
+ "learning_rate": 4.228056628056628e-05,
197
+ "loss": 1.3949,
198
+ "step": 12000
199
+ },
200
+ {
201
+ "epoch": 1.54,
202
+ "eval_accuracy": 0.7217752604656028,
203
+ "eval_loss": 1.3031212091445923,
204
+ "eval_runtime": 171.5333,
205
+ "eval_samples_per_second": 160.336,
206
+ "eval_steps_per_second": 20.043,
207
+ "step": 12000
208
+ },
209
+ {
210
+ "epoch": 1.61,
211
+ "learning_rate": 4.195881595881596e-05,
212
+ "loss": 1.3929,
213
+ "step": 12500
214
+ },
215
+ {
216
+ "epoch": 1.67,
217
+ "learning_rate": 4.163770913770914e-05,
218
+ "loss": 1.3755,
219
+ "step": 13000
220
+ },
221
+ {
222
+ "epoch": 1.74,
223
+ "learning_rate": 4.131595881595882e-05,
224
+ "loss": 1.3731,
225
+ "step": 13500
226
+ },
227
+ {
228
+ "epoch": 1.8,
229
+ "learning_rate": 4.09942084942085e-05,
230
+ "loss": 1.3656,
231
+ "step": 14000
232
+ },
233
+ {
234
+ "epoch": 1.8,
235
+ "eval_accuracy": 0.7279450116990841,
236
+ "eval_loss": 1.2706010341644287,
237
+ "eval_runtime": 172.4413,
238
+ "eval_samples_per_second": 159.492,
239
+ "eval_steps_per_second": 19.937,
240
+ "step": 14000
241
+ },
242
+ {
243
+ "epoch": 1.87,
244
+ "learning_rate": 4.067245817245817e-05,
245
+ "loss": 1.3562,
246
+ "step": 14500
247
+ },
248
+ {
249
+ "epoch": 1.93,
250
+ "learning_rate": 4.0350707850707856e-05,
251
+ "loss": 1.345,
252
+ "step": 15000
253
+ },
254
+ {
255
+ "epoch": 1.99,
256
+ "learning_rate": 4.0028957528957535e-05,
257
+ "loss": 1.3401,
258
+ "step": 15500
259
+ },
260
+ {
261
+ "epoch": 2.06,
262
+ "learning_rate": 3.970720720720721e-05,
263
+ "loss": 1.3271,
264
+ "step": 16000
265
+ },
266
+ {
267
+ "epoch": 2.06,
268
+ "eval_accuracy": 0.7319083261136123,
269
+ "eval_loss": 1.2469459772109985,
270
+ "eval_runtime": 172.8063,
271
+ "eval_samples_per_second": 159.155,
272
+ "eval_steps_per_second": 19.895,
273
+ "step": 16000
274
+ },
275
+ {
276
+ "epoch": 2.12,
277
+ "learning_rate": 3.9385456885456886e-05,
278
+ "loss": 1.3179,
279
+ "step": 16500
280
+ },
281
+ {
282
+ "epoch": 2.19,
283
+ "learning_rate": 3.9064350064350063e-05,
284
+ "loss": 1.3207,
285
+ "step": 17000
286
+ },
287
+ {
288
+ "epoch": 2.25,
289
+ "learning_rate": 3.874259974259974e-05,
290
+ "loss": 1.3092,
291
+ "step": 17500
292
+ },
293
+ {
294
+ "epoch": 2.32,
295
+ "learning_rate": 3.842084942084943e-05,
296
+ "loss": 1.3016,
297
+ "step": 18000
298
+ },
299
+ {
300
+ "epoch": 2.32,
301
+ "eval_accuracy": 0.7369295282061049,
302
+ "eval_loss": 1.2243499755859375,
303
+ "eval_runtime": 171.841,
304
+ "eval_samples_per_second": 160.049,
305
+ "eval_steps_per_second": 20.007,
306
+ "step": 18000
307
+ },
308
+ {
309
+ "epoch": 2.38,
310
+ "learning_rate": 3.80990990990991e-05,
311
+ "loss": 1.2989,
312
+ "step": 18500
313
+ },
314
+ {
315
+ "epoch": 2.45,
316
+ "learning_rate": 3.777734877734878e-05,
317
+ "loss": 1.2902,
318
+ "step": 19000
319
+ },
320
+ {
321
+ "epoch": 2.51,
322
+ "learning_rate": 3.745559845559846e-05,
323
+ "loss": 1.2897,
324
+ "step": 19500
325
+ },
326
+ {
327
+ "epoch": 2.57,
328
+ "learning_rate": 3.713384813384814e-05,
329
+ "loss": 1.2867,
330
+ "step": 20000
331
+ },
332
+ {
333
+ "epoch": 2.57,
334
+ "eval_accuracy": 0.7412163432241851,
335
+ "eval_loss": 1.1988307237625122,
336
+ "eval_runtime": 173.4853,
337
+ "eval_samples_per_second": 158.532,
338
+ "eval_steps_per_second": 19.817,
339
+ "step": 20000
340
+ },
341
+ {
342
+ "epoch": 2.64,
343
+ "learning_rate": 3.681209781209781e-05,
344
+ "loss": 1.2756,
345
+ "step": 20500
346
+ },
347
+ {
348
+ "epoch": 2.7,
349
+ "learning_rate": 3.6490990990990994e-05,
350
+ "loss": 1.273,
351
+ "step": 21000
352
+ },
353
+ {
354
+ "epoch": 2.77,
355
+ "learning_rate": 3.616924066924067e-05,
356
+ "loss": 1.2704,
357
+ "step": 21500
358
+ },
359
+ {
360
+ "epoch": 2.83,
361
+ "learning_rate": 3.5847490347490345e-05,
362
+ "loss": 1.2668,
363
+ "step": 22000
364
+ },
365
+ {
366
+ "epoch": 2.83,
367
+ "eval_accuracy": 0.7440720045251332,
368
+ "eval_loss": 1.1828033924102783,
369
+ "eval_runtime": 174.2101,
370
+ "eval_samples_per_second": 157.873,
371
+ "eval_steps_per_second": 19.735,
372
+ "step": 22000
373
+ },
374
+ {
375
+ "epoch": 2.9,
376
+ "learning_rate": 3.552574002574003e-05,
377
+ "loss": 1.2682,
378
+ "step": 22500
379
+ },
380
+ {
381
+ "epoch": 2.96,
382
+ "learning_rate": 3.52039897039897e-05,
383
+ "loss": 1.2624,
384
+ "step": 23000
385
+ },
386
+ {
387
+ "epoch": 3.02,
388
+ "learning_rate": 3.488223938223938e-05,
389
+ "loss": 1.2505,
390
+ "step": 23500
391
+ },
392
+ {
393
+ "epoch": 3.09,
394
+ "learning_rate": 3.4561132561132566e-05,
395
+ "loss": 1.2426,
396
+ "step": 24000
397
+ },
398
+ {
399
+ "epoch": 3.09,
400
+ "eval_accuracy": 0.7478223153253316,
401
+ "eval_loss": 1.166755199432373,
402
+ "eval_runtime": 174.8548,
403
+ "eval_samples_per_second": 157.29,
404
+ "eval_steps_per_second": 19.662,
405
+ "step": 24000
406
+ },
407
+ {
408
+ "epoch": 3.15,
409
+ "learning_rate": 3.423938223938224e-05,
410
+ "loss": 1.2337,
411
+ "step": 24500
412
+ },
413
+ {
414
+ "epoch": 3.22,
415
+ "learning_rate": 3.391763191763192e-05,
416
+ "loss": 1.2389,
417
+ "step": 25000
418
+ },
419
+ {
420
+ "epoch": 3.28,
421
+ "learning_rate": 3.3595881595881596e-05,
422
+ "loss": 1.2314,
423
+ "step": 25500
424
+ },
425
+ {
426
+ "epoch": 3.35,
427
+ "learning_rate": 3.3274131274131275e-05,
428
+ "loss": 1.2284,
429
+ "step": 26000
430
+ },
431
+ {
432
+ "epoch": 3.35,
433
+ "eval_accuracy": 0.7501192662767082,
434
+ "eval_loss": 1.1552441120147705,
435
+ "eval_runtime": 176.5773,
436
+ "eval_samples_per_second": 155.756,
437
+ "eval_steps_per_second": 19.47,
438
+ "step": 26000
439
+ },
440
+ {
441
+ "epoch": 3.41,
442
+ "learning_rate": 3.2952380952380954e-05,
443
+ "loss": 1.2244,
444
+ "step": 26500
445
+ },
446
+ {
447
+ "epoch": 3.47,
448
+ "learning_rate": 3.263063063063063e-05,
449
+ "loss": 1.2217,
450
+ "step": 27000
451
+ },
452
+ {
453
+ "epoch": 3.54,
454
+ "learning_rate": 3.230888030888031e-05,
455
+ "loss": 1.2204,
456
+ "step": 27500
457
+ },
458
+ {
459
+ "epoch": 3.6,
460
+ "learning_rate": 3.198712998712999e-05,
461
+ "loss": 1.2177,
462
+ "step": 28000
463
+ },
464
+ {
465
+ "epoch": 3.6,
466
+ "eval_accuracy": 0.7529280009402969,
467
+ "eval_loss": 1.140738606452942,
468
+ "eval_runtime": 175.8979,
469
+ "eval_samples_per_second": 156.358,
470
+ "eval_steps_per_second": 19.545,
471
+ "step": 28000
472
+ },
473
+ {
474
+ "epoch": 3.67,
475
+ "learning_rate": 3.166537966537967e-05,
476
+ "loss": 1.2126,
477
+ "step": 28500
478
+ },
479
+ {
480
+ "epoch": 3.73,
481
+ "learning_rate": 3.134427284427285e-05,
482
+ "loss": 1.2123,
483
+ "step": 29000
484
+ },
485
+ {
486
+ "epoch": 3.8,
487
+ "learning_rate": 3.1022522522522526e-05,
488
+ "loss": 1.2015,
489
+ "step": 29500
490
+ },
491
+ {
492
+ "epoch": 3.86,
493
+ "learning_rate": 3.07014157014157e-05,
494
+ "loss": 1.2045,
495
+ "step": 30000
496
+ },
497
+ {
498
+ "epoch": 3.86,
499
+ "eval_accuracy": 0.7548470260547614,
500
+ "eval_loss": 1.1305773258209229,
501
+ "eval_runtime": 177.469,
502
+ "eval_samples_per_second": 154.974,
503
+ "eval_steps_per_second": 19.372,
504
+ "step": 30000
505
+ },
506
+ {
507
+ "epoch": 3.93,
508
+ "learning_rate": 3.037966537966538e-05,
509
+ "loss": 1.2081,
510
+ "step": 30500
511
+ },
512
+ {
513
+ "epoch": 3.99,
514
+ "learning_rate": 3.0057915057915058e-05,
515
+ "loss": 1.2009,
516
+ "step": 31000
517
+ },
518
+ {
519
+ "epoch": 4.05,
520
+ "learning_rate": 2.973616473616474e-05,
521
+ "loss": 1.191,
522
+ "step": 31500
523
+ },
524
+ {
525
+ "epoch": 4.12,
526
+ "learning_rate": 2.9414414414414416e-05,
527
+ "loss": 1.1901,
528
+ "step": 32000
529
+ },
530
+ {
531
+ "epoch": 4.12,
532
+ "eval_accuracy": 0.7579209971336668,
533
+ "eval_loss": 1.1146818399429321,
534
+ "eval_runtime": 176.2162,
535
+ "eval_samples_per_second": 156.075,
536
+ "eval_steps_per_second": 19.51,
537
+ "step": 32000
538
+ },
539
+ {
540
+ "epoch": 4.18,
541
+ "learning_rate": 2.9092664092664095e-05,
542
+ "loss": 1.1939,
543
+ "step": 32500
544
+ },
545
+ {
546
+ "epoch": 4.25,
547
+ "learning_rate": 2.877091377091377e-05,
548
+ "loss": 1.179,
549
+ "step": 33000
550
+ },
551
+ {
552
+ "epoch": 4.31,
553
+ "learning_rate": 2.8449163449163453e-05,
554
+ "loss": 1.1831,
555
+ "step": 33500
556
+ },
557
+ {
558
+ "epoch": 4.38,
559
+ "learning_rate": 2.812741312741313e-05,
560
+ "loss": 1.1801,
561
+ "step": 34000
562
+ },
563
+ {
564
+ "epoch": 4.38,
565
+ "eval_accuracy": 0.7587203295541968,
566
+ "eval_loss": 1.110528588294983,
567
+ "eval_runtime": 177.1827,
568
+ "eval_samples_per_second": 155.224,
569
+ "eval_steps_per_second": 19.404,
570
+ "step": 34000
571
+ },
572
+ {
573
+ "epoch": 4.44,
574
+ "learning_rate": 2.7805662805662808e-05,
575
+ "loss": 1.1761,
576
+ "step": 34500
577
+ },
578
+ {
579
+ "epoch": 4.5,
580
+ "learning_rate": 2.7483912483912483e-05,
581
+ "loss": 1.1767,
582
+ "step": 35000
583
+ },
584
+ {
585
+ "epoch": 4.57,
586
+ "learning_rate": 2.7162805662805664e-05,
587
+ "loss": 1.166,
588
+ "step": 35500
589
+ },
590
+ {
591
+ "epoch": 4.63,
592
+ "learning_rate": 2.6841055341055343e-05,
593
+ "loss": 1.167,
594
+ "step": 36000
595
+ },
596
+ {
597
+ "epoch": 4.63,
598
+ "eval_accuracy": 0.7609539556365092,
599
+ "eval_loss": 1.1016037464141846,
600
+ "eval_runtime": 176.8657,
601
+ "eval_samples_per_second": 155.502,
602
+ "eval_steps_per_second": 19.438,
603
+ "step": 36000
604
+ },
605
+ {
606
+ "epoch": 4.7,
607
+ "learning_rate": 2.6519305019305018e-05,
608
+ "loss": 1.161,
609
+ "step": 36500
610
+ },
611
+ {
612
+ "epoch": 4.76,
613
+ "learning_rate": 2.61975546975547e-05,
614
+ "loss": 1.1658,
615
+ "step": 37000
616
+ },
617
+ {
618
+ "epoch": 4.83,
619
+ "learning_rate": 2.5876447876447878e-05,
620
+ "loss": 1.1656,
621
+ "step": 37500
622
+ },
623
+ {
624
+ "epoch": 4.89,
625
+ "learning_rate": 2.5554697554697553e-05,
626
+ "loss": 1.1543,
627
+ "step": 38000
628
+ },
629
+ {
630
+ "epoch": 4.89,
631
+ "eval_accuracy": 0.7620338199796781,
632
+ "eval_loss": 1.0933340787887573,
633
+ "eval_runtime": 176.4351,
634
+ "eval_samples_per_second": 155.882,
635
+ "eval_steps_per_second": 19.486,
636
+ "step": 38000
637
+ },
638
+ {
639
+ "epoch": 4.95,
640
+ "learning_rate": 2.5233590733590734e-05,
641
+ "loss": 1.1568,
642
+ "step": 38500
643
+ },
644
+ {
645
+ "epoch": 5.02,
646
+ "learning_rate": 2.4911840411840413e-05,
647
+ "loss": 1.1591,
648
+ "step": 39000
649
+ },
650
+ {
651
+ "epoch": 5.08,
652
+ "learning_rate": 2.459009009009009e-05,
653
+ "loss": 1.1485,
654
+ "step": 39500
655
+ },
656
+ {
657
+ "epoch": 5.15,
658
+ "learning_rate": 2.426833976833977e-05,
659
+ "loss": 1.1481,
660
+ "step": 40000
661
+ },
662
+ {
663
+ "epoch": 5.15,
664
+ "eval_accuracy": 0.7639144921830399,
665
+ "eval_loss": 1.0837804079055786,
666
+ "eval_runtime": 179.9041,
667
+ "eval_samples_per_second": 152.876,
668
+ "eval_steps_per_second": 19.11,
669
+ "step": 40000
670
+ }
671
+ ],
672
+ "logging_steps": 500,
673
+ "max_steps": 77700,
674
+ "num_input_tokens_seen": 0,
675
+ "num_train_epochs": 10,
676
+ "save_steps": 2000,
677
+ "total_flos": 1.6967761699709952e+17,
678
+ "train_batch_size": 32,
679
+ "trial_name": null,
680
+ "trial_params": null
681
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de5b3ec4a2c8531ca2b480371f1a5d94402246fde794b6ca393ec530f9d9dcd
3
+ size 4728
vocab.txt ADDED
The diff for this file is too large to render. See raw diff