fenffef commited on
Commit
104a3bb
·
1 Parent(s): 0020371

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./exps/bart-base-chinese",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 101,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 102,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 102,
27
+ "forced_eos_token_id": 102,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": 100,
42
+ "max_position_embeddings": 1024,
43
+ "model_type": "bart",
44
+ "no_repeat_ngram_size": 3,
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_beams": 4,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 0,
50
+ "scale_embedding": false,
51
+ "task_specific_params": {
52
+ "summarization": {
53
+ "length_penalty": 1.0,
54
+ "max_length": 128,
55
+ "min_length": 12,
56
+ "num_beams": 4
57
+ },
58
+ "summarization_cnn": {
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "num_beams": 4
63
+ },
64
+ "summarization_xsum": {
65
+ "length_penalty": 1.0,
66
+ "max_length": 62,
67
+ "min_length": 11,
68
+ "num_beams": 6
69
+ }
70
+ },
71
+ "tokenizer_class": "BertTokenizer",
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.35.0",
74
+ "use_cache": true,
75
+ "vocab_size": 51271
76
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 101,
4
+ "decoder_start_token_id": 102,
5
+ "early_stopping": true,
6
+ "eos_token_id": 102,
7
+ "forced_eos_token_id": 102,
8
+ "no_repeat_ngram_size": 3,
9
+ "num_beams": 4,
10
+ "pad_token_id": 0,
11
+ "transformers_version": "4.35.0"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc9b29007b36839abcb5984ef93d01db43ff18df367cde740b9c13164d6d486
3
+ size 129236992
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
trainer_state.json ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.999672313792313,
5
+ "eval_steps": 500,
6
+ "global_step": 76290,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2.980993577139861e-06,
14
+ "loss": 1.8224,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 2.9619871542797224e-06,
20
+ "loss": 1.6774,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 2.9429807314195834e-06,
26
+ "loss": 1.651,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.26,
31
+ "learning_rate": 2.9239743085594444e-06,
32
+ "loss": 1.6331,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 2.9049678856993053e-06,
38
+ "loss": 1.6218,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.39,
43
+ "learning_rate": 2.8859614628391667e-06,
44
+ "loss": 1.6129,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.46,
49
+ "learning_rate": 2.8669550399790273e-06,
50
+ "loss": 1.6042,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.52,
55
+ "learning_rate": 2.8479486171188882e-06,
56
+ "loss": 1.596,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.59,
61
+ "learning_rate": 2.8289421942587496e-06,
62
+ "loss": 1.5899,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.66,
67
+ "learning_rate": 2.8099357713986106e-06,
68
+ "loss": 1.5834,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.72,
73
+ "learning_rate": 2.7909293485384715e-06,
74
+ "loss": 1.5772,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.79,
79
+ "learning_rate": 2.771922925678333e-06,
80
+ "loss": 1.5719,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.85,
85
+ "learning_rate": 2.752916502818194e-06,
86
+ "loss": 1.567,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.92,
91
+ "learning_rate": 2.733910079958055e-06,
92
+ "loss": 1.5625,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.98,
97
+ "learning_rate": 2.714903657097916e-06,
98
+ "loss": 1.5588,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 1.0,
103
+ "eval_loss": 1.544177770614624,
104
+ "eval_runtime": 7.0475,
105
+ "eval_samples_per_second": 139.908,
106
+ "eval_steps_per_second": 0.851,
107
+ "step": 7629
108
+ },
109
+ {
110
+ "epoch": 1.05,
111
+ "learning_rate": 2.695897234237777e-06,
112
+ "loss": 1.5539,
113
+ "step": 8000
114
+ },
115
+ {
116
+ "epoch": 1.11,
117
+ "learning_rate": 2.6768908113776377e-06,
118
+ "loss": 1.5502,
119
+ "step": 8500
120
+ },
121
+ {
122
+ "epoch": 1.18,
123
+ "learning_rate": 2.6578843885174987e-06,
124
+ "loss": 1.5473,
125
+ "step": 9000
126
+ },
127
+ {
128
+ "epoch": 1.25,
129
+ "learning_rate": 2.63887796565736e-06,
130
+ "loss": 1.5443,
131
+ "step": 9500
132
+ },
133
+ {
134
+ "epoch": 1.31,
135
+ "learning_rate": 2.619871542797221e-06,
136
+ "loss": 1.5411,
137
+ "step": 10000
138
+ },
139
+ {
140
+ "epoch": 1.38,
141
+ "learning_rate": 2.600865119937082e-06,
142
+ "loss": 1.5391,
143
+ "step": 10500
144
+ },
145
+ {
146
+ "epoch": 1.44,
147
+ "learning_rate": 2.5818586970769434e-06,
148
+ "loss": 1.5366,
149
+ "step": 11000
150
+ },
151
+ {
152
+ "epoch": 1.51,
153
+ "learning_rate": 2.5628522742168044e-06,
154
+ "loss": 1.5344,
155
+ "step": 11500
156
+ },
157
+ {
158
+ "epoch": 1.57,
159
+ "learning_rate": 2.5438458513566653e-06,
160
+ "loss": 1.532,
161
+ "step": 12000
162
+ },
163
+ {
164
+ "epoch": 1.64,
165
+ "learning_rate": 2.5248394284965263e-06,
166
+ "loss": 1.5295,
167
+ "step": 12500
168
+ },
169
+ {
170
+ "epoch": 1.7,
171
+ "learning_rate": 2.5058330056363877e-06,
172
+ "loss": 1.5286,
173
+ "step": 13000
174
+ },
175
+ {
176
+ "epoch": 1.77,
177
+ "learning_rate": 2.4868265827762487e-06,
178
+ "loss": 1.526,
179
+ "step": 13500
180
+ },
181
+ {
182
+ "epoch": 1.84,
183
+ "learning_rate": 2.4678201599161096e-06,
184
+ "loss": 1.5247,
185
+ "step": 14000
186
+ },
187
+ {
188
+ "epoch": 1.9,
189
+ "learning_rate": 2.448813737055971e-06,
190
+ "loss": 1.5235,
191
+ "step": 14500
192
+ },
193
+ {
194
+ "epoch": 1.97,
195
+ "learning_rate": 2.429807314195832e-06,
196
+ "loss": 1.5219,
197
+ "step": 15000
198
+ },
199
+ {
200
+ "epoch": 2.0,
201
+ "eval_loss": 1.5164680480957031,
202
+ "eval_runtime": 6.9148,
203
+ "eval_samples_per_second": 142.593,
204
+ "eval_steps_per_second": 0.868,
205
+ "step": 15258
206
+ },
207
+ {
208
+ "epoch": 2.03,
209
+ "learning_rate": 2.410800891335693e-06,
210
+ "loss": 1.5194,
211
+ "step": 15500
212
+ },
213
+ {
214
+ "epoch": 2.1,
215
+ "learning_rate": 2.391794468475554e-06,
216
+ "loss": 1.5185,
217
+ "step": 16000
218
+ },
219
+ {
220
+ "epoch": 2.16,
221
+ "learning_rate": 2.372788045615415e-06,
222
+ "loss": 1.5166,
223
+ "step": 16500
224
+ },
225
+ {
226
+ "epoch": 2.23,
227
+ "learning_rate": 2.353781622755276e-06,
228
+ "loss": 1.516,
229
+ "step": 17000
230
+ },
231
+ {
232
+ "epoch": 2.29,
233
+ "learning_rate": 2.334775199895137e-06,
234
+ "loss": 1.5149,
235
+ "step": 17500
236
+ },
237
+ {
238
+ "epoch": 2.36,
239
+ "learning_rate": 2.315768777034998e-06,
240
+ "loss": 1.5137,
241
+ "step": 18000
242
+ },
243
+ {
244
+ "epoch": 2.42,
245
+ "learning_rate": 2.296762354174859e-06,
246
+ "loss": 1.5127,
247
+ "step": 18500
248
+ },
249
+ {
250
+ "epoch": 2.49,
251
+ "learning_rate": 2.27775593131472e-06,
252
+ "loss": 1.5111,
253
+ "step": 19000
254
+ },
255
+ {
256
+ "epoch": 2.56,
257
+ "learning_rate": 2.258749508454581e-06,
258
+ "loss": 1.5103,
259
+ "step": 19500
260
+ },
261
+ {
262
+ "epoch": 2.62,
263
+ "learning_rate": 2.239743085594442e-06,
264
+ "loss": 1.5095,
265
+ "step": 20000
266
+ },
267
+ {
268
+ "epoch": 2.69,
269
+ "learning_rate": 2.220736662734303e-06,
270
+ "loss": 1.5082,
271
+ "step": 20500
272
+ },
273
+ {
274
+ "epoch": 2.75,
275
+ "learning_rate": 2.2017302398741644e-06,
276
+ "loss": 1.5075,
277
+ "step": 21000
278
+ },
279
+ {
280
+ "epoch": 2.82,
281
+ "learning_rate": 2.1827238170140254e-06,
282
+ "loss": 1.5064,
283
+ "step": 21500
284
+ },
285
+ {
286
+ "epoch": 2.88,
287
+ "learning_rate": 2.1637173941538863e-06,
288
+ "loss": 1.5056,
289
+ "step": 22000
290
+ },
291
+ {
292
+ "epoch": 2.95,
293
+ "learning_rate": 2.1447109712937477e-06,
294
+ "loss": 1.5053,
295
+ "step": 22500
296
+ },
297
+ {
298
+ "epoch": 3.0,
299
+ "eval_loss": 1.5032986402511597,
300
+ "eval_runtime": 7.0258,
301
+ "eval_samples_per_second": 140.341,
302
+ "eval_steps_per_second": 0.854,
303
+ "step": 22887
304
+ },
305
+ {
306
+ "epoch": 3.01,
307
+ "learning_rate": 2.1257045484336087e-06,
308
+ "loss": 1.5047,
309
+ "step": 23000
310
+ },
311
+ {
312
+ "epoch": 3.08,
313
+ "learning_rate": 2.1066981255734696e-06,
314
+ "loss": 1.5028,
315
+ "step": 23500
316
+ },
317
+ {
318
+ "epoch": 3.15,
319
+ "learning_rate": 2.0876917027133306e-06,
320
+ "loss": 1.5022,
321
+ "step": 24000
322
+ },
323
+ {
324
+ "epoch": 3.21,
325
+ "learning_rate": 2.068685279853192e-06,
326
+ "loss": 1.5014,
327
+ "step": 24500
328
+ },
329
+ {
330
+ "epoch": 3.28,
331
+ "learning_rate": 2.049678856993053e-06,
332
+ "loss": 1.5005,
333
+ "step": 25000
334
+ },
335
+ {
336
+ "epoch": 3.34,
337
+ "learning_rate": 2.030672434132914e-06,
338
+ "loss": 1.5001,
339
+ "step": 25500
340
+ },
341
+ {
342
+ "epoch": 3.41,
343
+ "learning_rate": 2.011666011272775e-06,
344
+ "loss": 1.4992,
345
+ "step": 26000
346
+ },
347
+ {
348
+ "epoch": 3.47,
349
+ "learning_rate": 1.9926595884126363e-06,
350
+ "loss": 1.4986,
351
+ "step": 26500
352
+ },
353
+ {
354
+ "epoch": 3.54,
355
+ "learning_rate": 1.9736531655524972e-06,
356
+ "loss": 1.498,
357
+ "step": 27000
358
+ },
359
+ {
360
+ "epoch": 3.6,
361
+ "learning_rate": 1.9546467426923586e-06,
362
+ "loss": 1.4974,
363
+ "step": 27500
364
+ },
365
+ {
366
+ "epoch": 3.67,
367
+ "learning_rate": 1.9356403198322196e-06,
368
+ "loss": 1.4969,
369
+ "step": 28000
370
+ },
371
+ {
372
+ "epoch": 3.74,
373
+ "learning_rate": 1.91663389697208e-06,
374
+ "loss": 1.4963,
375
+ "step": 28500
376
+ },
377
+ {
378
+ "epoch": 3.8,
379
+ "learning_rate": 1.8976274741119413e-06,
380
+ "loss": 1.4957,
381
+ "step": 29000
382
+ },
383
+ {
384
+ "epoch": 3.87,
385
+ "learning_rate": 1.8786210512518025e-06,
386
+ "loss": 1.4952,
387
+ "step": 29500
388
+ },
389
+ {
390
+ "epoch": 3.93,
391
+ "learning_rate": 1.8596146283916634e-06,
392
+ "loss": 1.4945,
393
+ "step": 30000
394
+ },
395
+ {
396
+ "epoch": 4.0,
397
+ "learning_rate": 1.8406082055315244e-06,
398
+ "loss": 1.4935,
399
+ "step": 30500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_loss": 1.4955875873565674,
404
+ "eval_runtime": 6.9086,
405
+ "eval_samples_per_second": 142.721,
406
+ "eval_steps_per_second": 0.868,
407
+ "step": 30517
408
+ },
409
+ {
410
+ "epoch": 4.06,
411
+ "learning_rate": 1.8216017826713858e-06,
412
+ "loss": 1.493,
413
+ "step": 31000
414
+ },
415
+ {
416
+ "epoch": 4.13,
417
+ "learning_rate": 1.8025953598112466e-06,
418
+ "loss": 1.4925,
419
+ "step": 31500
420
+ },
421
+ {
422
+ "epoch": 4.19,
423
+ "learning_rate": 1.7835889369511077e-06,
424
+ "loss": 1.4921,
425
+ "step": 32000
426
+ },
427
+ {
428
+ "epoch": 4.26,
429
+ "learning_rate": 1.764582514090969e-06,
430
+ "loss": 1.4916,
431
+ "step": 32500
432
+ },
433
+ {
434
+ "epoch": 4.33,
435
+ "learning_rate": 1.7455760912308294e-06,
436
+ "loss": 1.4914,
437
+ "step": 33000
438
+ },
439
+ {
440
+ "epoch": 4.39,
441
+ "learning_rate": 1.7265696683706906e-06,
442
+ "loss": 1.4906,
443
+ "step": 33500
444
+ },
445
+ {
446
+ "epoch": 4.46,
447
+ "learning_rate": 1.7075632455105518e-06,
448
+ "loss": 1.4902,
449
+ "step": 34000
450
+ },
451
+ {
452
+ "epoch": 4.52,
453
+ "learning_rate": 1.688556822650413e-06,
454
+ "loss": 1.4898,
455
+ "step": 34500
456
+ },
457
+ {
458
+ "epoch": 4.59,
459
+ "learning_rate": 1.669550399790274e-06,
460
+ "loss": 1.4892,
461
+ "step": 35000
462
+ },
463
+ {
464
+ "epoch": 4.65,
465
+ "learning_rate": 1.650543976930135e-06,
466
+ "loss": 1.4891,
467
+ "step": 35500
468
+ },
469
+ {
470
+ "epoch": 4.72,
471
+ "learning_rate": 1.6315375540699963e-06,
472
+ "loss": 1.4886,
473
+ "step": 36000
474
+ },
475
+ {
476
+ "epoch": 4.78,
477
+ "learning_rate": 1.6125311312098573e-06,
478
+ "loss": 1.4881,
479
+ "step": 36500
480
+ },
481
+ {
482
+ "epoch": 4.85,
483
+ "learning_rate": 1.5935247083497182e-06,
484
+ "loss": 1.4879,
485
+ "step": 37000
486
+ },
487
+ {
488
+ "epoch": 4.92,
489
+ "learning_rate": 1.5745182854895796e-06,
490
+ "loss": 1.4878,
491
+ "step": 37500
492
+ },
493
+ {
494
+ "epoch": 4.98,
495
+ "learning_rate": 1.5555118626294406e-06,
496
+ "loss": 1.4873,
497
+ "step": 38000
498
+ },
499
+ {
500
+ "epoch": 5.0,
501
+ "eval_loss": 1.4904649257659912,
502
+ "eval_runtime": 6.8983,
503
+ "eval_samples_per_second": 142.934,
504
+ "eval_steps_per_second": 0.87,
505
+ "step": 38146
506
+ },
507
+ {
508
+ "epoch": 5.05,
509
+ "learning_rate": 1.5365054397693011e-06,
510
+ "loss": 1.4875,
511
+ "step": 38500
512
+ },
513
+ {
514
+ "epoch": 5.11,
515
+ "learning_rate": 1.5174990169091623e-06,
516
+ "loss": 1.4863,
517
+ "step": 39000
518
+ },
519
+ {
520
+ "epoch": 5.18,
521
+ "learning_rate": 1.4984925940490235e-06,
522
+ "loss": 1.4857,
523
+ "step": 39500
524
+ },
525
+ {
526
+ "epoch": 5.24,
527
+ "learning_rate": 1.4794861711888844e-06,
528
+ "loss": 1.4854,
529
+ "step": 40000
530
+ },
531
+ {
532
+ "epoch": 5.31,
533
+ "learning_rate": 1.4604797483287458e-06,
534
+ "loss": 1.4848,
535
+ "step": 40500
536
+ },
537
+ {
538
+ "epoch": 5.37,
539
+ "learning_rate": 1.4414733254686068e-06,
540
+ "loss": 1.485,
541
+ "step": 41000
542
+ },
543
+ {
544
+ "epoch": 5.44,
545
+ "learning_rate": 1.4224669026084677e-06,
546
+ "loss": 1.4848,
547
+ "step": 41500
548
+ },
549
+ {
550
+ "epoch": 5.51,
551
+ "learning_rate": 1.4034604797483287e-06,
552
+ "loss": 1.4842,
553
+ "step": 42000
554
+ },
555
+ {
556
+ "epoch": 5.57,
557
+ "learning_rate": 1.38445405688819e-06,
558
+ "loss": 1.4839,
559
+ "step": 42500
560
+ },
561
+ {
562
+ "epoch": 5.64,
563
+ "learning_rate": 1.365447634028051e-06,
564
+ "loss": 1.4836,
565
+ "step": 43000
566
+ },
567
+ {
568
+ "epoch": 5.7,
569
+ "learning_rate": 1.346441211167912e-06,
570
+ "loss": 1.4834,
571
+ "step": 43500
572
+ },
573
+ {
574
+ "epoch": 5.77,
575
+ "learning_rate": 1.327434788307773e-06,
576
+ "loss": 1.4831,
577
+ "step": 44000
578
+ },
579
+ {
580
+ "epoch": 5.83,
581
+ "learning_rate": 1.308428365447634e-06,
582
+ "loss": 1.483,
583
+ "step": 44500
584
+ },
585
+ {
586
+ "epoch": 5.9,
587
+ "learning_rate": 1.289421942587495e-06,
588
+ "loss": 1.4827,
589
+ "step": 45000
590
+ },
591
+ {
592
+ "epoch": 5.96,
593
+ "learning_rate": 1.2704155197273563e-06,
594
+ "loss": 1.4826,
595
+ "step": 45500
596
+ },
597
+ {
598
+ "epoch": 6.0,
599
+ "eval_loss": 1.4877293109893799,
600
+ "eval_runtime": 6.98,
601
+ "eval_samples_per_second": 141.261,
602
+ "eval_steps_per_second": 0.86,
603
+ "step": 45775
604
+ },
605
+ {
606
+ "epoch": 6.03,
607
+ "learning_rate": 1.2514090968672173e-06,
608
+ "loss": 1.4818,
609
+ "step": 46000
610
+ },
611
+ {
612
+ "epoch": 6.09,
613
+ "learning_rate": 1.2324026740070782e-06,
614
+ "loss": 1.4819,
615
+ "step": 46500
616
+ },
617
+ {
618
+ "epoch": 6.16,
619
+ "learning_rate": 1.2133962511469392e-06,
620
+ "loss": 1.4815,
621
+ "step": 47000
622
+ },
623
+ {
624
+ "epoch": 6.23,
625
+ "learning_rate": 1.1943898282868006e-06,
626
+ "loss": 1.4814,
627
+ "step": 47500
628
+ },
629
+ {
630
+ "epoch": 6.29,
631
+ "learning_rate": 1.1753834054266616e-06,
632
+ "loss": 1.4809,
633
+ "step": 48000
634
+ },
635
+ {
636
+ "epoch": 6.36,
637
+ "learning_rate": 1.1563769825665225e-06,
638
+ "loss": 1.4803,
639
+ "step": 48500
640
+ },
641
+ {
642
+ "epoch": 6.42,
643
+ "learning_rate": 1.1373705597063835e-06,
644
+ "loss": 1.4808,
645
+ "step": 49000
646
+ },
647
+ {
648
+ "epoch": 6.49,
649
+ "learning_rate": 1.1183641368462449e-06,
650
+ "loss": 1.4806,
651
+ "step": 49500
652
+ },
653
+ {
654
+ "epoch": 6.55,
655
+ "learning_rate": 1.0993577139861058e-06,
656
+ "loss": 1.4803,
657
+ "step": 50000
658
+ },
659
+ {
660
+ "epoch": 6.62,
661
+ "learning_rate": 1.0803512911259668e-06,
662
+ "loss": 1.4801,
663
+ "step": 50500
664
+ },
665
+ {
666
+ "epoch": 6.68,
667
+ "learning_rate": 1.0613448682658278e-06,
668
+ "loss": 1.4797,
669
+ "step": 51000
670
+ },
671
+ {
672
+ "epoch": 6.75,
673
+ "learning_rate": 1.0423384454056887e-06,
674
+ "loss": 1.4796,
675
+ "step": 51500
676
+ },
677
+ {
678
+ "epoch": 6.82,
679
+ "learning_rate": 1.02333202254555e-06,
680
+ "loss": 1.4795,
681
+ "step": 52000
682
+ },
683
+ {
684
+ "epoch": 6.88,
685
+ "learning_rate": 1.004325599685411e-06,
686
+ "loss": 1.4793,
687
+ "step": 52500
688
+ },
689
+ {
690
+ "epoch": 6.95,
691
+ "learning_rate": 9.85319176825272e-07,
692
+ "loss": 1.479,
693
+ "step": 53000
694
+ },
695
+ {
696
+ "epoch": 7.0,
697
+ "eval_loss": 1.4859654903411865,
698
+ "eval_runtime": 7.0683,
699
+ "eval_samples_per_second": 139.495,
700
+ "eval_steps_per_second": 0.849,
701
+ "step": 53404
702
+ },
703
+ {
704
+ "epoch": 7.01,
705
+ "learning_rate": 9.66312753965133e-07,
706
+ "loss": 1.4786,
707
+ "step": 53500
708
+ },
709
+ {
710
+ "epoch": 7.08,
711
+ "learning_rate": 9.473063311049942e-07,
712
+ "loss": 1.4788,
713
+ "step": 54000
714
+ },
715
+ {
716
+ "epoch": 7.14,
717
+ "learning_rate": 9.282999082448553e-07,
718
+ "loss": 1.4785,
719
+ "step": 54500
720
+ },
721
+ {
722
+ "epoch": 7.21,
723
+ "learning_rate": 9.092934853847164e-07,
724
+ "loss": 1.4784,
725
+ "step": 55000
726
+ },
727
+ {
728
+ "epoch": 7.27,
729
+ "learning_rate": 8.902870625245774e-07,
730
+ "loss": 1.4781,
731
+ "step": 55500
732
+ },
733
+ {
734
+ "epoch": 7.34,
735
+ "learning_rate": 8.712806396644386e-07,
736
+ "loss": 1.478,
737
+ "step": 56000
738
+ },
739
+ {
740
+ "epoch": 7.41,
741
+ "learning_rate": 8.522742168042993e-07,
742
+ "loss": 1.4779,
743
+ "step": 56500
744
+ },
745
+ {
746
+ "epoch": 7.47,
747
+ "learning_rate": 8.332677939441605e-07,
748
+ "loss": 1.4779,
749
+ "step": 57000
750
+ },
751
+ {
752
+ "epoch": 7.54,
753
+ "learning_rate": 8.142613710840215e-07,
754
+ "loss": 1.4774,
755
+ "step": 57500
756
+ },
757
+ {
758
+ "epoch": 7.6,
759
+ "learning_rate": 7.952549482238825e-07,
760
+ "loss": 1.4775,
761
+ "step": 58000
762
+ },
763
+ {
764
+ "epoch": 7.67,
765
+ "learning_rate": 7.762485253637437e-07,
766
+ "loss": 1.4772,
767
+ "step": 58500
768
+ },
769
+ {
770
+ "epoch": 7.73,
771
+ "learning_rate": 7.572421025036047e-07,
772
+ "loss": 1.4771,
773
+ "step": 59000
774
+ },
775
+ {
776
+ "epoch": 7.8,
777
+ "learning_rate": 7.382356796434658e-07,
778
+ "loss": 1.4771,
779
+ "step": 59500
780
+ },
781
+ {
782
+ "epoch": 7.86,
783
+ "learning_rate": 7.192292567833268e-07,
784
+ "loss": 1.4769,
785
+ "step": 60000
786
+ },
787
+ {
788
+ "epoch": 7.93,
789
+ "learning_rate": 7.00222833923188e-07,
790
+ "loss": 1.4769,
791
+ "step": 60500
792
+ },
793
+ {
794
+ "epoch": 8.0,
795
+ "learning_rate": 6.812164110630491e-07,
796
+ "loss": 1.4771,
797
+ "step": 61000
798
+ },
799
+ {
800
+ "epoch": 8.0,
801
+ "eval_loss": 1.483931303024292,
802
+ "eval_runtime": 6.9286,
803
+ "eval_samples_per_second": 142.308,
804
+ "eval_steps_per_second": 0.866,
805
+ "step": 61034
806
+ },
807
+ {
808
+ "epoch": 8.06,
809
+ "learning_rate": 6.622099882029101e-07,
810
+ "loss": 1.4775,
811
+ "step": 61500
812
+ },
813
+ {
814
+ "epoch": 8.13,
815
+ "learning_rate": 6.432035653427712e-07,
816
+ "loss": 1.4765,
817
+ "step": 62000
818
+ },
819
+ {
820
+ "epoch": 8.19,
821
+ "learning_rate": 6.24197142482632e-07,
822
+ "loss": 1.4762,
823
+ "step": 62500
824
+ },
825
+ {
826
+ "epoch": 8.26,
827
+ "learning_rate": 6.05190719622493e-07,
828
+ "loss": 1.4764,
829
+ "step": 63000
830
+ },
831
+ {
832
+ "epoch": 8.32,
833
+ "learning_rate": 5.861842967623541e-07,
834
+ "loss": 1.4762,
835
+ "step": 63500
836
+ },
837
+ {
838
+ "epoch": 8.39,
839
+ "learning_rate": 5.671778739022152e-07,
840
+ "loss": 1.4762,
841
+ "step": 64000
842
+ },
843
+ {
844
+ "epoch": 8.45,
845
+ "learning_rate": 5.481714510420763e-07,
846
+ "loss": 1.4761,
847
+ "step": 64500
848
+ },
849
+ {
850
+ "epoch": 8.52,
851
+ "learning_rate": 5.291650281819373e-07,
852
+ "loss": 1.4759,
853
+ "step": 65000
854
+ },
855
+ {
856
+ "epoch": 8.59,
857
+ "learning_rate": 5.101586053217985e-07,
858
+ "loss": 1.4758,
859
+ "step": 65500
860
+ },
861
+ {
862
+ "epoch": 8.65,
863
+ "learning_rate": 4.911521824616596e-07,
864
+ "loss": 1.4755,
865
+ "step": 66000
866
+ },
867
+ {
868
+ "epoch": 8.72,
869
+ "learning_rate": 4.7214575960152063e-07,
870
+ "loss": 1.4754,
871
+ "step": 66500
872
+ },
873
+ {
874
+ "epoch": 8.78,
875
+ "learning_rate": 4.531393367413817e-07,
876
+ "loss": 1.4757,
877
+ "step": 67000
878
+ },
879
+ {
880
+ "epoch": 8.85,
881
+ "learning_rate": 4.341329138812427e-07,
882
+ "loss": 1.4758,
883
+ "step": 67500
884
+ },
885
+ {
886
+ "epoch": 8.91,
887
+ "learning_rate": 4.151264910211035e-07,
888
+ "loss": 1.4754,
889
+ "step": 68000
890
+ },
891
+ {
892
+ "epoch": 8.98,
893
+ "learning_rate": 3.961200681609647e-07,
894
+ "loss": 1.4756,
895
+ "step": 68500
896
+ },
897
+ {
898
+ "epoch": 9.0,
899
+ "eval_loss": 1.4828386306762695,
900
+ "eval_runtime": 6.934,
901
+ "eval_samples_per_second": 142.198,
902
+ "eval_steps_per_second": 0.865,
903
+ "step": 68663
904
+ },
905
+ {
906
+ "epoch": 9.04,
907
+ "learning_rate": 3.771136453008257e-07,
908
+ "loss": 1.4747,
909
+ "step": 69000
910
+ },
911
+ {
912
+ "epoch": 9.11,
913
+ "learning_rate": 3.581072224406868e-07,
914
+ "loss": 1.4753,
915
+ "step": 69500
916
+ },
917
+ {
918
+ "epoch": 9.18,
919
+ "learning_rate": 3.3910079958054786e-07,
920
+ "loss": 1.4752,
921
+ "step": 70000
922
+ },
923
+ {
924
+ "epoch": 9.24,
925
+ "learning_rate": 3.20094376720409e-07,
926
+ "loss": 1.4752,
927
+ "step": 70500
928
+ },
929
+ {
930
+ "epoch": 9.31,
931
+ "learning_rate": 3.0108795386027005e-07,
932
+ "loss": 1.4751,
933
+ "step": 71000
934
+ },
935
+ {
936
+ "epoch": 9.37,
937
+ "learning_rate": 2.820815310001311e-07,
938
+ "loss": 1.4751,
939
+ "step": 71500
940
+ },
941
+ {
942
+ "epoch": 9.44,
943
+ "learning_rate": 2.630751081399922e-07,
944
+ "loss": 1.4749,
945
+ "step": 72000
946
+ },
947
+ {
948
+ "epoch": 9.5,
949
+ "learning_rate": 2.4406868527985326e-07,
950
+ "loss": 1.4748,
951
+ "step": 72500
952
+ },
953
+ {
954
+ "epoch": 9.57,
955
+ "learning_rate": 2.2506226241971439e-07,
956
+ "loss": 1.4751,
957
+ "step": 73000
958
+ },
959
+ {
960
+ "epoch": 9.63,
961
+ "learning_rate": 2.0605583955957543e-07,
962
+ "loss": 1.4747,
963
+ "step": 73500
964
+ },
965
+ {
966
+ "epoch": 9.7,
967
+ "learning_rate": 1.870494166994362e-07,
968
+ "loss": 1.4748,
969
+ "step": 74000
970
+ },
971
+ {
972
+ "epoch": 9.77,
973
+ "learning_rate": 1.6804299383929728e-07,
974
+ "loss": 1.475,
975
+ "step": 74500
976
+ },
977
+ {
978
+ "epoch": 9.83,
979
+ "learning_rate": 1.4903657097915838e-07,
980
+ "loss": 1.4746,
981
+ "step": 75000
982
+ },
983
+ {
984
+ "epoch": 9.9,
985
+ "learning_rate": 1.3003014811901947e-07,
986
+ "loss": 1.4752,
987
+ "step": 75500
988
+ },
989
+ {
990
+ "epoch": 9.96,
991
+ "learning_rate": 1.1102372525888054e-07,
992
+ "loss": 1.4749,
993
+ "step": 76000
994
+ },
995
+ {
996
+ "epoch": 10.0,
997
+ "eval_loss": 1.4823230504989624,
998
+ "eval_runtime": 6.8873,
999
+ "eval_samples_per_second": 143.161,
1000
+ "eval_steps_per_second": 0.871,
1001
+ "step": 76290
1002
+ }
1003
+ ],
1004
+ "logging_steps": 500,
1005
+ "max_steps": 76290,
1006
+ "num_train_epochs": 10,
1007
+ "save_steps": 500,
1008
+ "total_flos": 1.609765756918825e+18,
1009
+ "trial_name": null,
1010
+ "trial_params": null
1011
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60713a2452599cf347f1ec66d0e7a5505f4578e8c3138a0e9f3cead71b304ec3
3
+ size 4335
vocab.txt ADDED
The diff for this file is too large to render. See raw diff