fenffef commited on
Commit
7812b01
·
1 Parent(s): 8da1868

Upload 9 files

Browse files
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "init_mem_cpu_alloc_delta": 2530299,
4
+ "init_mem_cpu_peaked_delta": 19115,
5
+ "init_mem_gpu_alloc_delta": 560977408,
6
+ "init_mem_gpu_peaked_delta": 0,
7
+ "train_mem_cpu_alloc_delta": 18524938,
8
+ "train_mem_cpu_peaked_delta": 158585376,
9
+ "train_mem_gpu_alloc_delta": 1692413440,
10
+ "train_mem_gpu_peaked_delta": 10976766464,
11
+ "train_runtime": 81966.5582,
12
+ "train_samples": 2039878,
13
+ "train_samples_per_second": 0.486
14
+ }
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./exps/bart-base-chinese",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 101,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 102,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 102,
27
+ "forced_eos_token_id": 102,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": 100,
42
+ "max_position_embeddings": 1024,
43
+ "model_type": "bart",
44
+ "no_repeat_ngram_size": 3,
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_beams": 4,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 0,
50
+ "scale_embedding": false,
51
+ "task_specific_params": {
52
+ "summarization": {
53
+ "length_penalty": 1.0,
54
+ "max_length": 128,
55
+ "min_length": 12,
56
+ "num_beams": 4
57
+ },
58
+ "summarization_cnn": {
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "num_beams": 4
63
+ },
64
+ "summarization_xsum": {
65
+ "length_penalty": 1.0,
66
+ "max_length": 62,
67
+ "min_length": 11,
68
+ "num_beams": 6
69
+ }
70
+ },
71
+ "tokenizer_class": "BertTokenizer",
72
+ "transformers_version": "4.4.1",
73
+ "use_cache": true,
74
+ "vocab_size": 51271
75
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea53bcee568c3824685208d1531a0b60c92b162f80828761d9598ae07e9db8c6
3
+ size 561073657
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./exps/bart-base-chinese"}
train_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "init_mem_cpu_alloc_delta": 2530299,
4
+ "init_mem_cpu_peaked_delta": 19115,
5
+ "init_mem_gpu_alloc_delta": 560977408,
6
+ "init_mem_gpu_peaked_delta": 0,
7
+ "train_mem_cpu_alloc_delta": 18524938,
8
+ "train_mem_cpu_peaked_delta": 158585376,
9
+ "train_mem_gpu_alloc_delta": 1692413440,
10
+ "train_mem_gpu_peaked_delta": 10976766464,
11
+ "train_runtime": 81966.5582,
12
+ "train_samples": 2039878,
13
+ "train_samples_per_second": 0.486
14
+ }
trainer_state.json ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.999937252933425,
5
+ "global_step": 39840,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 2.9636044176706825e-06,
13
+ "loss": 2.8976,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 2.927208835341366e-06,
19
+ "loss": 2.6491,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 2.8908132530120482e-06,
25
+ "loss": 2.5824,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 2.854417670682731e-06,
31
+ "loss": 2.5408,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.63,
36
+ "learning_rate": 2.8180220883534135e-06,
37
+ "loss": 2.5086,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 2.7816265060240964e-06,
43
+ "loss": 2.4829,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 2.7452309236947792e-06,
49
+ "loss": 2.4628,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "eval_loss": 2.4354708194732666,
55
+ "eval_runtime": 73.1046,
56
+ "eval_samples_per_second": 281.843,
57
+ "step": 3984
58
+ },
59
+ {
60
+ "epoch": 1.0,
61
+ "learning_rate": 2.7088353413654617e-06,
62
+ "loss": 2.4451,
63
+ "step": 4000
64
+ },
65
+ {
66
+ "epoch": 1.13,
67
+ "learning_rate": 2.672439759036145e-06,
68
+ "loss": 2.4269,
69
+ "step": 4500
70
+ },
71
+ {
72
+ "epoch": 1.26,
73
+ "learning_rate": 2.6360441767068274e-06,
74
+ "loss": 2.4118,
75
+ "step": 5000
76
+ },
77
+ {
78
+ "epoch": 1.38,
79
+ "learning_rate": 2.5996485943775102e-06,
80
+ "loss": 2.3986,
81
+ "step": 5500
82
+ },
83
+ {
84
+ "epoch": 1.51,
85
+ "learning_rate": 2.5632530120481927e-06,
86
+ "loss": 2.387,
87
+ "step": 6000
88
+ },
89
+ {
90
+ "epoch": 1.63,
91
+ "learning_rate": 2.5268574297188755e-06,
92
+ "loss": 2.376,
93
+ "step": 6500
94
+ },
95
+ {
96
+ "epoch": 1.76,
97
+ "learning_rate": 2.4904618473895584e-06,
98
+ "loss": 2.366,
99
+ "step": 7000
100
+ },
101
+ {
102
+ "epoch": 1.88,
103
+ "learning_rate": 2.454066265060241e-06,
104
+ "loss": 2.3572,
105
+ "step": 7500
106
+ },
107
+ {
108
+ "epoch": 2.0,
109
+ "eval_loss": 2.3434979915618896,
110
+ "eval_runtime": 73.6648,
111
+ "eval_samples_per_second": 279.699,
112
+ "step": 7968
113
+ },
114
+ {
115
+ "epoch": 2.01,
116
+ "learning_rate": 2.4176706827309237e-06,
117
+ "loss": 2.3493,
118
+ "step": 8000
119
+ },
120
+ {
121
+ "epoch": 2.13,
122
+ "learning_rate": 2.3812751004016065e-06,
123
+ "loss": 2.3394,
124
+ "step": 8500
125
+ },
126
+ {
127
+ "epoch": 2.26,
128
+ "learning_rate": 2.344879518072289e-06,
129
+ "loss": 2.3331,
130
+ "step": 9000
131
+ },
132
+ {
133
+ "epoch": 2.38,
134
+ "learning_rate": 2.308483935742972e-06,
135
+ "loss": 2.3268,
136
+ "step": 9500
137
+ },
138
+ {
139
+ "epoch": 2.51,
140
+ "learning_rate": 2.2720883534136547e-06,
141
+ "loss": 2.3215,
142
+ "step": 10000
143
+ },
144
+ {
145
+ "epoch": 2.64,
146
+ "learning_rate": 2.2356927710843376e-06,
147
+ "loss": 2.3149,
148
+ "step": 10500
149
+ },
150
+ {
151
+ "epoch": 2.76,
152
+ "learning_rate": 2.19929718875502e-06,
153
+ "loss": 2.3102,
154
+ "step": 11000
155
+ },
156
+ {
157
+ "epoch": 2.89,
158
+ "learning_rate": 2.162901606425703e-06,
159
+ "loss": 2.3042,
160
+ "step": 11500
161
+ },
162
+ {
163
+ "epoch": 3.0,
164
+ "eval_loss": 2.2957260608673096,
165
+ "eval_runtime": 73.5103,
166
+ "eval_samples_per_second": 280.287,
167
+ "step": 11952
168
+ },
169
+ {
170
+ "epoch": 3.01,
171
+ "learning_rate": 2.1265060240963857e-06,
172
+ "loss": 2.3003,
173
+ "step": 12000
174
+ },
175
+ {
176
+ "epoch": 3.14,
177
+ "learning_rate": 2.090110441767068e-06,
178
+ "loss": 2.2937,
179
+ "step": 12500
180
+ },
181
+ {
182
+ "epoch": 3.26,
183
+ "learning_rate": 2.053714859437751e-06,
184
+ "loss": 2.2903,
185
+ "step": 13000
186
+ },
187
+ {
188
+ "epoch": 3.39,
189
+ "learning_rate": 2.017319277108434e-06,
190
+ "loss": 2.2849,
191
+ "step": 13500
192
+ },
193
+ {
194
+ "epoch": 3.51,
195
+ "learning_rate": 1.9809236947791167e-06,
196
+ "loss": 2.2819,
197
+ "step": 14000
198
+ },
199
+ {
200
+ "epoch": 3.64,
201
+ "learning_rate": 1.9445281124497996e-06,
202
+ "loss": 2.2775,
203
+ "step": 14500
204
+ },
205
+ {
206
+ "epoch": 3.77,
207
+ "learning_rate": 1.908132530120482e-06,
208
+ "loss": 2.2734,
209
+ "step": 15000
210
+ },
211
+ {
212
+ "epoch": 3.89,
213
+ "learning_rate": 1.871736947791165e-06,
214
+ "loss": 2.2714,
215
+ "step": 15500
216
+ },
217
+ {
218
+ "epoch": 4.0,
219
+ "eval_loss": 2.2644200325012207,
220
+ "eval_runtime": 73.6047,
221
+ "eval_samples_per_second": 279.928,
222
+ "step": 15936
223
+ },
224
+ {
225
+ "epoch": 4.02,
226
+ "learning_rate": 1.8353413654618473e-06,
227
+ "loss": 2.2707,
228
+ "step": 16000
229
+ },
230
+ {
231
+ "epoch": 4.14,
232
+ "learning_rate": 1.7989457831325302e-06,
233
+ "loss": 2.265,
234
+ "step": 16500
235
+ },
236
+ {
237
+ "epoch": 4.27,
238
+ "learning_rate": 1.7625502008032132e-06,
239
+ "loss": 2.2617,
240
+ "step": 17000
241
+ },
242
+ {
243
+ "epoch": 4.39,
244
+ "learning_rate": 1.7261546184738955e-06,
245
+ "loss": 2.2593,
246
+ "step": 17500
247
+ },
248
+ {
249
+ "epoch": 4.52,
250
+ "learning_rate": 1.6897590361445783e-06,
251
+ "loss": 2.256,
252
+ "step": 18000
253
+ },
254
+ {
255
+ "epoch": 4.64,
256
+ "learning_rate": 1.6533634538152614e-06,
257
+ "loss": 2.2539,
258
+ "step": 18500
259
+ },
260
+ {
261
+ "epoch": 4.77,
262
+ "learning_rate": 1.616967871485944e-06,
263
+ "loss": 2.2507,
264
+ "step": 19000
265
+ },
266
+ {
267
+ "epoch": 4.89,
268
+ "learning_rate": 1.5805722891566265e-06,
269
+ "loss": 2.2492,
270
+ "step": 19500
271
+ },
272
+ {
273
+ "epoch": 5.0,
274
+ "eval_loss": 2.2463769912719727,
275
+ "eval_runtime": 73.2603,
276
+ "eval_samples_per_second": 281.244,
277
+ "step": 19920
278
+ },
279
+ {
280
+ "epoch": 5.02,
281
+ "learning_rate": 1.5441767068273091e-06,
282
+ "loss": 2.2493,
283
+ "step": 20000
284
+ },
285
+ {
286
+ "epoch": 5.15,
287
+ "learning_rate": 1.5077811244979922e-06,
288
+ "loss": 2.2447,
289
+ "step": 20500
290
+ },
291
+ {
292
+ "epoch": 5.27,
293
+ "learning_rate": 1.4713855421686746e-06,
294
+ "loss": 2.2426,
295
+ "step": 21000
296
+ },
297
+ {
298
+ "epoch": 5.4,
299
+ "learning_rate": 1.4349899598393575e-06,
300
+ "loss": 2.2412,
301
+ "step": 21500
302
+ },
303
+ {
304
+ "epoch": 5.52,
305
+ "learning_rate": 1.3985943775100403e-06,
306
+ "loss": 2.2393,
307
+ "step": 22000
308
+ },
309
+ {
310
+ "epoch": 5.65,
311
+ "learning_rate": 1.3621987951807228e-06,
312
+ "loss": 2.2375,
313
+ "step": 22500
314
+ },
315
+ {
316
+ "epoch": 5.77,
317
+ "learning_rate": 1.3258032128514056e-06,
318
+ "loss": 2.236,
319
+ "step": 23000
320
+ },
321
+ {
322
+ "epoch": 5.9,
323
+ "learning_rate": 1.2894076305220885e-06,
324
+ "loss": 2.2342,
325
+ "step": 23500
326
+ },
327
+ {
328
+ "epoch": 6.0,
329
+ "eval_loss": 2.2327213287353516,
330
+ "eval_runtime": 73.1203,
331
+ "eval_samples_per_second": 281.782,
332
+ "step": 23904
333
+ },
334
+ {
335
+ "epoch": 6.02,
336
+ "learning_rate": 1.253012048192771e-06,
337
+ "loss": 2.2346,
338
+ "step": 24000
339
+ },
340
+ {
341
+ "epoch": 6.15,
342
+ "learning_rate": 1.2166164658634538e-06,
343
+ "loss": 2.2312,
344
+ "step": 24500
345
+ },
346
+ {
347
+ "epoch": 6.28,
348
+ "learning_rate": 1.1802208835341366e-06,
349
+ "loss": 2.2291,
350
+ "step": 25000
351
+ },
352
+ {
353
+ "epoch": 6.4,
354
+ "learning_rate": 1.1438253012048195e-06,
355
+ "loss": 2.2283,
356
+ "step": 25500
357
+ },
358
+ {
359
+ "epoch": 6.53,
360
+ "learning_rate": 1.107429718875502e-06,
361
+ "loss": 2.2279,
362
+ "step": 26000
363
+ },
364
+ {
365
+ "epoch": 6.65,
366
+ "learning_rate": 1.0710341365461848e-06,
367
+ "loss": 2.2269,
368
+ "step": 26500
369
+ },
370
+ {
371
+ "epoch": 6.78,
372
+ "learning_rate": 1.0346385542168676e-06,
373
+ "loss": 2.2251,
374
+ "step": 27000
375
+ },
376
+ {
377
+ "epoch": 6.9,
378
+ "learning_rate": 9.9824297188755e-07,
379
+ "loss": 2.2245,
380
+ "step": 27500
381
+ },
382
+ {
383
+ "epoch": 7.0,
384
+ "eval_loss": 2.223916530609131,
385
+ "eval_runtime": 72.7667,
386
+ "eval_samples_per_second": 283.152,
387
+ "step": 27888
388
+ },
389
+ {
390
+ "epoch": 7.03,
391
+ "learning_rate": 9.61847389558233e-07,
392
+ "loss": 2.2247,
393
+ "step": 28000
394
+ },
395
+ {
396
+ "epoch": 7.15,
397
+ "learning_rate": 9.254518072289157e-07,
398
+ "loss": 2.221,
399
+ "step": 28500
400
+ },
401
+ {
402
+ "epoch": 7.28,
403
+ "learning_rate": 8.890562248995984e-07,
404
+ "loss": 2.2214,
405
+ "step": 29000
406
+ },
407
+ {
408
+ "epoch": 7.4,
409
+ "learning_rate": 8.526606425702813e-07,
410
+ "loss": 2.2206,
411
+ "step": 29500
412
+ },
413
+ {
414
+ "epoch": 7.53,
415
+ "learning_rate": 8.162650602409637e-07,
416
+ "loss": 2.2187,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 7.66,
421
+ "learning_rate": 7.798694779116466e-07,
422
+ "loss": 2.2178,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 7.78,
427
+ "learning_rate": 7.434738955823293e-07,
428
+ "loss": 2.2178,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 7.91,
433
+ "learning_rate": 7.070783132530122e-07,
434
+ "loss": 2.2178,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 8.0,
439
+ "eval_loss": 2.2174229621887207,
440
+ "eval_runtime": 72.9337,
441
+ "eval_samples_per_second": 282.503,
442
+ "step": 31872
443
+ },
444
+ {
445
+ "epoch": 8.03,
446
+ "learning_rate": 6.706827309236949e-07,
447
+ "loss": 2.2183,
448
+ "step": 32000
449
+ },
450
+ {
451
+ "epoch": 8.16,
452
+ "learning_rate": 6.342871485943774e-07,
453
+ "loss": 2.2157,
454
+ "step": 32500
455
+ },
456
+ {
457
+ "epoch": 8.28,
458
+ "learning_rate": 5.978915662650604e-07,
459
+ "loss": 2.2157,
460
+ "step": 33000
461
+ },
462
+ {
463
+ "epoch": 8.41,
464
+ "learning_rate": 5.614959839357429e-07,
465
+ "loss": 2.2133,
466
+ "step": 33500
467
+ },
468
+ {
469
+ "epoch": 8.53,
470
+ "learning_rate": 5.251004016064259e-07,
471
+ "loss": 2.2145,
472
+ "step": 34000
473
+ },
474
+ {
475
+ "epoch": 8.66,
476
+ "learning_rate": 4.887048192771084e-07,
477
+ "loss": 2.2136,
478
+ "step": 34500
479
+ },
480
+ {
481
+ "epoch": 8.79,
482
+ "learning_rate": 4.5230923694779105e-07,
483
+ "loss": 2.214,
484
+ "step": 35000
485
+ },
486
+ {
487
+ "epoch": 8.91,
488
+ "learning_rate": 4.15913654618474e-07,
489
+ "loss": 2.2134,
490
+ "step": 35500
491
+ },
492
+ {
493
+ "epoch": 9.0,
494
+ "eval_loss": 2.213841676712036,
495
+ "eval_runtime": 72.9648,
496
+ "eval_samples_per_second": 282.383,
497
+ "step": 35856
498
+ },
499
+ {
500
+ "epoch": 9.04,
501
+ "learning_rate": 3.795180722891565e-07,
502
+ "loss": 2.2125,
503
+ "step": 36000
504
+ },
505
+ {
506
+ "epoch": 9.16,
507
+ "learning_rate": 3.431224899598395e-07,
508
+ "loss": 2.2122,
509
+ "step": 36500
510
+ },
511
+ {
512
+ "epoch": 9.29,
513
+ "learning_rate": 3.0672690763052206e-07,
514
+ "loss": 2.2127,
515
+ "step": 37000
516
+ },
517
+ {
518
+ "epoch": 9.41,
519
+ "learning_rate": 2.703313253012047e-07,
520
+ "loss": 2.2116,
521
+ "step": 37500
522
+ },
523
+ {
524
+ "epoch": 9.54,
525
+ "learning_rate": 2.3393574297188764e-07,
526
+ "loss": 2.2102,
527
+ "step": 38000
528
+ },
529
+ {
530
+ "epoch": 9.66,
531
+ "learning_rate": 1.975401606425702e-07,
532
+ "loss": 2.2104,
533
+ "step": 38500
534
+ },
535
+ {
536
+ "epoch": 9.79,
537
+ "learning_rate": 1.6114457831325312e-07,
538
+ "loss": 2.2104,
539
+ "step": 39000
540
+ },
541
+ {
542
+ "epoch": 9.91,
543
+ "learning_rate": 1.2474899598393572e-07,
544
+ "loss": 2.2113,
545
+ "step": 39500
546
+ },
547
+ {
548
+ "epoch": 10.0,
549
+ "eval_loss": 2.212301254272461,
550
+ "eval_runtime": 77.5042,
551
+ "eval_samples_per_second": 265.844,
552
+ "step": 39840
553
+ },
554
+ {
555
+ "epoch": 10.0,
556
+ "step": 39840,
557
+ "total_flos": 1.5755091071960033e+18,
558
+ "train_runtime": 81966.5582,
559
+ "train_samples_per_second": 0.486
560
+ }
561
+ ],
562
+ "max_steps": 39840,
563
+ "num_train_epochs": 10,
564
+ "total_flos": 1.5755091071960033e+18,
565
+ "trial_name": null,
566
+ "trial_params": null
567
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651fdb408f64c77e296af06e77c855fdd81fc3fb4ad48b701668de9f5632f042
3
+ size 2415
vocab.txt ADDED
The diff for this file is too large to render. See raw diff