d888d commited on
Commit
91d75e1
·
verified ·
1 Parent(s): 666e67d

Upload 7 files

Browse files
Files changed (7) hide show
  1. config.json +55 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +455 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "conv_act": "gelu",
8
+ "conv_kernel_size": 3,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2",
16
+ "3": "LABEL_3",
17
+ "4": "LABEL_4",
18
+ "5": "LABEL_5",
19
+ "6": "LABEL_6"
20
+ },
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 4096,
23
+ "label2id": {
24
+ "LABEL_0": 0,
25
+ "LABEL_1": 1,
26
+ "LABEL_2": 2,
27
+ "LABEL_3": 3,
28
+ "LABEL_4": 4,
29
+ "LABEL_5": 5,
30
+ "LABEL_6": 6
31
+ },
32
+ "layer_norm_eps": 1e-07,
33
+ "max_position_embeddings": 512,
34
+ "max_relative_positions": -1,
35
+ "model_type": "deberta-v2",
36
+ "norm_rel_ebd": "layer_norm",
37
+ "num_attention_heads": 16,
38
+ "num_hidden_layers": 24,
39
+ "pad_token_id": 0,
40
+ "pooler_dropout": 0,
41
+ "pooler_hidden_act": "gelu",
42
+ "pooler_hidden_size": 1024,
43
+ "pos_att_type": [
44
+ "c2p",
45
+ "p2c"
46
+ ],
47
+ "position_biased_input": false,
48
+ "position_buckets": 256,
49
+ "relative_attention": true,
50
+ "share_att_key": true,
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.39.3",
53
+ "type_vocab_size": 0,
54
+ "vocab_size": 12800
55
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d25aa6140ed4b2d72ce0f4e5d1d88c168938042a3b58e634ed8160b51e0ca083
3
+ size 1280651436
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bfdccdb1d1ae0e5544a7a20f5becd4c6192a9c7f336dccdf15c070a62410f9d
3
+ size 2561537340
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28489139e1e95de773495fd7a149c1d0795a7f2a165720f3a2989ae391bd644e
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b8f05025975eda8c6fd7151df997f2fdf1334b654149418d27c46930a65f882
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8554500158408583,
3
+ "best_model_checkpoint": "cn_output/run-0/checkpoint-5775",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5775,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05,
13
+ "grad_norm": 43.52214813232422,
14
+ "learning_rate": 2.7348179693000015e-05,
15
+ "loss": 1.6704,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.1,
20
+ "grad_norm": 16.620838165283203,
21
+ "learning_rate": 2.6866273442903096e-05,
22
+ "loss": 1.3854,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.16,
27
+ "grad_norm": 14.869579315185547,
28
+ "learning_rate": 2.638436719280618e-05,
29
+ "loss": 1.2394,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.21,
34
+ "grad_norm": 16.870759963989258,
35
+ "learning_rate": 2.5902460942709264e-05,
36
+ "loss": 1.1392,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.26,
41
+ "grad_norm": 12.891343116760254,
42
+ "learning_rate": 2.542055469261235e-05,
43
+ "loss": 1.0319,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.31,
48
+ "grad_norm": 7.518686771392822,
49
+ "learning_rate": 2.493864844251543e-05,
50
+ "loss": 1.0726,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.36,
55
+ "grad_norm": 9.59931755065918,
56
+ "learning_rate": 2.4456742192418514e-05,
57
+ "loss": 1.0907,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.42,
62
+ "grad_norm": 15.09188461303711,
63
+ "learning_rate": 2.3974835942321598e-05,
64
+ "loss": 1.038,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.47,
69
+ "grad_norm": 12.715774536132812,
70
+ "learning_rate": 2.3492929692224682e-05,
71
+ "loss": 0.9121,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.52,
76
+ "grad_norm": 23.419095993041992,
77
+ "learning_rate": 2.3011023442127766e-05,
78
+ "loss": 0.8541,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.57,
83
+ "grad_norm": 24.277725219726562,
84
+ "learning_rate": 2.252911719203085e-05,
85
+ "loss": 0.8723,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.62,
90
+ "grad_norm": 22.929729461669922,
91
+ "learning_rate": 2.2047210941933934e-05,
92
+ "loss": 0.8625,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.68,
97
+ "grad_norm": 12.207918167114258,
98
+ "learning_rate": 2.1565304691837015e-05,
99
+ "loss": 0.8384,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.73,
104
+ "grad_norm": 26.28716278076172,
105
+ "learning_rate": 2.10833984417401e-05,
106
+ "loss": 0.8521,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.78,
111
+ "grad_norm": 9.733681678771973,
112
+ "learning_rate": 2.0601492191643184e-05,
113
+ "loss": 0.7984,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.83,
118
+ "grad_norm": 25.5473690032959,
119
+ "learning_rate": 2.0119585941546264e-05,
120
+ "loss": 0.7452,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.88,
125
+ "grad_norm": 28.462318420410156,
126
+ "learning_rate": 1.963767969144935e-05,
127
+ "loss": 0.7016,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.94,
132
+ "grad_norm": 30.851057052612305,
133
+ "learning_rate": 1.9155773441352433e-05,
134
+ "loss": 0.6933,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.99,
139
+ "grad_norm": 20.66396141052246,
140
+ "learning_rate": 1.8673867191255517e-05,
141
+ "loss": 0.6987,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 1.0,
146
+ "eval_accuracy": 0.7933753943217665,
147
+ "eval_f1": 0.7855091479380132,
148
+ "eval_loss": 0.635335385799408,
149
+ "eval_runtime": 15.4807,
150
+ "eval_samples_per_second": 122.863,
151
+ "eval_steps_per_second": 3.876,
152
+ "step": 1925
153
+ },
154
+ {
155
+ "epoch": 1.04,
156
+ "grad_norm": 18.86337661743164,
157
+ "learning_rate": 1.8191960941158598e-05,
158
+ "loss": 0.5131,
159
+ "step": 2000
160
+ },
161
+ {
162
+ "epoch": 1.09,
163
+ "grad_norm": 23.430830001831055,
164
+ "learning_rate": 1.7710054691061682e-05,
165
+ "loss": 0.4268,
166
+ "step": 2100
167
+ },
168
+ {
169
+ "epoch": 1.14,
170
+ "grad_norm": 11.48133373260498,
171
+ "learning_rate": 1.7228148440964766e-05,
172
+ "loss": 0.4388,
173
+ "step": 2200
174
+ },
175
+ {
176
+ "epoch": 1.19,
177
+ "grad_norm": 21.7901668548584,
178
+ "learning_rate": 1.674624219086785e-05,
179
+ "loss": 0.4276,
180
+ "step": 2300
181
+ },
182
+ {
183
+ "epoch": 1.25,
184
+ "grad_norm": 3.37796688079834,
185
+ "learning_rate": 1.6264335940770934e-05,
186
+ "loss": 0.3975,
187
+ "step": 2400
188
+ },
189
+ {
190
+ "epoch": 1.3,
191
+ "grad_norm": 1.8390240669250488,
192
+ "learning_rate": 1.578242969067402e-05,
193
+ "loss": 0.4863,
194
+ "step": 2500
195
+ },
196
+ {
197
+ "epoch": 1.35,
198
+ "grad_norm": 0.9483298063278198,
199
+ "learning_rate": 1.5300523440577103e-05,
200
+ "loss": 0.4216,
201
+ "step": 2600
202
+ },
203
+ {
204
+ "epoch": 1.4,
205
+ "grad_norm": 13.241854667663574,
206
+ "learning_rate": 1.4818617190480184e-05,
207
+ "loss": 0.4522,
208
+ "step": 2700
209
+ },
210
+ {
211
+ "epoch": 1.45,
212
+ "grad_norm": 13.804593086242676,
213
+ "learning_rate": 1.4336710940383268e-05,
214
+ "loss": 0.3998,
215
+ "step": 2800
216
+ },
217
+ {
218
+ "epoch": 1.51,
219
+ "grad_norm": 20.864044189453125,
220
+ "learning_rate": 1.3854804690286352e-05,
221
+ "loss": 0.3561,
222
+ "step": 2900
223
+ },
224
+ {
225
+ "epoch": 1.56,
226
+ "grad_norm": 11.546530723571777,
227
+ "learning_rate": 1.3372898440189434e-05,
228
+ "loss": 0.4525,
229
+ "step": 3000
230
+ },
231
+ {
232
+ "epoch": 1.61,
233
+ "grad_norm": 21.35649871826172,
234
+ "learning_rate": 1.2890992190092519e-05,
235
+ "loss": 0.422,
236
+ "step": 3100
237
+ },
238
+ {
239
+ "epoch": 1.66,
240
+ "grad_norm": 9.798705101013184,
241
+ "learning_rate": 1.2409085939995601e-05,
242
+ "loss": 0.3685,
243
+ "step": 3200
244
+ },
245
+ {
246
+ "epoch": 1.71,
247
+ "grad_norm": 1.4076740741729736,
248
+ "learning_rate": 1.1927179689898684e-05,
249
+ "loss": 0.3969,
250
+ "step": 3300
251
+ },
252
+ {
253
+ "epoch": 1.77,
254
+ "grad_norm": 2.5313684940338135,
255
+ "learning_rate": 1.1445273439801768e-05,
256
+ "loss": 0.3632,
257
+ "step": 3400
258
+ },
259
+ {
260
+ "epoch": 1.82,
261
+ "grad_norm": 4.284488677978516,
262
+ "learning_rate": 1.0963367189704852e-05,
263
+ "loss": 0.3758,
264
+ "step": 3500
265
+ },
266
+ {
267
+ "epoch": 1.87,
268
+ "grad_norm": 87.45575714111328,
269
+ "learning_rate": 1.0481460939607936e-05,
270
+ "loss": 0.3407,
271
+ "step": 3600
272
+ },
273
+ {
274
+ "epoch": 1.92,
275
+ "grad_norm": 18.49854850769043,
276
+ "learning_rate": 9.999554689511019e-06,
277
+ "loss": 0.4158,
278
+ "step": 3700
279
+ },
280
+ {
281
+ "epoch": 1.97,
282
+ "grad_norm": 10.284146308898926,
283
+ "learning_rate": 9.517648439414103e-06,
284
+ "loss": 0.3669,
285
+ "step": 3800
286
+ },
287
+ {
288
+ "epoch": 2.0,
289
+ "eval_accuracy": 0.8496319663512093,
290
+ "eval_f1": 0.8396298133039037,
291
+ "eval_loss": 0.6180713176727295,
292
+ "eval_runtime": 15.5038,
293
+ "eval_samples_per_second": 122.679,
294
+ "eval_steps_per_second": 3.87,
295
+ "step": 3850
296
+ },
297
+ {
298
+ "epoch": 2.03,
299
+ "grad_norm": 2.4232983589172363,
300
+ "learning_rate": 9.035742189317185e-06,
301
+ "loss": 0.2593,
302
+ "step": 3900
303
+ },
304
+ {
305
+ "epoch": 2.08,
306
+ "grad_norm": 14.904269218444824,
307
+ "learning_rate": 8.553835939220268e-06,
308
+ "loss": 0.1672,
309
+ "step": 4000
310
+ },
311
+ {
312
+ "epoch": 2.13,
313
+ "grad_norm": 0.02660948596894741,
314
+ "learning_rate": 8.071929689123352e-06,
315
+ "loss": 0.1319,
316
+ "step": 4100
317
+ },
318
+ {
319
+ "epoch": 2.18,
320
+ "grad_norm": 0.1543588936328888,
321
+ "learning_rate": 7.590023439026435e-06,
322
+ "loss": 0.1873,
323
+ "step": 4200
324
+ },
325
+ {
326
+ "epoch": 2.23,
327
+ "grad_norm": 0.04116074740886688,
328
+ "learning_rate": 7.108117188929519e-06,
329
+ "loss": 0.1706,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "epoch": 2.29,
334
+ "grad_norm": 21.272350311279297,
335
+ "learning_rate": 6.626210938832602e-06,
336
+ "loss": 0.2408,
337
+ "step": 4400
338
+ },
339
+ {
340
+ "epoch": 2.34,
341
+ "grad_norm": 0.016322173178195953,
342
+ "learning_rate": 6.144304688735686e-06,
343
+ "loss": 0.1878,
344
+ "step": 4500
345
+ },
346
+ {
347
+ "epoch": 2.39,
348
+ "grad_norm": 0.12828181684017181,
349
+ "learning_rate": 5.662398438638769e-06,
350
+ "loss": 0.1664,
351
+ "step": 4600
352
+ },
353
+ {
354
+ "epoch": 2.44,
355
+ "grad_norm": 20.833871841430664,
356
+ "learning_rate": 5.180492188541853e-06,
357
+ "loss": 0.1066,
358
+ "step": 4700
359
+ },
360
+ {
361
+ "epoch": 2.49,
362
+ "grad_norm": 0.006888058967888355,
363
+ "learning_rate": 4.698585938444936e-06,
364
+ "loss": 0.1298,
365
+ "step": 4800
366
+ },
367
+ {
368
+ "epoch": 2.55,
369
+ "grad_norm": 0.0369136743247509,
370
+ "learning_rate": 4.21667968834802e-06,
371
+ "loss": 0.1151,
372
+ "step": 4900
373
+ },
374
+ {
375
+ "epoch": 2.6,
376
+ "grad_norm": 0.13510233163833618,
377
+ "learning_rate": 3.7347734382511036e-06,
378
+ "loss": 0.173,
379
+ "step": 5000
380
+ },
381
+ {
382
+ "epoch": 2.65,
383
+ "grad_norm": 0.05811930075287819,
384
+ "learning_rate": 3.252867188154187e-06,
385
+ "loss": 0.1359,
386
+ "step": 5100
387
+ },
388
+ {
389
+ "epoch": 2.7,
390
+ "grad_norm": 0.07640138268470764,
391
+ "learning_rate": 2.7709609380572702e-06,
392
+ "loss": 0.1293,
393
+ "step": 5200
394
+ },
395
+ {
396
+ "epoch": 2.75,
397
+ "grad_norm": 0.019262025132775307,
398
+ "learning_rate": 2.289054687960354e-06,
399
+ "loss": 0.1205,
400
+ "step": 5300
401
+ },
402
+ {
403
+ "epoch": 2.81,
404
+ "grad_norm": 16.199689865112305,
405
+ "learning_rate": 1.8071484378634369e-06,
406
+ "loss": 0.1713,
407
+ "step": 5400
408
+ },
409
+ {
410
+ "epoch": 2.86,
411
+ "grad_norm": 0.010484320111572742,
412
+ "learning_rate": 1.3252421877665204e-06,
413
+ "loss": 0.145,
414
+ "step": 5500
415
+ },
416
+ {
417
+ "epoch": 2.91,
418
+ "grad_norm": 0.06876770406961441,
419
+ "learning_rate": 8.43335937669604e-07,
420
+ "loss": 0.1405,
421
+ "step": 5600
422
+ },
423
+ {
424
+ "epoch": 2.96,
425
+ "grad_norm": 0.3863673210144043,
426
+ "learning_rate": 3.6142968757268745e-07,
427
+ "loss": 0.1295,
428
+ "step": 5700
429
+ },
430
+ {
431
+ "epoch": 3.0,
432
+ "eval_accuracy": 0.8654048370136698,
433
+ "eval_f1": 0.8554500158408583,
434
+ "eval_loss": 0.7277432680130005,
435
+ "eval_runtime": 15.4912,
436
+ "eval_samples_per_second": 122.779,
437
+ "eval_steps_per_second": 3.873,
438
+ "step": 5775
439
+ }
440
+ ],
441
+ "logging_steps": 100,
442
+ "max_steps": 5775,
443
+ "num_input_tokens_seen": 0,
444
+ "num_train_epochs": 3,
445
+ "save_steps": 500,
446
+ "total_flos": 4219575531135264.0,
447
+ "train_batch_size": 8,
448
+ "trial_name": null,
449
+ "trial_params": {
450
+ "learning_rate": 2.783008594309693e-05,
451
+ "lr_scheduler_type": "linear",
452
+ "num_train_epochs": 3,
453
+ "per_device_train_batch_size": 8
454
+ }
455
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f29ef3614e8dc39f7548f4c40ef000ff35bec5715582ffcddd443752ddbd2a5
3
+ size 4920