Token Classification
Safetensors
English
deberta-v2
shawnrushefsky commited on
Commit
a9f0fc3
·
1 Parent(s): 9e35bf4
README.md DELETED
@@ -1,3 +0,0 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
checkpoint-6562/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
checkpoint-6562/config.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "architectures": [
3
- "DebertaV2ForTokenClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "hidden_act": "gelu",
7
- "hidden_dropout_prob": 0.1,
8
- "hidden_size": 768,
9
- "id2label": {
10
- "0": "O",
11
- "1": "B-CHA",
12
- "2": "I-CHA",
13
- "3": "B-LOC",
14
- "4": "I-LOC",
15
- "5": "B-FAC",
16
- "6": "I-FAC",
17
- "7": "B-OBJ",
18
- "8": "I-OBJ",
19
- "9": "B-EVT",
20
- "10": "I-EVT",
21
- "11": "B-ORG",
22
- "12": "I-ORG",
23
- "13": "B-MISC",
24
- "14": "I-MISC"
25
- },
26
- "initializer_range": 0.02,
27
- "intermediate_size": 3072,
28
- "label2id": {
29
- "B-CHA": 1,
30
- "B-EVT": 9,
31
- "B-FAC": 5,
32
- "B-LOC": 3,
33
- "B-MISC": 13,
34
- "B-OBJ": 7,
35
- "B-ORG": 11,
36
- "I-CHA": 2,
37
- "I-EVT": 10,
38
- "I-FAC": 6,
39
- "I-LOC": 4,
40
- "I-MISC": 14,
41
- "I-OBJ": 8,
42
- "I-ORG": 12,
43
- "O": 0
44
- },
45
- "layer_norm_eps": 1e-07,
46
- "legacy": true,
47
- "max_position_embeddings": 512,
48
- "max_relative_positions": -1,
49
- "model_type": "deberta-v2",
50
- "norm_rel_ebd": "layer_norm",
51
- "num_attention_heads": 12,
52
- "num_hidden_layers": 12,
53
- "pad_token_id": 0,
54
- "pooler_dropout": 0,
55
- "pooler_hidden_act": "gelu",
56
- "pooler_hidden_size": 768,
57
- "pos_att_type": [
58
- "p2c",
59
- "c2p"
60
- ],
61
- "position_biased_input": false,
62
- "position_buckets": 256,
63
- "relative_attention": true,
64
- "share_att_key": true,
65
- "torch_dtype": "float32",
66
- "transformers_version": "4.55.4",
67
- "type_vocab_size": 0,
68
- "vocab_size": 128100
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6562/special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "[CLS]",
3
- "cls_token": "[CLS]",
4
- "eos_token": "[SEP]",
5
- "mask_token": "[MASK]",
6
- "pad_token": "[PAD]",
7
- "sep_token": "[SEP]",
8
- "unk_token": {
9
- "content": "[UNK]",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6562/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-6562/tokenizer_config.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[CLS]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[SEP]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[UNK]",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "128000": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "[CLS]",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "[CLS]",
47
- "do_lower_case": false,
48
- "eos_token": "[SEP]",
49
- "extra_special_tokens": {},
50
- "mask_token": "[MASK]",
51
- "model_max_length": 1000000000000000019884624838656,
52
- "pad_token": "[PAD]",
53
- "sep_token": "[SEP]",
54
- "sp_model_kwargs": {},
55
- "split_by_punct": false,
56
- "tokenizer_class": "DebertaV2Tokenizer",
57
- "unk_token": "[UNK]",
58
- "vocab_type": "spm"
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6562/trainer_state.json DELETED
@@ -1,951 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.8520971302428256,
6
- "eval_steps": 500,
7
- "global_step": 6562,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.006492663290481756,
14
- "grad_norm": 23.6250057220459,
15
- "learning_rate": 1.9600000000000003e-06,
16
- "loss": 3.3492,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.012985326580963512,
21
- "grad_norm": 1.5745655298233032,
22
- "learning_rate": 3.96e-06,
23
- "loss": 1.3837,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.019477989871445268,
28
- "grad_norm": 1.3241935968399048,
29
- "learning_rate": 5.9600000000000005e-06,
30
- "loss": 0.6673,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.025970653161927024,
35
- "grad_norm": 1.068557620048523,
36
- "learning_rate": 7.960000000000002e-06,
37
- "loss": 0.4426,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.032463316452408776,
42
- "grad_norm": 1.2953921556472778,
43
- "learning_rate": 9.960000000000001e-06,
44
- "loss": 0.363,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.038955979742890535,
49
- "grad_norm": 1.4753903150558472,
50
- "learning_rate": 1.196e-05,
51
- "loss": 0.3397,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.04544864303337229,
56
- "grad_norm": 1.245597243309021,
57
- "learning_rate": 1.396e-05,
58
- "loss": 0.3125,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.05194130632385405,
63
- "grad_norm": 1.3421887159347534,
64
- "learning_rate": 1.5960000000000003e-05,
65
- "loss": 0.3086,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.0584339696143358,
70
- "grad_norm": 1.7852529287338257,
71
- "learning_rate": 1.796e-05,
72
- "loss": 0.2883,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.06492663290481755,
77
- "grad_norm": 1.086064100265503,
78
- "learning_rate": 1.9960000000000002e-05,
79
- "loss": 0.2947,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 0.07141929619529931,
84
- "grad_norm": 0.8753414750099182,
85
- "learning_rate": 1.9974213919221156e-05,
86
- "loss": 0.2955,
87
- "step": 550
88
- },
89
- {
90
- "epoch": 0.07791195948578107,
91
- "grad_norm": 0.7875685095787048,
92
- "learning_rate": 1.9947901591895804e-05,
93
- "loss": 0.2773,
94
- "step": 600
95
- },
96
- {
97
- "epoch": 0.08440462277626282,
98
- "grad_norm": 1.1609506607055664,
99
- "learning_rate": 1.9921589264570452e-05,
100
- "loss": 0.2696,
101
- "step": 650
102
- },
103
- {
104
- "epoch": 0.09089728606674458,
105
- "grad_norm": 0.9162603616714478,
106
- "learning_rate": 1.98952769372451e-05,
107
- "loss": 0.2667,
108
- "step": 700
109
- },
110
- {
111
- "epoch": 0.09738994935722634,
112
- "grad_norm": 1.0398370027542114,
113
- "learning_rate": 1.986896460991975e-05,
114
- "loss": 0.277,
115
- "step": 750
116
- },
117
- {
118
- "epoch": 0.1038826126477081,
119
- "grad_norm": 0.9404798150062561,
120
- "learning_rate": 1.9842652282594397e-05,
121
- "loss": 0.2584,
122
- "step": 800
123
- },
124
- {
125
- "epoch": 0.11037527593818984,
126
- "grad_norm": 1.0353909730911255,
127
- "learning_rate": 1.9816339955269045e-05,
128
- "loss": 0.2533,
129
- "step": 850
130
- },
131
- {
132
- "epoch": 0.1168679392286716,
133
- "grad_norm": 1.485606074333191,
134
- "learning_rate": 1.9790027627943694e-05,
135
- "loss": 0.2506,
136
- "step": 900
137
- },
138
- {
139
- "epoch": 0.12336060251915336,
140
- "grad_norm": 0.9874151945114136,
141
- "learning_rate": 1.9763715300618342e-05,
142
- "loss": 0.2515,
143
- "step": 950
144
- },
145
- {
146
- "epoch": 0.1298532658096351,
147
- "grad_norm": 1.036007285118103,
148
- "learning_rate": 1.973740297329299e-05,
149
- "loss": 0.244,
150
- "step": 1000
151
- },
152
- {
153
- "epoch": 0.13634592910011686,
154
- "grad_norm": 0.946956217288971,
155
- "learning_rate": 1.9711090645967635e-05,
156
- "loss": 0.2518,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 0.14283859239059862,
161
- "grad_norm": 0.9096735119819641,
162
- "learning_rate": 1.9684778318642287e-05,
163
- "loss": 0.2543,
164
- "step": 1100
165
- },
166
- {
167
- "epoch": 0.14933125568108038,
168
- "grad_norm": 0.9371875524520874,
169
- "learning_rate": 1.965846599131693e-05,
170
- "loss": 0.2605,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 0.15582391897156214,
175
- "grad_norm": 0.8072580695152283,
176
- "learning_rate": 1.9632153663991583e-05,
177
- "loss": 0.2423,
178
- "step": 1200
179
- },
180
- {
181
- "epoch": 0.1623165822620439,
182
- "grad_norm": 0.6907565593719482,
183
- "learning_rate": 1.9605841336666228e-05,
184
- "loss": 0.2315,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 0.16880924555252563,
189
- "grad_norm": 0.6676374673843384,
190
- "learning_rate": 1.957952900934088e-05,
191
- "loss": 0.2363,
192
- "step": 1300
193
- },
194
- {
195
- "epoch": 0.1753019088430074,
196
- "grad_norm": 0.7281391024589539,
197
- "learning_rate": 1.9553216682015525e-05,
198
- "loss": 0.2399,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 0.18179457213348915,
203
- "grad_norm": 0.8105588555335999,
204
- "learning_rate": 1.9526904354690177e-05,
205
- "loss": 0.2448,
206
- "step": 1400
207
- },
208
- {
209
- "epoch": 0.1882872354239709,
210
- "grad_norm": 0.7466333508491516,
211
- "learning_rate": 1.950059202736482e-05,
212
- "loss": 0.2388,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 0.19477989871445267,
217
- "grad_norm": 0.5949985384941101,
218
- "learning_rate": 1.947427970003947e-05,
219
- "loss": 0.233,
220
- "step": 1500
221
- },
222
- {
223
- "epoch": 0.20127256200493443,
224
- "grad_norm": 0.7439960241317749,
225
- "learning_rate": 1.9447967372714118e-05,
226
- "loss": 0.2448,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.2077652252954162,
231
- "grad_norm": 0.9508784413337708,
232
- "learning_rate": 1.9421655045388766e-05,
233
- "loss": 0.2353,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 0.21425788858589795,
238
- "grad_norm": 0.8785332441329956,
239
- "learning_rate": 1.9395342718063415e-05,
240
- "loss": 0.2249,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.22075055187637968,
245
- "grad_norm": 0.7568134069442749,
246
- "learning_rate": 1.9369030390738063e-05,
247
- "loss": 0.2307,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 0.22724321516686144,
252
- "grad_norm": 0.6832641363143921,
253
- "learning_rate": 1.934271806341271e-05,
254
- "loss": 0.2263,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 0.2337358784573432,
259
- "grad_norm": 0.9721575379371643,
260
- "learning_rate": 1.931640573608736e-05,
261
- "loss": 0.2287,
262
- "step": 1800
263
- },
264
- {
265
- "epoch": 0.24022854174782496,
266
- "grad_norm": 0.620299220085144,
267
- "learning_rate": 1.9290093408762008e-05,
268
- "loss": 0.226,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 0.24672120503830672,
273
- "grad_norm": 0.6656680703163147,
274
- "learning_rate": 1.9263781081436656e-05,
275
- "loss": 0.2223,
276
- "step": 1900
277
- },
278
- {
279
- "epoch": 0.25321386832878845,
280
- "grad_norm": 0.8536450266838074,
281
- "learning_rate": 1.9237468754111304e-05,
282
- "loss": 0.2332,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 0.2597065316192702,
287
- "grad_norm": 0.9754176139831543,
288
- "learning_rate": 1.921115642678595e-05,
289
- "loss": 0.2211,
290
- "step": 2000
291
- },
292
- {
293
- "epoch": 0.26619919490975197,
294
- "grad_norm": 0.6166796684265137,
295
- "learning_rate": 1.9184844099460598e-05,
296
- "loss": 0.2201,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 0.2726918582002337,
301
- "grad_norm": 0.8182518482208252,
302
- "learning_rate": 1.9158531772135246e-05,
303
- "loss": 0.2213,
304
- "step": 2100
305
- },
306
- {
307
- "epoch": 0.2791845214907155,
308
- "grad_norm": 0.8245829939842224,
309
- "learning_rate": 1.9132219444809894e-05,
310
- "loss": 0.2208,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 0.28567718478119725,
315
- "grad_norm": 0.6423109173774719,
316
- "learning_rate": 1.9105907117484542e-05,
317
- "loss": 0.2308,
318
- "step": 2200
319
- },
320
- {
321
- "epoch": 0.292169848071679,
322
- "grad_norm": 0.7164100408554077,
323
- "learning_rate": 1.907959479015919e-05,
324
- "loss": 0.2193,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 0.29866251136216077,
329
- "grad_norm": 0.7910536527633667,
330
- "learning_rate": 1.905328246283384e-05,
331
- "loss": 0.2292,
332
- "step": 2300
333
- },
334
- {
335
- "epoch": 0.3051551746526425,
336
- "grad_norm": 0.9699934720993042,
337
- "learning_rate": 1.9026970135508487e-05,
338
- "loss": 0.2175,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 0.3116478379431243,
343
- "grad_norm": 1.0665416717529297,
344
- "learning_rate": 1.9000657808183136e-05,
345
- "loss": 0.2252,
346
- "step": 2400
347
- },
348
- {
349
- "epoch": 0.31814050123360604,
350
- "grad_norm": 0.9915699362754822,
351
- "learning_rate": 1.8974345480857784e-05,
352
- "loss": 0.215,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 0.3246331645240878,
357
- "grad_norm": 0.7021859288215637,
358
- "learning_rate": 1.8948033153532432e-05,
359
- "loss": 0.2123,
360
- "step": 2500
361
- },
362
- {
363
- "epoch": 0.33112582781456956,
364
- "grad_norm": 0.6402600407600403,
365
- "learning_rate": 1.892172082620708e-05,
366
- "loss": 0.2176,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 0.33761849110505127,
371
- "grad_norm": 0.9925078749656677,
372
- "learning_rate": 1.8895408498881725e-05,
373
- "loss": 0.223,
374
- "step": 2600
375
- },
376
- {
377
- "epoch": 0.344111154395533,
378
- "grad_norm": 0.5188687443733215,
379
- "learning_rate": 1.8869096171556377e-05,
380
- "loss": 0.2174,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 0.3506038176860148,
385
- "grad_norm": 0.7427687644958496,
386
- "learning_rate": 1.8842783844231022e-05,
387
- "loss": 0.2181,
388
- "step": 2700
389
- },
390
- {
391
- "epoch": 0.35709648097649654,
392
- "grad_norm": 0.6055501103401184,
393
- "learning_rate": 1.8816471516905674e-05,
394
- "loss": 0.2097,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 0.3635891442669783,
399
- "grad_norm": 0.6529932022094727,
400
- "learning_rate": 1.879015918958032e-05,
401
- "loss": 0.2133,
402
- "step": 2800
403
- },
404
- {
405
- "epoch": 0.37008180755746006,
406
- "grad_norm": 0.9499914646148682,
407
- "learning_rate": 1.8763846862254967e-05,
408
- "loss": 0.2189,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 0.3765744708479418,
413
- "grad_norm": 0.8217771053314209,
414
- "learning_rate": 1.8737534534929615e-05,
415
- "loss": 0.2172,
416
- "step": 2900
417
- },
418
- {
419
- "epoch": 0.3830671341384236,
420
- "grad_norm": 1.031895399093628,
421
- "learning_rate": 1.8711222207604263e-05,
422
- "loss": 0.2171,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 0.38955979742890534,
427
- "grad_norm": 0.7179540395736694,
428
- "learning_rate": 1.8684909880278912e-05,
429
- "loss": 0.2179,
430
- "step": 3000
431
- },
432
- {
433
- "epoch": 0.3960524607193871,
434
- "grad_norm": 0.8115535974502563,
435
- "learning_rate": 1.865859755295356e-05,
436
- "loss": 0.2131,
437
- "step": 3050
438
- },
439
- {
440
- "epoch": 0.40254512400986886,
441
- "grad_norm": 0.7659513354301453,
442
- "learning_rate": 1.863228522562821e-05,
443
- "loss": 0.2189,
444
- "step": 3100
445
- },
446
- {
447
- "epoch": 0.4090377873003506,
448
- "grad_norm": 0.8645811080932617,
449
- "learning_rate": 1.8605972898302857e-05,
450
- "loss": 0.2453,
451
- "step": 3150
452
- },
453
- {
454
- "epoch": 0.4155304505908324,
455
- "grad_norm": 0.8571991920471191,
456
- "learning_rate": 1.8579660570977505e-05,
457
- "loss": 0.2076,
458
- "step": 3200
459
- },
460
- {
461
- "epoch": 0.42202311388131414,
462
- "grad_norm": 1.222774863243103,
463
- "learning_rate": 1.8553348243652153e-05,
464
- "loss": 0.2094,
465
- "step": 3250
466
- },
467
- {
468
- "epoch": 0.4285157771717959,
469
- "grad_norm": 0.9631436467170715,
470
- "learning_rate": 1.85270359163268e-05,
471
- "loss": 0.2123,
472
- "step": 3300
473
- },
474
- {
475
- "epoch": 0.4350084404622776,
476
- "grad_norm": 0.5590534806251526,
477
- "learning_rate": 1.8500723589001446e-05,
478
- "loss": 0.2028,
479
- "step": 3350
480
- },
481
- {
482
- "epoch": 0.44150110375275936,
483
- "grad_norm": 0.7615678310394287,
484
- "learning_rate": 1.8474411261676098e-05,
485
- "loss": 0.2125,
486
- "step": 3400
487
- },
488
- {
489
- "epoch": 0.4479937670432411,
490
- "grad_norm": 0.7124540209770203,
491
- "learning_rate": 1.8448098934350743e-05,
492
- "loss": 0.206,
493
- "step": 3450
494
- },
495
- {
496
- "epoch": 0.4544864303337229,
497
- "grad_norm": 0.8492136001586914,
498
- "learning_rate": 1.8421786607025395e-05,
499
- "loss": 0.2054,
500
- "step": 3500
501
- },
502
- {
503
- "epoch": 0.46097909362420464,
504
- "grad_norm": 1.0388613939285278,
505
- "learning_rate": 1.839547427970004e-05,
506
- "loss": 0.2117,
507
- "step": 3550
508
- },
509
- {
510
- "epoch": 0.4674717569146864,
511
- "grad_norm": 0.8087108731269836,
512
- "learning_rate": 1.836916195237469e-05,
513
- "loss": 0.2192,
514
- "step": 3600
515
- },
516
- {
517
- "epoch": 0.47396442020516816,
518
- "grad_norm": 0.7446919679641724,
519
- "learning_rate": 1.8342849625049336e-05,
520
- "loss": 0.2112,
521
- "step": 3650
522
- },
523
- {
524
- "epoch": 0.4804570834956499,
525
- "grad_norm": 0.6669062972068787,
526
- "learning_rate": 1.8316537297723984e-05,
527
- "loss": 0.2167,
528
- "step": 3700
529
- },
530
- {
531
- "epoch": 0.4869497467861317,
532
- "grad_norm": 0.519498884677887,
533
- "learning_rate": 1.8290224970398633e-05,
534
- "loss": 0.2069,
535
- "step": 3750
536
- },
537
- {
538
- "epoch": 0.49344241007661344,
539
- "grad_norm": 0.5735522508621216,
540
- "learning_rate": 1.826391264307328e-05,
541
- "loss": 0.2102,
542
- "step": 3800
543
- },
544
- {
545
- "epoch": 0.4999350733670952,
546
- "grad_norm": 0.6683741211891174,
547
- "learning_rate": 1.823760031574793e-05,
548
- "loss": 0.2141,
549
- "step": 3850
550
- },
551
- {
552
- "epoch": 0.5064277366575769,
553
- "grad_norm": 0.6239739060401917,
554
- "learning_rate": 1.8211287988422578e-05,
555
- "loss": 0.1975,
556
- "step": 3900
557
- },
558
- {
559
- "epoch": 0.5129203999480587,
560
- "grad_norm": 0.6327545046806335,
561
- "learning_rate": 1.8184975661097226e-05,
562
- "loss": 0.2165,
563
- "step": 3950
564
- },
565
- {
566
- "epoch": 0.5194130632385404,
567
- "grad_norm": 0.6098693609237671,
568
- "learning_rate": 1.8158663333771874e-05,
569
- "loss": 0.2034,
570
- "step": 4000
571
- },
572
- {
573
- "epoch": 0.5259057265290222,
574
- "grad_norm": 0.7356197237968445,
575
- "learning_rate": 1.8132351006446522e-05,
576
- "loss": 0.2041,
577
- "step": 4050
578
- },
579
- {
580
- "epoch": 0.5323983898195039,
581
- "grad_norm": 0.7616349458694458,
582
- "learning_rate": 1.810603867912117e-05,
583
- "loss": 0.2121,
584
- "step": 4100
585
- },
586
- {
587
- "epoch": 0.5388910531099858,
588
- "grad_norm": 0.6561347842216492,
589
- "learning_rate": 1.807972635179582e-05,
590
- "loss": 0.2002,
591
- "step": 4150
592
- },
593
- {
594
- "epoch": 0.5453837164004675,
595
- "grad_norm": 0.6744963526725769,
596
- "learning_rate": 1.8053414024470467e-05,
597
- "loss": 0.2029,
598
- "step": 4200
599
- },
600
- {
601
- "epoch": 0.5518763796909493,
602
- "grad_norm": 0.7836194038391113,
603
- "learning_rate": 1.8027101697145112e-05,
604
- "loss": 0.2055,
605
- "step": 4250
606
- },
607
- {
608
- "epoch": 0.558369042981431,
609
- "grad_norm": 0.6983553767204285,
610
- "learning_rate": 1.800078936981976e-05,
611
- "loss": 0.2139,
612
- "step": 4300
613
- },
614
- {
615
- "epoch": 0.5648617062719128,
616
- "grad_norm": 0.6315485239028931,
617
- "learning_rate": 1.797447704249441e-05,
618
- "loss": 0.1978,
619
- "step": 4350
620
- },
621
- {
622
- "epoch": 0.5713543695623945,
623
- "grad_norm": 0.5928835868835449,
624
- "learning_rate": 1.7948164715169057e-05,
625
- "loss": 0.2027,
626
- "step": 4400
627
- },
628
- {
629
- "epoch": 0.5778470328528762,
630
- "grad_norm": 0.8622831702232361,
631
- "learning_rate": 1.7921852387843705e-05,
632
- "loss": 0.2104,
633
- "step": 4450
634
- },
635
- {
636
- "epoch": 0.584339696143358,
637
- "grad_norm": 0.7967308759689331,
638
- "learning_rate": 1.7895540060518354e-05,
639
- "loss": 0.2328,
640
- "step": 4500
641
- },
642
- {
643
- "epoch": 0.5908323594338397,
644
- "grad_norm": 0.8289620280265808,
645
- "learning_rate": 1.7869227733193002e-05,
646
- "loss": 0.2099,
647
- "step": 4550
648
- },
649
- {
650
- "epoch": 0.5973250227243215,
651
- "grad_norm": 0.7185404300689697,
652
- "learning_rate": 1.784291540586765e-05,
653
- "loss": 0.2045,
654
- "step": 4600
655
- },
656
- {
657
- "epoch": 0.6038176860148032,
658
- "grad_norm": 0.6304630637168884,
659
- "learning_rate": 1.78166030785423e-05,
660
- "loss": 0.2055,
661
- "step": 4650
662
- },
663
- {
664
- "epoch": 0.610310349305285,
665
- "grad_norm": 0.614983856678009,
666
- "learning_rate": 1.7790290751216947e-05,
667
- "loss": 0.1979,
668
- "step": 4700
669
- },
670
- {
671
- "epoch": 0.6168030125957668,
672
- "grad_norm": 0.568352460861206,
673
- "learning_rate": 1.7763978423891595e-05,
674
- "loss": 0.197,
675
- "step": 4750
676
- },
677
- {
678
- "epoch": 0.6232956758862486,
679
- "grad_norm": 0.8283701539039612,
680
- "learning_rate": 1.773766609656624e-05,
681
- "loss": 0.1986,
682
- "step": 4800
683
- },
684
- {
685
- "epoch": 0.6297883391767303,
686
- "grad_norm": 0.888967752456665,
687
- "learning_rate": 1.7711353769240892e-05,
688
- "loss": 0.2271,
689
- "step": 4850
690
- },
691
- {
692
- "epoch": 0.6362810024672121,
693
- "grad_norm": 0.5292450189590454,
694
- "learning_rate": 1.7685041441915537e-05,
695
- "loss": 0.1971,
696
- "step": 4900
697
- },
698
- {
699
- "epoch": 0.6427736657576938,
700
- "grad_norm": 0.5274189114570618,
701
- "learning_rate": 1.765872911459019e-05,
702
- "loss": 0.1963,
703
- "step": 4950
704
- },
705
- {
706
- "epoch": 0.6492663290481756,
707
- "grad_norm": 0.5699307322502136,
708
- "learning_rate": 1.7632416787264833e-05,
709
- "loss": 0.1984,
710
- "step": 5000
711
- },
712
- {
713
- "epoch": 0.6557589923386573,
714
- "grad_norm": 0.8022367358207703,
715
- "learning_rate": 1.7606104459939485e-05,
716
- "loss": 0.2053,
717
- "step": 5050
718
- },
719
- {
720
- "epoch": 0.6622516556291391,
721
- "grad_norm": 0.6432430148124695,
722
- "learning_rate": 1.757979213261413e-05,
723
- "loss": 0.1991,
724
- "step": 5100
725
- },
726
- {
727
- "epoch": 0.6687443189196208,
728
- "grad_norm": 0.7671304941177368,
729
- "learning_rate": 1.755347980528878e-05,
730
- "loss": 0.2064,
731
- "step": 5150
732
- },
733
- {
734
- "epoch": 0.6752369822101025,
735
- "grad_norm": 0.5107030272483826,
736
- "learning_rate": 1.7527167477963426e-05,
737
- "loss": 0.2049,
738
- "step": 5200
739
- },
740
- {
741
- "epoch": 0.6817296455005843,
742
- "grad_norm": 0.7239235639572144,
743
- "learning_rate": 1.7500855150638075e-05,
744
- "loss": 0.2001,
745
- "step": 5250
746
- },
747
- {
748
- "epoch": 0.688222308791066,
749
- "grad_norm": 0.6116129755973816,
750
- "learning_rate": 1.7474542823312723e-05,
751
- "loss": 0.2193,
752
- "step": 5300
753
- },
754
- {
755
- "epoch": 0.6947149720815479,
756
- "grad_norm": 0.5425911545753479,
757
- "learning_rate": 1.744823049598737e-05,
758
- "loss": 0.2001,
759
- "step": 5350
760
- },
761
- {
762
- "epoch": 0.7012076353720296,
763
- "grad_norm": 0.6464748382568359,
764
- "learning_rate": 1.742191816866202e-05,
765
- "loss": 0.1963,
766
- "step": 5400
767
- },
768
- {
769
- "epoch": 0.7077002986625114,
770
- "grad_norm": 0.8812252879142761,
771
- "learning_rate": 1.7395605841336668e-05,
772
- "loss": 0.196,
773
- "step": 5450
774
- },
775
- {
776
- "epoch": 0.7141929619529931,
777
- "grad_norm": 0.6928241848945618,
778
- "learning_rate": 1.7369293514011316e-05,
779
- "loss": 0.2011,
780
- "step": 5500
781
- },
782
- {
783
- "epoch": 0.7206856252434749,
784
- "grad_norm": 0.6892450451850891,
785
- "learning_rate": 1.7342981186685965e-05,
786
- "loss": 0.1949,
787
- "step": 5550
788
- },
789
- {
790
- "epoch": 0.7271782885339566,
791
- "grad_norm": 0.4782065749168396,
792
- "learning_rate": 1.7316668859360613e-05,
793
- "loss": 0.194,
794
- "step": 5600
795
- },
796
- {
797
- "epoch": 0.7336709518244384,
798
- "grad_norm": 0.6438505053520203,
799
- "learning_rate": 1.729035653203526e-05,
800
- "loss": 0.1967,
801
- "step": 5650
802
- },
803
- {
804
- "epoch": 0.7401636151149201,
805
- "grad_norm": 0.5797818899154663,
806
- "learning_rate": 1.726404420470991e-05,
807
- "loss": 0.2185,
808
- "step": 5700
809
- },
810
- {
811
- "epoch": 0.7466562784054019,
812
- "grad_norm": 0.6884586811065674,
813
- "learning_rate": 1.7237731877384554e-05,
814
- "loss": 0.1977,
815
- "step": 5750
816
- },
817
- {
818
- "epoch": 0.7531489416958836,
819
- "grad_norm": 0.648883581161499,
820
- "learning_rate": 1.7211419550059206e-05,
821
- "loss": 0.1964,
822
- "step": 5800
823
- },
824
- {
825
- "epoch": 0.7596416049863655,
826
- "grad_norm": 0.6440086960792542,
827
- "learning_rate": 1.718510722273385e-05,
828
- "loss": 0.2014,
829
- "step": 5850
830
- },
831
- {
832
- "epoch": 0.7661342682768472,
833
- "grad_norm": 0.5619300007820129,
834
- "learning_rate": 1.71587948954085e-05,
835
- "loss": 0.1909,
836
- "step": 5900
837
- },
838
- {
839
- "epoch": 0.7726269315673289,
840
- "grad_norm": 0.6859204769134521,
841
- "learning_rate": 1.7132482568083147e-05,
842
- "loss": 0.2049,
843
- "step": 5950
844
- },
845
- {
846
- "epoch": 0.7791195948578107,
847
- "grad_norm": 0.6132592558860779,
848
- "learning_rate": 1.7106170240757796e-05,
849
- "loss": 0.2,
850
- "step": 6000
851
- },
852
- {
853
- "epoch": 0.7856122581482924,
854
- "grad_norm": 0.7050901055335999,
855
- "learning_rate": 1.7079857913432444e-05,
856
- "loss": 0.189,
857
- "step": 6050
858
- },
859
- {
860
- "epoch": 0.7921049214387742,
861
- "grad_norm": 0.6752614974975586,
862
- "learning_rate": 1.7053545586107092e-05,
863
- "loss": 0.2248,
864
- "step": 6100
865
- },
866
- {
867
- "epoch": 0.7985975847292559,
868
- "grad_norm": 0.7186923623085022,
869
- "learning_rate": 1.702723325878174e-05,
870
- "loss": 0.1903,
871
- "step": 6150
872
- },
873
- {
874
- "epoch": 0.8050902480197377,
875
- "grad_norm": 0.5991400480270386,
876
- "learning_rate": 1.700092093145639e-05,
877
- "loss": 0.197,
878
- "step": 6200
879
- },
880
- {
881
- "epoch": 0.8115829113102194,
882
- "grad_norm": 0.9522245526313782,
883
- "learning_rate": 1.6974608604131037e-05,
884
- "loss": 0.1962,
885
- "step": 6250
886
- },
887
- {
888
- "epoch": 0.8180755746007012,
889
- "grad_norm": 0.8645381927490234,
890
- "learning_rate": 1.6948296276805686e-05,
891
- "loss": 0.1901,
892
- "step": 6300
893
- },
894
- {
895
- "epoch": 0.8245682378911829,
896
- "grad_norm": 0.5243034958839417,
897
- "learning_rate": 1.6921983949480334e-05,
898
- "loss": 0.194,
899
- "step": 6350
900
- },
901
- {
902
- "epoch": 0.8310609011816648,
903
- "grad_norm": 0.5842151641845703,
904
- "learning_rate": 1.6895671622154982e-05,
905
- "loss": 0.1917,
906
- "step": 6400
907
- },
908
- {
909
- "epoch": 0.8375535644721465,
910
- "grad_norm": 0.6111485362052917,
911
- "learning_rate": 1.6869359294829627e-05,
912
- "loss": 0.2042,
913
- "step": 6450
914
- },
915
- {
916
- "epoch": 0.8440462277626283,
917
- "grad_norm": 0.6515288949012756,
918
- "learning_rate": 1.684304696750428e-05,
919
- "loss": 0.2032,
920
- "step": 6500
921
- },
922
- {
923
- "epoch": 0.85053889105311,
924
- "grad_norm": 0.7596396207809448,
925
- "learning_rate": 1.6816734640178924e-05,
926
- "loss": 0.193,
927
- "step": 6550
928
- }
929
- ],
930
- "logging_steps": 50,
931
- "max_steps": 38505,
932
- "num_input_tokens_seen": 0,
933
- "num_train_epochs": 5,
934
- "save_steps": 386,
935
- "stateful_callbacks": {
936
- "TrainerControl": {
937
- "args": {
938
- "should_epoch_stop": false,
939
- "should_evaluate": false,
940
- "should_log": false,
941
- "should_save": true,
942
- "should_training_stop": false
943
- },
944
- "attributes": {}
945
- }
946
- },
947
- "total_flos": 5.487557747765412e+17,
948
- "train_batch_size": 40,
949
- "trial_name": null,
950
- "trial_params": null
951
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6948/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
checkpoint-6948/config.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "architectures": [
3
- "DebertaV2ForTokenClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "hidden_act": "gelu",
7
- "hidden_dropout_prob": 0.1,
8
- "hidden_size": 768,
9
- "id2label": {
10
- "0": "O",
11
- "1": "B-CHA",
12
- "2": "I-CHA",
13
- "3": "B-LOC",
14
- "4": "I-LOC",
15
- "5": "B-FAC",
16
- "6": "I-FAC",
17
- "7": "B-OBJ",
18
- "8": "I-OBJ",
19
- "9": "B-EVT",
20
- "10": "I-EVT",
21
- "11": "B-ORG",
22
- "12": "I-ORG",
23
- "13": "B-MISC",
24
- "14": "I-MISC"
25
- },
26
- "initializer_range": 0.02,
27
- "intermediate_size": 3072,
28
- "label2id": {
29
- "B-CHA": 1,
30
- "B-EVT": 9,
31
- "B-FAC": 5,
32
- "B-LOC": 3,
33
- "B-MISC": 13,
34
- "B-OBJ": 7,
35
- "B-ORG": 11,
36
- "I-CHA": 2,
37
- "I-EVT": 10,
38
- "I-FAC": 6,
39
- "I-LOC": 4,
40
- "I-MISC": 14,
41
- "I-OBJ": 8,
42
- "I-ORG": 12,
43
- "O": 0
44
- },
45
- "layer_norm_eps": 1e-07,
46
- "legacy": true,
47
- "max_position_embeddings": 512,
48
- "max_relative_positions": -1,
49
- "model_type": "deberta-v2",
50
- "norm_rel_ebd": "layer_norm",
51
- "num_attention_heads": 12,
52
- "num_hidden_layers": 12,
53
- "pad_token_id": 0,
54
- "pooler_dropout": 0,
55
- "pooler_hidden_act": "gelu",
56
- "pooler_hidden_size": 768,
57
- "pos_att_type": [
58
- "p2c",
59
- "c2p"
60
- ],
61
- "position_biased_input": false,
62
- "position_buckets": 256,
63
- "relative_attention": true,
64
- "share_att_key": true,
65
- "torch_dtype": "float32",
66
- "transformers_version": "4.55.4",
67
- "type_vocab_size": 0,
68
- "vocab_size": 128100
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6948/special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "[CLS]",
3
- "cls_token": "[CLS]",
4
- "eos_token": "[SEP]",
5
- "mask_token": "[MASK]",
6
- "pad_token": "[PAD]",
7
- "sep_token": "[SEP]",
8
- "unk_token": {
9
- "content": "[UNK]",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6948/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-6948/tokenizer_config.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[CLS]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[SEP]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[UNK]",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "128000": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "[CLS]",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "[CLS]",
47
- "do_lower_case": false,
48
- "eos_token": "[SEP]",
49
- "extra_special_tokens": {},
50
- "mask_token": "[MASK]",
51
- "model_max_length": 1000000000000000019884624838656,
52
- "pad_token": "[PAD]",
53
- "sep_token": "[SEP]",
54
- "sp_model_kwargs": {},
55
- "split_by_punct": false,
56
- "tokenizer_class": "DebertaV2Tokenizer",
57
- "unk_token": "[UNK]",
58
- "vocab_type": "spm"
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-6948/trainer_state.json DELETED
@@ -1,1000 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.9022204908453447,
6
- "eval_steps": 500,
7
- "global_step": 6948,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.006492663290481756,
14
- "grad_norm": 23.6250057220459,
15
- "learning_rate": 1.9600000000000003e-06,
16
- "loss": 3.3492,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.012985326580963512,
21
- "grad_norm": 1.5745655298233032,
22
- "learning_rate": 3.96e-06,
23
- "loss": 1.3837,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.019477989871445268,
28
- "grad_norm": 1.3241935968399048,
29
- "learning_rate": 5.9600000000000005e-06,
30
- "loss": 0.6673,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.025970653161927024,
35
- "grad_norm": 1.068557620048523,
36
- "learning_rate": 7.960000000000002e-06,
37
- "loss": 0.4426,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.032463316452408776,
42
- "grad_norm": 1.2953921556472778,
43
- "learning_rate": 9.960000000000001e-06,
44
- "loss": 0.363,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.038955979742890535,
49
- "grad_norm": 1.4753903150558472,
50
- "learning_rate": 1.196e-05,
51
- "loss": 0.3397,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.04544864303337229,
56
- "grad_norm": 1.245597243309021,
57
- "learning_rate": 1.396e-05,
58
- "loss": 0.3125,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.05194130632385405,
63
- "grad_norm": 1.3421887159347534,
64
- "learning_rate": 1.5960000000000003e-05,
65
- "loss": 0.3086,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.0584339696143358,
70
- "grad_norm": 1.7852529287338257,
71
- "learning_rate": 1.796e-05,
72
- "loss": 0.2883,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.06492663290481755,
77
- "grad_norm": 1.086064100265503,
78
- "learning_rate": 1.9960000000000002e-05,
79
- "loss": 0.2947,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 0.07141929619529931,
84
- "grad_norm": 0.8753414750099182,
85
- "learning_rate": 1.9974213919221156e-05,
86
- "loss": 0.2955,
87
- "step": 550
88
- },
89
- {
90
- "epoch": 0.07791195948578107,
91
- "grad_norm": 0.7875685095787048,
92
- "learning_rate": 1.9947901591895804e-05,
93
- "loss": 0.2773,
94
- "step": 600
95
- },
96
- {
97
- "epoch": 0.08440462277626282,
98
- "grad_norm": 1.1609506607055664,
99
- "learning_rate": 1.9921589264570452e-05,
100
- "loss": 0.2696,
101
- "step": 650
102
- },
103
- {
104
- "epoch": 0.09089728606674458,
105
- "grad_norm": 0.9162603616714478,
106
- "learning_rate": 1.98952769372451e-05,
107
- "loss": 0.2667,
108
- "step": 700
109
- },
110
- {
111
- "epoch": 0.09738994935722634,
112
- "grad_norm": 1.0398370027542114,
113
- "learning_rate": 1.986896460991975e-05,
114
- "loss": 0.277,
115
- "step": 750
116
- },
117
- {
118
- "epoch": 0.1038826126477081,
119
- "grad_norm": 0.9404798150062561,
120
- "learning_rate": 1.9842652282594397e-05,
121
- "loss": 0.2584,
122
- "step": 800
123
- },
124
- {
125
- "epoch": 0.11037527593818984,
126
- "grad_norm": 1.0353909730911255,
127
- "learning_rate": 1.9816339955269045e-05,
128
- "loss": 0.2533,
129
- "step": 850
130
- },
131
- {
132
- "epoch": 0.1168679392286716,
133
- "grad_norm": 1.485606074333191,
134
- "learning_rate": 1.9790027627943694e-05,
135
- "loss": 0.2506,
136
- "step": 900
137
- },
138
- {
139
- "epoch": 0.12336060251915336,
140
- "grad_norm": 0.9874151945114136,
141
- "learning_rate": 1.9763715300618342e-05,
142
- "loss": 0.2515,
143
- "step": 950
144
- },
145
- {
146
- "epoch": 0.1298532658096351,
147
- "grad_norm": 1.036007285118103,
148
- "learning_rate": 1.973740297329299e-05,
149
- "loss": 0.244,
150
- "step": 1000
151
- },
152
- {
153
- "epoch": 0.13634592910011686,
154
- "grad_norm": 0.946956217288971,
155
- "learning_rate": 1.9711090645967635e-05,
156
- "loss": 0.2518,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 0.14283859239059862,
161
- "grad_norm": 0.9096735119819641,
162
- "learning_rate": 1.9684778318642287e-05,
163
- "loss": 0.2543,
164
- "step": 1100
165
- },
166
- {
167
- "epoch": 0.14933125568108038,
168
- "grad_norm": 0.9371875524520874,
169
- "learning_rate": 1.965846599131693e-05,
170
- "loss": 0.2605,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 0.15582391897156214,
175
- "grad_norm": 0.8072580695152283,
176
- "learning_rate": 1.9632153663991583e-05,
177
- "loss": 0.2423,
178
- "step": 1200
179
- },
180
- {
181
- "epoch": 0.1623165822620439,
182
- "grad_norm": 0.6907565593719482,
183
- "learning_rate": 1.9605841336666228e-05,
184
- "loss": 0.2315,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 0.16880924555252563,
189
- "grad_norm": 0.6676374673843384,
190
- "learning_rate": 1.957952900934088e-05,
191
- "loss": 0.2363,
192
- "step": 1300
193
- },
194
- {
195
- "epoch": 0.1753019088430074,
196
- "grad_norm": 0.7281391024589539,
197
- "learning_rate": 1.9553216682015525e-05,
198
- "loss": 0.2399,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 0.18179457213348915,
203
- "grad_norm": 0.8105588555335999,
204
- "learning_rate": 1.9526904354690177e-05,
205
- "loss": 0.2448,
206
- "step": 1400
207
- },
208
- {
209
- "epoch": 0.1882872354239709,
210
- "grad_norm": 0.7466333508491516,
211
- "learning_rate": 1.950059202736482e-05,
212
- "loss": 0.2388,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 0.19477989871445267,
217
- "grad_norm": 0.5949985384941101,
218
- "learning_rate": 1.947427970003947e-05,
219
- "loss": 0.233,
220
- "step": 1500
221
- },
222
- {
223
- "epoch": 0.20127256200493443,
224
- "grad_norm": 0.7439960241317749,
225
- "learning_rate": 1.9447967372714118e-05,
226
- "loss": 0.2448,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.2077652252954162,
231
- "grad_norm": 0.9508784413337708,
232
- "learning_rate": 1.9421655045388766e-05,
233
- "loss": 0.2353,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 0.21425788858589795,
238
- "grad_norm": 0.8785332441329956,
239
- "learning_rate": 1.9395342718063415e-05,
240
- "loss": 0.2249,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.22075055187637968,
245
- "grad_norm": 0.7568134069442749,
246
- "learning_rate": 1.9369030390738063e-05,
247
- "loss": 0.2307,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 0.22724321516686144,
252
- "grad_norm": 0.6832641363143921,
253
- "learning_rate": 1.934271806341271e-05,
254
- "loss": 0.2263,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 0.2337358784573432,
259
- "grad_norm": 0.9721575379371643,
260
- "learning_rate": 1.931640573608736e-05,
261
- "loss": 0.2287,
262
- "step": 1800
263
- },
264
- {
265
- "epoch": 0.24022854174782496,
266
- "grad_norm": 0.620299220085144,
267
- "learning_rate": 1.9290093408762008e-05,
268
- "loss": 0.226,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 0.24672120503830672,
273
- "grad_norm": 0.6656680703163147,
274
- "learning_rate": 1.9263781081436656e-05,
275
- "loss": 0.2223,
276
- "step": 1900
277
- },
278
- {
279
- "epoch": 0.25321386832878845,
280
- "grad_norm": 0.8536450266838074,
281
- "learning_rate": 1.9237468754111304e-05,
282
- "loss": 0.2332,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 0.2597065316192702,
287
- "grad_norm": 0.9754176139831543,
288
- "learning_rate": 1.921115642678595e-05,
289
- "loss": 0.2211,
290
- "step": 2000
291
- },
292
- {
293
- "epoch": 0.26619919490975197,
294
- "grad_norm": 0.6166796684265137,
295
- "learning_rate": 1.9184844099460598e-05,
296
- "loss": 0.2201,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 0.2726918582002337,
301
- "grad_norm": 0.8182518482208252,
302
- "learning_rate": 1.9158531772135246e-05,
303
- "loss": 0.2213,
304
- "step": 2100
305
- },
306
- {
307
- "epoch": 0.2791845214907155,
308
- "grad_norm": 0.8245829939842224,
309
- "learning_rate": 1.9132219444809894e-05,
310
- "loss": 0.2208,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 0.28567718478119725,
315
- "grad_norm": 0.6423109173774719,
316
- "learning_rate": 1.9105907117484542e-05,
317
- "loss": 0.2308,
318
- "step": 2200
319
- },
320
- {
321
- "epoch": 0.292169848071679,
322
- "grad_norm": 0.7164100408554077,
323
- "learning_rate": 1.907959479015919e-05,
324
- "loss": 0.2193,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 0.29866251136216077,
329
- "grad_norm": 0.7910536527633667,
330
- "learning_rate": 1.905328246283384e-05,
331
- "loss": 0.2292,
332
- "step": 2300
333
- },
334
- {
335
- "epoch": 0.3051551746526425,
336
- "grad_norm": 0.9699934720993042,
337
- "learning_rate": 1.9026970135508487e-05,
338
- "loss": 0.2175,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 0.3116478379431243,
343
- "grad_norm": 1.0665416717529297,
344
- "learning_rate": 1.9000657808183136e-05,
345
- "loss": 0.2252,
346
- "step": 2400
347
- },
348
- {
349
- "epoch": 0.31814050123360604,
350
- "grad_norm": 0.9915699362754822,
351
- "learning_rate": 1.8974345480857784e-05,
352
- "loss": 0.215,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 0.3246331645240878,
357
- "grad_norm": 0.7021859288215637,
358
- "learning_rate": 1.8948033153532432e-05,
359
- "loss": 0.2123,
360
- "step": 2500
361
- },
362
- {
363
- "epoch": 0.33112582781456956,
364
- "grad_norm": 0.6402600407600403,
365
- "learning_rate": 1.892172082620708e-05,
366
- "loss": 0.2176,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 0.33761849110505127,
371
- "grad_norm": 0.9925078749656677,
372
- "learning_rate": 1.8895408498881725e-05,
373
- "loss": 0.223,
374
- "step": 2600
375
- },
376
- {
377
- "epoch": 0.344111154395533,
378
- "grad_norm": 0.5188687443733215,
379
- "learning_rate": 1.8869096171556377e-05,
380
- "loss": 0.2174,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 0.3506038176860148,
385
- "grad_norm": 0.7427687644958496,
386
- "learning_rate": 1.8842783844231022e-05,
387
- "loss": 0.2181,
388
- "step": 2700
389
- },
390
- {
391
- "epoch": 0.35709648097649654,
392
- "grad_norm": 0.6055501103401184,
393
- "learning_rate": 1.8816471516905674e-05,
394
- "loss": 0.2097,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 0.3635891442669783,
399
- "grad_norm": 0.6529932022094727,
400
- "learning_rate": 1.879015918958032e-05,
401
- "loss": 0.2133,
402
- "step": 2800
403
- },
404
- {
405
- "epoch": 0.37008180755746006,
406
- "grad_norm": 0.9499914646148682,
407
- "learning_rate": 1.8763846862254967e-05,
408
- "loss": 0.2189,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 0.3765744708479418,
413
- "grad_norm": 0.8217771053314209,
414
- "learning_rate": 1.8737534534929615e-05,
415
- "loss": 0.2172,
416
- "step": 2900
417
- },
418
- {
419
- "epoch": 0.3830671341384236,
420
- "grad_norm": 1.031895399093628,
421
- "learning_rate": 1.8711222207604263e-05,
422
- "loss": 0.2171,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 0.38955979742890534,
427
- "grad_norm": 0.7179540395736694,
428
- "learning_rate": 1.8684909880278912e-05,
429
- "loss": 0.2179,
430
- "step": 3000
431
- },
432
- {
433
- "epoch": 0.3960524607193871,
434
- "grad_norm": 0.8115535974502563,
435
- "learning_rate": 1.865859755295356e-05,
436
- "loss": 0.2131,
437
- "step": 3050
438
- },
439
- {
440
- "epoch": 0.40254512400986886,
441
- "grad_norm": 0.7659513354301453,
442
- "learning_rate": 1.863228522562821e-05,
443
- "loss": 0.2189,
444
- "step": 3100
445
- },
446
- {
447
- "epoch": 0.4090377873003506,
448
- "grad_norm": 0.8645811080932617,
449
- "learning_rate": 1.8605972898302857e-05,
450
- "loss": 0.2453,
451
- "step": 3150
452
- },
453
- {
454
- "epoch": 0.4155304505908324,
455
- "grad_norm": 0.8571991920471191,
456
- "learning_rate": 1.8579660570977505e-05,
457
- "loss": 0.2076,
458
- "step": 3200
459
- },
460
- {
461
- "epoch": 0.42202311388131414,
462
- "grad_norm": 1.222774863243103,
463
- "learning_rate": 1.8553348243652153e-05,
464
- "loss": 0.2094,
465
- "step": 3250
466
- },
467
- {
468
- "epoch": 0.4285157771717959,
469
- "grad_norm": 0.9631436467170715,
470
- "learning_rate": 1.85270359163268e-05,
471
- "loss": 0.2123,
472
- "step": 3300
473
- },
474
- {
475
- "epoch": 0.4350084404622776,
476
- "grad_norm": 0.5590534806251526,
477
- "learning_rate": 1.8500723589001446e-05,
478
- "loss": 0.2028,
479
- "step": 3350
480
- },
481
- {
482
- "epoch": 0.44150110375275936,
483
- "grad_norm": 0.7615678310394287,
484
- "learning_rate": 1.8474411261676098e-05,
485
- "loss": 0.2125,
486
- "step": 3400
487
- },
488
- {
489
- "epoch": 0.4479937670432411,
490
- "grad_norm": 0.7124540209770203,
491
- "learning_rate": 1.8448098934350743e-05,
492
- "loss": 0.206,
493
- "step": 3450
494
- },
495
- {
496
- "epoch": 0.4544864303337229,
497
- "grad_norm": 0.8492136001586914,
498
- "learning_rate": 1.8421786607025395e-05,
499
- "loss": 0.2054,
500
- "step": 3500
501
- },
502
- {
503
- "epoch": 0.46097909362420464,
504
- "grad_norm": 1.0388613939285278,
505
- "learning_rate": 1.839547427970004e-05,
506
- "loss": 0.2117,
507
- "step": 3550
508
- },
509
- {
510
- "epoch": 0.4674717569146864,
511
- "grad_norm": 0.8087108731269836,
512
- "learning_rate": 1.836916195237469e-05,
513
- "loss": 0.2192,
514
- "step": 3600
515
- },
516
- {
517
- "epoch": 0.47396442020516816,
518
- "grad_norm": 0.7446919679641724,
519
- "learning_rate": 1.8342849625049336e-05,
520
- "loss": 0.2112,
521
- "step": 3650
522
- },
523
- {
524
- "epoch": 0.4804570834956499,
525
- "grad_norm": 0.6669062972068787,
526
- "learning_rate": 1.8316537297723984e-05,
527
- "loss": 0.2167,
528
- "step": 3700
529
- },
530
- {
531
- "epoch": 0.4869497467861317,
532
- "grad_norm": 0.519498884677887,
533
- "learning_rate": 1.8290224970398633e-05,
534
- "loss": 0.2069,
535
- "step": 3750
536
- },
537
- {
538
- "epoch": 0.49344241007661344,
539
- "grad_norm": 0.5735522508621216,
540
- "learning_rate": 1.826391264307328e-05,
541
- "loss": 0.2102,
542
- "step": 3800
543
- },
544
- {
545
- "epoch": 0.4999350733670952,
546
- "grad_norm": 0.6683741211891174,
547
- "learning_rate": 1.823760031574793e-05,
548
- "loss": 0.2141,
549
- "step": 3850
550
- },
551
- {
552
- "epoch": 0.5064277366575769,
553
- "grad_norm": 0.6239739060401917,
554
- "learning_rate": 1.8211287988422578e-05,
555
- "loss": 0.1975,
556
- "step": 3900
557
- },
558
- {
559
- "epoch": 0.5129203999480587,
560
- "grad_norm": 0.6327545046806335,
561
- "learning_rate": 1.8184975661097226e-05,
562
- "loss": 0.2165,
563
- "step": 3950
564
- },
565
- {
566
- "epoch": 0.5194130632385404,
567
- "grad_norm": 0.6098693609237671,
568
- "learning_rate": 1.8158663333771874e-05,
569
- "loss": 0.2034,
570
- "step": 4000
571
- },
572
- {
573
- "epoch": 0.5259057265290222,
574
- "grad_norm": 0.7356197237968445,
575
- "learning_rate": 1.8132351006446522e-05,
576
- "loss": 0.2041,
577
- "step": 4050
578
- },
579
- {
580
- "epoch": 0.5323983898195039,
581
- "grad_norm": 0.7616349458694458,
582
- "learning_rate": 1.810603867912117e-05,
583
- "loss": 0.2121,
584
- "step": 4100
585
- },
586
- {
587
- "epoch": 0.5388910531099858,
588
- "grad_norm": 0.6561347842216492,
589
- "learning_rate": 1.807972635179582e-05,
590
- "loss": 0.2002,
591
- "step": 4150
592
- },
593
- {
594
- "epoch": 0.5453837164004675,
595
- "grad_norm": 0.6744963526725769,
596
- "learning_rate": 1.8053414024470467e-05,
597
- "loss": 0.2029,
598
- "step": 4200
599
- },
600
- {
601
- "epoch": 0.5518763796909493,
602
- "grad_norm": 0.7836194038391113,
603
- "learning_rate": 1.8027101697145112e-05,
604
- "loss": 0.2055,
605
- "step": 4250
606
- },
607
- {
608
- "epoch": 0.558369042981431,
609
- "grad_norm": 0.6983553767204285,
610
- "learning_rate": 1.800078936981976e-05,
611
- "loss": 0.2139,
612
- "step": 4300
613
- },
614
- {
615
- "epoch": 0.5648617062719128,
616
- "grad_norm": 0.6315485239028931,
617
- "learning_rate": 1.797447704249441e-05,
618
- "loss": 0.1978,
619
- "step": 4350
620
- },
621
- {
622
- "epoch": 0.5713543695623945,
623
- "grad_norm": 0.5928835868835449,
624
- "learning_rate": 1.7948164715169057e-05,
625
- "loss": 0.2027,
626
- "step": 4400
627
- },
628
- {
629
- "epoch": 0.5778470328528762,
630
- "grad_norm": 0.8622831702232361,
631
- "learning_rate": 1.7921852387843705e-05,
632
- "loss": 0.2104,
633
- "step": 4450
634
- },
635
- {
636
- "epoch": 0.584339696143358,
637
- "grad_norm": 0.7967308759689331,
638
- "learning_rate": 1.7895540060518354e-05,
639
- "loss": 0.2328,
640
- "step": 4500
641
- },
642
- {
643
- "epoch": 0.5908323594338397,
644
- "grad_norm": 0.8289620280265808,
645
- "learning_rate": 1.7869227733193002e-05,
646
- "loss": 0.2099,
647
- "step": 4550
648
- },
649
- {
650
- "epoch": 0.5973250227243215,
651
- "grad_norm": 0.7185404300689697,
652
- "learning_rate": 1.784291540586765e-05,
653
- "loss": 0.2045,
654
- "step": 4600
655
- },
656
- {
657
- "epoch": 0.6038176860148032,
658
- "grad_norm": 0.6304630637168884,
659
- "learning_rate": 1.78166030785423e-05,
660
- "loss": 0.2055,
661
- "step": 4650
662
- },
663
- {
664
- "epoch": 0.610310349305285,
665
- "grad_norm": 0.614983856678009,
666
- "learning_rate": 1.7790290751216947e-05,
667
- "loss": 0.1979,
668
- "step": 4700
669
- },
670
- {
671
- "epoch": 0.6168030125957668,
672
- "grad_norm": 0.568352460861206,
673
- "learning_rate": 1.7763978423891595e-05,
674
- "loss": 0.197,
675
- "step": 4750
676
- },
677
- {
678
- "epoch": 0.6232956758862486,
679
- "grad_norm": 0.8283701539039612,
680
- "learning_rate": 1.773766609656624e-05,
681
- "loss": 0.1986,
682
- "step": 4800
683
- },
684
- {
685
- "epoch": 0.6297883391767303,
686
- "grad_norm": 0.888967752456665,
687
- "learning_rate": 1.7711353769240892e-05,
688
- "loss": 0.2271,
689
- "step": 4850
690
- },
691
- {
692
- "epoch": 0.6362810024672121,
693
- "grad_norm": 0.5292450189590454,
694
- "learning_rate": 1.7685041441915537e-05,
695
- "loss": 0.1971,
696
- "step": 4900
697
- },
698
- {
699
- "epoch": 0.6427736657576938,
700
- "grad_norm": 0.5274189114570618,
701
- "learning_rate": 1.765872911459019e-05,
702
- "loss": 0.1963,
703
- "step": 4950
704
- },
705
- {
706
- "epoch": 0.6492663290481756,
707
- "grad_norm": 0.5699307322502136,
708
- "learning_rate": 1.7632416787264833e-05,
709
- "loss": 0.1984,
710
- "step": 5000
711
- },
712
- {
713
- "epoch": 0.6557589923386573,
714
- "grad_norm": 0.8022367358207703,
715
- "learning_rate": 1.7606104459939485e-05,
716
- "loss": 0.2053,
717
- "step": 5050
718
- },
719
- {
720
- "epoch": 0.6622516556291391,
721
- "grad_norm": 0.6432430148124695,
722
- "learning_rate": 1.757979213261413e-05,
723
- "loss": 0.1991,
724
- "step": 5100
725
- },
726
- {
727
- "epoch": 0.6687443189196208,
728
- "grad_norm": 0.7671304941177368,
729
- "learning_rate": 1.755347980528878e-05,
730
- "loss": 0.2064,
731
- "step": 5150
732
- },
733
- {
734
- "epoch": 0.6752369822101025,
735
- "grad_norm": 0.5107030272483826,
736
- "learning_rate": 1.7527167477963426e-05,
737
- "loss": 0.2049,
738
- "step": 5200
739
- },
740
- {
741
- "epoch": 0.6817296455005843,
742
- "grad_norm": 0.7239235639572144,
743
- "learning_rate": 1.7500855150638075e-05,
744
- "loss": 0.2001,
745
- "step": 5250
746
- },
747
- {
748
- "epoch": 0.688222308791066,
749
- "grad_norm": 0.6116129755973816,
750
- "learning_rate": 1.7474542823312723e-05,
751
- "loss": 0.2193,
752
- "step": 5300
753
- },
754
- {
755
- "epoch": 0.6947149720815479,
756
- "grad_norm": 0.5425911545753479,
757
- "learning_rate": 1.744823049598737e-05,
758
- "loss": 0.2001,
759
- "step": 5350
760
- },
761
- {
762
- "epoch": 0.7012076353720296,
763
- "grad_norm": 0.6464748382568359,
764
- "learning_rate": 1.742191816866202e-05,
765
- "loss": 0.1963,
766
- "step": 5400
767
- },
768
- {
769
- "epoch": 0.7077002986625114,
770
- "grad_norm": 0.8812252879142761,
771
- "learning_rate": 1.7395605841336668e-05,
772
- "loss": 0.196,
773
- "step": 5450
774
- },
775
- {
776
- "epoch": 0.7141929619529931,
777
- "grad_norm": 0.6928241848945618,
778
- "learning_rate": 1.7369293514011316e-05,
779
- "loss": 0.2011,
780
- "step": 5500
781
- },
782
- {
783
- "epoch": 0.7206856252434749,
784
- "grad_norm": 0.6892450451850891,
785
- "learning_rate": 1.7342981186685965e-05,
786
- "loss": 0.1949,
787
- "step": 5550
788
- },
789
- {
790
- "epoch": 0.7271782885339566,
791
- "grad_norm": 0.4782065749168396,
792
- "learning_rate": 1.7316668859360613e-05,
793
- "loss": 0.194,
794
- "step": 5600
795
- },
796
- {
797
- "epoch": 0.7336709518244384,
798
- "grad_norm": 0.6438505053520203,
799
- "learning_rate": 1.729035653203526e-05,
800
- "loss": 0.1967,
801
- "step": 5650
802
- },
803
- {
804
- "epoch": 0.7401636151149201,
805
- "grad_norm": 0.5797818899154663,
806
- "learning_rate": 1.726404420470991e-05,
807
- "loss": 0.2185,
808
- "step": 5700
809
- },
810
- {
811
- "epoch": 0.7466562784054019,
812
- "grad_norm": 0.6884586811065674,
813
- "learning_rate": 1.7237731877384554e-05,
814
- "loss": 0.1977,
815
- "step": 5750
816
- },
817
- {
818
- "epoch": 0.7531489416958836,
819
- "grad_norm": 0.648883581161499,
820
- "learning_rate": 1.7211419550059206e-05,
821
- "loss": 0.1964,
822
- "step": 5800
823
- },
824
- {
825
- "epoch": 0.7596416049863655,
826
- "grad_norm": 0.6440086960792542,
827
- "learning_rate": 1.718510722273385e-05,
828
- "loss": 0.2014,
829
- "step": 5850
830
- },
831
- {
832
- "epoch": 0.7661342682768472,
833
- "grad_norm": 0.5619300007820129,
834
- "learning_rate": 1.71587948954085e-05,
835
- "loss": 0.1909,
836
- "step": 5900
837
- },
838
- {
839
- "epoch": 0.7726269315673289,
840
- "grad_norm": 0.6859204769134521,
841
- "learning_rate": 1.7132482568083147e-05,
842
- "loss": 0.2049,
843
- "step": 5950
844
- },
845
- {
846
- "epoch": 0.7791195948578107,
847
- "grad_norm": 0.6132592558860779,
848
- "learning_rate": 1.7106170240757796e-05,
849
- "loss": 0.2,
850
- "step": 6000
851
- },
852
- {
853
- "epoch": 0.7856122581482924,
854
- "grad_norm": 0.7050901055335999,
855
- "learning_rate": 1.7079857913432444e-05,
856
- "loss": 0.189,
857
- "step": 6050
858
- },
859
- {
860
- "epoch": 0.7921049214387742,
861
- "grad_norm": 0.6752614974975586,
862
- "learning_rate": 1.7053545586107092e-05,
863
- "loss": 0.2248,
864
- "step": 6100
865
- },
866
- {
867
- "epoch": 0.7985975847292559,
868
- "grad_norm": 0.7186923623085022,
869
- "learning_rate": 1.702723325878174e-05,
870
- "loss": 0.1903,
871
- "step": 6150
872
- },
873
- {
874
- "epoch": 0.8050902480197377,
875
- "grad_norm": 0.5991400480270386,
876
- "learning_rate": 1.700092093145639e-05,
877
- "loss": 0.197,
878
- "step": 6200
879
- },
880
- {
881
- "epoch": 0.8115829113102194,
882
- "grad_norm": 0.9522245526313782,
883
- "learning_rate": 1.6974608604131037e-05,
884
- "loss": 0.1962,
885
- "step": 6250
886
- },
887
- {
888
- "epoch": 0.8180755746007012,
889
- "grad_norm": 0.8645381927490234,
890
- "learning_rate": 1.6948296276805686e-05,
891
- "loss": 0.1901,
892
- "step": 6300
893
- },
894
- {
895
- "epoch": 0.8245682378911829,
896
- "grad_norm": 0.5243034958839417,
897
- "learning_rate": 1.6921983949480334e-05,
898
- "loss": 0.194,
899
- "step": 6350
900
- },
901
- {
902
- "epoch": 0.8310609011816648,
903
- "grad_norm": 0.5842151641845703,
904
- "learning_rate": 1.6895671622154982e-05,
905
- "loss": 0.1917,
906
- "step": 6400
907
- },
908
- {
909
- "epoch": 0.8375535644721465,
910
- "grad_norm": 0.6111485362052917,
911
- "learning_rate": 1.6869359294829627e-05,
912
- "loss": 0.2042,
913
- "step": 6450
914
- },
915
- {
916
- "epoch": 0.8440462277626283,
917
- "grad_norm": 0.6515288949012756,
918
- "learning_rate": 1.684304696750428e-05,
919
- "loss": 0.2032,
920
- "step": 6500
921
- },
922
- {
923
- "epoch": 0.85053889105311,
924
- "grad_norm": 0.7596396207809448,
925
- "learning_rate": 1.6816734640178924e-05,
926
- "loss": 0.193,
927
- "step": 6550
928
- },
929
- {
930
- "epoch": 0.8570315543435918,
931
- "grad_norm": 0.6944254636764526,
932
- "learning_rate": 1.6790422312853575e-05,
933
- "loss": 0.1899,
934
- "step": 6600
935
- },
936
- {
937
- "epoch": 0.8635242176340735,
938
- "grad_norm": 0.6190508604049683,
939
- "learning_rate": 1.676410998552822e-05,
940
- "loss": 0.1987,
941
- "step": 6650
942
- },
943
- {
944
- "epoch": 0.8700168809245552,
945
- "grad_norm": 1.1515477895736694,
946
- "learning_rate": 1.673779765820287e-05,
947
- "loss": 0.196,
948
- "step": 6700
949
- },
950
- {
951
- "epoch": 0.876509544215037,
952
- "grad_norm": 0.5803254842758179,
953
- "learning_rate": 1.6711485330877517e-05,
954
- "loss": 0.1923,
955
- "step": 6750
956
- },
957
- {
958
- "epoch": 0.8830022075055187,
959
- "grad_norm": 0.8052871227264404,
960
- "learning_rate": 1.6685173003552165e-05,
961
- "loss": 0.1894,
962
- "step": 6800
963
- },
964
- {
965
- "epoch": 0.8894948707960005,
966
- "grad_norm": 0.9313941597938538,
967
- "learning_rate": 1.6658860676226813e-05,
968
- "loss": 0.1889,
969
- "step": 6850
970
- },
971
- {
972
- "epoch": 0.8959875340864822,
973
- "grad_norm": 0.5186671614646912,
974
- "learning_rate": 1.663254834890146e-05,
975
- "loss": 0.1895,
976
- "step": 6900
977
- }
978
- ],
979
- "logging_steps": 50,
980
- "max_steps": 38505,
981
- "num_input_tokens_seen": 0,
982
- "num_train_epochs": 5,
983
- "save_steps": 386,
984
- "stateful_callbacks": {
985
- "TrainerControl": {
986
- "args": {
987
- "should_epoch_stop": false,
988
- "should_evaluate": false,
989
- "should_log": false,
990
- "should_save": true,
991
- "should_training_stop": false
992
- },
993
- "attributes": {}
994
- }
995
- },
996
- "total_flos": 5.810355262497751e+17,
997
- "train_batch_size": 40,
998
- "trial_name": null,
999
- "trial_params": null
1000
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-7334/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
checkpoint-7334/config.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "architectures": [
3
- "DebertaV2ForTokenClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "hidden_act": "gelu",
7
- "hidden_dropout_prob": 0.1,
8
- "hidden_size": 768,
9
- "id2label": {
10
- "0": "O",
11
- "1": "B-CHA",
12
- "2": "I-CHA",
13
- "3": "B-LOC",
14
- "4": "I-LOC",
15
- "5": "B-FAC",
16
- "6": "I-FAC",
17
- "7": "B-OBJ",
18
- "8": "I-OBJ",
19
- "9": "B-EVT",
20
- "10": "I-EVT",
21
- "11": "B-ORG",
22
- "12": "I-ORG",
23
- "13": "B-MISC",
24
- "14": "I-MISC"
25
- },
26
- "initializer_range": 0.02,
27
- "intermediate_size": 3072,
28
- "label2id": {
29
- "B-CHA": 1,
30
- "B-EVT": 9,
31
- "B-FAC": 5,
32
- "B-LOC": 3,
33
- "B-MISC": 13,
34
- "B-OBJ": 7,
35
- "B-ORG": 11,
36
- "I-CHA": 2,
37
- "I-EVT": 10,
38
- "I-FAC": 6,
39
- "I-LOC": 4,
40
- "I-MISC": 14,
41
- "I-OBJ": 8,
42
- "I-ORG": 12,
43
- "O": 0
44
- },
45
- "layer_norm_eps": 1e-07,
46
- "legacy": true,
47
- "max_position_embeddings": 512,
48
- "max_relative_positions": -1,
49
- "model_type": "deberta-v2",
50
- "norm_rel_ebd": "layer_norm",
51
- "num_attention_heads": 12,
52
- "num_hidden_layers": 12,
53
- "pad_token_id": 0,
54
- "pooler_dropout": 0,
55
- "pooler_hidden_act": "gelu",
56
- "pooler_hidden_size": 768,
57
- "pos_att_type": [
58
- "p2c",
59
- "c2p"
60
- ],
61
- "position_biased_input": false,
62
- "position_buckets": 256,
63
- "relative_attention": true,
64
- "share_att_key": true,
65
- "torch_dtype": "float32",
66
- "transformers_version": "4.55.4",
67
- "type_vocab_size": 0,
68
- "vocab_size": 128100
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-7334/special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "[CLS]",
3
- "cls_token": "[CLS]",
4
- "eos_token": "[SEP]",
5
- "mask_token": "[MASK]",
6
- "pad_token": "[PAD]",
7
- "sep_token": "[SEP]",
8
- "unk_token": {
9
- "content": "[UNK]",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-7334/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-7334/tokenizer_config.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[CLS]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[SEP]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[UNK]",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "128000": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "[CLS]",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "[CLS]",
47
- "do_lower_case": false,
48
- "eos_token": "[SEP]",
49
- "extra_special_tokens": {},
50
- "mask_token": "[MASK]",
51
- "model_max_length": 1000000000000000019884624838656,
52
- "pad_token": "[PAD]",
53
- "sep_token": "[SEP]",
54
- "sp_model_kwargs": {},
55
- "split_by_punct": false,
56
- "tokenizer_class": "DebertaV2Tokenizer",
57
- "unk_token": "[UNK]",
58
- "vocab_type": "spm"
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-7334/trainer_state.json DELETED
@@ -1,1056 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.9523438514478639,
6
- "eval_steps": 500,
7
- "global_step": 7334,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.006492663290481756,
14
- "grad_norm": 23.6250057220459,
15
- "learning_rate": 1.9600000000000003e-06,
16
- "loss": 3.3492,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.012985326580963512,
21
- "grad_norm": 1.5745655298233032,
22
- "learning_rate": 3.96e-06,
23
- "loss": 1.3837,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.019477989871445268,
28
- "grad_norm": 1.3241935968399048,
29
- "learning_rate": 5.9600000000000005e-06,
30
- "loss": 0.6673,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.025970653161927024,
35
- "grad_norm": 1.068557620048523,
36
- "learning_rate": 7.960000000000002e-06,
37
- "loss": 0.4426,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.032463316452408776,
42
- "grad_norm": 1.2953921556472778,
43
- "learning_rate": 9.960000000000001e-06,
44
- "loss": 0.363,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.038955979742890535,
49
- "grad_norm": 1.4753903150558472,
50
- "learning_rate": 1.196e-05,
51
- "loss": 0.3397,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.04544864303337229,
56
- "grad_norm": 1.245597243309021,
57
- "learning_rate": 1.396e-05,
58
- "loss": 0.3125,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.05194130632385405,
63
- "grad_norm": 1.3421887159347534,
64
- "learning_rate": 1.5960000000000003e-05,
65
- "loss": 0.3086,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.0584339696143358,
70
- "grad_norm": 1.7852529287338257,
71
- "learning_rate": 1.796e-05,
72
- "loss": 0.2883,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.06492663290481755,
77
- "grad_norm": 1.086064100265503,
78
- "learning_rate": 1.9960000000000002e-05,
79
- "loss": 0.2947,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 0.07141929619529931,
84
- "grad_norm": 0.8753414750099182,
85
- "learning_rate": 1.9974213919221156e-05,
86
- "loss": 0.2955,
87
- "step": 550
88
- },
89
- {
90
- "epoch": 0.07791195948578107,
91
- "grad_norm": 0.7875685095787048,
92
- "learning_rate": 1.9947901591895804e-05,
93
- "loss": 0.2773,
94
- "step": 600
95
- },
96
- {
97
- "epoch": 0.08440462277626282,
98
- "grad_norm": 1.1609506607055664,
99
- "learning_rate": 1.9921589264570452e-05,
100
- "loss": 0.2696,
101
- "step": 650
102
- },
103
- {
104
- "epoch": 0.09089728606674458,
105
- "grad_norm": 0.9162603616714478,
106
- "learning_rate": 1.98952769372451e-05,
107
- "loss": 0.2667,
108
- "step": 700
109
- },
110
- {
111
- "epoch": 0.09738994935722634,
112
- "grad_norm": 1.0398370027542114,
113
- "learning_rate": 1.986896460991975e-05,
114
- "loss": 0.277,
115
- "step": 750
116
- },
117
- {
118
- "epoch": 0.1038826126477081,
119
- "grad_norm": 0.9404798150062561,
120
- "learning_rate": 1.9842652282594397e-05,
121
- "loss": 0.2584,
122
- "step": 800
123
- },
124
- {
125
- "epoch": 0.11037527593818984,
126
- "grad_norm": 1.0353909730911255,
127
- "learning_rate": 1.9816339955269045e-05,
128
- "loss": 0.2533,
129
- "step": 850
130
- },
131
- {
132
- "epoch": 0.1168679392286716,
133
- "grad_norm": 1.485606074333191,
134
- "learning_rate": 1.9790027627943694e-05,
135
- "loss": 0.2506,
136
- "step": 900
137
- },
138
- {
139
- "epoch": 0.12336060251915336,
140
- "grad_norm": 0.9874151945114136,
141
- "learning_rate": 1.9763715300618342e-05,
142
- "loss": 0.2515,
143
- "step": 950
144
- },
145
- {
146
- "epoch": 0.1298532658096351,
147
- "grad_norm": 1.036007285118103,
148
- "learning_rate": 1.973740297329299e-05,
149
- "loss": 0.244,
150
- "step": 1000
151
- },
152
- {
153
- "epoch": 0.13634592910011686,
154
- "grad_norm": 0.946956217288971,
155
- "learning_rate": 1.9711090645967635e-05,
156
- "loss": 0.2518,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 0.14283859239059862,
161
- "grad_norm": 0.9096735119819641,
162
- "learning_rate": 1.9684778318642287e-05,
163
- "loss": 0.2543,
164
- "step": 1100
165
- },
166
- {
167
- "epoch": 0.14933125568108038,
168
- "grad_norm": 0.9371875524520874,
169
- "learning_rate": 1.965846599131693e-05,
170
- "loss": 0.2605,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 0.15582391897156214,
175
- "grad_norm": 0.8072580695152283,
176
- "learning_rate": 1.9632153663991583e-05,
177
- "loss": 0.2423,
178
- "step": 1200
179
- },
180
- {
181
- "epoch": 0.1623165822620439,
182
- "grad_norm": 0.6907565593719482,
183
- "learning_rate": 1.9605841336666228e-05,
184
- "loss": 0.2315,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 0.16880924555252563,
189
- "grad_norm": 0.6676374673843384,
190
- "learning_rate": 1.957952900934088e-05,
191
- "loss": 0.2363,
192
- "step": 1300
193
- },
194
- {
195
- "epoch": 0.1753019088430074,
196
- "grad_norm": 0.7281391024589539,
197
- "learning_rate": 1.9553216682015525e-05,
198
- "loss": 0.2399,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 0.18179457213348915,
203
- "grad_norm": 0.8105588555335999,
204
- "learning_rate": 1.9526904354690177e-05,
205
- "loss": 0.2448,
206
- "step": 1400
207
- },
208
- {
209
- "epoch": 0.1882872354239709,
210
- "grad_norm": 0.7466333508491516,
211
- "learning_rate": 1.950059202736482e-05,
212
- "loss": 0.2388,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 0.19477989871445267,
217
- "grad_norm": 0.5949985384941101,
218
- "learning_rate": 1.947427970003947e-05,
219
- "loss": 0.233,
220
- "step": 1500
221
- },
222
- {
223
- "epoch": 0.20127256200493443,
224
- "grad_norm": 0.7439960241317749,
225
- "learning_rate": 1.9447967372714118e-05,
226
- "loss": 0.2448,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.2077652252954162,
231
- "grad_norm": 0.9508784413337708,
232
- "learning_rate": 1.9421655045388766e-05,
233
- "loss": 0.2353,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 0.21425788858589795,
238
- "grad_norm": 0.8785332441329956,
239
- "learning_rate": 1.9395342718063415e-05,
240
- "loss": 0.2249,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.22075055187637968,
245
- "grad_norm": 0.7568134069442749,
246
- "learning_rate": 1.9369030390738063e-05,
247
- "loss": 0.2307,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 0.22724321516686144,
252
- "grad_norm": 0.6832641363143921,
253
- "learning_rate": 1.934271806341271e-05,
254
- "loss": 0.2263,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 0.2337358784573432,
259
- "grad_norm": 0.9721575379371643,
260
- "learning_rate": 1.931640573608736e-05,
261
- "loss": 0.2287,
262
- "step": 1800
263
- },
264
- {
265
- "epoch": 0.24022854174782496,
266
- "grad_norm": 0.620299220085144,
267
- "learning_rate": 1.9290093408762008e-05,
268
- "loss": 0.226,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 0.24672120503830672,
273
- "grad_norm": 0.6656680703163147,
274
- "learning_rate": 1.9263781081436656e-05,
275
- "loss": 0.2223,
276
- "step": 1900
277
- },
278
- {
279
- "epoch": 0.25321386832878845,
280
- "grad_norm": 0.8536450266838074,
281
- "learning_rate": 1.9237468754111304e-05,
282
- "loss": 0.2332,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 0.2597065316192702,
287
- "grad_norm": 0.9754176139831543,
288
- "learning_rate": 1.921115642678595e-05,
289
- "loss": 0.2211,
290
- "step": 2000
291
- },
292
- {
293
- "epoch": 0.26619919490975197,
294
- "grad_norm": 0.6166796684265137,
295
- "learning_rate": 1.9184844099460598e-05,
296
- "loss": 0.2201,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 0.2726918582002337,
301
- "grad_norm": 0.8182518482208252,
302
- "learning_rate": 1.9158531772135246e-05,
303
- "loss": 0.2213,
304
- "step": 2100
305
- },
306
- {
307
- "epoch": 0.2791845214907155,
308
- "grad_norm": 0.8245829939842224,
309
- "learning_rate": 1.9132219444809894e-05,
310
- "loss": 0.2208,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 0.28567718478119725,
315
- "grad_norm": 0.6423109173774719,
316
- "learning_rate": 1.9105907117484542e-05,
317
- "loss": 0.2308,
318
- "step": 2200
319
- },
320
- {
321
- "epoch": 0.292169848071679,
322
- "grad_norm": 0.7164100408554077,
323
- "learning_rate": 1.907959479015919e-05,
324
- "loss": 0.2193,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 0.29866251136216077,
329
- "grad_norm": 0.7910536527633667,
330
- "learning_rate": 1.905328246283384e-05,
331
- "loss": 0.2292,
332
- "step": 2300
333
- },
334
- {
335
- "epoch": 0.3051551746526425,
336
- "grad_norm": 0.9699934720993042,
337
- "learning_rate": 1.9026970135508487e-05,
338
- "loss": 0.2175,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 0.3116478379431243,
343
- "grad_norm": 1.0665416717529297,
344
- "learning_rate": 1.9000657808183136e-05,
345
- "loss": 0.2252,
346
- "step": 2400
347
- },
348
- {
349
- "epoch": 0.31814050123360604,
350
- "grad_norm": 0.9915699362754822,
351
- "learning_rate": 1.8974345480857784e-05,
352
- "loss": 0.215,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 0.3246331645240878,
357
- "grad_norm": 0.7021859288215637,
358
- "learning_rate": 1.8948033153532432e-05,
359
- "loss": 0.2123,
360
- "step": 2500
361
- },
362
- {
363
- "epoch": 0.33112582781456956,
364
- "grad_norm": 0.6402600407600403,
365
- "learning_rate": 1.892172082620708e-05,
366
- "loss": 0.2176,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 0.33761849110505127,
371
- "grad_norm": 0.9925078749656677,
372
- "learning_rate": 1.8895408498881725e-05,
373
- "loss": 0.223,
374
- "step": 2600
375
- },
376
- {
377
- "epoch": 0.344111154395533,
378
- "grad_norm": 0.5188687443733215,
379
- "learning_rate": 1.8869096171556377e-05,
380
- "loss": 0.2174,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 0.3506038176860148,
385
- "grad_norm": 0.7427687644958496,
386
- "learning_rate": 1.8842783844231022e-05,
387
- "loss": 0.2181,
388
- "step": 2700
389
- },
390
- {
391
- "epoch": 0.35709648097649654,
392
- "grad_norm": 0.6055501103401184,
393
- "learning_rate": 1.8816471516905674e-05,
394
- "loss": 0.2097,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 0.3635891442669783,
399
- "grad_norm": 0.6529932022094727,
400
- "learning_rate": 1.879015918958032e-05,
401
- "loss": 0.2133,
402
- "step": 2800
403
- },
404
- {
405
- "epoch": 0.37008180755746006,
406
- "grad_norm": 0.9499914646148682,
407
- "learning_rate": 1.8763846862254967e-05,
408
- "loss": 0.2189,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 0.3765744708479418,
413
- "grad_norm": 0.8217771053314209,
414
- "learning_rate": 1.8737534534929615e-05,
415
- "loss": 0.2172,
416
- "step": 2900
417
- },
418
- {
419
- "epoch": 0.3830671341384236,
420
- "grad_norm": 1.031895399093628,
421
- "learning_rate": 1.8711222207604263e-05,
422
- "loss": 0.2171,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 0.38955979742890534,
427
- "grad_norm": 0.7179540395736694,
428
- "learning_rate": 1.8684909880278912e-05,
429
- "loss": 0.2179,
430
- "step": 3000
431
- },
432
- {
433
- "epoch": 0.3960524607193871,
434
- "grad_norm": 0.8115535974502563,
435
- "learning_rate": 1.865859755295356e-05,
436
- "loss": 0.2131,
437
- "step": 3050
438
- },
439
- {
440
- "epoch": 0.40254512400986886,
441
- "grad_norm": 0.7659513354301453,
442
- "learning_rate": 1.863228522562821e-05,
443
- "loss": 0.2189,
444
- "step": 3100
445
- },
446
- {
447
- "epoch": 0.4090377873003506,
448
- "grad_norm": 0.8645811080932617,
449
- "learning_rate": 1.8605972898302857e-05,
450
- "loss": 0.2453,
451
- "step": 3150
452
- },
453
- {
454
- "epoch": 0.4155304505908324,
455
- "grad_norm": 0.8571991920471191,
456
- "learning_rate": 1.8579660570977505e-05,
457
- "loss": 0.2076,
458
- "step": 3200
459
- },
460
- {
461
- "epoch": 0.42202311388131414,
462
- "grad_norm": 1.222774863243103,
463
- "learning_rate": 1.8553348243652153e-05,
464
- "loss": 0.2094,
465
- "step": 3250
466
- },
467
- {
468
- "epoch": 0.4285157771717959,
469
- "grad_norm": 0.9631436467170715,
470
- "learning_rate": 1.85270359163268e-05,
471
- "loss": 0.2123,
472
- "step": 3300
473
- },
474
- {
475
- "epoch": 0.4350084404622776,
476
- "grad_norm": 0.5590534806251526,
477
- "learning_rate": 1.8500723589001446e-05,
478
- "loss": 0.2028,
479
- "step": 3350
480
- },
481
- {
482
- "epoch": 0.44150110375275936,
483
- "grad_norm": 0.7615678310394287,
484
- "learning_rate": 1.8474411261676098e-05,
485
- "loss": 0.2125,
486
- "step": 3400
487
- },
488
- {
489
- "epoch": 0.4479937670432411,
490
- "grad_norm": 0.7124540209770203,
491
- "learning_rate": 1.8448098934350743e-05,
492
- "loss": 0.206,
493
- "step": 3450
494
- },
495
- {
496
- "epoch": 0.4544864303337229,
497
- "grad_norm": 0.8492136001586914,
498
- "learning_rate": 1.8421786607025395e-05,
499
- "loss": 0.2054,
500
- "step": 3500
501
- },
502
- {
503
- "epoch": 0.46097909362420464,
504
- "grad_norm": 1.0388613939285278,
505
- "learning_rate": 1.839547427970004e-05,
506
- "loss": 0.2117,
507
- "step": 3550
508
- },
509
- {
510
- "epoch": 0.4674717569146864,
511
- "grad_norm": 0.8087108731269836,
512
- "learning_rate": 1.836916195237469e-05,
513
- "loss": 0.2192,
514
- "step": 3600
515
- },
516
- {
517
- "epoch": 0.47396442020516816,
518
- "grad_norm": 0.7446919679641724,
519
- "learning_rate": 1.8342849625049336e-05,
520
- "loss": 0.2112,
521
- "step": 3650
522
- },
523
- {
524
- "epoch": 0.4804570834956499,
525
- "grad_norm": 0.6669062972068787,
526
- "learning_rate": 1.8316537297723984e-05,
527
- "loss": 0.2167,
528
- "step": 3700
529
- },
530
- {
531
- "epoch": 0.4869497467861317,
532
- "grad_norm": 0.519498884677887,
533
- "learning_rate": 1.8290224970398633e-05,
534
- "loss": 0.2069,
535
- "step": 3750
536
- },
537
- {
538
- "epoch": 0.49344241007661344,
539
- "grad_norm": 0.5735522508621216,
540
- "learning_rate": 1.826391264307328e-05,
541
- "loss": 0.2102,
542
- "step": 3800
543
- },
544
- {
545
- "epoch": 0.4999350733670952,
546
- "grad_norm": 0.6683741211891174,
547
- "learning_rate": 1.823760031574793e-05,
548
- "loss": 0.2141,
549
- "step": 3850
550
- },
551
- {
552
- "epoch": 0.5064277366575769,
553
- "grad_norm": 0.6239739060401917,
554
- "learning_rate": 1.8211287988422578e-05,
555
- "loss": 0.1975,
556
- "step": 3900
557
- },
558
- {
559
- "epoch": 0.5129203999480587,
560
- "grad_norm": 0.6327545046806335,
561
- "learning_rate": 1.8184975661097226e-05,
562
- "loss": 0.2165,
563
- "step": 3950
564
- },
565
- {
566
- "epoch": 0.5194130632385404,
567
- "grad_norm": 0.6098693609237671,
568
- "learning_rate": 1.8158663333771874e-05,
569
- "loss": 0.2034,
570
- "step": 4000
571
- },
572
- {
573
- "epoch": 0.5259057265290222,
574
- "grad_norm": 0.7356197237968445,
575
- "learning_rate": 1.8132351006446522e-05,
576
- "loss": 0.2041,
577
- "step": 4050
578
- },
579
- {
580
- "epoch": 0.5323983898195039,
581
- "grad_norm": 0.7616349458694458,
582
- "learning_rate": 1.810603867912117e-05,
583
- "loss": 0.2121,
584
- "step": 4100
585
- },
586
- {
587
- "epoch": 0.5388910531099858,
588
- "grad_norm": 0.6561347842216492,
589
- "learning_rate": 1.807972635179582e-05,
590
- "loss": 0.2002,
591
- "step": 4150
592
- },
593
- {
594
- "epoch": 0.5453837164004675,
595
- "grad_norm": 0.6744963526725769,
596
- "learning_rate": 1.8053414024470467e-05,
597
- "loss": 0.2029,
598
- "step": 4200
599
- },
600
- {
601
- "epoch": 0.5518763796909493,
602
- "grad_norm": 0.7836194038391113,
603
- "learning_rate": 1.8027101697145112e-05,
604
- "loss": 0.2055,
605
- "step": 4250
606
- },
607
- {
608
- "epoch": 0.558369042981431,
609
- "grad_norm": 0.6983553767204285,
610
- "learning_rate": 1.800078936981976e-05,
611
- "loss": 0.2139,
612
- "step": 4300
613
- },
614
- {
615
- "epoch": 0.5648617062719128,
616
- "grad_norm": 0.6315485239028931,
617
- "learning_rate": 1.797447704249441e-05,
618
- "loss": 0.1978,
619
- "step": 4350
620
- },
621
- {
622
- "epoch": 0.5713543695623945,
623
- "grad_norm": 0.5928835868835449,
624
- "learning_rate": 1.7948164715169057e-05,
625
- "loss": 0.2027,
626
- "step": 4400
627
- },
628
- {
629
- "epoch": 0.5778470328528762,
630
- "grad_norm": 0.8622831702232361,
631
- "learning_rate": 1.7921852387843705e-05,
632
- "loss": 0.2104,
633
- "step": 4450
634
- },
635
- {
636
- "epoch": 0.584339696143358,
637
- "grad_norm": 0.7967308759689331,
638
- "learning_rate": 1.7895540060518354e-05,
639
- "loss": 0.2328,
640
- "step": 4500
641
- },
642
- {
643
- "epoch": 0.5908323594338397,
644
- "grad_norm": 0.8289620280265808,
645
- "learning_rate": 1.7869227733193002e-05,
646
- "loss": 0.2099,
647
- "step": 4550
648
- },
649
- {
650
- "epoch": 0.5973250227243215,
651
- "grad_norm": 0.7185404300689697,
652
- "learning_rate": 1.784291540586765e-05,
653
- "loss": 0.2045,
654
- "step": 4600
655
- },
656
- {
657
- "epoch": 0.6038176860148032,
658
- "grad_norm": 0.6304630637168884,
659
- "learning_rate": 1.78166030785423e-05,
660
- "loss": 0.2055,
661
- "step": 4650
662
- },
663
- {
664
- "epoch": 0.610310349305285,
665
- "grad_norm": 0.614983856678009,
666
- "learning_rate": 1.7790290751216947e-05,
667
- "loss": 0.1979,
668
- "step": 4700
669
- },
670
- {
671
- "epoch": 0.6168030125957668,
672
- "grad_norm": 0.568352460861206,
673
- "learning_rate": 1.7763978423891595e-05,
674
- "loss": 0.197,
675
- "step": 4750
676
- },
677
- {
678
- "epoch": 0.6232956758862486,
679
- "grad_norm": 0.8283701539039612,
680
- "learning_rate": 1.773766609656624e-05,
681
- "loss": 0.1986,
682
- "step": 4800
683
- },
684
- {
685
- "epoch": 0.6297883391767303,
686
- "grad_norm": 0.888967752456665,
687
- "learning_rate": 1.7711353769240892e-05,
688
- "loss": 0.2271,
689
- "step": 4850
690
- },
691
- {
692
- "epoch": 0.6362810024672121,
693
- "grad_norm": 0.5292450189590454,
694
- "learning_rate": 1.7685041441915537e-05,
695
- "loss": 0.1971,
696
- "step": 4900
697
- },
698
- {
699
- "epoch": 0.6427736657576938,
700
- "grad_norm": 0.5274189114570618,
701
- "learning_rate": 1.765872911459019e-05,
702
- "loss": 0.1963,
703
- "step": 4950
704
- },
705
- {
706
- "epoch": 0.6492663290481756,
707
- "grad_norm": 0.5699307322502136,
708
- "learning_rate": 1.7632416787264833e-05,
709
- "loss": 0.1984,
710
- "step": 5000
711
- },
712
- {
713
- "epoch": 0.6557589923386573,
714
- "grad_norm": 0.8022367358207703,
715
- "learning_rate": 1.7606104459939485e-05,
716
- "loss": 0.2053,
717
- "step": 5050
718
- },
719
- {
720
- "epoch": 0.6622516556291391,
721
- "grad_norm": 0.6432430148124695,
722
- "learning_rate": 1.757979213261413e-05,
723
- "loss": 0.1991,
724
- "step": 5100
725
- },
726
- {
727
- "epoch": 0.6687443189196208,
728
- "grad_norm": 0.7671304941177368,
729
- "learning_rate": 1.755347980528878e-05,
730
- "loss": 0.2064,
731
- "step": 5150
732
- },
733
- {
734
- "epoch": 0.6752369822101025,
735
- "grad_norm": 0.5107030272483826,
736
- "learning_rate": 1.7527167477963426e-05,
737
- "loss": 0.2049,
738
- "step": 5200
739
- },
740
- {
741
- "epoch": 0.6817296455005843,
742
- "grad_norm": 0.7239235639572144,
743
- "learning_rate": 1.7500855150638075e-05,
744
- "loss": 0.2001,
745
- "step": 5250
746
- },
747
- {
748
- "epoch": 0.688222308791066,
749
- "grad_norm": 0.6116129755973816,
750
- "learning_rate": 1.7474542823312723e-05,
751
- "loss": 0.2193,
752
- "step": 5300
753
- },
754
- {
755
- "epoch": 0.6947149720815479,
756
- "grad_norm": 0.5425911545753479,
757
- "learning_rate": 1.744823049598737e-05,
758
- "loss": 0.2001,
759
- "step": 5350
760
- },
761
- {
762
- "epoch": 0.7012076353720296,
763
- "grad_norm": 0.6464748382568359,
764
- "learning_rate": 1.742191816866202e-05,
765
- "loss": 0.1963,
766
- "step": 5400
767
- },
768
- {
769
- "epoch": 0.7077002986625114,
770
- "grad_norm": 0.8812252879142761,
771
- "learning_rate": 1.7395605841336668e-05,
772
- "loss": 0.196,
773
- "step": 5450
774
- },
775
- {
776
- "epoch": 0.7141929619529931,
777
- "grad_norm": 0.6928241848945618,
778
- "learning_rate": 1.7369293514011316e-05,
779
- "loss": 0.2011,
780
- "step": 5500
781
- },
782
- {
783
- "epoch": 0.7206856252434749,
784
- "grad_norm": 0.6892450451850891,
785
- "learning_rate": 1.7342981186685965e-05,
786
- "loss": 0.1949,
787
- "step": 5550
788
- },
789
- {
790
- "epoch": 0.7271782885339566,
791
- "grad_norm": 0.4782065749168396,
792
- "learning_rate": 1.7316668859360613e-05,
793
- "loss": 0.194,
794
- "step": 5600
795
- },
796
- {
797
- "epoch": 0.7336709518244384,
798
- "grad_norm": 0.6438505053520203,
799
- "learning_rate": 1.729035653203526e-05,
800
- "loss": 0.1967,
801
- "step": 5650
802
- },
803
- {
804
- "epoch": 0.7401636151149201,
805
- "grad_norm": 0.5797818899154663,
806
- "learning_rate": 1.726404420470991e-05,
807
- "loss": 0.2185,
808
- "step": 5700
809
- },
810
- {
811
- "epoch": 0.7466562784054019,
812
- "grad_norm": 0.6884586811065674,
813
- "learning_rate": 1.7237731877384554e-05,
814
- "loss": 0.1977,
815
- "step": 5750
816
- },
817
- {
818
- "epoch": 0.7531489416958836,
819
- "grad_norm": 0.648883581161499,
820
- "learning_rate": 1.7211419550059206e-05,
821
- "loss": 0.1964,
822
- "step": 5800
823
- },
824
- {
825
- "epoch": 0.7596416049863655,
826
- "grad_norm": 0.6440086960792542,
827
- "learning_rate": 1.718510722273385e-05,
828
- "loss": 0.2014,
829
- "step": 5850
830
- },
831
- {
832
- "epoch": 0.7661342682768472,
833
- "grad_norm": 0.5619300007820129,
834
- "learning_rate": 1.71587948954085e-05,
835
- "loss": 0.1909,
836
- "step": 5900
837
- },
838
- {
839
- "epoch": 0.7726269315673289,
840
- "grad_norm": 0.6859204769134521,
841
- "learning_rate": 1.7132482568083147e-05,
842
- "loss": 0.2049,
843
- "step": 5950
844
- },
845
- {
846
- "epoch": 0.7791195948578107,
847
- "grad_norm": 0.6132592558860779,
848
- "learning_rate": 1.7106170240757796e-05,
849
- "loss": 0.2,
850
- "step": 6000
851
- },
852
- {
853
- "epoch": 0.7856122581482924,
854
- "grad_norm": 0.7050901055335999,
855
- "learning_rate": 1.7079857913432444e-05,
856
- "loss": 0.189,
857
- "step": 6050
858
- },
859
- {
860
- "epoch": 0.7921049214387742,
861
- "grad_norm": 0.6752614974975586,
862
- "learning_rate": 1.7053545586107092e-05,
863
- "loss": 0.2248,
864
- "step": 6100
865
- },
866
- {
867
- "epoch": 0.7985975847292559,
868
- "grad_norm": 0.7186923623085022,
869
- "learning_rate": 1.702723325878174e-05,
870
- "loss": 0.1903,
871
- "step": 6150
872
- },
873
- {
874
- "epoch": 0.8050902480197377,
875
- "grad_norm": 0.5991400480270386,
876
- "learning_rate": 1.700092093145639e-05,
877
- "loss": 0.197,
878
- "step": 6200
879
- },
880
- {
881
- "epoch": 0.8115829113102194,
882
- "grad_norm": 0.9522245526313782,
883
- "learning_rate": 1.6974608604131037e-05,
884
- "loss": 0.1962,
885
- "step": 6250
886
- },
887
- {
888
- "epoch": 0.8180755746007012,
889
- "grad_norm": 0.8645381927490234,
890
- "learning_rate": 1.6948296276805686e-05,
891
- "loss": 0.1901,
892
- "step": 6300
893
- },
894
- {
895
- "epoch": 0.8245682378911829,
896
- "grad_norm": 0.5243034958839417,
897
- "learning_rate": 1.6921983949480334e-05,
898
- "loss": 0.194,
899
- "step": 6350
900
- },
901
- {
902
- "epoch": 0.8310609011816648,
903
- "grad_norm": 0.5842151641845703,
904
- "learning_rate": 1.6895671622154982e-05,
905
- "loss": 0.1917,
906
- "step": 6400
907
- },
908
- {
909
- "epoch": 0.8375535644721465,
910
- "grad_norm": 0.6111485362052917,
911
- "learning_rate": 1.6869359294829627e-05,
912
- "loss": 0.2042,
913
- "step": 6450
914
- },
915
- {
916
- "epoch": 0.8440462277626283,
917
- "grad_norm": 0.6515288949012756,
918
- "learning_rate": 1.684304696750428e-05,
919
- "loss": 0.2032,
920
- "step": 6500
921
- },
922
- {
923
- "epoch": 0.85053889105311,
924
- "grad_norm": 0.7596396207809448,
925
- "learning_rate": 1.6816734640178924e-05,
926
- "loss": 0.193,
927
- "step": 6550
928
- },
929
- {
930
- "epoch": 0.8570315543435918,
931
- "grad_norm": 0.6944254636764526,
932
- "learning_rate": 1.6790422312853575e-05,
933
- "loss": 0.1899,
934
- "step": 6600
935
- },
936
- {
937
- "epoch": 0.8635242176340735,
938
- "grad_norm": 0.6190508604049683,
939
- "learning_rate": 1.676410998552822e-05,
940
- "loss": 0.1987,
941
- "step": 6650
942
- },
943
- {
944
- "epoch": 0.8700168809245552,
945
- "grad_norm": 1.1515477895736694,
946
- "learning_rate": 1.673779765820287e-05,
947
- "loss": 0.196,
948
- "step": 6700
949
- },
950
- {
951
- "epoch": 0.876509544215037,
952
- "grad_norm": 0.5803254842758179,
953
- "learning_rate": 1.6711485330877517e-05,
954
- "loss": 0.1923,
955
- "step": 6750
956
- },
957
- {
958
- "epoch": 0.8830022075055187,
959
- "grad_norm": 0.8052871227264404,
960
- "learning_rate": 1.6685173003552165e-05,
961
- "loss": 0.1894,
962
- "step": 6800
963
- },
964
- {
965
- "epoch": 0.8894948707960005,
966
- "grad_norm": 0.9313941597938538,
967
- "learning_rate": 1.6658860676226813e-05,
968
- "loss": 0.1889,
969
- "step": 6850
970
- },
971
- {
972
- "epoch": 0.8959875340864822,
973
- "grad_norm": 0.5186671614646912,
974
- "learning_rate": 1.663254834890146e-05,
975
- "loss": 0.1895,
976
- "step": 6900
977
- },
978
- {
979
- "epoch": 0.902480197376964,
980
- "grad_norm": 0.7533177137374878,
981
- "learning_rate": 1.660623602157611e-05,
982
- "loss": 0.1956,
983
- "step": 6950
984
- },
985
- {
986
- "epoch": 0.9089728606674458,
987
- "grad_norm": 0.7142027020454407,
988
- "learning_rate": 1.6579923694250758e-05,
989
- "loss": 0.1921,
990
- "step": 7000
991
- },
992
- {
993
- "epoch": 0.9154655239579276,
994
- "grad_norm": 1.0748203992843628,
995
- "learning_rate": 1.6553611366925407e-05,
996
- "loss": 0.1847,
997
- "step": 7050
998
- },
999
- {
1000
- "epoch": 0.9219581872484093,
1001
- "grad_norm": 0.5605922341346741,
1002
- "learning_rate": 1.6527299039600055e-05,
1003
- "loss": 0.1887,
1004
- "step": 7100
1005
- },
1006
- {
1007
- "epoch": 0.9284508505388911,
1008
- "grad_norm": 0.5474116802215576,
1009
- "learning_rate": 1.6500986712274703e-05,
1010
- "loss": 0.1948,
1011
- "step": 7150
1012
- },
1013
- {
1014
- "epoch": 0.9349435138293728,
1015
- "grad_norm": 0.9507768154144287,
1016
- "learning_rate": 1.6474674384949348e-05,
1017
- "loss": 0.2126,
1018
- "step": 7200
1019
- },
1020
- {
1021
- "epoch": 0.9414361771198546,
1022
- "grad_norm": 0.694097101688385,
1023
- "learning_rate": 1.6448362057624e-05,
1024
- "loss": 0.1904,
1025
- "step": 7250
1026
- },
1027
- {
1028
- "epoch": 0.9479288404103363,
1029
- "grad_norm": 0.6153343915939331,
1030
- "learning_rate": 1.6422049730298645e-05,
1031
- "loss": 0.1921,
1032
- "step": 7300
1033
- }
1034
- ],
1035
- "logging_steps": 50,
1036
- "max_steps": 38505,
1037
- "num_input_tokens_seen": 0,
1038
- "num_train_epochs": 5,
1039
- "save_steps": 386,
1040
- "stateful_callbacks": {
1041
- "TrainerControl": {
1042
- "args": {
1043
- "should_epoch_stop": false,
1044
- "should_evaluate": false,
1045
- "should_log": false,
1046
- "should_save": true,
1047
- "should_training_stop": false
1048
- },
1049
- "attributes": {}
1050
- }
1051
- },
1052
- "total_flos": 6.133152775887913e+17,
1053
- "train_batch_size": 40,
1054
- "trial_name": null,
1055
- "trial_params": null
1056
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "architectures": [
3
- "DebertaV2ForTokenClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "hidden_act": "gelu",
7
- "hidden_dropout_prob": 0.1,
8
- "hidden_size": 768,
9
- "id2label": {
10
- "0": "O",
11
- "1": "B-CHA",
12
- "2": "I-CHA",
13
- "3": "B-LOC",
14
- "4": "I-LOC",
15
- "5": "B-FAC",
16
- "6": "I-FAC",
17
- "7": "B-OBJ",
18
- "8": "I-OBJ",
19
- "9": "B-EVT",
20
- "10": "I-EVT",
21
- "11": "B-ORG",
22
- "12": "I-ORG",
23
- "13": "B-MISC",
24
- "14": "I-MISC"
25
- },
26
- "initializer_range": 0.02,
27
- "intermediate_size": 3072,
28
- "label2id": {
29
- "B-CHA": 1,
30
- "B-EVT": 9,
31
- "B-FAC": 5,
32
- "B-LOC": 3,
33
- "B-MISC": 13,
34
- "B-OBJ": 7,
35
- "B-ORG": 11,
36
- "I-CHA": 2,
37
- "I-EVT": 10,
38
- "I-FAC": 6,
39
- "I-LOC": 4,
40
- "I-MISC": 14,
41
- "I-OBJ": 8,
42
- "I-ORG": 12,
43
- "O": 0
44
- },
45
- "layer_norm_eps": 1e-07,
46
- "legacy": true,
47
- "max_position_embeddings": 512,
48
- "max_relative_positions": -1,
49
- "model_type": "deberta-v2",
50
- "norm_rel_ebd": "layer_norm",
51
- "num_attention_heads": 12,
52
- "num_hidden_layers": 12,
53
- "pad_token_id": 0,
54
- "pooler_dropout": 0,
55
- "pooler_hidden_act": "gelu",
56
- "pooler_hidden_size": 768,
57
- "pos_att_type": [
58
- "p2c",
59
- "c2p"
60
- ],
61
- "position_biased_input": false,
62
- "position_buckets": 256,
63
- "relative_attention": true,
64
- "share_att_key": true,
65
- "torch_dtype": "float32",
66
- "transformers_version": "4.55.4",
67
- "type_vocab_size": 0,
68
- "vocab_size": 128100
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "[CLS]",
3
- "cls_token": "[CLS]",
4
- "eos_token": "[SEP]",
5
- "mask_token": "[MASK]",
6
- "pad_token": "[PAD]",
7
- "sep_token": "[SEP]",
8
- "unk_token": {
9
- "content": "[UNK]",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[CLS]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[SEP]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[UNK]",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "128000": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "[CLS]",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "[CLS]",
47
- "do_lower_case": false,
48
- "eos_token": "[SEP]",
49
- "extra_special_tokens": {},
50
- "mask_token": "[MASK]",
51
- "model_max_length": 1000000000000000019884624838656,
52
- "pad_token": "[PAD]",
53
- "sep_token": "[SEP]",
54
- "sp_model_kwargs": {},
55
- "split_by_punct": false,
56
- "tokenizer_class": "DebertaV2Tokenizer",
57
- "unk_token": "[UNK]",
58
- "vocab_type": "spm"
59
- }