celalkartoglu commited on
Commit
c8b8422
·
verified ·
1 Parent(s): 996a9cf

Add multi-head XLM-R model (intent+NER) for TR

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # XLM-RoBERTa Multi-Head (TR) — Intent + NER
2
+
3
+ - Base: `xlm-roberta-base`
4
+ - Tasks: Intent classification (60 sınıf), NER (BIO)
5
+ - Dosyalar: `pytorch_model.bin`, `config.json`, `tokenizer.*`, `label_schemes.json`, `modeling_xlmr_multihead.py`
6
+
7
+ ## Kullanım
8
+
9
+ ```python
10
+ from transformers import AutoTokenizer
11
+ from modeling_xlmr_multihead import XLMRMultiHead
12
+ import torch, json
13
+
14
+ ckpt = "celalkartoglu/xlmr-multihead-tr"
15
+
16
+ tok = AutoTokenizer.from_pretrained(ckpt)
17
+ model = XLMRMultiHead(n_intent=60, n_ner=111)
18
+ model.load_state_dict(torch.load("pytorch_model.bin", map_location="cpu"))
19
+ model.eval()
checkpoint-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cebda0fd8fb43cdd7bfaed3ed802d2d765fa5a6e277ca051d6f4edd4c2acf95
3
+ size 1112724220
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d0c750103a8c782395706d4b04be165e37b41ca2b102a5a41c949cec4dfc6e
3
+ size 2220847627
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f132dbe4b36ef2ef5948850ab1d51b605a7a6fe61572f71d627f2cd76c02944
3
+ size 14645
checkpoint-2500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:756748015f02714454fc8b7c1609af89637c927d42c77e08d04e18d38ef0be30
3
+ size 1383
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a739733499f8f4ac73e337930f1fa842e39bf811ec9e25a9c7583f40a9e61a1
3
+ size 1465
checkpoint-2500/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
checkpoint-2500/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
checkpoint-2500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0091a328b3441d754e481db5a390d7f3b8dabc6016869fd13ba350d23ddc4cd
3
+ size 17082832
checkpoint-2500/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2500,
3
+ "best_metric": 0.7024300779458964,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/NLP/Multi-Task/data/massive_tr/xlmr-multihead/checkpoint-2500",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1,
14
+ "grad_norm": 31.571680068969727,
15
+ "learning_rate": 8.166666666666666e-06,
16
+ "loss": 9.5398,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.2,
21
+ "grad_norm": 20.519420623779297,
22
+ "learning_rate": 1.65e-05,
23
+ "loss": 6.6539,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.3,
28
+ "grad_norm": 14.259832382202148,
29
+ "learning_rate": 2.483333333333333e-05,
30
+ "loss": 6.1233,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.4,
35
+ "grad_norm": 28.014291763305664,
36
+ "learning_rate": 2.979787234042553e-05,
37
+ "loss": 5.6725,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.5,
42
+ "grad_norm": 28.722820281982422,
43
+ "learning_rate": 2.926595744680851e-05,
44
+ "loss": 5.3297,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.6,
49
+ "grad_norm": 38.051170349121094,
50
+ "learning_rate": 2.873404255319149e-05,
51
+ "loss": 4.5911,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.7,
56
+ "grad_norm": 26.39080810546875,
57
+ "learning_rate": 2.820212765957447e-05,
58
+ "loss": 4.017,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.8,
63
+ "grad_norm": 18.74272918701172,
64
+ "learning_rate": 2.7670212765957448e-05,
65
+ "loss": 3.6629,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.9,
70
+ "grad_norm": 18.32183074951172,
71
+ "learning_rate": 2.7138297872340427e-05,
72
+ "loss": 3.3781,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 1.0,
77
+ "grad_norm": 29.03256607055664,
78
+ "learning_rate": 2.6606382978723407e-05,
79
+ "loss": 2.9562,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_intent_accuracy": 0.704,
85
+ "eval_intent_macro_f1": 0.48933652636220204,
86
+ "eval_joint_score": 0.5881757105943153,
87
+ "eval_loss": 2.396956443786621,
88
+ "eval_ner_f1": 0.4723514211886305,
89
+ "eval_ner_precision": 0.5163841807909605,
90
+ "eval_ner_recall": 0.43523809523809526,
91
+ "eval_runtime": 0.5889,
92
+ "eval_samples_per_second": 1698.154,
93
+ "eval_steps_per_second": 54.341,
94
+ "step": 500
95
+ },
96
+ {
97
+ "epoch": 1.1,
98
+ "grad_norm": 40.97825241088867,
99
+ "learning_rate": 2.6074468085106382e-05,
100
+ "loss": 2.701,
101
+ "step": 550
102
+ },
103
+ {
104
+ "epoch": 1.2,
105
+ "grad_norm": 23.553442001342773,
106
+ "learning_rate": 2.554255319148936e-05,
107
+ "loss": 2.4746,
108
+ "step": 600
109
+ },
110
+ {
111
+ "epoch": 1.3,
112
+ "grad_norm": 26.10988426208496,
113
+ "learning_rate": 2.501063829787234e-05,
114
+ "loss": 2.3423,
115
+ "step": 650
116
+ },
117
+ {
118
+ "epoch": 1.4,
119
+ "grad_norm": 19.3520565032959,
120
+ "learning_rate": 2.447872340425532e-05,
121
+ "loss": 2.1157,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 1.5,
126
+ "grad_norm": 34.30548858642578,
127
+ "learning_rate": 2.3946808510638296e-05,
128
+ "loss": 1.9679,
129
+ "step": 750
130
+ },
131
+ {
132
+ "epoch": 1.6,
133
+ "grad_norm": 15.152769088745117,
134
+ "learning_rate": 2.341489361702128e-05,
135
+ "loss": 1.9404,
136
+ "step": 800
137
+ },
138
+ {
139
+ "epoch": 1.7,
140
+ "grad_norm": 25.86590003967285,
141
+ "learning_rate": 2.2882978723404258e-05,
142
+ "loss": 1.97,
143
+ "step": 850
144
+ },
145
+ {
146
+ "epoch": 1.8,
147
+ "grad_norm": 18.924379348754883,
148
+ "learning_rate": 2.2351063829787237e-05,
149
+ "loss": 1.8073,
150
+ "step": 900
151
+ },
152
+ {
153
+ "epoch": 1.9,
154
+ "grad_norm": 20.5709171295166,
155
+ "learning_rate": 2.1819148936170213e-05,
156
+ "loss": 1.7987,
157
+ "step": 950
158
+ },
159
+ {
160
+ "epoch": 2.0,
161
+ "grad_norm": 31.26434898376465,
162
+ "learning_rate": 2.1287234042553192e-05,
163
+ "loss": 1.7652,
164
+ "step": 1000
165
+ },
166
+ {
167
+ "epoch": 2.0,
168
+ "eval_intent_accuracy": 0.833,
169
+ "eval_intent_macro_f1": 0.7032350726349212,
170
+ "eval_joint_score": 0.7102992125984251,
171
+ "eval_loss": 1.4518628120422363,
172
+ "eval_ner_f1": 0.5875984251968503,
173
+ "eval_ner_precision": 0.6079429735234216,
174
+ "eval_ner_recall": 0.5685714285714286,
175
+ "eval_runtime": 0.595,
176
+ "eval_samples_per_second": 1680.634,
177
+ "eval_steps_per_second": 53.78,
178
+ "step": 1000
179
+ },
180
+ {
181
+ "epoch": 2.1,
182
+ "grad_norm": 35.977622985839844,
183
+ "learning_rate": 2.075531914893617e-05,
184
+ "loss": 1.4852,
185
+ "step": 1050
186
+ },
187
+ {
188
+ "epoch": 2.2,
189
+ "grad_norm": 30.942049026489258,
190
+ "learning_rate": 2.0223404255319147e-05,
191
+ "loss": 1.3909,
192
+ "step": 1100
193
+ },
194
+ {
195
+ "epoch": 2.3,
196
+ "grad_norm": 29.61802101135254,
197
+ "learning_rate": 1.9691489361702126e-05,
198
+ "loss": 1.4372,
199
+ "step": 1150
200
+ },
201
+ {
202
+ "epoch": 2.4,
203
+ "grad_norm": 17.332490921020508,
204
+ "learning_rate": 1.915957446808511e-05,
205
+ "loss": 1.4184,
206
+ "step": 1200
207
+ },
208
+ {
209
+ "epoch": 2.5,
210
+ "grad_norm": 9.733820915222168,
211
+ "learning_rate": 1.8627659574468088e-05,
212
+ "loss": 1.3071,
213
+ "step": 1250
214
+ },
215
+ {
216
+ "epoch": 2.6,
217
+ "grad_norm": 22.356639862060547,
218
+ "learning_rate": 1.8095744680851064e-05,
219
+ "loss": 1.3381,
220
+ "step": 1300
221
+ },
222
+ {
223
+ "epoch": 2.7,
224
+ "grad_norm": 26.953872680664062,
225
+ "learning_rate": 1.7563829787234043e-05,
226
+ "loss": 1.347,
227
+ "step": 1350
228
+ },
229
+ {
230
+ "epoch": 2.8,
231
+ "grad_norm": 39.545013427734375,
232
+ "learning_rate": 1.7031914893617022e-05,
233
+ "loss": 1.2399,
234
+ "step": 1400
235
+ },
236
+ {
237
+ "epoch": 2.9,
238
+ "grad_norm": 29.881067276000977,
239
+ "learning_rate": 1.65e-05,
240
+ "loss": 1.2018,
241
+ "step": 1450
242
+ },
243
+ {
244
+ "epoch": 3.0,
245
+ "grad_norm": 34.384517669677734,
246
+ "learning_rate": 1.5968085106382977e-05,
247
+ "loss": 1.2898,
248
+ "step": 1500
249
+ },
250
+ {
251
+ "epoch": 3.0,
252
+ "eval_intent_accuracy": 0.843,
253
+ "eval_intent_macro_f1": 0.7342243232665543,
254
+ "eval_joint_score": 0.7419320432043204,
255
+ "eval_loss": 1.2403136491775513,
256
+ "eval_ner_f1": 0.6408640864086408,
257
+ "eval_ner_precision": 0.6075085324232082,
258
+ "eval_ner_recall": 0.6780952380952381,
259
+ "eval_runtime": 0.5931,
260
+ "eval_samples_per_second": 1686.094,
261
+ "eval_steps_per_second": 53.955,
262
+ "step": 1500
263
+ },
264
+ {
265
+ "epoch": 3.1,
266
+ "grad_norm": 40.15689468383789,
267
+ "learning_rate": 1.5436170212765956e-05,
268
+ "loss": 1.0928,
269
+ "step": 1550
270
+ },
271
+ {
272
+ "epoch": 3.2,
273
+ "grad_norm": 14.882131576538086,
274
+ "learning_rate": 1.4904255319148937e-05,
275
+ "loss": 0.971,
276
+ "step": 1600
277
+ },
278
+ {
279
+ "epoch": 3.3,
280
+ "grad_norm": 16.319412231445312,
281
+ "learning_rate": 1.4372340425531915e-05,
282
+ "loss": 1.1631,
283
+ "step": 1650
284
+ },
285
+ {
286
+ "epoch": 3.4,
287
+ "grad_norm": 26.943748474121094,
288
+ "learning_rate": 1.3840425531914896e-05,
289
+ "loss": 1.0528,
290
+ "step": 1700
291
+ },
292
+ {
293
+ "epoch": 3.5,
294
+ "grad_norm": 19.07010841369629,
295
+ "learning_rate": 1.3308510638297873e-05,
296
+ "loss": 1.0475,
297
+ "step": 1750
298
+ },
299
+ {
300
+ "epoch": 3.6,
301
+ "grad_norm": 16.704652786254883,
302
+ "learning_rate": 1.277659574468085e-05,
303
+ "loss": 1.0072,
304
+ "step": 1800
305
+ },
306
+ {
307
+ "epoch": 3.7,
308
+ "grad_norm": 20.118215560913086,
309
+ "learning_rate": 1.224468085106383e-05,
310
+ "loss": 1.0892,
311
+ "step": 1850
312
+ },
313
+ {
314
+ "epoch": 3.8,
315
+ "grad_norm": 25.932292938232422,
316
+ "learning_rate": 1.1712765957446809e-05,
317
+ "loss": 0.8766,
318
+ "step": 1900
319
+ },
320
+ {
321
+ "epoch": 3.9,
322
+ "grad_norm": 20.448410034179688,
323
+ "learning_rate": 1.1180851063829788e-05,
324
+ "loss": 0.8948,
325
+ "step": 1950
326
+ },
327
+ {
328
+ "epoch": 4.0,
329
+ "grad_norm": 13.030745506286621,
330
+ "learning_rate": 1.0648936170212766e-05,
331
+ "loss": 0.9611,
332
+ "step": 2000
333
+ },
334
+ {
335
+ "epoch": 4.0,
336
+ "eval_intent_accuracy": 0.865,
337
+ "eval_intent_macro_f1": 0.7845036463088115,
338
+ "eval_joint_score": 0.7702941855099215,
339
+ "eval_loss": 1.1503586769104004,
340
+ "eval_ner_f1": 0.675588371019843,
341
+ "eval_ner_precision": 0.6553267681289168,
342
+ "eval_ner_recall": 0.6971428571428572,
343
+ "eval_runtime": 0.5885,
344
+ "eval_samples_per_second": 1699.13,
345
+ "eval_steps_per_second": 54.372,
346
+ "step": 2000
347
+ },
348
+ {
349
+ "epoch": 4.1,
350
+ "grad_norm": 7.756314754486084,
351
+ "learning_rate": 1.0117021276595745e-05,
352
+ "loss": 0.7831,
353
+ "step": 2050
354
+ },
355
+ {
356
+ "epoch": 4.2,
357
+ "grad_norm": 15.828548431396484,
358
+ "learning_rate": 9.585106382978724e-06,
359
+ "loss": 0.7917,
360
+ "step": 2100
361
+ },
362
+ {
363
+ "epoch": 4.3,
364
+ "grad_norm": 17.722307205200195,
365
+ "learning_rate": 9.053191489361703e-06,
366
+ "loss": 0.7958,
367
+ "step": 2150
368
+ },
369
+ {
370
+ "epoch": 4.4,
371
+ "grad_norm": 31.272441864013672,
372
+ "learning_rate": 8.52127659574468e-06,
373
+ "loss": 0.7537,
374
+ "step": 2200
375
+ },
376
+ {
377
+ "epoch": 4.5,
378
+ "grad_norm": 37.739131927490234,
379
+ "learning_rate": 7.98936170212766e-06,
380
+ "loss": 0.8002,
381
+ "step": 2250
382
+ },
383
+ {
384
+ "epoch": 4.6,
385
+ "grad_norm": 29.077699661254883,
386
+ "learning_rate": 7.457446808510638e-06,
387
+ "loss": 0.8878,
388
+ "step": 2300
389
+ },
390
+ {
391
+ "epoch": 4.7,
392
+ "grad_norm": 43.24278259277344,
393
+ "learning_rate": 6.925531914893618e-06,
394
+ "loss": 0.8453,
395
+ "step": 2350
396
+ },
397
+ {
398
+ "epoch": 4.8,
399
+ "grad_norm": 94.07080841064453,
400
+ "learning_rate": 6.393617021276596e-06,
401
+ "loss": 0.7931,
402
+ "step": 2400
403
+ },
404
+ {
405
+ "epoch": 4.9,
406
+ "grad_norm": 20.592226028442383,
407
+ "learning_rate": 5.861702127659575e-06,
408
+ "loss": 0.7832,
409
+ "step": 2450
410
+ },
411
+ {
412
+ "epoch": 5.0,
413
+ "grad_norm": 36.34307098388672,
414
+ "learning_rate": 5.3297872340425535e-06,
415
+ "loss": 0.8036,
416
+ "step": 2500
417
+ },
418
+ {
419
+ "epoch": 5.0,
420
+ "eval_intent_accuracy": 0.868,
421
+ "eval_intent_macro_f1": 0.8051212440624268,
422
+ "eval_joint_score": 0.7852150389729482,
423
+ "eval_loss": 1.0913232564926147,
424
+ "eval_ner_f1": 0.7024300779458964,
425
+ "eval_ner_precision": 0.6772767462422635,
426
+ "eval_ner_recall": 0.7295238095238096,
427
+ "eval_runtime": 0.5824,
428
+ "eval_samples_per_second": 1717.074,
429
+ "eval_steps_per_second": 54.946,
430
+ "step": 2500
431
+ }
432
+ ],
433
+ "logging_steps": 50,
434
+ "max_steps": 3000,
435
+ "num_input_tokens_seen": 0,
436
+ "num_train_epochs": 6,
437
+ "save_steps": 500,
438
+ "stateful_callbacks": {
439
+ "EarlyStoppingCallback": {
440
+ "args": {
441
+ "early_stopping_patience": 2,
442
+ "early_stopping_threshold": 0.0005
443
+ },
444
+ "attributes": {
445
+ "early_stopping_patience_counter": 0
446
+ }
447
+ },
448
+ "TrainerControl": {
449
+ "args": {
450
+ "should_epoch_stop": false,
451
+ "should_evaluate": false,
452
+ "should_log": false,
453
+ "should_save": true,
454
+ "should_training_stop": false
455
+ },
456
+ "attributes": {}
457
+ }
458
+ },
459
+ "total_flos": 0.0,
460
+ "train_batch_size": 16,
461
+ "trial_name": null,
462
+ "trial_params": null
463
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f4f62880a212beb81193a9861931b6524a6e34b8db381dd37db521a5f5c4365
3
+ size 5905
checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8f9aa6c167366934527385354bfc97c0141c5aa07ebd9840679f44af003e05
3
+ size 1112724220
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab521bdc935ebe8e3adbfe6efdc13f07767aaae1d887c688f07fc2ffa9bdaee1
3
+ size 2220847627
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7fb16fe58d7a2babe8b210fc16add386e796fcbfaa7b1b7c8e812b5b7f5ba51
3
+ size 14645
checkpoint-3000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb991a0a0a68e475b29937e715b9a48479241229b17b1bcd57cfc33b936aaa3b
3
+ size 1383
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4ddfbbe4d25d106204e7026b316dfbe9c8fb4352b2ed1fb5bbb4c4276142d1
3
+ size 1465
checkpoint-3000/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
checkpoint-3000/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
checkpoint-3000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0091a328b3441d754e481db5a390d7f3b8dabc6016869fd13ba350d23ddc4cd
3
+ size 17082832
checkpoint-3000/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3000,
3
+ "best_metric": 0.7091743119266055,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/NLP/Multi-Task/data/massive_tr/xlmr-multihead/checkpoint-3000",
5
+ "epoch": 6.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1,
14
+ "grad_norm": 31.571680068969727,
15
+ "learning_rate": 8.166666666666666e-06,
16
+ "loss": 9.5398,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.2,
21
+ "grad_norm": 20.519420623779297,
22
+ "learning_rate": 1.65e-05,
23
+ "loss": 6.6539,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.3,
28
+ "grad_norm": 14.259832382202148,
29
+ "learning_rate": 2.483333333333333e-05,
30
+ "loss": 6.1233,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.4,
35
+ "grad_norm": 28.014291763305664,
36
+ "learning_rate": 2.979787234042553e-05,
37
+ "loss": 5.6725,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.5,
42
+ "grad_norm": 28.722820281982422,
43
+ "learning_rate": 2.926595744680851e-05,
44
+ "loss": 5.3297,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.6,
49
+ "grad_norm": 38.051170349121094,
50
+ "learning_rate": 2.873404255319149e-05,
51
+ "loss": 4.5911,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.7,
56
+ "grad_norm": 26.39080810546875,
57
+ "learning_rate": 2.820212765957447e-05,
58
+ "loss": 4.017,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.8,
63
+ "grad_norm": 18.74272918701172,
64
+ "learning_rate": 2.7670212765957448e-05,
65
+ "loss": 3.6629,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.9,
70
+ "grad_norm": 18.32183074951172,
71
+ "learning_rate": 2.7138297872340427e-05,
72
+ "loss": 3.3781,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 1.0,
77
+ "grad_norm": 29.03256607055664,
78
+ "learning_rate": 2.6606382978723407e-05,
79
+ "loss": 2.9562,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_intent_accuracy": 0.704,
85
+ "eval_intent_macro_f1": 0.48933652636220204,
86
+ "eval_joint_score": 0.5881757105943153,
87
+ "eval_loss": 2.396956443786621,
88
+ "eval_ner_f1": 0.4723514211886305,
89
+ "eval_ner_precision": 0.5163841807909605,
90
+ "eval_ner_recall": 0.43523809523809526,
91
+ "eval_runtime": 0.5889,
92
+ "eval_samples_per_second": 1698.154,
93
+ "eval_steps_per_second": 54.341,
94
+ "step": 500
95
+ },
96
+ {
97
+ "epoch": 1.1,
98
+ "grad_norm": 40.97825241088867,
99
+ "learning_rate": 2.6074468085106382e-05,
100
+ "loss": 2.701,
101
+ "step": 550
102
+ },
103
+ {
104
+ "epoch": 1.2,
105
+ "grad_norm": 23.553442001342773,
106
+ "learning_rate": 2.554255319148936e-05,
107
+ "loss": 2.4746,
108
+ "step": 600
109
+ },
110
+ {
111
+ "epoch": 1.3,
112
+ "grad_norm": 26.10988426208496,
113
+ "learning_rate": 2.501063829787234e-05,
114
+ "loss": 2.3423,
115
+ "step": 650
116
+ },
117
+ {
118
+ "epoch": 1.4,
119
+ "grad_norm": 19.3520565032959,
120
+ "learning_rate": 2.447872340425532e-05,
121
+ "loss": 2.1157,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 1.5,
126
+ "grad_norm": 34.30548858642578,
127
+ "learning_rate": 2.3946808510638296e-05,
128
+ "loss": 1.9679,
129
+ "step": 750
130
+ },
131
+ {
132
+ "epoch": 1.6,
133
+ "grad_norm": 15.152769088745117,
134
+ "learning_rate": 2.341489361702128e-05,
135
+ "loss": 1.9404,
136
+ "step": 800
137
+ },
138
+ {
139
+ "epoch": 1.7,
140
+ "grad_norm": 25.86590003967285,
141
+ "learning_rate": 2.2882978723404258e-05,
142
+ "loss": 1.97,
143
+ "step": 850
144
+ },
145
+ {
146
+ "epoch": 1.8,
147
+ "grad_norm": 18.924379348754883,
148
+ "learning_rate": 2.2351063829787237e-05,
149
+ "loss": 1.8073,
150
+ "step": 900
151
+ },
152
+ {
153
+ "epoch": 1.9,
154
+ "grad_norm": 20.5709171295166,
155
+ "learning_rate": 2.1819148936170213e-05,
156
+ "loss": 1.7987,
157
+ "step": 950
158
+ },
159
+ {
160
+ "epoch": 2.0,
161
+ "grad_norm": 31.26434898376465,
162
+ "learning_rate": 2.1287234042553192e-05,
163
+ "loss": 1.7652,
164
+ "step": 1000
165
+ },
166
+ {
167
+ "epoch": 2.0,
168
+ "eval_intent_accuracy": 0.833,
169
+ "eval_intent_macro_f1": 0.7032350726349212,
170
+ "eval_joint_score": 0.7102992125984251,
171
+ "eval_loss": 1.4518628120422363,
172
+ "eval_ner_f1": 0.5875984251968503,
173
+ "eval_ner_precision": 0.6079429735234216,
174
+ "eval_ner_recall": 0.5685714285714286,
175
+ "eval_runtime": 0.595,
176
+ "eval_samples_per_second": 1680.634,
177
+ "eval_steps_per_second": 53.78,
178
+ "step": 1000
179
+ },
180
+ {
181
+ "epoch": 2.1,
182
+ "grad_norm": 35.977622985839844,
183
+ "learning_rate": 2.075531914893617e-05,
184
+ "loss": 1.4852,
185
+ "step": 1050
186
+ },
187
+ {
188
+ "epoch": 2.2,
189
+ "grad_norm": 30.942049026489258,
190
+ "learning_rate": 2.0223404255319147e-05,
191
+ "loss": 1.3909,
192
+ "step": 1100
193
+ },
194
+ {
195
+ "epoch": 2.3,
196
+ "grad_norm": 29.61802101135254,
197
+ "learning_rate": 1.9691489361702126e-05,
198
+ "loss": 1.4372,
199
+ "step": 1150
200
+ },
201
+ {
202
+ "epoch": 2.4,
203
+ "grad_norm": 17.332490921020508,
204
+ "learning_rate": 1.915957446808511e-05,
205
+ "loss": 1.4184,
206
+ "step": 1200
207
+ },
208
+ {
209
+ "epoch": 2.5,
210
+ "grad_norm": 9.733820915222168,
211
+ "learning_rate": 1.8627659574468088e-05,
212
+ "loss": 1.3071,
213
+ "step": 1250
214
+ },
215
+ {
216
+ "epoch": 2.6,
217
+ "grad_norm": 22.356639862060547,
218
+ "learning_rate": 1.8095744680851064e-05,
219
+ "loss": 1.3381,
220
+ "step": 1300
221
+ },
222
+ {
223
+ "epoch": 2.7,
224
+ "grad_norm": 26.953872680664062,
225
+ "learning_rate": 1.7563829787234043e-05,
226
+ "loss": 1.347,
227
+ "step": 1350
228
+ },
229
+ {
230
+ "epoch": 2.8,
231
+ "grad_norm": 39.545013427734375,
232
+ "learning_rate": 1.7031914893617022e-05,
233
+ "loss": 1.2399,
234
+ "step": 1400
235
+ },
236
+ {
237
+ "epoch": 2.9,
238
+ "grad_norm": 29.881067276000977,
239
+ "learning_rate": 1.65e-05,
240
+ "loss": 1.2018,
241
+ "step": 1450
242
+ },
243
+ {
244
+ "epoch": 3.0,
245
+ "grad_norm": 34.384517669677734,
246
+ "learning_rate": 1.5968085106382977e-05,
247
+ "loss": 1.2898,
248
+ "step": 1500
249
+ },
250
+ {
251
+ "epoch": 3.0,
252
+ "eval_intent_accuracy": 0.843,
253
+ "eval_intent_macro_f1": 0.7342243232665543,
254
+ "eval_joint_score": 0.7419320432043204,
255
+ "eval_loss": 1.2403136491775513,
256
+ "eval_ner_f1": 0.6408640864086408,
257
+ "eval_ner_precision": 0.6075085324232082,
258
+ "eval_ner_recall": 0.6780952380952381,
259
+ "eval_runtime": 0.5931,
260
+ "eval_samples_per_second": 1686.094,
261
+ "eval_steps_per_second": 53.955,
262
+ "step": 1500
263
+ },
264
+ {
265
+ "epoch": 3.1,
266
+ "grad_norm": 40.15689468383789,
267
+ "learning_rate": 1.5436170212765956e-05,
268
+ "loss": 1.0928,
269
+ "step": 1550
270
+ },
271
+ {
272
+ "epoch": 3.2,
273
+ "grad_norm": 14.882131576538086,
274
+ "learning_rate": 1.4904255319148937e-05,
275
+ "loss": 0.971,
276
+ "step": 1600
277
+ },
278
+ {
279
+ "epoch": 3.3,
280
+ "grad_norm": 16.319412231445312,
281
+ "learning_rate": 1.4372340425531915e-05,
282
+ "loss": 1.1631,
283
+ "step": 1650
284
+ },
285
+ {
286
+ "epoch": 3.4,
287
+ "grad_norm": 26.943748474121094,
288
+ "learning_rate": 1.3840425531914896e-05,
289
+ "loss": 1.0528,
290
+ "step": 1700
291
+ },
292
+ {
293
+ "epoch": 3.5,
294
+ "grad_norm": 19.07010841369629,
295
+ "learning_rate": 1.3308510638297873e-05,
296
+ "loss": 1.0475,
297
+ "step": 1750
298
+ },
299
+ {
300
+ "epoch": 3.6,
301
+ "grad_norm": 16.704652786254883,
302
+ "learning_rate": 1.277659574468085e-05,
303
+ "loss": 1.0072,
304
+ "step": 1800
305
+ },
306
+ {
307
+ "epoch": 3.7,
308
+ "grad_norm": 20.118215560913086,
309
+ "learning_rate": 1.224468085106383e-05,
310
+ "loss": 1.0892,
311
+ "step": 1850
312
+ },
313
+ {
314
+ "epoch": 3.8,
315
+ "grad_norm": 25.932292938232422,
316
+ "learning_rate": 1.1712765957446809e-05,
317
+ "loss": 0.8766,
318
+ "step": 1900
319
+ },
320
+ {
321
+ "epoch": 3.9,
322
+ "grad_norm": 20.448410034179688,
323
+ "learning_rate": 1.1180851063829788e-05,
324
+ "loss": 0.8948,
325
+ "step": 1950
326
+ },
327
+ {
328
+ "epoch": 4.0,
329
+ "grad_norm": 13.030745506286621,
330
+ "learning_rate": 1.0648936170212766e-05,
331
+ "loss": 0.9611,
332
+ "step": 2000
333
+ },
334
+ {
335
+ "epoch": 4.0,
336
+ "eval_intent_accuracy": 0.865,
337
+ "eval_intent_macro_f1": 0.7845036463088115,
338
+ "eval_joint_score": 0.7702941855099215,
339
+ "eval_loss": 1.1503586769104004,
340
+ "eval_ner_f1": 0.675588371019843,
341
+ "eval_ner_precision": 0.6553267681289168,
342
+ "eval_ner_recall": 0.6971428571428572,
343
+ "eval_runtime": 0.5885,
344
+ "eval_samples_per_second": 1699.13,
345
+ "eval_steps_per_second": 54.372,
346
+ "step": 2000
347
+ },
348
+ {
349
+ "epoch": 4.1,
350
+ "grad_norm": 7.756314754486084,
351
+ "learning_rate": 1.0117021276595745e-05,
352
+ "loss": 0.7831,
353
+ "step": 2050
354
+ },
355
+ {
356
+ "epoch": 4.2,
357
+ "grad_norm": 15.828548431396484,
358
+ "learning_rate": 9.585106382978724e-06,
359
+ "loss": 0.7917,
360
+ "step": 2100
361
+ },
362
+ {
363
+ "epoch": 4.3,
364
+ "grad_norm": 17.722307205200195,
365
+ "learning_rate": 9.053191489361703e-06,
366
+ "loss": 0.7958,
367
+ "step": 2150
368
+ },
369
+ {
370
+ "epoch": 4.4,
371
+ "grad_norm": 31.272441864013672,
372
+ "learning_rate": 8.52127659574468e-06,
373
+ "loss": 0.7537,
374
+ "step": 2200
375
+ },
376
+ {
377
+ "epoch": 4.5,
378
+ "grad_norm": 37.739131927490234,
379
+ "learning_rate": 7.98936170212766e-06,
380
+ "loss": 0.8002,
381
+ "step": 2250
382
+ },
383
+ {
384
+ "epoch": 4.6,
385
+ "grad_norm": 29.077699661254883,
386
+ "learning_rate": 7.457446808510638e-06,
387
+ "loss": 0.8878,
388
+ "step": 2300
389
+ },
390
+ {
391
+ "epoch": 4.7,
392
+ "grad_norm": 43.24278259277344,
393
+ "learning_rate": 6.925531914893618e-06,
394
+ "loss": 0.8453,
395
+ "step": 2350
396
+ },
397
+ {
398
+ "epoch": 4.8,
399
+ "grad_norm": 94.07080841064453,
400
+ "learning_rate": 6.393617021276596e-06,
401
+ "loss": 0.7931,
402
+ "step": 2400
403
+ },
404
+ {
405
+ "epoch": 4.9,
406
+ "grad_norm": 20.592226028442383,
407
+ "learning_rate": 5.861702127659575e-06,
408
+ "loss": 0.7832,
409
+ "step": 2450
410
+ },
411
+ {
412
+ "epoch": 5.0,
413
+ "grad_norm": 36.34307098388672,
414
+ "learning_rate": 5.3297872340425535e-06,
415
+ "loss": 0.8036,
416
+ "step": 2500
417
+ },
418
+ {
419
+ "epoch": 5.0,
420
+ "eval_intent_accuracy": 0.868,
421
+ "eval_intent_macro_f1": 0.8051212440624268,
422
+ "eval_joint_score": 0.7852150389729482,
423
+ "eval_loss": 1.0913232564926147,
424
+ "eval_ner_f1": 0.7024300779458964,
425
+ "eval_ner_precision": 0.6772767462422635,
426
+ "eval_ner_recall": 0.7295238095238096,
427
+ "eval_runtime": 0.5824,
428
+ "eval_samples_per_second": 1717.074,
429
+ "eval_steps_per_second": 54.946,
430
+ "step": 2500
431
+ },
432
+ {
433
+ "epoch": 5.1,
434
+ "grad_norm": 13.311659812927246,
435
+ "learning_rate": 4.797872340425533e-06,
436
+ "loss": 0.6409,
437
+ "step": 2550
438
+ },
439
+ {
440
+ "epoch": 5.2,
441
+ "grad_norm": 17.807374954223633,
442
+ "learning_rate": 4.265957446808511e-06,
443
+ "loss": 0.7401,
444
+ "step": 2600
445
+ },
446
+ {
447
+ "epoch": 5.3,
448
+ "grad_norm": 8.320006370544434,
449
+ "learning_rate": 3.7340425531914894e-06,
450
+ "loss": 0.668,
451
+ "step": 2650
452
+ },
453
+ {
454
+ "epoch": 5.4,
455
+ "grad_norm": 20.279203414916992,
456
+ "learning_rate": 3.202127659574468e-06,
457
+ "loss": 0.6477,
458
+ "step": 2700
459
+ },
460
+ {
461
+ "epoch": 5.5,
462
+ "grad_norm": 23.9965763092041,
463
+ "learning_rate": 2.670212765957447e-06,
464
+ "loss": 0.7239,
465
+ "step": 2750
466
+ },
467
+ {
468
+ "epoch": 5.6,
469
+ "grad_norm": 38.03826904296875,
470
+ "learning_rate": 2.1382978723404258e-06,
471
+ "loss": 0.6816,
472
+ "step": 2800
473
+ },
474
+ {
475
+ "epoch": 5.7,
476
+ "grad_norm": 17.692941665649414,
477
+ "learning_rate": 1.6063829787234043e-06,
478
+ "loss": 0.6778,
479
+ "step": 2850
480
+ },
481
+ {
482
+ "epoch": 5.8,
483
+ "grad_norm": 33.07294464111328,
484
+ "learning_rate": 1.074468085106383e-06,
485
+ "loss": 0.694,
486
+ "step": 2900
487
+ },
488
+ {
489
+ "epoch": 5.9,
490
+ "grad_norm": 17.965429306030273,
491
+ "learning_rate": 5.425531914893617e-07,
492
+ "loss": 0.6421,
493
+ "step": 2950
494
+ },
495
+ {
496
+ "epoch": 6.0,
497
+ "grad_norm": 15.919111251831055,
498
+ "learning_rate": 1.0638297872340427e-08,
499
+ "loss": 0.6727,
500
+ "step": 3000
501
+ },
502
+ {
503
+ "epoch": 6.0,
504
+ "eval_intent_accuracy": 0.87,
505
+ "eval_intent_macro_f1": 0.8193369098618751,
506
+ "eval_joint_score": 0.7895871559633028,
507
+ "eval_loss": 1.0895804166793823,
508
+ "eval_ner_f1": 0.7091743119266055,
509
+ "eval_ner_precision": 0.684070796460177,
510
+ "eval_ner_recall": 0.7361904761904762,
511
+ "eval_runtime": 0.6107,
512
+ "eval_samples_per_second": 1637.488,
513
+ "eval_steps_per_second": 52.4,
514
+ "step": 3000
515
+ }
516
+ ],
517
+ "logging_steps": 50,
518
+ "max_steps": 3000,
519
+ "num_input_tokens_seen": 0,
520
+ "num_train_epochs": 6,
521
+ "save_steps": 500,
522
+ "stateful_callbacks": {
523
+ "EarlyStoppingCallback": {
524
+ "args": {
525
+ "early_stopping_patience": 2,
526
+ "early_stopping_threshold": 0.0005
527
+ },
528
+ "attributes": {
529
+ "early_stopping_patience_counter": 0
530
+ }
531
+ },
532
+ "TrainerControl": {
533
+ "args": {
534
+ "should_epoch_stop": false,
535
+ "should_evaluate": false,
536
+ "should_log": false,
537
+ "should_save": true,
538
+ "should_training_stop": true
539
+ },
540
+ "attributes": {}
541
+ }
542
+ },
543
+ "total_flos": 0.0,
544
+ "train_batch_size": 16,
545
+ "trial_name": null,
546
+ "trial_params": null
547
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f4f62880a212beb81193a9861931b6524a6e34b8db381dd37db521a5f5c4365
3
+ size 5905
label_schemes.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "intents": [
3
+ "intent_0",
4
+ "intent_1",
5
+ "intent_10",
6
+ "intent_11",
7
+ "intent_12",
8
+ "intent_13",
9
+ "intent_14",
10
+ "intent_15",
11
+ "intent_16",
12
+ "intent_17",
13
+ "intent_18",
14
+ "intent_19",
15
+ "intent_2",
16
+ "intent_20",
17
+ "intent_21",
18
+ "intent_22",
19
+ "intent_23",
20
+ "intent_24",
21
+ "intent_25",
22
+ "intent_26",
23
+ "intent_27",
24
+ "intent_28",
25
+ "intent_29",
26
+ "intent_3",
27
+ "intent_30",
28
+ "intent_31",
29
+ "intent_32",
30
+ "intent_33",
31
+ "intent_34",
32
+ "intent_35",
33
+ "intent_36",
34
+ "intent_37",
35
+ "intent_38",
36
+ "intent_39",
37
+ "intent_4",
38
+ "intent_40",
39
+ "intent_41",
40
+ "intent_42",
41
+ "intent_43",
42
+ "intent_44",
43
+ "intent_45",
44
+ "intent_46",
45
+ "intent_47",
46
+ "intent_48",
47
+ "intent_49",
48
+ "intent_5",
49
+ "intent_50",
50
+ "intent_51",
51
+ "intent_52",
52
+ "intent_53",
53
+ "intent_54",
54
+ "intent_55",
55
+ "intent_56",
56
+ "intent_57",
57
+ "intent_58",
58
+ "intent_59",
59
+ "intent_6",
60
+ "intent_7",
61
+ "intent_8",
62
+ "intent_9"
63
+ ],
64
+ "sentiments": [
65
+ "pozitif",
66
+ "negatif",
67
+ "nötr"
68
+ ],
69
+ "dialogue_acts": [
70
+ "soru",
71
+ "istek",
72
+ "teşekkür",
73
+ "selamlama",
74
+ "onay",
75
+ "red",
76
+ "açıklama",
77
+ "cevap",
78
+ "düzeltme"
79
+ ],
80
+ "ner_tags": [
81
+ "O",
82
+ "B-ALARM_TYPE",
83
+ "I-ALARM_TYPE",
84
+ "B-APP_NAME",
85
+ "I-APP_NAME",
86
+ "B-ARTIST_NAME",
87
+ "I-ARTIST_NAME",
88
+ "B-AUDIOBOOK_AUTHOR",
89
+ "I-AUDIOBOOK_AUTHOR",
90
+ "B-AUDIOBOOK_NAME",
91
+ "I-AUDIOBOOK_NAME",
92
+ "B-BUSINESS_NAME",
93
+ "I-BUSINESS_NAME",
94
+ "B-BUSINESS_TYPE",
95
+ "I-BUSINESS_TYPE",
96
+ "B-CHANGE_AMOUNT",
97
+ "I-CHANGE_AMOUNT",
98
+ "B-COFFEE_TYPE",
99
+ "I-COFFEE_TYPE",
100
+ "B-COLOR_TYPE",
101
+ "I-COLOR_TYPE",
102
+ "B-COOKING_TYPE",
103
+ "I-COOKING_TYPE",
104
+ "B-CURRENCY_NAME",
105
+ "I-CURRENCY_NAME",
106
+ "B-DATE",
107
+ "I-DATE",
108
+ "B-DEFINITION_WORD",
109
+ "I-DEFINITION_WORD",
110
+ "B-DEVICE_TYPE",
111
+ "I-DEVICE_TYPE",
112
+ "B-DRINK_TYPE",
113
+ "I-DRINK_TYPE",
114
+ "B-EMAIL_ADDRESS",
115
+ "I-EMAIL_ADDRESS",
116
+ "B-EMAIL_FOLDER",
117
+ "I-EMAIL_FOLDER",
118
+ "B-EVENT_NAME",
119
+ "I-EVENT_NAME",
120
+ "B-FOOD_TYPE",
121
+ "I-FOOD_TYPE",
122
+ "B-GAME_NAME",
123
+ "I-GAME_NAME",
124
+ "B-GAME_TYPE",
125
+ "I-GAME_TYPE",
126
+ "B-GENERAL_FREQUENCY",
127
+ "I-GENERAL_FREQUENCY",
128
+ "B-HOUSE_PLACE",
129
+ "I-HOUSE_PLACE",
130
+ "B-INGREDIENT",
131
+ "I-INGREDIENT",
132
+ "B-JOKE_TYPE",
133
+ "I-JOKE_TYPE",
134
+ "B-LIST_NAME",
135
+ "I-LIST_NAME",
136
+ "B-MEAL_TYPE",
137
+ "I-MEAL_TYPE",
138
+ "B-MEDIA_TYPE",
139
+ "I-MEDIA_TYPE",
140
+ "B-MOVIE_NAME",
141
+ "I-MOVIE_NAME",
142
+ "B-MOVIE_TYPE",
143
+ "I-MOVIE_TYPE",
144
+ "B-MUSIC_ALBUM",
145
+ "I-MUSIC_ALBUM",
146
+ "B-MUSIC_DESCRIPTOR",
147
+ "I-MUSIC_DESCRIPTOR",
148
+ "B-MUSIC_GENRE",
149
+ "I-MUSIC_GENRE",
150
+ "B-NEWS_TOPIC",
151
+ "I-NEWS_TOPIC",
152
+ "B-ORDER_TYPE",
153
+ "I-ORDER_TYPE",
154
+ "B-PERSON",
155
+ "I-PERSON",
156
+ "B-PERSONAL_INFO",
157
+ "I-PERSONAL_INFO",
158
+ "B-PLACE_NAME",
159
+ "I-PLACE_NAME",
160
+ "B-PLAYER_SETTING",
161
+ "I-PLAYER_SETTING",
162
+ "B-PLAYLIST_NAME",
163
+ "I-PLAYLIST_NAME",
164
+ "B-PODCAST_DESCRIPTOR",
165
+ "I-PODCAST_DESCRIPTOR",
166
+ "B-PODCAST_NAME",
167
+ "I-PODCAST_NAME",
168
+ "B-RADIO_NAME",
169
+ "I-RADIO_NAME",
170
+ "B-RELATION",
171
+ "I-RELATION",
172
+ "B-SONG_NAME",
173
+ "I-SONG_NAME",
174
+ "B-SPORT_TYPE",
175
+ "I-SPORT_TYPE",
176
+ "B-TIME",
177
+ "I-TIME",
178
+ "B-TIME_ZONE",
179
+ "I-TIME_ZONE",
180
+ "B-TIMEOFDAY",
181
+ "I-TIMEOFDAY",
182
+ "B-TRANSPORT_AGENCY",
183
+ "I-TRANSPORT_AGENCY",
184
+ "B-TRANSPORT_DESCRIPTOR",
185
+ "I-TRANSPORT_DESCRIPTOR",
186
+ "B-TRANSPORT_NAME",
187
+ "I-TRANSPORT_NAME",
188
+ "B-TRANSPORT_TYPE",
189
+ "I-TRANSPORT_TYPE",
190
+ "B-WEATHER_DESCRIPTOR",
191
+ "I-WEATHER_DESCRIPTOR"
192
+ ]
193
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8f9aa6c167366934527385354bfc97c0141c5aa07ebd9840679f44af003e05
3
+ size 1112724220
modeling_xlmr_multihead.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import AutoModel
4
+
5
+ IGNORE_IDX = -100
6
+
7
+ class XLMRMultiHead(nn.Module):
8
+ def __init__(self, base="xlm-roberta-base", n_intent=0, n_ner=0, dropout=0.1):
9
+ super().__init__()
10
+ self.enc = AutoModel.from_pretrained(base)
11
+ h = self.enc.config.hidden_size
12
+ self.drop = nn.Dropout(dropout)
13
+ self.intent = nn.Linear(h, n_intent)
14
+ self.ner = nn.Linear(h, n_ner)
15
+ self.ce_int = nn.CrossEntropyLoss()
16
+ self.ce_tok = nn.CrossEntropyLoss(ignore_index=IGNORE_IDX)
17
+
18
+ def forward(self, input_ids, attention_mask, labels_intent=None, labels_ner=None):
19
+ out = self.enc(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
20
+ cls = self.drop(out.last_hidden_state[:,0])
21
+ seq = self.drop(out.last_hidden_state)
22
+
23
+ li = self.intent(cls) # [B, n_intent]
24
+ ln = self.ner(seq) # [B, T, n_ner]
25
+
26
+ loss=None
27
+ if labels_intent is not None and labels_ner is not None:
28
+ l_i = self.ce_int(li, labels_intent)
29
+ l_n = self.ce_tok(ln.reshape(-1, ln.size(-1)), labels_ner.reshape(-1))
30
+ loss = 1.0*l_i + 0.8*l_n
31
+ return {"loss": loss, "logits_intent": li, "logits_ner": ln}
runs/Sep01_10-59-23_a3c5035f483f/events.out.tfevents.1756724394.a3c5035f483f.2331.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97df4cc4f8bd4a3821ed1e75af425693a017d06c268e7105099dbf8d00d26282
3
+ size 4388
runs/Sep01_11-21-03_a3c5035f483f/events.out.tfevents.1756725668.a3c5035f483f.2331.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0027b16012ae2f6b1f4d523cacc88ddc56c66aa277ad1689d26ebd78e95c954a
3
+ size 4390
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af6aa7f169ffc6462ef897edb214dfc2de1217c2669749e57b6f94c096232f7
3
+ size 17082832
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f4f62880a212beb81193a9861931b6524a6e34b8db381dd37db521a5f5c4365
3
+ size 5905