franfj commited on
Commit
d40af78
·
verified ·
1 Parent(s): f5f90b5

Upload folder using huggingface_hub

Browse files
checkpoint-4869/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-multilingual-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "output_past": true,
17
+ "pad_token_id": 0,
18
+ "problem_type": "single_label_classification",
19
+ "qa_dropout": 0.1,
20
+ "seq_classif_dropout": 0.2,
21
+ "sinusoidal_pos_embds": false,
22
+ "tie_weights_": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.48.3",
25
+ "vocab_size": 119547
26
+ }
checkpoint-4869/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2dc1a034fc94f3ba1ade1a879613bfe896c3069f8f2586e9b2dd85aabc8d687
3
+ size 541317368
checkpoint-4869/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bff5bf1678b79658f22adcb363b05ee9952aa08dcd853472fc486982076e19
3
+ size 1082696890
checkpoint-4869/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5922254a568679b10af963b3b855e6ca274e9e9099d2e30915bca951a4bbd620
3
+ size 14244
checkpoint-4869/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eacc40b7c65dbbb072f5857b7428d653eaf2c4eab5e6a3120a3fd798961ec10e
3
+ size 1064
checkpoint-4869/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-4869/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4869/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-4869/trainer_state.json ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4869,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.030807147258163893,
13
+ "grad_norm": 0.8413804173469543,
14
+ "learning_rate": 4.948654754569727e-05,
15
+ "loss": 0.6254,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.061614294516327786,
20
+ "grad_norm": 0.8895371556282043,
21
+ "learning_rate": 4.8973095091394537e-05,
22
+ "loss": 0.6182,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.09242144177449169,
27
+ "grad_norm": 0.8498105406761169,
28
+ "learning_rate": 4.845964263709181e-05,
29
+ "loss": 0.6218,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.12322858903265557,
34
+ "grad_norm": 0.5705401301383972,
35
+ "learning_rate": 4.794619018278908e-05,
36
+ "loss": 0.6157,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.15403573629081946,
41
+ "grad_norm": 0.4433952271938324,
42
+ "learning_rate": 4.743273772848634e-05,
43
+ "loss": 0.62,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.18484288354898337,
48
+ "grad_norm": 0.6152193546295166,
49
+ "learning_rate": 4.691928527418361e-05,
50
+ "loss": 0.6155,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.21565003080714726,
55
+ "grad_norm": 0.6297628879547119,
56
+ "learning_rate": 4.640583281988088e-05,
57
+ "loss": 0.6052,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.24645717806531114,
62
+ "grad_norm": 1.396401047706604,
63
+ "learning_rate": 4.5892380365578146e-05,
64
+ "loss": 0.6109,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.27726432532347506,
69
+ "grad_norm": 0.8087674975395203,
70
+ "learning_rate": 4.537892791127542e-05,
71
+ "loss": 0.6189,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.3080714725816389,
76
+ "grad_norm": 0.6552234888076782,
77
+ "learning_rate": 4.486547545697269e-05,
78
+ "loss": 0.6038,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.33887861983980283,
83
+ "grad_norm": 0.705816388130188,
84
+ "learning_rate": 4.435202300266996e-05,
85
+ "loss": 0.5924,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.36968576709796674,
90
+ "grad_norm": 0.7386820912361145,
91
+ "learning_rate": 4.383857054836723e-05,
92
+ "loss": 0.6001,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.4004929143561306,
97
+ "grad_norm": 0.5741195678710938,
98
+ "learning_rate": 4.332511809406449e-05,
99
+ "loss": 0.605,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.4313000616142945,
104
+ "grad_norm": 1.0655230283737183,
105
+ "learning_rate": 4.281166563976176e-05,
106
+ "loss": 0.611,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.46210720887245843,
111
+ "grad_norm": 0.8722517490386963,
112
+ "learning_rate": 4.2298213185459026e-05,
113
+ "loss": 0.6034,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.4929143561306223,
118
+ "grad_norm": 1.067927360534668,
119
+ "learning_rate": 4.1784760731156296e-05,
120
+ "loss": 0.5969,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5237215033887862,
125
+ "grad_norm": 0.6687670350074768,
126
+ "learning_rate": 4.127130827685357e-05,
127
+ "loss": 0.5942,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5545286506469501,
132
+ "grad_norm": 0.5344964861869812,
133
+ "learning_rate": 4.075785582255083e-05,
134
+ "loss": 0.6074,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.585335797905114,
139
+ "grad_norm": 1.381353497505188,
140
+ "learning_rate": 4.02444033682481e-05,
141
+ "loss": 0.5867,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6161429451632778,
146
+ "grad_norm": 1.152223825454712,
147
+ "learning_rate": 3.973095091394537e-05,
148
+ "loss": 0.5978,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6469500924214417,
153
+ "grad_norm": 0.6478285193443298,
154
+ "learning_rate": 3.9217498459642635e-05,
155
+ "loss": 0.5964,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.6777572396796057,
160
+ "grad_norm": 0.5611660480499268,
161
+ "learning_rate": 3.8704046005339906e-05,
162
+ "loss": 0.6018,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7085643869377696,
167
+ "grad_norm": 1.0561463832855225,
168
+ "learning_rate": 3.8190593551037176e-05,
169
+ "loss": 0.5934,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.7393715341959335,
174
+ "grad_norm": 0.6724879741668701,
175
+ "learning_rate": 3.767714109673444e-05,
176
+ "loss": 0.596,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.7701786814540974,
181
+ "grad_norm": 0.74288010597229,
182
+ "learning_rate": 3.716368864243171e-05,
183
+ "loss": 0.5967,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8009858287122612,
188
+ "grad_norm": 0.5019825100898743,
189
+ "learning_rate": 3.665023618812898e-05,
190
+ "loss": 0.607,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.8317929759704251,
195
+ "grad_norm": 0.5708956718444824,
196
+ "learning_rate": 3.613678373382625e-05,
197
+ "loss": 0.5914,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.862600123228589,
202
+ "grad_norm": 0.7714687585830688,
203
+ "learning_rate": 3.562333127952352e-05,
204
+ "loss": 0.5956,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.893407270486753,
209
+ "grad_norm": 0.5472440123558044,
210
+ "learning_rate": 3.5109878825220786e-05,
211
+ "loss": 0.595,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9242144177449169,
216
+ "grad_norm": 0.6139542460441589,
217
+ "learning_rate": 3.4596426370918056e-05,
218
+ "loss": 0.5856,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.9550215650030807,
223
+ "grad_norm": 0.9097696542739868,
224
+ "learning_rate": 3.4082973916615327e-05,
225
+ "loss": 0.5799,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 0.9858287122612446,
230
+ "grad_norm": 0.6651259660720825,
231
+ "learning_rate": 3.356952146231259e-05,
232
+ "loss": 0.6113,
233
+ "step": 1600
234
+ },
235
+ {
236
+ "epoch": 1.0,
237
+ "eval_accuracy": 0.6870184899845917,
238
+ "eval_f1": 0.3470224222454392,
239
+ "eval_loss": 0.5951406359672546,
240
+ "eval_runtime": 182.7287,
241
+ "eval_samples_per_second": 142.069,
242
+ "eval_steps_per_second": 2.222,
243
+ "step": 1623
244
+ },
245
+ {
246
+ "epoch": 1.0166358595194085,
247
+ "grad_norm": 0.8712685108184814,
248
+ "learning_rate": 3.305606900800986e-05,
249
+ "loss": 0.5899,
250
+ "step": 1650
251
+ },
252
+ {
253
+ "epoch": 1.0474430067775724,
254
+ "grad_norm": 0.8138744235038757,
255
+ "learning_rate": 3.254261655370713e-05,
256
+ "loss": 0.5791,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 1.0782501540357363,
261
+ "grad_norm": 0.6202666759490967,
262
+ "learning_rate": 3.2029164099404395e-05,
263
+ "loss": 0.5763,
264
+ "step": 1750
265
+ },
266
+ {
267
+ "epoch": 1.1090573012939002,
268
+ "grad_norm": 0.9712618589401245,
269
+ "learning_rate": 3.1515711645101665e-05,
270
+ "loss": 0.5755,
271
+ "step": 1800
272
+ },
273
+ {
274
+ "epoch": 1.1398644485520641,
275
+ "grad_norm": 0.75148606300354,
276
+ "learning_rate": 3.100225919079893e-05,
277
+ "loss": 0.5805,
278
+ "step": 1850
279
+ },
280
+ {
281
+ "epoch": 1.170671595810228,
282
+ "grad_norm": 0.9125542044639587,
283
+ "learning_rate": 3.04888067364962e-05,
284
+ "loss": 0.5953,
285
+ "step": 1900
286
+ },
287
+ {
288
+ "epoch": 1.201478743068392,
289
+ "grad_norm": 0.6646100878715515,
290
+ "learning_rate": 2.997535428219347e-05,
291
+ "loss": 0.5863,
292
+ "step": 1950
293
+ },
294
+ {
295
+ "epoch": 1.2322858903265557,
296
+ "grad_norm": 0.8849004507064819,
297
+ "learning_rate": 2.9461901827890737e-05,
298
+ "loss": 0.5774,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 1.2630930375847196,
303
+ "grad_norm": 0.615292489528656,
304
+ "learning_rate": 2.8948449373588004e-05,
305
+ "loss": 0.5721,
306
+ "step": 2050
307
+ },
308
+ {
309
+ "epoch": 1.2939001848428835,
310
+ "grad_norm": 0.605387806892395,
311
+ "learning_rate": 2.8434996919285278e-05,
312
+ "loss": 0.5841,
313
+ "step": 2100
314
+ },
315
+ {
316
+ "epoch": 1.3247073321010474,
317
+ "grad_norm": 0.579868495464325,
318
+ "learning_rate": 2.7921544464982545e-05,
319
+ "loss": 0.5777,
320
+ "step": 2150
321
+ },
322
+ {
323
+ "epoch": 1.3555144793592113,
324
+ "grad_norm": 0.9773715138435364,
325
+ "learning_rate": 2.7408092010679816e-05,
326
+ "loss": 0.5824,
327
+ "step": 2200
328
+ },
329
+ {
330
+ "epoch": 1.3863216266173752,
331
+ "grad_norm": 0.7113815546035767,
332
+ "learning_rate": 2.6894639556377083e-05,
333
+ "loss": 0.5898,
334
+ "step": 2250
335
+ },
336
+ {
337
+ "epoch": 1.4171287738755391,
338
+ "grad_norm": 0.8966727256774902,
339
+ "learning_rate": 2.638118710207435e-05,
340
+ "loss": 0.5761,
341
+ "step": 2300
342
+ },
343
+ {
344
+ "epoch": 1.447935921133703,
345
+ "grad_norm": 0.8858876824378967,
346
+ "learning_rate": 2.5867734647771617e-05,
347
+ "loss": 0.5828,
348
+ "step": 2350
349
+ },
350
+ {
351
+ "epoch": 1.478743068391867,
352
+ "grad_norm": 0.8401291966438293,
353
+ "learning_rate": 2.5354282193468888e-05,
354
+ "loss": 0.5934,
355
+ "step": 2400
356
+ },
357
+ {
358
+ "epoch": 1.5095502156500307,
359
+ "grad_norm": 0.6061798930168152,
360
+ "learning_rate": 2.4840829739166155e-05,
361
+ "loss": 0.5764,
362
+ "step": 2450
363
+ },
364
+ {
365
+ "epoch": 1.5403573629081948,
366
+ "grad_norm": 0.7705461382865906,
367
+ "learning_rate": 2.4327377284863422e-05,
368
+ "loss": 0.5822,
369
+ "step": 2500
370
+ },
371
+ {
372
+ "epoch": 1.5711645101663585,
373
+ "grad_norm": 0.8041555881500244,
374
+ "learning_rate": 2.3813924830560692e-05,
375
+ "loss": 0.5922,
376
+ "step": 2550
377
+ },
378
+ {
379
+ "epoch": 1.6019716574245226,
380
+ "grad_norm": 0.48748356103897095,
381
+ "learning_rate": 2.330047237625796e-05,
382
+ "loss": 0.5859,
383
+ "step": 2600
384
+ },
385
+ {
386
+ "epoch": 1.6327788046826863,
387
+ "grad_norm": 0.6026128530502319,
388
+ "learning_rate": 2.2787019921955226e-05,
389
+ "loss": 0.5857,
390
+ "step": 2650
391
+ },
392
+ {
393
+ "epoch": 1.6635859519408502,
394
+ "grad_norm": 0.8907531499862671,
395
+ "learning_rate": 2.2273567467652497e-05,
396
+ "loss": 0.5753,
397
+ "step": 2700
398
+ },
399
+ {
400
+ "epoch": 1.6943930991990142,
401
+ "grad_norm": 1.0833817720413208,
402
+ "learning_rate": 2.1760115013349767e-05,
403
+ "loss": 0.5872,
404
+ "step": 2750
405
+ },
406
+ {
407
+ "epoch": 1.725200246457178,
408
+ "grad_norm": 0.5048693418502808,
409
+ "learning_rate": 2.1246662559047035e-05,
410
+ "loss": 0.5808,
411
+ "step": 2800
412
+ },
413
+ {
414
+ "epoch": 1.756007393715342,
415
+ "grad_norm": 0.6286513209342957,
416
+ "learning_rate": 2.07332101047443e-05,
417
+ "loss": 0.5826,
418
+ "step": 2850
419
+ },
420
+ {
421
+ "epoch": 1.786814540973506,
422
+ "grad_norm": 0.9262789487838745,
423
+ "learning_rate": 2.021975765044157e-05,
424
+ "loss": 0.5818,
425
+ "step": 2900
426
+ },
427
+ {
428
+ "epoch": 1.8176216882316698,
429
+ "grad_norm": 0.7718830704689026,
430
+ "learning_rate": 1.970630519613884e-05,
431
+ "loss": 0.5878,
432
+ "step": 2950
433
+ },
434
+ {
435
+ "epoch": 1.8484288354898335,
436
+ "grad_norm": 0.8999438881874084,
437
+ "learning_rate": 1.9192852741836106e-05,
438
+ "loss": 0.5746,
439
+ "step": 3000
440
+ },
441
+ {
442
+ "epoch": 1.8792359827479976,
443
+ "grad_norm": 0.5566578507423401,
444
+ "learning_rate": 1.8679400287533373e-05,
445
+ "loss": 0.5778,
446
+ "step": 3050
447
+ },
448
+ {
449
+ "epoch": 1.9100431300061613,
450
+ "grad_norm": 1.171489953994751,
451
+ "learning_rate": 1.8165947833230644e-05,
452
+ "loss": 0.5806,
453
+ "step": 3100
454
+ },
455
+ {
456
+ "epoch": 1.9408502772643255,
457
+ "grad_norm": 0.6422250866889954,
458
+ "learning_rate": 1.7652495378927914e-05,
459
+ "loss": 0.5975,
460
+ "step": 3150
461
+ },
462
+ {
463
+ "epoch": 1.9716574245224892,
464
+ "grad_norm": 1.3185786008834839,
465
+ "learning_rate": 1.713904292462518e-05,
466
+ "loss": 0.574,
467
+ "step": 3200
468
+ },
469
+ {
470
+ "epoch": 2.0,
471
+ "eval_accuracy": 0.6854006163328197,
472
+ "eval_f1": 0.37727792603888677,
473
+ "eval_loss": 0.5936052799224854,
474
+ "eval_runtime": 183.2044,
475
+ "eval_samples_per_second": 141.7,
476
+ "eval_steps_per_second": 2.216,
477
+ "step": 3246
478
+ },
479
+ {
480
+ "epoch": 2.0024645717806533,
481
+ "grad_norm": 0.6023927330970764,
482
+ "learning_rate": 1.662559047032245e-05,
483
+ "loss": 0.5813,
484
+ "step": 3250
485
+ },
486
+ {
487
+ "epoch": 2.033271719038817,
488
+ "grad_norm": 0.7293614745140076,
489
+ "learning_rate": 1.611213801601972e-05,
490
+ "loss": 0.5615,
491
+ "step": 3300
492
+ },
493
+ {
494
+ "epoch": 2.064078866296981,
495
+ "grad_norm": 0.6881064772605896,
496
+ "learning_rate": 1.5598685561716986e-05,
497
+ "loss": 0.566,
498
+ "step": 3350
499
+ },
500
+ {
501
+ "epoch": 2.094886013555145,
502
+ "grad_norm": 1.230314016342163,
503
+ "learning_rate": 1.5085233107414253e-05,
504
+ "loss": 0.567,
505
+ "step": 3400
506
+ },
507
+ {
508
+ "epoch": 2.1256931608133085,
509
+ "grad_norm": 1.237333059310913,
510
+ "learning_rate": 1.4571780653111522e-05,
511
+ "loss": 0.5694,
512
+ "step": 3450
513
+ },
514
+ {
515
+ "epoch": 2.1565003080714726,
516
+ "grad_norm": 0.7606936693191528,
517
+ "learning_rate": 1.405832819880879e-05,
518
+ "loss": 0.5794,
519
+ "step": 3500
520
+ },
521
+ {
522
+ "epoch": 2.1873074553296363,
523
+ "grad_norm": 0.811718225479126,
524
+ "learning_rate": 1.3544875744506058e-05,
525
+ "loss": 0.5684,
526
+ "step": 3550
527
+ },
528
+ {
529
+ "epoch": 2.2181146025878005,
530
+ "grad_norm": 0.8104314804077148,
531
+ "learning_rate": 1.3031423290203328e-05,
532
+ "loss": 0.5698,
533
+ "step": 3600
534
+ },
535
+ {
536
+ "epoch": 2.248921749845964,
537
+ "grad_norm": 1.2725473642349243,
538
+ "learning_rate": 1.2517970835900597e-05,
539
+ "loss": 0.5701,
540
+ "step": 3650
541
+ },
542
+ {
543
+ "epoch": 2.2797288971041283,
544
+ "grad_norm": 0.8804383277893066,
545
+ "learning_rate": 1.2004518381597864e-05,
546
+ "loss": 0.5632,
547
+ "step": 3700
548
+ },
549
+ {
550
+ "epoch": 2.310536044362292,
551
+ "grad_norm": 0.6673324108123779,
552
+ "learning_rate": 1.1491065927295133e-05,
553
+ "loss": 0.5719,
554
+ "step": 3750
555
+ },
556
+ {
557
+ "epoch": 2.341343191620456,
558
+ "grad_norm": 0.7855513691902161,
559
+ "learning_rate": 1.0977613472992402e-05,
560
+ "loss": 0.5786,
561
+ "step": 3800
562
+ },
563
+ {
564
+ "epoch": 2.37215033887862,
565
+ "grad_norm": 0.730636477470398,
566
+ "learning_rate": 1.0464161018689669e-05,
567
+ "loss": 0.5698,
568
+ "step": 3850
569
+ },
570
+ {
571
+ "epoch": 2.402957486136784,
572
+ "grad_norm": 0.8906323909759521,
573
+ "learning_rate": 9.95070856438694e-06,
574
+ "loss": 0.5727,
575
+ "step": 3900
576
+ },
577
+ {
578
+ "epoch": 2.4337646333949476,
579
+ "grad_norm": 0.689626932144165,
580
+ "learning_rate": 9.437256110084207e-06,
581
+ "loss": 0.5616,
582
+ "step": 3950
583
+ },
584
+ {
585
+ "epoch": 2.4645717806531113,
586
+ "grad_norm": 1.2123029232025146,
587
+ "learning_rate": 8.923803655781475e-06,
588
+ "loss": 0.5435,
589
+ "step": 4000
590
+ },
591
+ {
592
+ "epoch": 2.4953789279112755,
593
+ "grad_norm": 0.9665613770484924,
594
+ "learning_rate": 8.410351201478742e-06,
595
+ "loss": 0.5723,
596
+ "step": 4050
597
+ },
598
+ {
599
+ "epoch": 2.526186075169439,
600
+ "grad_norm": 0.673569917678833,
601
+ "learning_rate": 7.896898747176013e-06,
602
+ "loss": 0.5641,
603
+ "step": 4100
604
+ },
605
+ {
606
+ "epoch": 2.5569932224276033,
607
+ "grad_norm": 0.8823952078819275,
608
+ "learning_rate": 7.383446292873281e-06,
609
+ "loss": 0.5749,
610
+ "step": 4150
611
+ },
612
+ {
613
+ "epoch": 2.587800369685767,
614
+ "grad_norm": 0.755225658416748,
615
+ "learning_rate": 6.869993838570549e-06,
616
+ "loss": 0.5698,
617
+ "step": 4200
618
+ },
619
+ {
620
+ "epoch": 2.618607516943931,
621
+ "grad_norm": 0.7433829307556152,
622
+ "learning_rate": 6.356541384267817e-06,
623
+ "loss": 0.5728,
624
+ "step": 4250
625
+ },
626
+ {
627
+ "epoch": 2.649414664202095,
628
+ "grad_norm": 0.7045505046844482,
629
+ "learning_rate": 5.843088929965086e-06,
630
+ "loss": 0.5788,
631
+ "step": 4300
632
+ },
633
+ {
634
+ "epoch": 2.6802218114602585,
635
+ "grad_norm": 0.8876537084579468,
636
+ "learning_rate": 5.3296364756623535e-06,
637
+ "loss": 0.5548,
638
+ "step": 4350
639
+ },
640
+ {
641
+ "epoch": 2.7110289587184226,
642
+ "grad_norm": 0.9181818962097168,
643
+ "learning_rate": 4.816184021359622e-06,
644
+ "loss": 0.5681,
645
+ "step": 4400
646
+ },
647
+ {
648
+ "epoch": 2.741836105976587,
649
+ "grad_norm": 0.6883150339126587,
650
+ "learning_rate": 4.30273156705689e-06,
651
+ "loss": 0.5623,
652
+ "step": 4450
653
+ },
654
+ {
655
+ "epoch": 2.7726432532347505,
656
+ "grad_norm": 0.8438022136688232,
657
+ "learning_rate": 3.789279112754159e-06,
658
+ "loss": 0.5549,
659
+ "step": 4500
660
+ },
661
+ {
662
+ "epoch": 2.803450400492914,
663
+ "grad_norm": 0.7179750204086304,
664
+ "learning_rate": 3.2758266584514275e-06,
665
+ "loss": 0.574,
666
+ "step": 4550
667
+ },
668
+ {
669
+ "epoch": 2.8342575477510783,
670
+ "grad_norm": 0.8070858716964722,
671
+ "learning_rate": 2.7623742041486962e-06,
672
+ "loss": 0.5529,
673
+ "step": 4600
674
+ },
675
+ {
676
+ "epoch": 2.865064695009242,
677
+ "grad_norm": 1.1926172971725464,
678
+ "learning_rate": 2.2489217498459646e-06,
679
+ "loss": 0.5781,
680
+ "step": 4650
681
+ },
682
+ {
683
+ "epoch": 2.895871842267406,
684
+ "grad_norm": 0.6364499926567078,
685
+ "learning_rate": 1.735469295543233e-06,
686
+ "loss": 0.5745,
687
+ "step": 4700
688
+ },
689
+ {
690
+ "epoch": 2.92667898952557,
691
+ "grad_norm": 1.107398509979248,
692
+ "learning_rate": 1.2220168412405011e-06,
693
+ "loss": 0.5785,
694
+ "step": 4750
695
+ },
696
+ {
697
+ "epoch": 2.957486136783734,
698
+ "grad_norm": 1.8869876861572266,
699
+ "learning_rate": 7.085643869377696e-07,
700
+ "loss": 0.5667,
701
+ "step": 4800
702
+ },
703
+ {
704
+ "epoch": 2.9882932840418976,
705
+ "grad_norm": 0.6402796506881714,
706
+ "learning_rate": 1.9511193263503802e-07,
707
+ "loss": 0.5626,
708
+ "step": 4850
709
+ },
710
+ {
711
+ "epoch": 3.0,
712
+ "eval_accuracy": 0.6842064714946071,
713
+ "eval_f1": 0.3815630657815329,
714
+ "eval_loss": 0.5975381731987,
715
+ "eval_runtime": 178.1316,
716
+ "eval_samples_per_second": 145.735,
717
+ "eval_steps_per_second": 2.279,
718
+ "step": 4869
719
+ }
720
+ ],
721
+ "logging_steps": 50,
722
+ "max_steps": 4869,
723
+ "num_input_tokens_seen": 0,
724
+ "num_train_epochs": 3,
725
+ "save_steps": 500,
726
+ "stateful_callbacks": {
727
+ "TrainerControl": {
728
+ "args": {
729
+ "should_epoch_stop": false,
730
+ "should_evaluate": false,
731
+ "should_log": false,
732
+ "should_save": true,
733
+ "should_training_stop": true
734
+ },
735
+ "attributes": {}
736
+ }
737
+ },
738
+ "total_flos": 4.126624402931712e+16,
739
+ "train_batch_size": 64,
740
+ "trial_name": null,
741
+ "trial_params": null
742
+ }
checkpoint-4869/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:216272edf196d9a6abebbc3a50312cbe271d02b69b3998f776cc104790711eae
3
+ size 5304
checkpoint-4869/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-multilingual-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "output_past": true,
17
+ "pad_token_id": 0,
18
+ "problem_type": "single_label_classification",
19
+ "qa_dropout": 0.1,
20
+ "seq_classif_dropout": 0.2,
21
+ "sinusoidal_pos_embds": false,
22
+ "tie_weights_": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.48.3",
25
+ "vocab_size": 119547
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2dc1a034fc94f3ba1ade1a879613bfe896c3069f8f2586e9b2dd85aabc8d687
3
+ size 541317368
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff