Samuael commited on
Commit
3870d96
·
verified ·
1 Parent(s): 167a244

Upload tokenizer

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- base_model: Samuael/amBART_1000
3
  tags:
4
  - generated_from_trainer
5
  metrics:
6
  - wer
7
  - bleu
 
8
  model-index:
9
  - name: amBART_261
10
  results: []
 
1
  ---
 
2
  tags:
3
  - generated_from_trainer
4
  metrics:
5
  - wer
6
  - bleu
7
+ base_model: Samuael/amBART_1000
8
  model-index:
9
  - name: amBART_261
10
  results: []
sentencepiece.bpe.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4e4c09bd68c20916dfa8472b3c979527234d7b22afa71e1ea0bb36ee79a1bbd
3
- size 253571
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a801c63cf0822cc3a880177fd5895196337d7e3813edde88c428061c263354a4
3
+ size 240461
special_tokens_map.json CHANGED
@@ -1,71 +1,12 @@
1
  {
2
  "additional_special_tokens": [
3
- "ar_AR",
4
- "cs_CZ",
5
- "de_DE",
6
- "en_XX",
7
- "es_XX",
8
- "et_EE",
9
- "fi_FI",
10
- "fr_XX",
11
- "gu_IN",
12
- "hi_IN",
13
- "it_IT",
14
- "ja_XX",
15
- "kk_KZ",
16
- "ko_KR",
17
- "lt_LT",
18
- "lv_LV",
19
- "my_MM",
20
- "ne_NP",
21
- "nl_XX",
22
- "ro_RO",
23
- "ru_RU",
24
- "si_LK",
25
- "tr_TR",
26
- "vi_VN",
27
- "zh_CN"
28
  ],
29
- "bos_token": {
30
- "content": "<s>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false
35
- },
36
- "cls_token": {
37
- "content": "<s>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false
42
- },
43
- "eos_token": {
44
- "content": "</s>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false
49
- },
50
- "pad_token": {
51
- "content": "<pad>",
52
- "lstrip": false,
53
- "normalized": false,
54
- "rstrip": false,
55
- "single_word": false
56
- },
57
- "sep_token": {
58
- "content": "</s>",
59
- "lstrip": false,
60
- "normalized": false,
61
- "rstrip": false,
62
- "single_word": false
63
- },
64
- "unk_token": {
65
- "content": "<unk>",
66
- "lstrip": false,
67
- "normalized": false,
68
- "rstrip": false,
69
- "single_word": false
70
- }
71
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ "",
4
+ "ar_AR"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
+ "bos_token": "<s>",
7
+ "cls_token": "<s>",
8
+ "eos_token": "</s>",
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  }
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "1001": {
36
  "content": "ar_AR",
37
  "lstrip": false,
38
  "normalized": false,
@@ -40,7 +40,7 @@
40
  "single_word": false,
41
  "special": true
42
  },
43
- "1002": {
44
  "content": "cs_CZ",
45
  "lstrip": false,
46
  "normalized": false,
@@ -48,7 +48,7 @@
48
  "single_word": false,
49
  "special": true
50
  },
51
- "1003": {
52
  "content": "de_DE",
53
  "lstrip": false,
54
  "normalized": false,
@@ -56,7 +56,7 @@
56
  "single_word": false,
57
  "special": true
58
  },
59
- "1004": {
60
  "content": "en_XX",
61
  "lstrip": false,
62
  "normalized": false,
@@ -64,7 +64,7 @@
64
  "single_word": false,
65
  "special": true
66
  },
67
- "1005": {
68
  "content": "es_XX",
69
  "lstrip": false,
70
  "normalized": false,
@@ -72,7 +72,7 @@
72
  "single_word": false,
73
  "special": true
74
  },
75
- "1006": {
76
  "content": "et_EE",
77
  "lstrip": false,
78
  "normalized": false,
@@ -80,7 +80,7 @@
80
  "single_word": false,
81
  "special": true
82
  },
83
- "1007": {
84
  "content": "fi_FI",
85
  "lstrip": false,
86
  "normalized": false,
@@ -88,7 +88,7 @@
88
  "single_word": false,
89
  "special": true
90
  },
91
- "1008": {
92
  "content": "fr_XX",
93
  "lstrip": false,
94
  "normalized": false,
@@ -96,7 +96,7 @@
96
  "single_word": false,
97
  "special": true
98
  },
99
- "1009": {
100
  "content": "gu_IN",
101
  "lstrip": false,
102
  "normalized": false,
@@ -104,7 +104,7 @@
104
  "single_word": false,
105
  "special": true
106
  },
107
- "1010": {
108
  "content": "hi_IN",
109
  "lstrip": false,
110
  "normalized": false,
@@ -112,7 +112,7 @@
112
  "single_word": false,
113
  "special": true
114
  },
115
- "1011": {
116
  "content": "it_IT",
117
  "lstrip": false,
118
  "normalized": false,
@@ -120,7 +120,7 @@
120
  "single_word": false,
121
  "special": true
122
  },
123
- "1012": {
124
  "content": "ja_XX",
125
  "lstrip": false,
126
  "normalized": false,
@@ -128,7 +128,7 @@
128
  "single_word": false,
129
  "special": true
130
  },
131
- "1013": {
132
  "content": "kk_KZ",
133
  "lstrip": false,
134
  "normalized": false,
@@ -136,7 +136,7 @@
136
  "single_word": false,
137
  "special": true
138
  },
139
- "1014": {
140
  "content": "ko_KR",
141
  "lstrip": false,
142
  "normalized": false,
@@ -144,7 +144,7 @@
144
  "single_word": false,
145
  "special": true
146
  },
147
- "1015": {
148
  "content": "lt_LT",
149
  "lstrip": false,
150
  "normalized": false,
@@ -152,7 +152,7 @@
152
  "single_word": false,
153
  "special": true
154
  },
155
- "1016": {
156
  "content": "lv_LV",
157
  "lstrip": false,
158
  "normalized": false,
@@ -160,7 +160,7 @@
160
  "single_word": false,
161
  "special": true
162
  },
163
- "1017": {
164
  "content": "my_MM",
165
  "lstrip": false,
166
  "normalized": false,
@@ -168,7 +168,7 @@
168
  "single_word": false,
169
  "special": true
170
  },
171
- "1018": {
172
  "content": "ne_NP",
173
  "lstrip": false,
174
  "normalized": false,
@@ -176,7 +176,7 @@
176
  "single_word": false,
177
  "special": true
178
  },
179
- "1019": {
180
  "content": "nl_XX",
181
  "lstrip": false,
182
  "normalized": false,
@@ -184,7 +184,7 @@
184
  "single_word": false,
185
  "special": true
186
  },
187
- "1020": {
188
  "content": "ro_RO",
189
  "lstrip": false,
190
  "normalized": false,
@@ -192,7 +192,7 @@
192
  "single_word": false,
193
  "special": true
194
  },
195
- "1021": {
196
  "content": "ru_RU",
197
  "lstrip": false,
198
  "normalized": false,
@@ -200,7 +200,7 @@
200
  "single_word": false,
201
  "special": true
202
  },
203
- "1022": {
204
  "content": "si_LK",
205
  "lstrip": false,
206
  "normalized": false,
@@ -208,7 +208,7 @@
208
  "single_word": false,
209
  "special": true
210
  },
211
- "1023": {
212
  "content": "tr_TR",
213
  "lstrip": false,
214
  "normalized": false,
@@ -216,7 +216,7 @@
216
  "single_word": false,
217
  "special": true
218
  },
219
- "1024": {
220
  "content": "vi_VN",
221
  "lstrip": false,
222
  "normalized": false,
@@ -224,7 +224,7 @@
224
  "single_word": false,
225
  "special": true
226
  },
227
- "1025": {
228
  "content": "zh_CN",
229
  "lstrip": false,
230
  "normalized": false,
@@ -234,31 +234,8 @@
234
  }
235
  },
236
  "additional_special_tokens": [
237
- "ar_AR",
238
- "cs_CZ",
239
- "de_DE",
240
- "en_XX",
241
- "es_XX",
242
- "et_EE",
243
- "fi_FI",
244
- "fr_XX",
245
- "gu_IN",
246
- "hi_IN",
247
- "it_IT",
248
- "ja_XX",
249
- "kk_KZ",
250
- "ko_KR",
251
- "lt_LT",
252
- "lv_LV",
253
- "my_MM",
254
- "ne_NP",
255
- "nl_XX",
256
- "ro_RO",
257
- "ru_RU",
258
- "si_LK",
259
- "tr_TR",
260
- "vi_VN",
261
- "zh_CN"
262
  ],
263
  "bos_token": "<s>",
264
  "clean_up_tokenization_spaces": true,
@@ -272,5 +249,6 @@
272
  "src_lang": "ar_AR",
273
  "tgt_lang": "cs_CZ",
274
  "tokenizer_class": "MBartTokenizer",
 
275
  "unk_token": "<unk>"
276
  }
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "235": {
36
  "content": "ar_AR",
37
  "lstrip": false,
38
  "normalized": false,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "236": {
44
  "content": "cs_CZ",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "237": {
52
  "content": "de_DE",
53
  "lstrip": false,
54
  "normalized": false,
 
56
  "single_word": false,
57
  "special": true
58
  },
59
+ "238": {
60
  "content": "en_XX",
61
  "lstrip": false,
62
  "normalized": false,
 
64
  "single_word": false,
65
  "special": true
66
  },
67
+ "239": {
68
  "content": "es_XX",
69
  "lstrip": false,
70
  "normalized": false,
 
72
  "single_word": false,
73
  "special": true
74
  },
75
+ "240": {
76
  "content": "et_EE",
77
  "lstrip": false,
78
  "normalized": false,
 
80
  "single_word": false,
81
  "special": true
82
  },
83
+ "241": {
84
  "content": "fi_FI",
85
  "lstrip": false,
86
  "normalized": false,
 
88
  "single_word": false,
89
  "special": true
90
  },
91
+ "242": {
92
  "content": "fr_XX",
93
  "lstrip": false,
94
  "normalized": false,
 
96
  "single_word": false,
97
  "special": true
98
  },
99
+ "243": {
100
  "content": "gu_IN",
101
  "lstrip": false,
102
  "normalized": false,
 
104
  "single_word": false,
105
  "special": true
106
  },
107
+ "244": {
108
  "content": "hi_IN",
109
  "lstrip": false,
110
  "normalized": false,
 
112
  "single_word": false,
113
  "special": true
114
  },
115
+ "245": {
116
  "content": "it_IT",
117
  "lstrip": false,
118
  "normalized": false,
 
120
  "single_word": false,
121
  "special": true
122
  },
123
+ "246": {
124
  "content": "ja_XX",
125
  "lstrip": false,
126
  "normalized": false,
 
128
  "single_word": false,
129
  "special": true
130
  },
131
+ "247": {
132
  "content": "kk_KZ",
133
  "lstrip": false,
134
  "normalized": false,
 
136
  "single_word": false,
137
  "special": true
138
  },
139
+ "248": {
140
  "content": "ko_KR",
141
  "lstrip": false,
142
  "normalized": false,
 
144
  "single_word": false,
145
  "special": true
146
  },
147
+ "249": {
148
  "content": "lt_LT",
149
  "lstrip": false,
150
  "normalized": false,
 
152
  "single_word": false,
153
  "special": true
154
  },
155
+ "250": {
156
  "content": "lv_LV",
157
  "lstrip": false,
158
  "normalized": false,
 
160
  "single_word": false,
161
  "special": true
162
  },
163
+ "251": {
164
  "content": "my_MM",
165
  "lstrip": false,
166
  "normalized": false,
 
168
  "single_word": false,
169
  "special": true
170
  },
171
+ "252": {
172
  "content": "ne_NP",
173
  "lstrip": false,
174
  "normalized": false,
 
176
  "single_word": false,
177
  "special": true
178
  },
179
+ "253": {
180
  "content": "nl_XX",
181
  "lstrip": false,
182
  "normalized": false,
 
184
  "single_word": false,
185
  "special": true
186
  },
187
+ "254": {
188
  "content": "ro_RO",
189
  "lstrip": false,
190
  "normalized": false,
 
192
  "single_word": false,
193
  "special": true
194
  },
195
+ "255": {
196
  "content": "ru_RU",
197
  "lstrip": false,
198
  "normalized": false,
 
200
  "single_word": false,
201
  "special": true
202
  },
203
+ "256": {
204
  "content": "si_LK",
205
  "lstrip": false,
206
  "normalized": false,
 
208
  "single_word": false,
209
  "special": true
210
  },
211
+ "257": {
212
  "content": "tr_TR",
213
  "lstrip": false,
214
  "normalized": false,
 
216
  "single_word": false,
217
  "special": true
218
  },
219
+ "258": {
220
  "content": "vi_VN",
221
  "lstrip": false,
222
  "normalized": false,
 
224
  "single_word": false,
225
  "special": true
226
  },
227
+ "259": {
228
  "content": "zh_CN",
229
  "lstrip": false,
230
  "normalized": false,
 
234
  }
235
  },
236
  "additional_special_tokens": [
237
+ "",
238
+ "ar_AR"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  ],
240
  "bos_token": "<s>",
241
  "clean_up_tokenization_spaces": true,
 
249
  "src_lang": "ar_AR",
250
  "tgt_lang": "cs_CZ",
251
  "tokenizer_class": "MBartTokenizer",
252
+ "tokenizer_file": null,
253
  "unk_token": "<unk>"
254
  }