mariamffatima commited on
Commit
cf39ac7
·
verified ·
1 Parent(s): b5adbd1

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +31 -60
  2. tokenizer_config.json +4 -3
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -139,85 +144,51 @@
139
  "special": false
140
  }
141
  ],
142
- "normalizer": {
143
- "type": "Sequence",
144
- "normalizers": []
145
- },
146
  "pre_tokenizer": {
 
 
 
 
 
 
 
 
 
 
 
 
147
  "type": "Sequence",
148
- "pretokenizers": [
149
- {
150
- "type": "Split",
151
- "pattern": {
152
- "Regex": "[\r\n]"
153
- },
154
- "behavior": "Isolated",
155
- "invert": false
156
- },
157
- {
158
- "type": "Split",
159
- "pattern": {
160
- "Regex": "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+"
161
- },
162
- "behavior": "Isolated",
163
- "invert": false
164
- },
165
  {
166
- "type": "Split",
167
  "pattern": {
168
- "Regex": "\\s?[!-/:-~!-/:-~‘-‟ -。]+"
169
  },
170
- "behavior": "Isolated",
171
- "invert": false
172
  },
173
  {
174
- "type": "Split",
175
- "pattern": {
176
- "Regex": "\\s+$"
177
- },
178
- "behavior": "Isolated",
179
- "invert": false
180
  },
181
  {
182
- "type": "Split",
183
- "pattern": {
184
- "Regex": "[一-龥ࠀ-一가-퟿]+"
185
- },
186
- "behavior": "Isolated",
187
- "invert": false
188
  },
189
  {
190
- "type": "Digits",
191
- "individual_digits": true
192
- },
193
- {
194
- "type": "ByteLevel",
195
- "add_prefix_space": false,
196
- "trim_offsets": true,
197
- "use_regex": false
198
  }
199
  ]
200
  },
201
- "post_processor": {
202
- "type": "ByteLevel",
203
- "add_prefix_space": true,
204
- "trim_offsets": false,
205
- "use_regex": true
206
- },
207
- "decoder": {
208
- "type": "ByteLevel",
209
- "add_prefix_space": true,
210
- "trim_offsets": true,
211
- "use_regex": true
212
- },
213
  "model": {
214
  "type": "BPE",
215
  "dropout": null,
216
  "unk_token": null,
217
  "continuing_subword_prefix": null,
218
  "end_of_word_suffix": null,
219
- "fuse_unk": false,
220
- "byte_fallback": false,
221
  "ignore_merges": false,
222
  "vocab": {
223
  "!": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 1024,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
144
  "special": false
145
  }
146
  ],
147
+ "normalizer": null,
 
 
 
148
  "pre_tokenizer": {
149
+ "type": "Metaspace",
150
+ "replacement": "▁",
151
+ "prepend_scheme": "always",
152
+ "split": false
153
+ },
154
+ "post_processor": {
155
+ "type": "ByteLevel",
156
+ "add_prefix_space": true,
157
+ "trim_offsets": false,
158
+ "use_regex": true
159
+ },
160
+ "decoder": {
161
  "type": "Sequence",
162
+ "decoders": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  {
164
+ "type": "Replace",
165
  "pattern": {
166
+ "String": ""
167
  },
168
+ "content": " "
 
169
  },
170
  {
171
+ "type": "ByteFallback"
 
 
 
 
 
172
  },
173
  {
174
+ "type": "Fuse"
 
 
 
 
 
175
  },
176
  {
177
+ "type": "Strip",
178
+ "content": " ",
179
+ "start": 1,
180
+ "stop": 0
 
 
 
 
181
  }
182
  ]
183
  },
 
 
 
 
 
 
 
 
 
 
 
 
184
  "model": {
185
  "type": "BPE",
186
  "dropout": null,
187
  "unk_token": null,
188
  "continuing_subword_prefix": null,
189
  "end_of_word_suffix": null,
190
+ "fuse_unk": true,
191
+ "byte_fallback": true,
192
  "ignore_merges": false,
193
  "vocab": {
194
  "!": 0,
tokenizer_config.json CHANGED
@@ -1,13 +1,14 @@
1
  {
 
2
  "backend": "tokenizers",
3
  "bos_token": "<|begin▁of▁sentence|>",
4
  "clean_up_tokenization_spaces": false,
5
  "eos_token": "<|end▁of▁sentence|>",
6
  "is_local": false,
7
- "legacy": true,
8
  "model_max_length": 4096,
9
  "pad_token": "<|end▁of▁sentence|>",
10
  "sp_model_kwargs": {},
11
- "tokenizer_class": "TokenizersBackend",
12
- "unk_token": null
 
13
  }
 
1
  {
2
+ "add_prefix_space": null,
3
  "backend": "tokenizers",
4
  "bos_token": "<|begin▁of▁sentence|>",
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "<|end▁of▁sentence|>",
7
  "is_local": false,
 
8
  "model_max_length": 4096,
9
  "pad_token": "<|end▁of▁sentence|>",
10
  "sp_model_kwargs": {},
11
+ "tokenizer_class": "LlamaTokenizer",
12
+ "unk_token": null,
13
+ "use_default_system_prompt": false
14
  }