eeshaAI commited on
Commit
dbe4dbf
·
verified ·
1 Parent(s): 0dc703b

Tokenizer with visual tokens

Browse files
Files changed (2) hide show
  1. tokenizer.json +25 -26
  2. tokenizer_config.json +7 -3
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 384,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 384
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 100277,
16
- "pad_type_id": 0,
17
- "pad_token": "<|pad|>"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 100256,
@@ -221,8 +207,8 @@
221
  "single_word": false,
222
  "lstrip": false,
223
  "rstrip": false,
224
- "normalized": true,
225
- "special": false
226
  },
227
  {
228
  "id": 100279,
@@ -230,8 +216,8 @@
230
  "single_word": false,
231
  "lstrip": false,
232
  "rstrip": false,
233
- "normalized": true,
234
- "special": false
235
  },
236
  {
237
  "id": 100280,
@@ -239,8 +225,8 @@
239
  "single_word": false,
240
  "lstrip": false,
241
  "rstrip": false,
242
- "normalized": true,
243
- "special": false
244
  },
245
  {
246
  "id": 100281,
@@ -9461,10 +9447,23 @@
9461
  ],
9462
  "normalizer": null,
9463
  "pre_tokenizer": {
9464
- "type": "ByteLevel",
9465
- "add_prefix_space": false,
9466
- "trim_offsets": true,
9467
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
9468
  },
9469
  "post_processor": {
9470
  "type": "TemplateProcessing",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 100256,
 
207
  "single_word": false,
208
  "lstrip": false,
209
  "rstrip": false,
210
+ "normalized": false,
211
+ "special": true
212
  },
213
  {
214
  "id": 100279,
 
216
  "single_word": false,
217
  "lstrip": false,
218
  "rstrip": false,
219
+ "normalized": false,
220
+ "special": true
221
  },
222
  {
223
  "id": 100280,
 
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
228
+ "normalized": false,
229
+ "special": true
230
  },
231
  {
232
  "id": 100281,
 
9447
  ],
9448
  "normalizer": null,
9449
  "pre_tokenizer": {
9450
+ "type": "Sequence",
9451
+ "pretokenizers": [
9452
+ {
9453
+ "type": "Split",
9454
+ "pattern": {
9455
+ "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
9456
+ },
9457
+ "behavior": "Removed",
9458
+ "invert": true
9459
+ },
9460
+ {
9461
+ "type": "ByteLevel",
9462
+ "add_prefix_space": false,
9463
+ "trim_offsets": true,
9464
+ "use_regex": false
9465
+ }
9466
+ ]
9467
  },
9468
  "post_processor": {
9469
  "type": "TemplateProcessing",
tokenizer_config.json CHANGED
@@ -4,11 +4,15 @@
4
  "bos_token": "<|endoftext|>",
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "<|endoftext|>",
7
- "errors": "replace",
 
 
 
 
8
  "is_local": false,
9
- "local_files_only": false,
10
  "model_max_length": 1000000000000000019884624838656,
 
11
  "pad_token": "<|pad|>",
12
- "tokenizer_class": "GPT2Tokenizer",
13
  "unk_token": "<|endoftext|>"
14
  }
 
4
  "bos_token": "<|endoftext|>",
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "<|endoftext|>",
7
+ "extra_special_tokens": [
8
+ "<video_start>",
9
+ "<video_end>",
10
+ "<video_pad>"
11
+ ],
12
  "is_local": false,
 
13
  "model_max_length": 1000000000000000019884624838656,
14
+ "model_specific_special_tokens": {},
15
  "pad_token": "<|pad|>",
16
+ "tokenizer_class": "TokenizersBackend",
17
  "unk_token": "<|endoftext|>"
18
  }