win2win commited on
Commit
efc1e48
·
verified ·
1 Parent(s): 1ae9149

Create tokenizer_config.json

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +29 -0
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": { "content": "[PAD]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true },
4
+ "100": { "content": "[UNK]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true },
5
+ "101": { "content": "[CLS]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true },
6
+ "102": { "content": "[SEP]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true },
7
+ "103": { "content": "[MASK]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true }
8
+ },
9
+ "clean_up_tokenization_spaces": false,
10
+ "cls_token": "[CLS]",
11
+ "do_lower_case": true,
12
+ "extra_special_tokens": {},
13
+ "mask_token": "[MASK]",
14
+ "max_length": 512,
15
+ "model_max_length": 512,
16
+ "pad_to_multiple_of": null,
17
+ "pad_token": "[PAD]",
18
+ "pad_token_type_id": 0,
19
+ "padding_side": "right",
20
+ "sep_token": "[SEP]",
21
+ "stride": 0,
22
+ "strip_accents": null,
23
+ "tokenize_chinese_chars": true,
24
+ "tokenizer_class": "DistilBertTokenizerFast", // Changed from "DistilBertTokenizer"
25
+ "truncation_side": "right",
26
+ "truncation_strategy": "longest_first",
27
+ "unk_token": "[UNK]",
28
+ "add_token_type_ids": false // Added this line
29
+ }