keiwoo commited on
Commit
61dacb3
·
verified ·
1 Parent(s): 0089656

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +153 -0
tokenizer_config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens": [
3
+ {
4
+ "content": "[PAD]",
5
+ "id": 0,
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ {
13
+ "content": "[UNK]",
14
+ "id": 1,
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ {
22
+ "content": "[CLS]",
23
+ "id": 2,
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ {
31
+ "content": "[SEP]",
32
+ "id": 3,
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false,
37
+ "special": true
38
+ },
39
+ {
40
+ "content": "[MASK]",
41
+ "id": 4,
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false,
46
+ "special": true
47
+ }
48
+ ],
49
+ "added_tokens_decoder": {
50
+ "0": {
51
+ "content": "[PAD]",
52
+ "lstrip": false,
53
+ "normalized": false,
54
+ "rstrip": false,
55
+ "single_word": false,
56
+ "special": true
57
+ },
58
+ "1": {
59
+ "content": "[UNK]",
60
+ "lstrip": false,
61
+ "normalized": false,
62
+ "rstrip": false,
63
+ "single_word": false,
64
+ "special": true
65
+ },
66
+ "2": {
67
+ "content": "[CLS]",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false,
72
+ "special": true
73
+ },
74
+ "3": {
75
+ "content": "[SEP]",
76
+ "lstrip": false,
77
+ "normalized": false,
78
+ "rstrip": false,
79
+ "single_word": false,
80
+ "special": true
81
+ },
82
+ "4": {
83
+ "content": "[MASK]",
84
+ "lstrip": false,
85
+ "normalized": false,
86
+ "rstrip": false,
87
+ "single_word": false,
88
+ "special": true
89
+ }
90
+ },
91
+ "clean_up_tokenization_spaces": true,
92
+ "cls_token": "[CLS]",
93
+ "decoder": "WordPiece",
94
+ "do_basic_tokenize": true,
95
+ "do_lower_case": false,
96
+ "extra_special_tokens": {},
97
+ "mask_token": "[MASK]",
98
+ "model": {
99
+ "continuing_subword_prefix": "##",
100
+ "max_input_chars_per_word": 8,
101
+ "type": "WordPiece",
102
+ "unk_token": "[UNK]",
103
+ "vocab": {
104
+ "A": 5,
105
+ "C": 6,
106
+ "D": 7,
107
+ "E": 8,
108
+ "F": 9,
109
+ "G": 10,
110
+ "H": 11,
111
+ "I": 12,
112
+ "K": 13,
113
+ "L": 14,
114
+ "M": 15,
115
+ "N": 16,
116
+ "P": 17,
117
+ "Q": 18,
118
+ "R": 19,
119
+ "S": 20,
120
+ "T": 21,
121
+ "V": 22,
122
+ "W": 23,
123
+ "Y": 24,
124
+ "[CLS]": 2,
125
+ "[MASK]": 4,
126
+ "[PAD]": 0,
127
+ "[SEP]": 3,
128
+ "[UNK]": 1
129
+ }
130
+ },
131
+ "model_max_length": 1000000000000000019884624838656,
132
+ "never_split": null,
133
+ "normalizer": {
134
+ "clean_text": true,
135
+ "handle_chinese_chars": true,
136
+ "lowercase": false,
137
+ "strip_accents": null,
138
+ "type": "BertNormalizer"
139
+ },
140
+ "pad_token": "[PAD]",
141
+ "padding": null,
142
+ "post_processor": null,
143
+ "pre_tokenizer": {
144
+ "type": "BertPreTokenizer"
145
+ },
146
+ "sep_token": "[SEP]",
147
+ "strip_accents": null,
148
+ "tokenize_chinese_chars": true,
149
+ "tokenizer_class": "BertTokenizer",
150
+ "truncation": null,
151
+ "unk_token": "[UNK]",
152
+ "version": "1.0"
153
+ }