goabonga commited on
Commit
9505c31
·
verified ·
1 Parent(s): 79f5789

Upload tokenizer files (vocab, config, README)

Browse files
Files changed (1) hide show
  1. tokenizer.json +111 -6
tokenizer.json CHANGED
@@ -9,7 +9,8 @@
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false
 
13
  },
14
  {
15
  "id": 1,
@@ -17,7 +18,8 @@
17
  "single_word": false,
18
  "lstrip": false,
19
  "rstrip": false,
20
- "normalized": false
 
21
  },
22
  {
23
  "id": 2,
@@ -25,7 +27,8 @@
25
  "single_word": false,
26
  "lstrip": false,
27
  "rstrip": false,
28
- "normalized": false
 
29
  },
30
  {
31
  "id": 3,
@@ -33,7 +36,8 @@
33
  "single_word": false,
34
  "lstrip": false,
35
  "rstrip": false,
36
- "normalized": false
 
37
  }
38
  ],
39
  "added_tokens_decoder": {
@@ -79,13 +83,114 @@
79
  ]
80
  },
81
  "pre_tokenizer": {
82
- "type": "Whitespace"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  },
84
- "post_processor": null,
85
  "decoder": {
86
  "type": "WordPiece",
87
  "unk_token": "<unk>"
88
  },
 
 
 
 
 
 
89
  "special_tokens": {
90
  "pad_token": 0,
91
  "unk_token": 1,
 
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
  },
15
  {
16
  "id": 1,
 
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
  },
24
  {
25
  "id": 2,
 
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
  },
33
  {
34
  "id": 3,
 
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  }
42
  ],
43
  "added_tokens_decoder": {
 
83
  ]
84
  },
85
  "pre_tokenizer": {
86
+ "type": "Sequence",
87
+ "pretokenizers": [
88
+ {
89
+ "type": "Whitespace"
90
+ },
91
+ {
92
+ "type": "Punctuation",
93
+ "behavior": "Isolated"
94
+ }
95
+ ]
96
+ },
97
+ "post_processor": {
98
+ "type": "TemplateProcessing",
99
+ "single": [
100
+ {
101
+ "SpecialToken": {
102
+ "id": "<bos>",
103
+ "type_id": 0
104
+ }
105
+ },
106
+ {
107
+ "Sequence": {
108
+ "id": "A",
109
+ "type_id": 0
110
+ }
111
+ },
112
+ {
113
+ "SpecialToken": {
114
+ "id": "<eos>",
115
+ "type_id": 0
116
+ }
117
+ }
118
+ ],
119
+ "pair": [
120
+ {
121
+ "SpecialToken": {
122
+ "id": "<bos>",
123
+ "type_id": 0
124
+ }
125
+ },
126
+ {
127
+ "Sequence": {
128
+ "id": "A",
129
+ "type_id": 0
130
+ }
131
+ },
132
+ {
133
+ "Sequence": {
134
+ "id": "B",
135
+ "type_id": 0
136
+ }
137
+ },
138
+ {
139
+ "SpecialToken": {
140
+ "id": "<eos>",
141
+ "type_id": 0
142
+ }
143
+ }
144
+ ],
145
+ "special_tokens": {
146
+ "<pad>": {
147
+ "id": "<pad>",
148
+ "ids": [
149
+ 0
150
+ ],
151
+ "tokens": [
152
+ "<pad>"
153
+ ]
154
+ },
155
+ "<unk>": {
156
+ "id": "<unk>",
157
+ "ids": [
158
+ 1
159
+ ],
160
+ "tokens": [
161
+ "<unk>"
162
+ ]
163
+ },
164
+ "<bos>": {
165
+ "id": "<bos>",
166
+ "ids": [
167
+ 2
168
+ ],
169
+ "tokens": [
170
+ "<bos>"
171
+ ]
172
+ },
173
+ "<eos>": {
174
+ "id": "<eos>",
175
+ "ids": [
176
+ 3
177
+ ],
178
+ "tokens": [
179
+ "<eos>"
180
+ ]
181
+ }
182
+ }
183
  },
 
184
  "decoder": {
185
  "type": "WordPiece",
186
  "unk_token": "<unk>"
187
  },
188
+ "special": [
189
+ 0,
190
+ 1,
191
+ 2,
192
+ 3
193
+ ],
194
  "special_tokens": {
195
  "pad_token": 0,
196
  "unk_token": 1,