nosuchjihyun commited on
Commit
f839f6a
·
verified ·
1 Parent(s): 3e7d3c1

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -1
  2. special_tokens_map.json +5 -3
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +35 -25
  5. vocab.json +0 -0
merges.txt CHANGED
@@ -32483,4 +32483,3 @@ ason ic
32483
  ple ted
32484
  cent ered
32485
  ĠCl ose
32486
- ek t
 
32483
  ple ted
32484
  cent ered
32485
  ĠCl ose
 
special_tokens_map.json CHANGED
@@ -1,5 +1,7 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
 
 
5
  }
 
1
  {
2
+ "bos_token": "<bos>",
3
+ "cls_token": "<classification>",
4
+ "eos_token": "<eos>",
5
+ "pad_token": "<pad>",
6
+ "unk_token": "<unk>"
7
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -34,7 +34,7 @@
34
  "special": true
35
  },
36
  "4": {
37
- "content": "<mask>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
@@ -42,7 +42,7 @@
42
  "special": true
43
  },
44
  "5": {
45
- "content": "<general><semantic_similarity>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  "special": true
51
  },
52
  "6": {
53
- "content": "<classification>",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
@@ -58,7 +58,7 @@
58
  "special": true
59
  },
60
  "7": {
61
- "content": "<clustering>",
62
  "lstrip": false,
63
  "normalized": false,
64
  "rstrip": false,
@@ -66,7 +66,7 @@
66
  "special": true
67
  },
68
  "8": {
69
- "content": "<retrieval_query>",
70
  "lstrip": false,
71
  "normalized": false,
72
  "rstrip": false,
@@ -74,7 +74,7 @@
74
  "special": true
75
  },
76
  "9": {
77
- "content": "<retrieval_document>",
78
  "lstrip": false,
79
  "normalized": false,
80
  "rstrip": false,
@@ -82,7 +82,7 @@
82
  "special": true
83
  },
84
  "10": {
85
- "content": "<code_retrieval_query>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
@@ -90,7 +90,7 @@
90
  "special": true
91
  },
92
  "11": {
93
- "content": "<reserved0>",
94
  "lstrip": false,
95
  "normalized": false,
96
  "rstrip": false,
@@ -98,7 +98,7 @@
98
  "special": true
99
  },
100
  "12": {
101
- "content": "<reserved1>",
102
  "lstrip": false,
103
  "normalized": false,
104
  "rstrip": false,
@@ -106,7 +106,7 @@
106
  "special": true
107
  },
108
  "13": {
109
- "content": "<reserved2>",
110
  "lstrip": false,
111
  "normalized": false,
112
  "rstrip": false,
@@ -114,7 +114,7 @@
114
  "special": true
115
  },
116
  "14": {
117
- "content": "<reserved3>",
118
  "lstrip": false,
119
  "normalized": false,
120
  "rstrip": false,
@@ -122,7 +122,7 @@
122
  "special": true
123
  },
124
  "15": {
125
- "content": "<reserved4>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
@@ -130,7 +130,7 @@
130
  "special": true
131
  },
132
  "16": {
133
- "content": "<reserved5>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
@@ -138,7 +138,7 @@
138
  "special": true
139
  },
140
  "17": {
141
- "content": "<reserved6>",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
@@ -146,7 +146,7 @@
146
  "special": true
147
  },
148
  "18": {
149
- "content": "<reserved7>",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
@@ -154,7 +154,7 @@
154
  "special": true
155
  },
156
  "19": {
157
- "content": "<reserved8>",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
@@ -162,7 +162,7 @@
162
  "special": true
163
  },
164
  "20": {
165
- "content": "<reserved9>",
166
  "lstrip": false,
167
  "normalized": false,
168
  "rstrip": false,
@@ -170,7 +170,7 @@
170
  "special": true
171
  },
172
  "21": {
173
- "content": "<reserved10>",
174
  "lstrip": false,
175
  "normalized": false,
176
  "rstrip": false,
@@ -178,7 +178,7 @@
178
  "special": true
179
  },
180
  "22": {
181
- "content": "<reserved11>",
182
  "lstrip": false,
183
  "normalized": false,
184
  "rstrip": false,
@@ -186,7 +186,7 @@
186
  "special": true
187
  },
188
  "23": {
189
- "content": "<reserved12>",
190
  "lstrip": false,
191
  "normalized": false,
192
  "rstrip": false,
@@ -194,7 +194,7 @@
194
  "special": true
195
  },
196
  "24": {
197
- "content": "<reserved13>",
198
  "lstrip": false,
199
  "normalized": false,
200
  "rstrip": false,
@@ -202,7 +202,7 @@
202
  "special": true
203
  },
204
  "25": {
205
- "content": "<reserved14>",
206
  "lstrip": false,
207
  "normalized": false,
208
  "rstrip": false,
@@ -210,6 +210,14 @@
210
  "special": true
211
  },
212
  "26": {
 
 
 
 
 
 
 
 
213
  "content": "<reserved15>",
214
  "lstrip": false,
215
  "normalized": false,
@@ -226,11 +234,13 @@
226
  "special": true
227
  }
228
  },
229
- "bos_token": "<|endoftext|>",
230
  "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|endoftext|>",
 
232
  "extra_special_tokens": {},
233
  "model_max_length": 1000000000000000019884624838656,
 
234
  "tokenizer_class": "GPT2Tokenizer",
235
- "unk_token": "<|endoftext|>"
236
  }
 
34
  "special": true
35
  },
36
  "4": {
37
+ "content": "<unk>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
 
42
  "special": true
43
  },
44
  "5": {
45
+ "content": "<mask>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
 
50
  "special": true
51
  },
52
  "6": {
53
+ "content": "<general><semantic_similarity>",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
 
58
  "special": true
59
  },
60
  "7": {
61
+ "content": "<classification>",
62
  "lstrip": false,
63
  "normalized": false,
64
  "rstrip": false,
 
66
  "special": true
67
  },
68
  "8": {
69
+ "content": "<clustering>",
70
  "lstrip": false,
71
  "normalized": false,
72
  "rstrip": false,
 
74
  "special": true
75
  },
76
  "9": {
77
+ "content": "<retrieval_query>",
78
  "lstrip": false,
79
  "normalized": false,
80
  "rstrip": false,
 
82
  "special": true
83
  },
84
  "10": {
85
+ "content": "<retrieval_document>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
 
90
  "special": true
91
  },
92
  "11": {
93
+ "content": "<code_retrieval_query>",
94
  "lstrip": false,
95
  "normalized": false,
96
  "rstrip": false,
 
98
  "special": true
99
  },
100
  "12": {
101
+ "content": "<reserved0>",
102
  "lstrip": false,
103
  "normalized": false,
104
  "rstrip": false,
 
106
  "special": true
107
  },
108
  "13": {
109
+ "content": "<reserved1>",
110
  "lstrip": false,
111
  "normalized": false,
112
  "rstrip": false,
 
114
  "special": true
115
  },
116
  "14": {
117
+ "content": "<reserved2>",
118
  "lstrip": false,
119
  "normalized": false,
120
  "rstrip": false,
 
122
  "special": true
123
  },
124
  "15": {
125
+ "content": "<reserved3>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
 
130
  "special": true
131
  },
132
  "16": {
133
+ "content": "<reserved4>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
 
138
  "special": true
139
  },
140
  "17": {
141
+ "content": "<reserved5>",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
 
146
  "special": true
147
  },
148
  "18": {
149
+ "content": "<reserved6>",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
 
154
  "special": true
155
  },
156
  "19": {
157
+ "content": "<reserved7>",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
 
162
  "special": true
163
  },
164
  "20": {
165
+ "content": "<reserved8>",
166
  "lstrip": false,
167
  "normalized": false,
168
  "rstrip": false,
 
170
  "special": true
171
  },
172
  "21": {
173
+ "content": "<reserved9>",
174
  "lstrip": false,
175
  "normalized": false,
176
  "rstrip": false,
 
178
  "special": true
179
  },
180
  "22": {
181
+ "content": "<reserved10>",
182
  "lstrip": false,
183
  "normalized": false,
184
  "rstrip": false,
 
186
  "special": true
187
  },
188
  "23": {
189
+ "content": "<reserved11>",
190
  "lstrip": false,
191
  "normalized": false,
192
  "rstrip": false,
 
194
  "special": true
195
  },
196
  "24": {
197
+ "content": "<reserved12>",
198
  "lstrip": false,
199
  "normalized": false,
200
  "rstrip": false,
 
202
  "special": true
203
  },
204
  "25": {
205
+ "content": "<reserved13>",
206
  "lstrip": false,
207
  "normalized": false,
208
  "rstrip": false,
 
210
  "special": true
211
  },
212
  "26": {
213
+ "content": "<reserved14>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
  "content": "<reserved15>",
222
  "lstrip": false,
223
  "normalized": false,
 
234
  "special": true
235
  }
236
  },
237
+ "bos_token": "<bos>",
238
  "clean_up_tokenization_spaces": false,
239
+ "cls_token": "<classification>",
240
+ "eos_token": "<eos>",
241
  "extra_special_tokens": {},
242
  "model_max_length": 1000000000000000019884624838656,
243
+ "pad_token": "<pad>",
244
  "tokenizer_class": "GPT2Tokenizer",
245
+ "unk_token": "<unk>"
246
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff