vojtam commited on
Commit
cc59e26
·
verified ·
1 Parent(s): f919f24

Upload tokenizer.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.json +231 -0
tokenizer.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[PAD]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|endoftext|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "ByteLevel",
37
+ "add_prefix_space": false,
38
+ "trim_offsets": true,
39
+ "use_regex": true
40
+ },
41
+ "post_processor": {
42
+ "type": "ByteLevel",
43
+ "add_prefix_space": true,
44
+ "trim_offsets": false,
45
+ "use_regex": true
46
+ },
47
+ "decoder": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": true,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "model": {
54
+ "type": "BPE",
55
+ "dropout": null,
56
+ "unk_token": null,
57
+ "continuing_subword_prefix": "",
58
+ "end_of_word_suffix": "",
59
+ "fuse_unk": false,
60
+ "byte_fallback": false,
61
+ "ignore_merges": false,
62
+ "vocab": {
63
+ "<unk>": 0,
64
+ "[PAD]": 1,
65
+ "<|endoftext|>": 2,
66
+ "AA": 3,
67
+ "TT": 4,
68
+ "TG": 5,
69
+ "CA": 6,
70
+ "CC": 7,
71
+ "TA": 8,
72
+ "GG": 9,
73
+ "TC": 10,
74
+ "GA": 11,
75
+ "AAA": 12,
76
+ "GC": 13,
77
+ "TAA": 14,
78
+ "TTTT": 15,
79
+ "TCA": 16,
80
+ "TGA": 17,
81
+ "TTA": 18,
82
+ "GAA": 19,
83
+ "TCC": 20,
84
+ "CAA": 21,
85
+ "CTG": 22,
86
+ "CTT": 23,
87
+ "GTG": 24,
88
+ "GTT": 25,
89
+ "GCA": 26,
90
+ "A": 27,
91
+ "T": 28,
92
+ "G": 29,
93
+ "C": 30,
94
+ "▁": 31
95
+ },
96
+ "merges": [
97
+ [
98
+ "A",
99
+ "A"
100
+ ],
101
+ [
102
+ "T",
103
+ "T"
104
+ ],
105
+ [
106
+ "T",
107
+ "G"
108
+ ],
109
+ [
110
+ "C",
111
+ "A"
112
+ ],
113
+ [
114
+ "C",
115
+ "C"
116
+ ],
117
+ [
118
+ "T",
119
+ "A"
120
+ ],
121
+ [
122
+ "G",
123
+ "G"
124
+ ],
125
+ [
126
+ "T",
127
+ "C"
128
+ ],
129
+ [
130
+ "G",
131
+ "A"
132
+ ],
133
+ [
134
+ "A",
135
+ "AA"
136
+ ],
137
+ [
138
+ "AA",
139
+ "A"
140
+ ],
141
+ [
142
+ "G",
143
+ "C"
144
+ ],
145
+ [
146
+ "T",
147
+ "AA"
148
+ ],
149
+ [
150
+ "TA",
151
+ "A"
152
+ ],
153
+ [
154
+ "TT",
155
+ "TT"
156
+ ],
157
+ [
158
+ "T",
159
+ "CA"
160
+ ],
161
+ [
162
+ "TC",
163
+ "A"
164
+ ],
165
+ [
166
+ "T",
167
+ "GA"
168
+ ],
169
+ [
170
+ "TG",
171
+ "A"
172
+ ],
173
+ [
174
+ "T",
175
+ "TA"
176
+ ],
177
+ [
178
+ "TT",
179
+ "A"
180
+ ],
181
+ [
182
+ "G",
183
+ "AA"
184
+ ],
185
+ [
186
+ "GA",
187
+ "A"
188
+ ],
189
+ [
190
+ "T",
191
+ "CC"
192
+ ],
193
+ [
194
+ "TC",
195
+ "C"
196
+ ],
197
+ [
198
+ "C",
199
+ "AA"
200
+ ],
201
+ [
202
+ "CA",
203
+ "A"
204
+ ],
205
+ [
206
+ "C",
207
+ "TG"
208
+ ],
209
+ [
210
+ "C",
211
+ "TT"
212
+ ],
213
+ [
214
+ "G",
215
+ "TG"
216
+ ],
217
+ [
218
+ "G",
219
+ "TT"
220
+ ],
221
+ [
222
+ "G",
223
+ "CA"
224
+ ],
225
+ [
226
+ "GC",
227
+ "A"
228
+ ]
229
+ ]
230
+ }
231
+ }