jonno1984 commited on
Commit
fe7415e
·
1 Parent(s): e9373b1

Upload ./ with huggingface_hub

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +190 -0
  3. tokenizer_config.json +10 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "</s>",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "</s>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "[CLS]",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "[SEP]",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ }
69
+ ],
70
+ "normalizer": null,
71
+ "pre_tokenizer": {
72
+ "type": "Whitespace"
73
+ },
74
+ "post_processor": {
75
+ "type": "TemplateProcessing",
76
+ "single": [
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[CLS]",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "Sequence": {
85
+ "id": "A",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "[SEP]",
92
+ "type_id": 0
93
+ }
94
+ }
95
+ ],
96
+ "pair": [
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[CLS]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "A",
106
+ "type_id": 0
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 0
113
+ }
114
+ },
115
+ {
116
+ "Sequence": {
117
+ "id": "B",
118
+ "type_id": 1
119
+ }
120
+ },
121
+ {
122
+ "SpecialToken": {
123
+ "id": "[SEP]",
124
+ "type_id": 1
125
+ }
126
+ }
127
+ ],
128
+ "special_tokens": {
129
+ "[CLS]": {
130
+ "id": "[CLS]",
131
+ "ids": [
132
+ 5
133
+ ],
134
+ "tokens": [
135
+ "[CLS]"
136
+ ]
137
+ },
138
+ "[SEP]": {
139
+ "id": "[SEP]",
140
+ "ids": [
141
+ 6
142
+ ],
143
+ "tokens": [
144
+ "[SEP]"
145
+ ]
146
+ }
147
+ }
148
+ },
149
+ "decoder": {
150
+ "type": "WordPiece",
151
+ "prefix": "##",
152
+ "cleanup": true
153
+ },
154
+ "model": {
155
+ "type": "WordPiece",
156
+ "unk_token": "<unk>",
157
+ "continuing_subword_prefix": "##",
158
+ "max_input_chars_per_word": 100,
159
+ "vocab": {
160
+ "<pad>": 0,
161
+ "</s>": 1,
162
+ "<s>": 2,
163
+ "<unk>": 3,
164
+ "[MASK]": 4,
165
+ "[CLS]": 5,
166
+ "[SEP]": 6,
167
+ "1": 7,
168
+ "2": 8,
169
+ "3": 9,
170
+ "4": 10,
171
+ "5": 11,
172
+ "A": 12,
173
+ "B": 13,
174
+ "C": 14,
175
+ "D": 15,
176
+ "E": 16,
177
+ "F": 17,
178
+ "G": 18,
179
+ "H": 19,
180
+ "I": 20,
181
+ "J": 21,
182
+ "K": 22,
183
+ "L": 23,
184
+ "M": 24,
185
+ "N": 25,
186
+ "O": 26,
187
+ "P": 27
188
+ }
189
+ }
190
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "</s>",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "[SEP]",
8
+ "tokenizer_class": "PreTrainedTokenizerFast",
9
+ "unk_token": "<unk>"
10
+ }