AlonKellner-Jounce commited on
Commit
f9d6e33
·
verified ·
1 Parent(s): ec0e62c

working tokenizer

Browse files
config.json CHANGED
@@ -24,7 +24,7 @@
24
  "sliding_window": null,
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
- "transformers_version": "4.45.1",
28
  "use_cache": true,
29
  "vocab_size": 25
30
  }
 
24
  "sliding_window": null,
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.46.1",
28
  "use_cache": true,
29
  "vocab_size": 25
30
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 3,
6
- "transformers_version": "4.45.1"
7
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 3,
6
+ "transformers_version": "4.46.1"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:926d327bcb02cde99d75d29fb7e3d2656623e4ba0ed69fe910f759dbf1692e8a
3
  size 142920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619fe89397d93fd0a9d08c5ac59c8a0f3235948849bad255c9f4a570f15860b7
3
  size 142920
special_tokens_map.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "bos_token": "1",
3
- "cls_token": "1",
4
- "eos_token": "2",
5
- "mask_token": "4",
6
- "pad_token": "3",
7
- "sep_token": "2",
8
- "unk_token": "0"
9
  }
 
1
  {
2
+ "bos_token": "CLS",
3
+ "cls_token": "CLS",
4
+ "eos_token": "SEP",
5
+ "mask_token": "MASK",
6
+ "pad_token": "PAD",
7
+ "sep_token": "SEP",
8
+ "unk_token": "UNK"
9
  }
tokenizer.json CHANGED
@@ -5,93 +5,48 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "[UNK]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false,
13
- "special": true
14
  },
15
  {
16
  "id": 1,
17
- "content": "[CLS]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": false,
22
- "special": true
23
  },
24
  {
25
  "id": 2,
26
- "content": "[SEP]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
  },
33
  {
34
  "id": 3,
35
- "content": "[PAD]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
  },
42
  {
43
  "id": 4,
44
- "content": "[MASK]",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
- },
51
- {
52
- "id": 5,
53
- "content": "1",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": false,
58
- "special": true
59
- },
60
- {
61
- "id": 6,
62
- "content": "2",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": false,
67
- "special": true
68
- },
69
- {
70
- "id": 7,
71
- "content": "3",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
- "normalized": false,
76
- "special": true
77
- },
78
- {
79
- "id": 8,
80
- "content": "4",
81
- "single_word": false,
82
- "lstrip": false,
83
- "rstrip": false,
84
- "normalized": false,
85
- "special": true
86
- },
87
- {
88
- "id": 14,
89
- "content": "0",
90
- "single_word": false,
91
- "lstrip": false,
92
- "rstrip": false,
93
- "normalized": false,
94
- "special": true
95
  }
96
  ],
97
  "normalizer": {
@@ -125,7 +80,7 @@
125
  "single": [
126
  {
127
  "SpecialToken": {
128
- "id": "[CLS]",
129
  "type_id": 0
130
  }
131
  },
@@ -137,7 +92,7 @@
137
  },
138
  {
139
  "SpecialToken": {
140
- "id": "[SEP]",
141
  "type_id": 0
142
  }
143
  }
@@ -145,7 +100,7 @@
145
  "pair": [
146
  {
147
  "SpecialToken": {
148
- "id": "[CLS]",
149
  "type_id": 0
150
  }
151
  },
@@ -157,7 +112,7 @@
157
  },
158
  {
159
  "SpecialToken": {
160
- "id": "[SEP]",
161
  "type_id": 0
162
  }
163
  },
@@ -169,28 +124,28 @@
169
  },
170
  {
171
  "SpecialToken": {
172
- "id": "[SEP]",
173
  "type_id": 1
174
  }
175
  }
176
  ],
177
  "special_tokens": {
178
- "[CLS]": {
179
- "id": "[CLS]",
180
  "ids": [
181
  1
182
  ],
183
  "tokens": [
184
- "[CLS]"
185
  ]
186
  },
187
- "[SEP]": {
188
- "id": "[SEP]",
189
  "ids": [
190
  2
191
  ],
192
  "tokens": [
193
- "[SEP]"
194
  ]
195
  }
196
  }
@@ -199,23 +154,32 @@
199
  "model": {
200
  "type": "WordLevel",
201
  "vocab": {
202
- "[UNK]": 0,
203
- "[CLS]": 1,
204
- "[SEP]": 2,
205
- "[PAD]": 3,
206
- "[MASK]": 4,
207
- "1": 5,
208
- "2": 6,
209
- "3": 7,
210
- "4": 8,
211
- "5": 9,
212
- "6": 10,
213
- "7": 11,
214
- "8": 12,
215
- "9": 13,
216
- "0": 14,
217
- "token": 15
 
 
 
 
 
 
 
 
 
218
  },
219
- "unk_token": "[UNK]"
220
  }
221
  }
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "UNK",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": true,
13
+ "special": false
14
  },
15
  {
16
  "id": 1,
17
+ "content": "CLS",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": true,
22
+ "special": false
23
  },
24
  {
25
  "id": 2,
26
+ "content": "SEP",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": true,
31
+ "special": false
32
  },
33
  {
34
  "id": 3,
35
+ "content": "PAD",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": true,
40
+ "special": false
41
  },
42
  {
43
  "id": 4,
44
+ "content": "MASK",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": true,
49
+ "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "normalizer": {
 
80
  "single": [
81
  {
82
  "SpecialToken": {
83
+ "id": "CLS",
84
  "type_id": 0
85
  }
86
  },
 
92
  },
93
  {
94
  "SpecialToken": {
95
+ "id": "SEP",
96
  "type_id": 0
97
  }
98
  }
 
100
  "pair": [
101
  {
102
  "SpecialToken": {
103
+ "id": "CLS",
104
  "type_id": 0
105
  }
106
  },
 
112
  },
113
  {
114
  "SpecialToken": {
115
+ "id": "SEP",
116
  "type_id": 0
117
  }
118
  },
 
124
  },
125
  {
126
  "SpecialToken": {
127
+ "id": "SEP",
128
  "type_id": 1
129
  }
130
  }
131
  ],
132
  "special_tokens": {
133
+ "CLS": {
134
+ "id": "CLS",
135
  "ids": [
136
  1
137
  ],
138
  "tokens": [
139
+ "CLS"
140
  ]
141
  },
142
+ "SEP": {
143
+ "id": "SEP",
144
  "ids": [
145
  2
146
  ],
147
  "tokens": [
148
+ "SEP"
149
  ]
150
  }
151
  }
 
154
  "model": {
155
  "type": "WordLevel",
156
  "vocab": {
157
+ "UNK": 0,
158
+ "CLS": 1,
159
+ "SEP": 2,
160
+ "PAD": 3,
161
+ "MASK": 4,
162
+ "boken": 5,
163
+ "coken": 6,
164
+ "doken": 7,
165
+ "foken": 8,
166
+ "goken": 9,
167
+ "hoken": 10,
168
+ "joken": 11,
169
+ "koken": 12,
170
+ "loken": 13,
171
+ "moken": 14,
172
+ "noken": 15,
173
+ "poken": 16,
174
+ "qoken": 17,
175
+ "roken": 18,
176
+ "soken": 19,
177
+ "token": 20,
178
+ "voken": 21,
179
+ "woken": 22,
180
+ "xoken": 23,
181
+ "yoken": 24
182
  },
183
+ "unk_token": "UNK"
184
  }
185
  }
tokenizer_config.json CHANGED
@@ -1,97 +1,57 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[UNK]",
5
  "lstrip": false,
6
- "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
- "special": true
10
  },
11
  "1": {
12
- "content": "[CLS]",
13
  "lstrip": false,
14
- "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
- "special": true
18
  },
19
  "2": {
20
- "content": "[SEP]",
21
  "lstrip": false,
22
- "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
- "special": true
26
  },
27
  "3": {
28
- "content": "[PAD]",
29
  "lstrip": false,
30
- "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
- "special": true
34
  },
35
  "4": {
36
- "content": "[MASK]",
37
  "lstrip": false,
38
- "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "1",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "2",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "7": {
60
- "content": "3",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- },
67
- "8": {
68
- "content": "4",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false,
73
- "special": true
74
- },
75
- "14": {
76
- "content": "0",
77
- "lstrip": false,
78
- "normalized": false,
79
- "rstrip": false,
80
- "single_word": false,
81
- "special": true
82
  }
83
  },
84
- "bos_token": "1",
85
  "chat_template": "{% for message in messages %}{{ message['content'] }}{% endfor %}",
86
  "clean_up_tokenization_spaces": false,
87
- "cls_token": "1",
88
- "eos_token": "2",
89
- "mask_token": "4",
90
  "max_new_tokens": 1048576,
91
  "model_max_length": 1048576,
92
- "pad_token": "3",
93
  "padding_side": "right",
94
- "sep_token": "2",
95
  "tokenizer_class": "PreTrainedTokenizerFast",
96
- "unk_token": "0"
97
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "UNK",
5
  "lstrip": false,
6
+ "normalized": true,
7
  "rstrip": false,
8
  "single_word": false,
9
+ "special": false
10
  },
11
  "1": {
12
+ "content": "CLS",
13
  "lstrip": false,
14
+ "normalized": true,
15
  "rstrip": false,
16
  "single_word": false,
17
+ "special": false
18
  },
19
  "2": {
20
+ "content": "SEP",
21
  "lstrip": false,
22
+ "normalized": true,
23
  "rstrip": false,
24
  "single_word": false,
25
+ "special": false
26
  },
27
  "3": {
28
+ "content": "PAD",
29
  "lstrip": false,
30
+ "normalized": true,
31
  "rstrip": false,
32
  "single_word": false,
33
+ "special": false
34
  },
35
  "4": {
36
+ "content": "MASK",
37
  "lstrip": false,
38
+ "normalized": true,
39
  "rstrip": false,
40
  "single_word": false,
41
+ "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  },
44
+ "bos_token": "CLS",
45
  "chat_template": "{% for message in messages %}{{ message['content'] }}{% endfor %}",
46
  "clean_up_tokenization_spaces": false,
47
+ "cls_token": "CLS",
48
+ "eos_token": "SEP",
49
+ "mask_token": "MASK",
50
  "max_new_tokens": 1048576,
51
  "model_max_length": 1048576,
52
+ "pad_token": "PAD",
53
  "padding_side": "right",
54
+ "sep_token": "SEP",
55
  "tokenizer_class": "PreTrainedTokenizerFast",
56
+ "unk_token": "UNK"
57
  }