AlonKellner-Jounce commited on
Commit
ca2f1bc
·
verified ·
1 Parent(s): b15d526

simpler tokenizer

Browse files
config.json CHANGED
@@ -26,5 +26,5 @@
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.46.1",
28
  "use_cache": true,
29
- "vocab_size": 25
30
  }
 
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.46.1",
28
  "use_cache": true,
29
+ "vocab_size": 6
30
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b43fba3a61c14ef075dbe086c7b9239b13b0d843d936e8b3bba923824ad4433
3
- size 142920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56b706c1b3d4fff35a09250fce0a43595ec3fe4829184f029f39881699899daf
3
+ size 138048
special_tokens_map.json CHANGED
@@ -1,51 +1,9 @@
1
  {
2
- "bos_token": {
3
- "content": "[CLS]",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "[CLS]",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "[SEP]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "[MASK]",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "[PAD]",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "[SEP]",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "[UNK]",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
  }
 
1
  {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
tokenizer.json CHANGED
@@ -1,16 +1,7 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": {
5
- "strategy": {
6
- "Fixed": 10
7
- },
8
- "direction": "Right",
9
- "pad_to_multiple_of": null,
10
- "pad_id": 3,
11
- "pad_type_id": 0,
12
- "pad_token": "[PAD]"
13
- },
14
  "added_tokens": [
15
  {
16
  "id": 0,
@@ -168,26 +159,7 @@
168
  "[SEP]": 2,
169
  "[PAD]": 3,
170
  "[MASK]": 4,
171
- ".": 5,
172
- "this": 6,
173
- "sentence": 7,
174
- "is": 8,
175
- "one": 9,
176
- "the": 10,
177
- "2": 11,
178
- "#)": 12,
179
- "3": 13,
180
- "numbers": 14,
181
- "symbols": 15,
182
- "first": 16,
183
- "contains": 17,
184
- "1": 18,
185
- "not": 19,
186
- "but": 20,
187
- "second": 21,
188
- "(": 22,
189
- "and": 23,
190
- "over": 24
191
  },
192
  "unk_token": "[UNK]"
193
  }
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
159
  "[SEP]": 2,
160
  "[PAD]": 3,
161
  "[MASK]": 4,
162
+ "token": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  },
164
  "unk_token": "[UNK]"
165
  }
tokenizer_config.json CHANGED
@@ -47,11 +47,9 @@
47
  "cls_token": "[CLS]",
48
  "eos_token": "[SEP]",
49
  "mask_token": "[MASK]",
50
- "max_length": 1048576,
51
  "model_max_length": 1048576,
52
- "pad_to_multiple_of": null,
53
  "pad_token": "[PAD]",
54
- "pad_token_type_id": 0,
55
  "padding_side": "right",
56
  "sep_token": "[SEP]",
57
  "tokenizer_class": "PreTrainedTokenizerFast",
 
47
  "cls_token": "[CLS]",
48
  "eos_token": "[SEP]",
49
  "mask_token": "[MASK]",
50
+ "max_new_tokens": 1048576,
51
  "model_max_length": 1048576,
 
52
  "pad_token": "[PAD]",
 
53
  "padding_side": "right",
54
  "sep_token": "[SEP]",
55
  "tokenizer_class": "PreTrainedTokenizerFast",