JakeOh commited on
Commit
3426f14
·
verified ·
1 Parent(s): b9ae91c

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. char_tokenizer.py +21 -1
  2. special_tokens_map.json +7 -0
  3. tokenizer_config.json +21 -8
  4. vocab.json +22 -16
char_tokenizer.py CHANGED
@@ -42,6 +42,11 @@ class CharTokenizer(PreTrainedTokenizer):
42
  unk_token = kwargs.pop("unk_token", "<unk>")
43
  bos_token = kwargs.pop("bos_token", "<s>")
44
  eos_token = kwargs.pop("eos_token", "</s>")
 
 
 
 
 
45
 
46
  # Initialize vocab dictionaries first
47
  self.char_to_id = {}
@@ -60,7 +65,17 @@ class CharTokenizer(PreTrainedTokenizer):
60
  }
61
  elif characters is not None:
62
  # Build vocabulary from characters
63
- special_tokens = [pad_token, unk_token, bos_token, eos_token]
 
 
 
 
 
 
 
 
 
 
64
  unique_chars = []
65
  for char in characters:
66
  if char not in unique_chars and char not in special_tokens:
@@ -74,6 +89,11 @@ class CharTokenizer(PreTrainedTokenizer):
74
  unk_token=unk_token,
75
  bos_token=bos_token,
76
  eos_token=eos_token,
 
 
 
 
 
77
  model_max_length=model_max_length,
78
  padding_side=padding_side,
79
  **kwargs,
 
42
  unk_token = kwargs.pop("unk_token", "<unk>")
43
  bos_token = kwargs.pop("bos_token", "<s>")
44
  eos_token = kwargs.pop("eos_token", "</s>")
45
+ user_token = kwargs.pop("user_token", "<|user|>")
46
+ assistant_token = kwargs.pop("assistant_token", "<|assistant|>")
47
+ system_token = kwargs.pop("system_token", "<|system|>")
48
+ eot_token = kwargs.pop("eot_token", "<|end|>")
49
+ mask_token = kwargs.pop("mask_token", "<|mdm_mask|>")
50
 
51
  # Initialize vocab dictionaries first
52
  self.char_to_id = {}
 
65
  }
66
  elif characters is not None:
67
  # Build vocabulary from characters
68
+ special_tokens = [
69
+ pad_token,
70
+ unk_token,
71
+ bos_token,
72
+ eos_token,
73
+ user_token,
74
+ assistant_token,
75
+ system_token,
76
+ eot_token,
77
+ mask_token,
78
+ ]
79
  unique_chars = []
80
  for char in characters:
81
  if char not in unique_chars and char not in special_tokens:
 
89
  unk_token=unk_token,
90
  bos_token=bos_token,
91
  eos_token=eos_token,
92
+ user_token=user_token,
93
+ assistant_token=assistant_token,
94
+ system_token=system_token,
95
+ eot_token=eot_token,
96
+ mask_token=mask_token,
97
  model_max_length=model_max_length,
98
  padding_side=padding_side,
99
  **kwargs,
special_tokens_map.json CHANGED
@@ -27,6 +27,13 @@
27
  "normalized": false,
28
  "rstrip": false,
29
  "single_word": false
 
 
 
 
 
 
 
30
  }
31
  ],
32
  "bos_token": "<|startoftext|>",
 
27
  "normalized": false,
28
  "rstrip": false,
29
  "single_word": false
30
+ },
31
+ {
32
+ "content": "<|mdm_mask|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
  }
38
  ],
39
  "bos_token": "<|startoftext|>",
tokenizer_config.json CHANGED
@@ -16,55 +16,68 @@
16
  "single_word": false,
17
  "special": true
18
  },
19
- "19": {
20
- "content": "<|system|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "20": {
28
- "content": "<|user|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "21": {
36
- "content": "<|assistant|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
- "22": {
44
  "content": "<|end|>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
48
  "single_word": false,
49
  "special": true
 
 
 
 
 
 
 
 
50
  }
51
  },
52
  "additional_special_tokens": [
53
  "<|system|>",
54
  "<|user|>",
55
  "<|assistant|>",
56
- "<|end|>"
 
57
  ],
 
58
  "bos_token": "<|startoftext|>",
59
  "clean_up_tokenization_spaces": false,
60
  "eos_token": "<|endoftext|>",
 
61
  "extra_special_tokens": {},
62
  "mask_token": "<|mdm_mask|>",
63
  "model_max_length": 4096,
64
  "pad_token": "<|endoftext|>",
65
  "padding_side": "right",
 
66
  "tokenizer_class": "CharTokenizer",
67
  "unk_token": null,
 
68
  "auto_map": {
69
  "AutoTokenizer": [
70
  "char_tokenizer.CharTokenizer",
 
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "4": {
20
+ "content": "<|user|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "5": {
28
+ "content": "<|assistant|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "6": {
36
+ "content": "<|system|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "7": {
44
  "content": "<|end|>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
48
  "single_word": false,
49
  "special": true
50
+ },
51
+ "8": {
52
+ "content": "<|mdm_mask|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
  }
59
  },
60
  "additional_special_tokens": [
61
  "<|system|>",
62
  "<|user|>",
63
  "<|assistant|>",
64
+ "<|end|>",
65
+ "<|mdm_mask|>"
66
  ],
67
+ "assistant_token": "<|assistant|>",
68
  "bos_token": "<|startoftext|>",
69
  "clean_up_tokenization_spaces": false,
70
  "eos_token": "<|endoftext|>",
71
+ "eot_token": "<|end|>",
72
  "extra_special_tokens": {},
73
  "mask_token": "<|mdm_mask|>",
74
  "model_max_length": 4096,
75
  "pad_token": "<|endoftext|>",
76
  "padding_side": "right",
77
+ "system_token": "<|system|>",
78
  "tokenizer_class": "CharTokenizer",
79
  "unk_token": null,
80
+ "user_token": "<|user|>",
81
  "auto_map": {
82
  "AutoTokenizer": [
83
  "char_tokenizer.CharTokenizer",
vocab.json CHANGED
@@ -2,20 +2,26 @@
2
  "<|endoftext|>": 3,
3
  "null": 1,
4
  "<|startoftext|>": 2,
5
- "*": 4,
6
- "+": 5,
7
- "-": 6,
8
- "/": 7,
9
- "0": 8,
10
- "1": 9,
11
- "2": 10,
12
- "3": 11,
13
- "4": 12,
14
- "5": 13,
15
- "6": 14,
16
- "7": 15,
17
- "8": 16,
18
- "9": 17,
19
- "=": 18,
20
- "?": 19
 
 
 
 
 
 
21
  }
 
2
  "<|endoftext|>": 3,
3
  "null": 1,
4
  "<|startoftext|>": 2,
5
+ "<|user|>": 4,
6
+ "<|assistant|>": 5,
7
+ "<|system|>": 6,
8
+ "<|end|>": 7,
9
+ "<|mdm_mask|>": 8,
10
+ "\n": 9,
11
+ "*": 10,
12
+ "+": 11,
13
+ "-": 12,
14
+ "/": 13,
15
+ "0": 14,
16
+ "1": 15,
17
+ "2": 16,
18
+ "3": 17,
19
+ "4": 18,
20
+ "5": 19,
21
+ "6": 20,
22
+ "7": 21,
23
+ "8": 22,
24
+ "9": 23,
25
+ "=": 24,
26
+ "?": 25
27
  }