fix: re-ordering special tokens
Browse files- tokenization_arcade100k.py +12 -10
tokenization_arcade100k.py
CHANGED
|
@@ -42,12 +42,14 @@ def _arcade100k(vocab_file: str):
|
|
| 42 |
mergeable_ranks = _load_tiktoken_bpe(vocab_file)
|
| 43 |
|
| 44 |
ENDOFTEXT = "<|endoftext|>"
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
CODE = [
|
| 47 |
-
"<fim_prefix>",
|
| 48 |
-
"<fim_middle>",
|
| 49 |
-
"<fim_suffix>",
|
| 50 |
-
"<fim_pad>",
|
| 51 |
"<gh_stars>",
|
| 52 |
"<filename>",
|
| 53 |
"<issue_start>",
|
|
@@ -68,10 +70,9 @@ def _arcade100k(vocab_file: str):
|
|
| 68 |
"<|im_end|>", # Chat: Input message end
|
| 69 |
]
|
| 70 |
PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
|
| 71 |
-
REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register
|
| 72 |
ENDOFPROMPT = "<|endofprompt|>"
|
| 73 |
-
|
| 74 |
-
SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS
|
| 75 |
START_ID = len(mergeable_ranks) + 1
|
| 76 |
SPECIAL_TOKENS = {
|
| 77 |
t: START_ID + i
|
|
@@ -110,8 +111,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
| 110 |
**kwargs,
|
| 111 |
):
|
| 112 |
super().__init__(errors=errors, **kwargs)
|
| 113 |
-
self._tiktoken_config = _arcade100k(
|
| 114 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
|
|
|
| 115 |
# TODO: Remove this assertion
|
| 116 |
assert (
|
| 117 |
len(self.tokenizer._mergeable_ranks)
|
|
@@ -174,7 +176,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
| 174 |
Returns:
|
| 175 |
`Tuple(str)`: Paths to the files saved.
|
| 176 |
"""
|
| 177 |
-
file_path = os.path.join(save_directory, "
|
| 178 |
with open(file_path, "w", encoding="utf8") as w:
|
| 179 |
for k, v in self.tokenizer._mergeable_ranks.items():
|
| 180 |
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|
|
|
|
| 42 |
mergeable_ranks = _load_tiktoken_bpe(vocab_file)
|
| 43 |
|
| 44 |
ENDOFTEXT = "<|endoftext|>"
|
| 45 |
+
FIM = [
|
| 46 |
+
"<|fim_prefix|>",
|
| 47 |
+
"<|fim_middle|>",
|
| 48 |
+
"<|fim_suffix|>",
|
| 49 |
+
"<|fim_pad|>",
|
| 50 |
+
]
|
| 51 |
+
# `StarCoder` Tokens
|
| 52 |
CODE = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
"<gh_stars>",
|
| 54 |
"<filename>",
|
| 55 |
"<issue_start>",
|
|
|
|
| 70 |
"<|im_end|>", # Chat: Input message end
|
| 71 |
]
|
| 72 |
PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
|
| 73 |
+
REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register 0 sink token (https://arxiv.org/abs/2309.17453)
|
| 74 |
ENDOFPROMPT = "<|endofprompt|>"
|
| 75 |
+
SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + FIM + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS + ["<|extra0|>"]
|
|
|
|
| 76 |
START_ID = len(mergeable_ranks) + 1
|
| 77 |
SPECIAL_TOKENS = {
|
| 78 |
t: START_ID + i
|
|
|
|
| 111 |
**kwargs,
|
| 112 |
):
|
| 113 |
super().__init__(errors=errors, **kwargs)
|
| 114 |
+
self._tiktoken_config = _arcade100k(vocab_file)
|
| 115 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
| 116 |
+
|
| 117 |
# TODO: Remove this assertion
|
| 118 |
assert (
|
| 119 |
len(self.tokenizer._mergeable_ranks)
|
|
|
|
| 176 |
Returns:
|
| 177 |
`Tuple(str)`: Paths to the files saved.
|
| 178 |
"""
|
| 179 |
+
file_path = os.path.join(save_directory, "arcade100k.tiktoken")
|
| 180 |
with open(file_path, "w", encoding="utf8") as w:
|
| 181 |
for k, v in self.tokenizer._mergeable_ranks.items():
|
| 182 |
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|