Commit
·
6c7419a
1
Parent(s):
9921aee
add pad token and default eos token
Browse files- tokenization_xgen.py +14 -5
tokenization_xgen.py
CHANGED
|
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
|
| 28 |
-
def tiktoken_tokenizer(base="gpt2", add_special=True):
|
| 29 |
if not add_special:
|
| 30 |
return tiktoken.get_encoding(base)
|
| 31 |
|
|
@@ -83,6 +83,9 @@ def tiktoken_tokenizer(base="gpt2", add_special=True):
|
|
| 83 |
special_tokens[sp] = idx
|
| 84 |
idx += 1
|
| 85 |
|
|
|
|
|
|
|
|
|
|
| 86 |
# In production, load the arguments directly instead of accessing private attributes
|
| 87 |
# See openai_public.py for examples of arguments for specific encodings
|
| 88 |
enc = tiktoken.Encoding(
|
|
@@ -112,19 +115,22 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
| 112 |
def __init__(
|
| 113 |
self,
|
| 114 |
pad_token=None,
|
|
|
|
| 115 |
add_eos_token=False,
|
| 116 |
add_special_tokens=True,
|
| 117 |
**kwargs,
|
| 118 |
):
|
| 119 |
-
|
|
|
|
| 120 |
super().__init__(
|
| 121 |
-
pad_token=
|
|
|
|
| 122 |
add_eos_token=add_eos_token,
|
| 123 |
add_special_tokens=add_special_tokens,
|
| 124 |
**kwargs,
|
| 125 |
)
|
| 126 |
self.add_eos_token = add_eos_token
|
| 127 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
|
| 128 |
|
| 129 |
@property
|
| 130 |
def vocab_size(self):
|
|
@@ -142,6 +148,9 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
| 142 |
|
| 143 |
def _convert_token_to_id(self, token):
|
| 144 |
"""Converts a token (str) in an id using the vocab."""
|
|
|
|
|
|
|
|
|
|
| 145 |
return token
|
| 146 |
|
| 147 |
def _convert_id_to_token(self, index):
|
|
@@ -216,4 +225,4 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
| 216 |
if token_ids_1 is not None:
|
| 217 |
output += [1] * len(token_ids_1 + eos_token_id)
|
| 218 |
|
| 219 |
-
return output
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
|
| 28 |
+
def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
| 29 |
if not add_special:
|
| 30 |
return tiktoken.get_encoding(base)
|
| 31 |
|
|
|
|
| 83 |
special_tokens[sp] = idx
|
| 84 |
idx += 1
|
| 85 |
|
| 86 |
+
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
| 87 |
+
special_tokens[pad_token] = idx
|
| 88 |
+
idx += 1
|
| 89 |
# In production, load the arguments directly instead of accessing private attributes
|
| 90 |
# See openai_public.py for examples of arguments for specific encodings
|
| 91 |
enc = tiktoken.Encoding(
|
|
|
|
| 115 |
def __init__(
|
| 116 |
self,
|
| 117 |
pad_token=None,
|
| 118 |
+
eos_token="<|endoftext|>",
|
| 119 |
add_eos_token=False,
|
| 120 |
add_special_tokens=True,
|
| 121 |
**kwargs,
|
| 122 |
):
|
| 123 |
+
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
| 124 |
+
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
| 125 |
super().__init__(
|
| 126 |
+
pad_token=pad_token_added,
|
| 127 |
+
eos_token=eos_token_added,
|
| 128 |
add_eos_token=add_eos_token,
|
| 129 |
add_special_tokens=add_special_tokens,
|
| 130 |
**kwargs,
|
| 131 |
)
|
| 132 |
self.add_eos_token = add_eos_token
|
| 133 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
| 134 |
|
| 135 |
@property
|
| 136 |
def vocab_size(self):
|
|
|
|
| 148 |
|
| 149 |
def _convert_token_to_id(self, token):
|
| 150 |
"""Converts a token (str) in an id using the vocab."""
|
| 151 |
+
if isinstance(token, str):
|
| 152 |
+
ids = self._tokenize(token)
|
| 153 |
+
return ids[0]
|
| 154 |
return token
|
| 155 |
|
| 156 |
def _convert_id_to_token(self, index):
|
|
|
|
| 225 |
if token_ids_1 is not None:
|
| 226 |
output += [1] * len(token_ids_1 + eos_token_id)
|
| 227 |
|
| 228 |
+
return output
|