Update tokenization_xgen.py
Browse files- tokenization_xgen.py +12 -0
tokenization_xgen.py
CHANGED
|
@@ -60,9 +60,18 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
| 60 |
]
|
| 61 |
return fim_tokens
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
| 64 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
| 65 |
fim_tokens = include_fim_tokens()
|
|
|
|
| 66 |
|
| 67 |
tokenizer = tiktoken.get_encoding(base)
|
| 68 |
|
|
@@ -82,6 +91,9 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
| 82 |
for sp in fim_tokens:
|
| 83 |
special_tokens[sp] = idx
|
| 84 |
idx += 1
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
| 87 |
special_tokens[pad_token] = idx
|
|
|
|
| 60 |
]
|
| 61 |
return fim_tokens
|
| 62 |
|
| 63 |
+
def include_additional_tokens():
|
| 64 |
+
tokens = []
|
| 65 |
+
tokens += [f"<dummy_{i}>" for i in range(4)]
|
| 66 |
+
tokens.append("<sep>") # 50317
|
| 67 |
+
tokens.append("<eom>") # 50318
|
| 68 |
+
tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
|
| 69 |
+
return tokens
|
| 70 |
+
|
| 71 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
| 72 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
| 73 |
fim_tokens = include_fim_tokens()
|
| 74 |
+
additional_tokens = include_additional_tokens()
|
| 75 |
|
| 76 |
tokenizer = tiktoken.get_encoding(base)
|
| 77 |
|
|
|
|
| 91 |
for sp in fim_tokens:
|
| 92 |
special_tokens[sp] = idx
|
| 93 |
idx += 1
|
| 94 |
+
for sp in additional_tokens:
|
| 95 |
+
special_tokens[sp] = idx
|
| 96 |
+
idx += 1
|
| 97 |
|
| 98 |
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
| 99 |
special_tokens[pad_token] = idx
|