| |
| |
| |
|
|
| import array |
| import os |
| import struct |
| import argparse |
| from pathlib import Path |
| from typing import List |
|
|
| import tiktoken |
| from tiktoken.load import load_tiktoken_bpe |
|
|
| TOKENIZER_MODEL = "tokenizer.model" |
|
|
|
|
| class Tokenizer: |
| pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" |
|
|
| def __init__(self, tokenizer_model=None): |
| model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL |
| assert os.path.isfile(model_path), model_path |
| mergeable_ranks = load_tiktoken_bpe(model_path) |
| self.model_path = model_path |
|
|
| |
| num_base_tokens = len(mergeable_ranks) |
| num_reserved_special_tokens = 256 |
|
|
| special_tokens = [ |
| "<|begin_of_text|>", |
| "<|end_of_text|>", |
| "<|reserved_special_token_0|>", |
| "<|reserved_special_token_1|>", |
| "<|reserved_special_token_2|>", |
| "<|reserved_special_token_3|>", |
| "<|start_header_id|>", |
| "<|end_header_id|>", |
| "<|reserved_special_token_4|>", |
| "<|eot_id|>", |
| ] + [ |
| f"<|reserved_special_token_{i}|>" |
| for i in range(5, num_reserved_special_tokens - 5) |
| ] |
| self.special_tokens = { |
| token: num_base_tokens + i for i, token in enumerate(special_tokens) |
| } |
| self.model = tiktoken.Encoding( |
| name=Path(model_path).name, |
| pat_str=self.pat_str, |
| mergeable_ranks=mergeable_ranks, |
| special_tokens=self.special_tokens, |
| ) |
| self.n_words = self.model.n_vocab |
| self.bos_id = self.special_tokens["<|begin_of_text|>"] |
| self.eos_id = self.special_tokens["<|end_of_text|>"] |
| self.pad_id = -1 |
| self.stop_tokens = { |
| self.special_tokens["<|end_of_text|>"], |
| self.special_tokens["<|eot_id|>"], |
| } |
|
|
| def encode( |
| self, s: str, bos: bool, eos: bool, allowed_special, disallowed_special |
| ) -> List[int]: |
| assert type(s) is str |
| self.model.encode( |
| substr, |
| allowed_special=allowed_special, |
| disallowed_special=disallowed_special, |
| ) |
|
|
| if bos: |
| t.insert(0, self.bos_id) |
| if eos: |
| t.append(self.eos_id) |
| return t |
|
|
| def decode(self, t: List[int]) -> str: |
| return self.model.decode(t) |
|
|
| def export(self): |
|
|
| |
| tokens, scores = [], [] |
| for i in range(self.n_words): |
|
|
| |
| t = self.model.decode_single_token_bytes(i) |
| s = i |
| tokens.append(t) |
| scores.append(s) |
|
|
| |
| max_token_length = max(len(t) for t in tokens) |
|
|
| |
| |
| tokenizer_bin = self.model_path.replace(".model", ".bin") |
| with open(tokenizer_bin, "wb") as f: |
| f.write(struct.pack("I", max_token_length)) |
| for bytes, score in zip(tokens, scores): |
| f.write(struct.pack("fI", score, len(bytes))) |
| f.write(bytes) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ") |
|
|
| args = parser.parse_args() |
|
|
| t = Tokenizer(args.tokenizer_model) |
| t.export() |
|
|