| from transformers import PreTrainedTokenizer |
| import pickle |
| import os |
|
|
| class CustomTokenizer(PreTrainedTokenizer): |
| def __init__(self): |
| meta_path = "data/babylm2024/meta.pkl" |
| with open(meta_path, 'rb') as f: |
| meta = pickle.load(f) |
| self.stoi, self.itos = meta['stoi'], meta['itos'] |
| super().__init__() |
| |
| def tokenize(self, text): |
| return [s for s in text] |
| |
| def encode(self, text): |
| return [self.stoi[c] for c in text] |
| |
| def decode(self, text): |
| return ''.join([self.itos[i] for i in text]) |
| |
|
|
| def convert_tokens_to_ids(self, tokens): |
| return [self.stoi[t] for t in tokens] |
| |
| def convert_ids_to_tokens(self, ids): |
| return [self.itos[id] for id in ids] |
| |
| def get_vocab(self): |
| return self.stoi |
| |
| def vocab_size(self) -> int: |
| """ |
| `int`: Size of the base vocabulary (without the added tokens). |
| """ |
| return len(self.stoi) |
| |
| def save_vocabulary(self, save_directory, **kwargs): |
| vocab_file_path = os.path.join(save_directory, "vocab.txt") |
| with open(vocab_file_path, "w") as f: |
| for token in self.get_vocab(): |
| f.write(token + "\n") |
| return (vocab_file_path,) |
|
|
| |