ESCM / hf_tokenizer.py
sayonarawildhearts's picture
Upload tokenizer
1c3b9d4 verified
from transformers import PreTrainedTokenizer
import pickle
import os
class CustomTokenizer(PreTrainedTokenizer):
def __init__(self):
meta_path = "data/babylm2024/meta.pkl"
with open(meta_path, 'rb') as f:
meta = pickle.load(f)
self.stoi, self.itos = meta['stoi'], meta['itos']
super().__init__()
def tokenize(self, text):
return [s for s in text]
def encode(self, text):
return [self.stoi[c] for c in text]
def decode(self, text):
return ''.join([self.itos[i] for i in text])
def convert_tokens_to_ids(self, tokens):
return [self.stoi[t] for t in tokens]
def convert_ids_to_tokens(self, ids):
return [self.itos[id] for id in ids]
def get_vocab(self):
return self.stoi
def vocab_size(self) -> int:
"""
`int`: Size of the base vocabulary (without the added tokens).
"""
return len(self.stoi)
def save_vocabulary(self, save_directory, **kwargs):
vocab_file_path = os.path.join(save_directory, "vocab.txt")
with open(vocab_file_path, "w") as f:
for token in self.get_vocab():
f.write(token + "\n")
return (vocab_file_path,)