import json from transformers import PreTrainedTokenizer class I3Tokenizer(PreTrainedTokenizer): def __init__(self, vocab_file, **kwargs): super().__init__(**kwargs) with open(vocab_file, "r") as f: vocab_data = json.load(f) self.chunk_to_idx = vocab_data["chunk_to_idx"] self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()} self.vocab_size = vocab_data["vocab_size"] @property def vocab_size(self): return len(self.chunk_to_idx) def _tokenize(self, text): # replicate your ChunkTokenizer.encode logic text = text.lower() pos = 0 tokens = [] while pos < len(text): chunk = text[pos:pos+2] if chunk in self.chunk_to_idx: tokens.append(chunk) pos += 2 else: pos += 1 return tokens def _convert_token_to_id(self, token): return self.chunk_to_idx.get(token, 0) def _convert_id_to_token(self, index): return self.idx_to_chunk.get(index, "") def convert_tokens_to_string(self, tokens): return "".join(tokens) def save_vocabulary(self, save_directory): vocab_file = f"{save_directory}/tokenizer.json" with open(vocab_file, "w") as f: json.dump({ "chunk_to_idx": self.chunk_to_idx, "idx_to_chunk": self.idx_to_chunk, "vocab_size": self.vocab_size, }, f) return (vocab_file,)