# tokenization_khmerocr.py import json import os from typing import List, Optional, Tuple, Union, Dict from transformers import PreTrainedTokenizer class KhmerOCRTokenizer(PreTrainedTokenizer): """ Custom Character-level Tokenizer for Khmer OCR """ vocab_files_names = {"vocab_file": "vocab.json"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file=None, unk_token="", pad_token="", bos_token="", eos_token="", **kwargs ): # 1. Initialize empty vocab/decoder BEFORE calling super() self.vocab = {} self.decoder = {} # 2. Load vocab immediately if file is provided if vocab_file: with open(vocab_file, encoding="utf-8") as f: self.vocab = json.load(f) self.decoder = {v: k for k, v in self.vocab.items()} # 3. NOW call super() (Parent class can now safely call get_vocab()) super().__init__( unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs ) # Ensure special tokens are in the vocab logic self.pad_token_id = self.vocab.get(pad_token, 0) self.bos_token_id = self.vocab.get(bos_token, 1) self.eos_token_id = self.vocab.get(eos_token, 2) @property def vocab_size(self): return len(self.vocab) def _tokenize(self, text: str) -> List[str]: return list(text) def _convert_token_to_id(self, token: str) -> int: return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index: int) -> str: return self.decoder.get(index, self.unk_token) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if filename_prefix: vocab_file = os.path.join(save_directory, f"{filename_prefix}-vocab.json") else: vocab_file = os.path.join(save_directory, "vocab.json") with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self.vocab, f, ensure_ascii=False) return (vocab_file,) def get_vocab(self) -> Dict[str, int]: return self.vocab