Image-to-Text
Transformers
Safetensors
Khmer
khmer-ocr
feature-extraction
transformer
text-recognition
crnn
khmer-text-recognition
custom_code
khmer-text-recognition / tokenization_khmerocr.py
Darayut's picture
Upload tokenization_khmerocr.py with huggingface_hub
10cc428 verified
# tokenization_khmerocr.py
import json
import os
from typing import List, Optional, Tuple, Union, Dict
from transformers import PreTrainedTokenizer
class KhmerOCRTokenizer(PreTrainedTokenizer):
"""
Custom Character-level Tokenizer for Khmer OCR
"""
vocab_files_names = {"vocab_file": "vocab.json"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
unk_token="<unk>",
pad_token="<pad>",
bos_token="<sos>",
eos_token="<eos>",
**kwargs
):
# 1. Initialize empty vocab/decoder BEFORE calling super()
self.vocab = {}
self.decoder = {}
# 2. Load vocab immediately if file is provided
if vocab_file:
with open(vocab_file, encoding="utf-8") as f:
self.vocab = json.load(f)
self.decoder = {v: k for k, v in self.vocab.items()}
# 3. NOW call super() (Parent class can now safely call get_vocab())
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
**kwargs
)
# Ensure special tokens are in the vocab logic
self.pad_token_id = self.vocab.get(pad_token, 0)
self.bos_token_id = self.vocab.get(bos_token, 1)
self.eos_token_id = self.vocab.get(eos_token, 2)
@property
def vocab_size(self):
return len(self.vocab)
def _tokenize(self, text: str) -> List[str]:
return list(text)
def _convert_token_to_id(self, token: str) -> int:
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
return self.decoder.get(index, self.unk_token)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if filename_prefix:
vocab_file = os.path.join(save_directory, f"{filename_prefix}-vocab.json")
else:
vocab_file = os.path.join(save_directory, "vocab.json")
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self.vocab, f, ensure_ascii=False)
return (vocab_file,)
def get_vocab(self) -> Dict[str, int]:
return self.vocab