|
|
|
|
|
import json
|
|
|
import os
|
|
|
from typing import List, Optional, Tuple, Union, Dict
|
|
|
from transformers import PreTrainedTokenizer
|
|
|
|
|
|
class KhmerOCRTokenizer(PreTrainedTokenizer):
|
|
|
"""
|
|
|
Custom Character-level Tokenizer for Khmer OCR
|
|
|
"""
|
|
|
vocab_files_names = {"vocab_file": "vocab.json"}
|
|
|
model_input_names = ["input_ids", "attention_mask"]
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
vocab_file=None,
|
|
|
unk_token="<unk>",
|
|
|
pad_token="<pad>",
|
|
|
bos_token="<sos>",
|
|
|
eos_token="<eos>",
|
|
|
**kwargs
|
|
|
):
|
|
|
|
|
|
self.vocab = {}
|
|
|
self.decoder = {}
|
|
|
|
|
|
|
|
|
if vocab_file:
|
|
|
with open(vocab_file, encoding="utf-8") as f:
|
|
|
self.vocab = json.load(f)
|
|
|
self.decoder = {v: k for k, v in self.vocab.items()}
|
|
|
|
|
|
|
|
|
super().__init__(
|
|
|
unk_token=unk_token,
|
|
|
pad_token=pad_token,
|
|
|
bos_token=bos_token,
|
|
|
eos_token=eos_token,
|
|
|
**kwargs
|
|
|
)
|
|
|
|
|
|
|
|
|
self.pad_token_id = self.vocab.get(pad_token, 0)
|
|
|
self.bos_token_id = self.vocab.get(bos_token, 1)
|
|
|
self.eos_token_id = self.vocab.get(eos_token, 2)
|
|
|
|
|
|
@property
|
|
|
def vocab_size(self):
|
|
|
return len(self.vocab)
|
|
|
|
|
|
def _tokenize(self, text: str) -> List[str]:
|
|
|
return list(text)
|
|
|
|
|
|
def _convert_token_to_id(self, token: str) -> int:
|
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
|
|
|
|
|
def _convert_id_to_token(self, index: int) -> str:
|
|
|
return self.decoder.get(index, self.unk_token)
|
|
|
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
|
|
if filename_prefix:
|
|
|
vocab_file = os.path.join(save_directory, f"{filename_prefix}-vocab.json")
|
|
|
else:
|
|
|
vocab_file = os.path.join(save_directory, "vocab.json")
|
|
|
|
|
|
with open(vocab_file, "w", encoding="utf-8") as f:
|
|
|
json.dump(self.vocab, f, ensure_ascii=False)
|
|
|
|
|
|
return (vocab_file,)
|
|
|
|
|
|
def get_vocab(self) -> Dict[str, int]:
|
|
|
return self.vocab |