Image-to-Text
Transformers
Safetensors
Khmer
khmer-ocr
feature-extraction
transformer
text-recognition
crnn
khmer-text-recognition
custom_code
Darayut commited on
Commit
10cc428
·
verified ·
1 Parent(s): 010ffc8

Upload tokenization_khmerocr.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_khmerocr.py +72 -0
tokenization_khmerocr.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tokenization_khmerocr.py
2
+ import json
3
+ import os
4
+ from typing import List, Optional, Tuple, Union, Dict
5
+ from transformers import PreTrainedTokenizer
6
+
7
+ class KhmerOCRTokenizer(PreTrainedTokenizer):
8
+ """
9
+ Custom Character-level Tokenizer for Khmer OCR
10
+ """
11
+ vocab_files_names = {"vocab_file": "vocab.json"}
12
+ model_input_names = ["input_ids", "attention_mask"]
13
+
14
+ def __init__(
15
+ self,
16
+ vocab_file=None,
17
+ unk_token="<unk>",
18
+ pad_token="<pad>",
19
+ bos_token="<sos>",
20
+ eos_token="<eos>",
21
+ **kwargs
22
+ ):
23
+ # 1. Initialize empty vocab/decoder BEFORE calling super()
24
+ self.vocab = {}
25
+ self.decoder = {}
26
+
27
+ # 2. Load vocab immediately if file is provided
28
+ if vocab_file:
29
+ with open(vocab_file, encoding="utf-8") as f:
30
+ self.vocab = json.load(f)
31
+ self.decoder = {v: k for k, v in self.vocab.items()}
32
+
33
+ # 3. NOW call super() (Parent class can now safely call get_vocab())
34
+ super().__init__(
35
+ unk_token=unk_token,
36
+ pad_token=pad_token,
37
+ bos_token=bos_token,
38
+ eos_token=eos_token,
39
+ **kwargs
40
+ )
41
+
42
+ # Ensure special tokens are in the vocab logic
43
+ self.pad_token_id = self.vocab.get(pad_token, 0)
44
+ self.bos_token_id = self.vocab.get(bos_token, 1)
45
+ self.eos_token_id = self.vocab.get(eos_token, 2)
46
+
47
+ @property
48
+ def vocab_size(self):
49
+ return len(self.vocab)
50
+
51
+ def _tokenize(self, text: str) -> List[str]:
52
+ return list(text)
53
+
54
+ def _convert_token_to_id(self, token: str) -> int:
55
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
56
+
57
+ def _convert_id_to_token(self, index: int) -> str:
58
+ return self.decoder.get(index, self.unk_token)
59
+
60
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
61
+ if filename_prefix:
62
+ vocab_file = os.path.join(save_directory, f"{filename_prefix}-vocab.json")
63
+ else:
64
+ vocab_file = os.path.join(save_directory, "vocab.json")
65
+
66
+ with open(vocab_file, "w", encoding="utf-8") as f:
67
+ json.dump(self.vocab, f, ensure_ascii=False)
68
+
69
+ return (vocab_file,)
70
+
71
+ def get_vocab(self) -> Dict[str, int]:
72
+ return self.vocab