AnthonyDi commited on
Commit
13c8827
·
verified ·
1 Parent(s): 6bf8f29

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +99 -0
tokenizer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from transformers import PreTrainedTokenizer
4
+
5
+ class CharacterTokenizer(PreTrainedTokenizer):
6
+ """
7
+ Character-level tokenizer for OCR tasks.
8
+ Each character becomes a separate token.
9
+ """
10
+
11
+ def __init__(
12
+ self,
13
+ vocab_file=None,
14
+ unk_token="<unk>",
15
+ pad_token="<pad>",
16
+ bos_token="<s>",
17
+ eos_token="</s>",
18
+ max_length=256,
19
+ **kwargs
20
+ ):
21
+ if vocab_file is None or not os.path.isfile(vocab_file):
22
+ raise ValueError("`vocab_file` must be provided or exist.")
23
+
24
+ # Load vocabulary FIRST
25
+ with open(vocab_file, "r", encoding="utf-8") as f:
26
+ self.token_to_id = json.load(f)
27
+ self.id_to_token = {v: k for k, v in self.token_to_id.items()}
28
+
29
+ self.max_length = max_length
30
+
31
+ # Initialize parent class
32
+ super().__init__(
33
+ unk_token=unk_token,
34
+ pad_token=pad_token,
35
+ bos_token=bos_token,
36
+ eos_token=eos_token,
37
+ **kwargs
38
+ )
39
+
40
+ @classmethod
41
+ def register_for_auto_class(cls, auto_class="AutoTokenizer"):
42
+ """Register this tokenizer for AutoTokenizer"""
43
+ return cls
44
+
45
+ @property
46
+ def vocab_size(self):
47
+ return len(self.token_to_id)
48
+
49
+ def get_vocab(self):
50
+ return self.token_to_id
51
+
52
+ def _tokenize(self, text):
53
+ return list(text)
54
+
55
+ def _convert_token_to_id(self, token):
56
+ return self.token_to_id.get(token, self.unk_token_id)
57
+
58
+ def _convert_id_to_token(self, index):
59
+ return self.id_to_token.get(index, self.unk_token)
60
+
61
+ def save_vocabulary(self, save_directory, filename_prefix=None):
62
+ os.makedirs(save_directory, exist_ok=True)
63
+
64
+ vocab_path = os.path.join(save_directory, "vocab.json")
65
+ with open(vocab_path, "w", encoding="utf-8") as f:
66
+ json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
67
+
68
+ config_path = os.path.join(save_directory, "tokenizer_config.json")
69
+ with open(config_path, "w", encoding="utf-8") as f:
70
+ json.dump({
71
+ "tokenizer_class": "CharacterTokenizer",
72
+ "auto_map": {
73
+ "AutoTokenizer": "tokenizer.CharacterTokenizer"
74
+ },
75
+ "bos_token": self.bos_token,
76
+ "eos_token": self.eos_token,
77
+ "unk_token": self.unk_token,
78
+ "pad_token": self.pad_token,
79
+ "vocab_file": "vocab.json",
80
+ }, f, indent=2)
81
+
82
+ return (vocab_path,)
83
+
84
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
85
+ if token_ids_1 is None:
86
+ return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
87
+ else:
88
+ return (
89
+ [self.bos_token_id]
90
+ + token_ids_0
91
+ + [self.eos_token_id]
92
+ + token_ids_1
93
+ + [self.eos_token_id]
94
+ )
95
+
96
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
97
+ return [0] * len(
98
+ self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
99
+ )