File size: 7,100 Bytes
13c8827 c402548 13c8827 2c67084 c402548 2c67084 13c8827 c402548 13c8827 c402548 13c8827 c402548 13c8827 c402548 13c8827 c402548 13c8827 d06d75a 13c8827 c402548 13c8827 c402548 13c8827 c402548 13c8827 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | import json
import os
from transformers import PreTrainedTokenizer
class CharacterTokenizer(PreTrainedTokenizer):
"""
Character-level tokenizer for OCR tasks that follows HuggingFace conventions.
Each character becomes a separate token, but decoding produces continuous text.
"""
def __init__(
self,
vocab_file=None,
unk_token="<unk>",
pad_token="<pad>",
bos_token="<s>",
eos_token="</s>",
max_length=256,
**kwargs
):
if vocab_file is None or not os.path.isfile(vocab_file):
raise ValueError("`vocab_file` must be provided or exist.")
# Load vocabulary FIRST
with open(vocab_file, "r", encoding="utf-8") as f:
self.token_to_id = json.load(f)
self.id_to_token = {v: k for k, v in self.token_to_id.items()}
self.max_length = max_length
# Initialize parent class
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
**kwargs
)
@classmethod
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
"""Register this tokenizer for AutoTokenizer"""
return cls
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load tokenizer from a directory or Hub"""
# Check if it's a local path
if os.path.isdir(pretrained_model_name_or_path):
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
else:
# Download from Hub
from huggingface_hub import hf_hub_download
vocab_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename="vocab.json"
)
# Try to load config if it exists
try:
if os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
else:
config_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename="tokenizer_config.json"
)
if os.path.exists(config_file):
with open(config_file, "r") as f:
config = json.load(f)
kwargs.update(config)
except:
pass # Config file is optional
# Remove vocab_file from kwargs if it exists to avoid duplicate argument
kwargs.pop('vocab_file', None)
return cls(vocab_file=vocab_file, **kwargs)
@property
def vocab_size(self):
return len(self.token_to_id)
def get_vocab(self):
return self.token_to_id
def _tokenize(self, text):
"""Tokenize text into individual characters"""
return list(text)
def _convert_token_to_id(self, token):
"""Convert a token (character) to its ID"""
return self.token_to_id.get(token, self.unk_token_id)
def _convert_id_to_token(self, index):
"""Convert an ID to its token (character)"""
return self.id_to_token.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""
Convert a sequence of tokens to a single string.
This is the KEY method that HuggingFace uses for decoding!
For character-level tokenization, we join without spaces.
"""
# Filter out special tokens
filtered_tokens = []
for token in tokens:
if token not in {self.pad_token, self.bos_token, self.eos_token, self.unk_token}:
filtered_tokens.append(token)
# Join characters directly without spaces
return ''.join(filtered_tokens)
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
"""
Override decode to ensure proper character-level decoding.
This follows HuggingFace conventions but handles character-level properly.
"""
# Convert tensor to list if needed
if hasattr(token_ids, 'tolist'):
token_ids = token_ids.tolist()
# Convert IDs to tokens
tokens = [self._convert_id_to_token(id) for id in token_ids]
# Filter special tokens if requested
if skip_special_tokens:
tokens = [token for token in tokens if token not in {
self.pad_token, self.bos_token, self.eos_token, self.unk_token
}]
# Use our convert_tokens_to_string method
text = self.convert_tokens_to_string(tokens)
# For character-level, we don't want clean_up_tokenization_spaces
# since we're not using word-level tokenization
return text
def batch_decode(self, sequences, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
"""
Batch decode following HuggingFace conventions
"""
return [
self.decode(seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
for seq in sequences
]
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save vocabulary following HuggingFace conventions"""
os.makedirs(save_directory, exist_ok=True)
vocab_path = os.path.join(save_directory, "vocab.json")
with open(vocab_path, "w", encoding="utf-8") as f:
json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
config_path = os.path.join(save_directory, "tokenizer_config.json")
with open(config_path, "w", encoding="utf-8") as f:
json.dump({
"tokenizer_class": "CharacterTokenizer",
"auto_map": {
"AutoTokenizer": ["tokenizer.CharacterTokenizer", None]
},
"bos_token": self.bos_token,
"eos_token": self.eos_token,
"unk_token": self.unk_token,
"pad_token": self.pad_token,
"vocab_file": "vocab.json",
"clean_up_tokenization_spaces": False, # Important for character-level
}, f, indent=2)
return (vocab_path,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""Build inputs with special tokens following HuggingFace conventions"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
else:
return (
[self.bos_token_id]
+ token_ids_0
+ [self.eos_token_id]
+ token_ids_1
+ [self.eos_token_id]
)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""Create token type IDs following HuggingFace conventions"""
return [0] * len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
)
|