CharacterTokenizer / tokenizer.py
AnthonyDi's picture
Upload tokenizer.py with huggingface_hub
c402548 verified
import json
import os
from transformers import PreTrainedTokenizer
class CharacterTokenizer(PreTrainedTokenizer):
"""
Character-level tokenizer for OCR tasks that follows HuggingFace conventions.
Each character becomes a separate token, but decoding produces continuous text.
"""
def __init__(
self,
vocab_file=None,
unk_token="<unk>",
pad_token="<pad>",
bos_token="<s>",
eos_token="</s>",
max_length=256,
**kwargs
):
if vocab_file is None or not os.path.isfile(vocab_file):
raise ValueError("`vocab_file` must be provided or exist.")
# Load vocabulary FIRST
with open(vocab_file, "r", encoding="utf-8") as f:
self.token_to_id = json.load(f)
self.id_to_token = {v: k for k, v in self.token_to_id.items()}
self.max_length = max_length
# Initialize parent class
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
**kwargs
)
@classmethod
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
"""Register this tokenizer for AutoTokenizer"""
return cls
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load tokenizer from a directory or Hub"""
# Check if it's a local path
if os.path.isdir(pretrained_model_name_or_path):
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
else:
# Download from Hub
from huggingface_hub import hf_hub_download
vocab_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename="vocab.json"
)
# Try to load config if it exists
try:
if os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
else:
config_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename="tokenizer_config.json"
)
if os.path.exists(config_file):
with open(config_file, "r") as f:
config = json.load(f)
kwargs.update(config)
except:
pass # Config file is optional
# Remove vocab_file from kwargs if it exists to avoid duplicate argument
kwargs.pop('vocab_file', None)
return cls(vocab_file=vocab_file, **kwargs)
@property
def vocab_size(self):
return len(self.token_to_id)
def get_vocab(self):
return self.token_to_id
def _tokenize(self, text):
"""Tokenize text into individual characters"""
return list(text)
def _convert_token_to_id(self, token):
"""Convert a token (character) to its ID"""
return self.token_to_id.get(token, self.unk_token_id)
def _convert_id_to_token(self, index):
"""Convert an ID to its token (character)"""
return self.id_to_token.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""
Convert a sequence of tokens to a single string.
This is the KEY method that HuggingFace uses for decoding!
For character-level tokenization, we join without spaces.
"""
# Filter out special tokens
filtered_tokens = []
for token in tokens:
if token not in {self.pad_token, self.bos_token, self.eos_token, self.unk_token}:
filtered_tokens.append(token)
# Join characters directly without spaces
return ''.join(filtered_tokens)
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
"""
Override decode to ensure proper character-level decoding.
This follows HuggingFace conventions but handles character-level properly.
"""
# Convert tensor to list if needed
if hasattr(token_ids, 'tolist'):
token_ids = token_ids.tolist()
# Convert IDs to tokens
tokens = [self._convert_id_to_token(id) for id in token_ids]
# Filter special tokens if requested
if skip_special_tokens:
tokens = [token for token in tokens if token not in {
self.pad_token, self.bos_token, self.eos_token, self.unk_token
}]
# Use our convert_tokens_to_string method
text = self.convert_tokens_to_string(tokens)
# For character-level, we don't want clean_up_tokenization_spaces
# since we're not using word-level tokenization
return text
def batch_decode(self, sequences, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
"""
Batch decode following HuggingFace conventions
"""
return [
self.decode(seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
for seq in sequences
]
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save vocabulary following HuggingFace conventions"""
os.makedirs(save_directory, exist_ok=True)
vocab_path = os.path.join(save_directory, "vocab.json")
with open(vocab_path, "w", encoding="utf-8") as f:
json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
config_path = os.path.join(save_directory, "tokenizer_config.json")
with open(config_path, "w", encoding="utf-8") as f:
json.dump({
"tokenizer_class": "CharacterTokenizer",
"auto_map": {
"AutoTokenizer": ["tokenizer.CharacterTokenizer", None]
},
"bos_token": self.bos_token,
"eos_token": self.eos_token,
"unk_token": self.unk_token,
"pad_token": self.pad_token,
"vocab_file": "vocab.json",
"clean_up_tokenization_spaces": False, # Important for character-level
}, f, indent=2)
return (vocab_path,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""Build inputs with special tokens following HuggingFace conventions"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
else:
return (
[self.bos_token_id]
+ token_ids_0
+ [self.eos_token_id]
+ token_ids_1
+ [self.eos_token_id]
)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""Create token type IDs following HuggingFace conventions"""
return [0] * len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
)