File size: 7,100 Bytes

import json
import os
from transformers import PreTrainedTokenizer

class CharacterTokenizer(PreTrainedTokenizer):
    """
    Character-level tokenizer for OCR tasks that follows HuggingFace conventions.
    Each character becomes a separate token, but decoding produces continuous text.
    """

    def __init__(
        self,
        vocab_file=None,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
        max_length=256,
        **kwargs
    ):
        if vocab_file is None or not os.path.isfile(vocab_file):
            raise ValueError("`vocab_file` must be provided or exist.")

        # Load vocabulary FIRST
        with open(vocab_file, "r", encoding="utf-8") as f:
            self.token_to_id = json.load(f)
        self.id_to_token = {v: k for k, v in self.token_to_id.items()}

        self.max_length = max_length

        # Initialize parent class
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs
        )

    @classmethod
    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
        """Register this tokenizer for AutoTokenizer"""
        return cls

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """Load tokenizer from a directory or Hub"""
        # Check if it's a local path
        if os.path.isdir(pretrained_model_name_or_path):
            vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        else:
            # Download from Hub
            from huggingface_hub import hf_hub_download
            vocab_file = hf_hub_download(
                repo_id=pretrained_model_name_or_path,
                filename="vocab.json"
            )

        # Try to load config if it exists
        try:
            if os.path.isdir(pretrained_model_name_or_path):
                config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
            else:
                config_file = hf_hub_download(
                    repo_id=pretrained_model_name_or_path,
                    filename="tokenizer_config.json"
                )

            if os.path.exists(config_file):
                with open(config_file, "r") as f:
                    config = json.load(f)
                    kwargs.update(config)
        except:
            pass  # Config file is optional

        # Remove vocab_file from kwargs if it exists to avoid duplicate argument
        kwargs.pop('vocab_file', None)

        return cls(vocab_file=vocab_file, **kwargs)

    @property
    def vocab_size(self):
        return len(self.token_to_id)

    def get_vocab(self):
        return self.token_to_id

    def _tokenize(self, text):
        """Tokenize text into individual characters"""
        return list(text)

    def _convert_token_to_id(self, token):
        """Convert a token (character) to its ID"""
        return self.token_to_id.get(token, self.unk_token_id)

    def _convert_id_to_token(self, index):
        """Convert an ID to its token (character)"""
        return self.id_to_token.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """
        Convert a sequence of tokens to a single string.
        This is the KEY method that HuggingFace uses for decoding!
        For character-level tokenization, we join without spaces.
        """
        # Filter out special tokens
        filtered_tokens = []
        for token in tokens:
            if token not in {self.pad_token, self.bos_token, self.eos_token, self.unk_token}:
                filtered_tokens.append(token)
        
        # Join characters directly without spaces
        return ''.join(filtered_tokens)

    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
        """
        Override decode to ensure proper character-level decoding.
        This follows HuggingFace conventions but handles character-level properly.
        """
        # Convert tensor to list if needed
        if hasattr(token_ids, 'tolist'):
            token_ids = token_ids.tolist()
        
        # Convert IDs to tokens
        tokens = [self._convert_id_to_token(id) for id in token_ids]
        
        # Filter special tokens if requested
        if skip_special_tokens:
            tokens = [token for token in tokens if token not in {
                self.pad_token, self.bos_token, self.eos_token, self.unk_token
            }]
        
        # Use our convert_tokens_to_string method
        text = self.convert_tokens_to_string(tokens)
        
        # For character-level, we don't want clean_up_tokenization_spaces
        # since we're not using word-level tokenization
        return text

    def batch_decode(self, sequences, skip_special_tokens=False, clean_up_tokenization_spaces=True, **kwargs):
        """
        Batch decode following HuggingFace conventions
        """
        return [
            self.decode(seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
            for seq in sequences
        ]

    def save_vocabulary(self, save_directory, filename_prefix=None):
        """Save vocabulary following HuggingFace conventions"""
        os.makedirs(save_directory, exist_ok=True)

        vocab_path = os.path.join(save_directory, "vocab.json")
        with open(vocab_path, "w", encoding="utf-8") as f:
            json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)

        config_path = os.path.join(save_directory, "tokenizer_config.json")
        with open(config_path, "w", encoding="utf-8") as f:
            json.dump({
                "tokenizer_class": "CharacterTokenizer",
                "auto_map": {
                    "AutoTokenizer": ["tokenizer.CharacterTokenizer", None]
                },
                "bos_token": self.bos_token,
                "eos_token": self.eos_token,
                "unk_token": self.unk_token,
                "pad_token": self.pad_token,
                "vocab_file": "vocab.json",
                "clean_up_tokenization_spaces": False,  # Important for character-level
            }, f, indent=2)

        return (vocab_path,)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """Build inputs with special tokens following HuggingFace conventions"""
        if token_ids_1 is None:
            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        else:
            return (
                [self.bos_token_id]
                + token_ids_0
                + [self.eos_token_id]
                + token_ids_1
                + [self.eos_token_id]
            )

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """Create token type IDs following HuggingFace conventions"""
        return [0] * len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
        )