File size: 770 Bytes

3f80d0c

"""Custom Telugu SentencePiece tokenizer with clean decode output."""
from transformers import LlamaTokenizer


class TeluguSPTokenizer(LlamaTokenizer):
    """Telugu SentencePiece tokenizer that produces clean decoded text.

    Extends LlamaTokenizer to clean up SentencePiece's ▁ (U+2581) word
    boundary markers, producing natural text output.
    """

    def decode(self, token_ids, skip_special_tokens=False, **kwargs):
        text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
        # SentencePiece uses ▁ (U+2581) as word boundary marker
        # Replace with space, then clean up
        text = text.replace("\u2581", " ")
        # Clean up extra whitespace
        text = " ".join(text.split())
        return text