"""Custom Telugu SentencePiece tokenizer with clean decode output.""" from transformers import LlamaTokenizer class TeluguSPTokenizer(LlamaTokenizer): """Telugu SentencePiece tokenizer that produces clean decoded text. Extends LlamaTokenizer to clean up SentencePiece's ▁ (U+2581) word boundary markers, producing natural text output. """ def decode(self, token_ids, skip_special_tokens=False, **kwargs): text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) # SentencePiece uses ▁ (U+2581) as word boundary marker # Replace with space, then clean up text = text.replace("\u2581", " ") # Clean up extra whitespace text = " ".join(text.split()) return text