| """Custom Telugu SentencePiece tokenizer with clean decode output.""" | |
| from transformers import LlamaTokenizer | |
| class TeluguSPTokenizer(LlamaTokenizer): | |
| """Telugu SentencePiece tokenizer that produces clean decoded text. | |
| Extends LlamaTokenizer to clean up SentencePiece's ▁ (U+2581) word | |
| boundary markers, producing natural text output. | |
| """ | |
| def decode(self, token_ids, skip_special_tokens=False, **kwargs): | |
| text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) | |
| # SentencePiece uses ▁ (U+2581) as word boundary marker | |
| # Replace with space, then clean up | |
| text = text.replace("\u2581", " ") | |
| # Clean up extra whitespace | |
| text = " ".join(text.split()) | |
| return text | |