pothana-sp-base-300M / tokenizer_class.py
neshkatrapati's picture
Upload folder using huggingface_hub
3f80d0c verified
"""Custom Telugu SentencePiece tokenizer with clean decode output."""
from transformers import LlamaTokenizer
class TeluguSPTokenizer(LlamaTokenizer):
"""Telugu SentencePiece tokenizer that produces clean decoded text.
Extends LlamaTokenizer to clean up SentencePiece's ▁ (U+2581) word
boundary markers, producing natural text output.
"""
def decode(self, token_ids, skip_special_tokens=False, **kwargs):
text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
# SentencePiece uses ▁ (U+2581) as word boundary marker
# Replace with space, then clean up
text = text.replace("\u2581", " ")
# Clean up extra whitespace
text = " ".join(text.split())
return text