pothana-chat-300M / tokenizer_class.py
neshkatrapati's picture
Upload folder using huggingface_hub
ebb32fd verified
"""Custom Telugu tokenizer that handles @@ continuation marker stripping."""
from transformers import PreTrainedTokenizerFast
class TeluguTokenizer(PreTrainedTokenizerFast):
"""Telugu tokenizer with Morfessor @@ continuation marker support.
Tokens ending with @@ are continuation pieces that join to the next token.
This class overrides decode() to strip @@ markers and join morphemes:
"రెడ్డి@@ గారు" → "రెడ్డిగారు"
Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>)
from decoded output for clean text.
"""
# Chat special tokens to strip from output
_CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"]
def decode(self, token_ids, skip_special_tokens=False, **kwargs):
text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
# Strip @@ continuation markers:
# "@@ " between tokens means "join to next token" (no space)
text = text.replace("@@ ", "")
# Handle remaining @@ (before punctuation, end of string, etc.)
text = text.replace("@@", "")
# Strip chat special tokens
for special in self._CHAT_SPECIALS:
text = text.replace(special, "")
# Clean up extra whitespace from removed tokens
import re
text = re.sub(r" +", " ", text).strip()
return text