"""Custom Telugu tokenizer that handles @@ continuation marker stripping.""" from transformers import PreTrainedTokenizerFast class TeluguTokenizer(PreTrainedTokenizerFast): """Telugu tokenizer with Morfessor @@ continuation marker support. Tokens ending with @@ are continuation pieces that join to the next token. This class overrides decode() to strip @@ markers and join morphemes: "రెడ్డి@@ గారు" → "రెడ్డిగారు" Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>) from decoded output for clean text. """ # Chat special tokens to strip from output _CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"] def decode(self, token_ids, skip_special_tokens=False, **kwargs): text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) # Strip @@ continuation markers: # "@@ " between tokens means "join to next token" (no space) text = text.replace("@@ ", "") # Handle remaining @@ (before punctuation, end of string, etc.) text = text.replace("@@", "") # Strip chat special tokens for special in self._CHAT_SPECIALS: text = text.replace(special, "") # Clean up extra whitespace from removed tokens import re text = re.sub(r" +", " ", text).strip() return text