"""Custom Telugu tokenizer that handles @@ continuation marker stripping.""" from transformers import PreTrainedTokenizerFast class TeluguTokenizer(PreTrainedTokenizerFast): """Telugu tokenizer with Morfessor @@ continuation marker support. Tokens ending with @@ are continuation pieces that join to the next token. This class overrides decode() to strip @@ markers and join morphemes: "రెడ్డి@@ గారు" → "రెడ్డిగారు" """ def decode(self, token_ids, skip_special_tokens=False, **kwargs): text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) # Strip @@ continuation markers: # "@@ " between tokens means "join to next token" (no space) text = text.replace("@@ ", "") # Handle remaining @@ (before punctuation, end of string, etc.) text = text.replace("@@", "") return text