| """Custom Telugu tokenizer that handles @@ continuation marker stripping.""" | |
| from transformers import PreTrainedTokenizerFast | |
| class TeluguTokenizer(PreTrainedTokenizerFast): | |
| """Telugu tokenizer with Morfessor @@ continuation marker support. | |
| Tokens ending with @@ are continuation pieces that join to the next token. | |
| This class overrides decode() to strip @@ markers and join morphemes: | |
| "రెడ్డి@@ గారు" → "రెడ్డిగారు" | |
| """ | |
| def decode(self, token_ids, skip_special_tokens=False, **kwargs): | |
| text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) | |
| # Strip @@ continuation markers: | |
| # "@@ " between tokens means "join to next token" (no space) | |
| text = text.replace("@@ ", "") | |
| # Handle remaining @@ (before punctuation, end of string, etc.) | |
| text = text.replace("@@", "") | |
| return text | |