| | """Custom Telugu tokenizer that handles @@ continuation marker stripping.""" |
| | from transformers import PreTrainedTokenizerFast |
| |
|
| |
|
| | class TeluguTokenizer(PreTrainedTokenizerFast): |
| | """Telugu tokenizer with Morfessor @@ continuation marker support. |
| | |
| | Tokens ending with @@ are continuation pieces that join to the next token. |
| | This class overrides decode() to strip @@ markers and join morphemes: |
| | "రెడ్డి@@ గారు" → "రెడ్డిగారు" |
| | |
| | Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>) |
| | from decoded output for clean text. |
| | """ |
| |
|
| | |
| | _CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"] |
| |
|
| | def decode(self, token_ids, skip_special_tokens=False, **kwargs): |
| | text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) |
| | |
| | |
| | text = text.replace("@@ ", "") |
| | |
| | text = text.replace("@@", "") |
| | |
| | for special in self._CHAT_SPECIALS: |
| | text = text.replace(special, "") |
| | |
| | import re |
| | text = re.sub(r" +", " ", text).strip() |
| | return text |
| |
|