File size: 1,428 Bytes
bc96c4b
 
 
 
 
 
 
 
 
 
ebb32fd
 
 
bc96c4b
 
ebb32fd
 
 
bc96c4b
 
 
 
 
 
 
ebb32fd
 
 
 
 
 
bc96c4b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
"""Custom Telugu tokenizer that handles @@ continuation marker stripping."""
from transformers import PreTrainedTokenizerFast


class TeluguTokenizer(PreTrainedTokenizerFast):
    """Telugu tokenizer with Morfessor @@ continuation marker support.

    Tokens ending with @@ are continuation pieces that join to the next token.
    This class overrides decode() to strip @@ markers and join morphemes:
        "రెడ్డి@@ గారు" → "రెడ్డిగారు"

    Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>)
    from decoded output for clean text.
    """

    # Chat special tokens to strip from output
    _CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"]

    def decode(self, token_ids, skip_special_tokens=False, **kwargs):
        text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
        # Strip @@ continuation markers:
        # "@@ " between tokens means "join to next token" (no space)
        text = text.replace("@@ ", "")
        # Handle remaining @@ (before punctuation, end of string, etc.)
        text = text.replace("@@", "")
        # Strip chat special tokens
        for special in self._CHAT_SPECIALS:
            text = text.replace(special, "")
        # Clean up extra whitespace from removed tokens
        import re
        text = re.sub(r"  +", " ", text).strip()
        return text