File size: 4,408 Bytes
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbe5930
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""

HuggingFace-compatible wrapper for PaniniTokenizer.



This file enables:

    tokenizer = AutoTokenizer.from_pretrained("ArthaLabs/panini-tokenizer", trust_remote_code=True)

"""

import os
import json
from typing import List, Optional, Union
from transformers import PreTrainedTokenizer


class PaniniTokenizerHF(PreTrainedTokenizer):
    """

    HuggingFace-compatible Panini Tokenizer.

    

    A grammar-first Sanskrit tokenizer based on Pāṇinian morphological analysis.

    Uses Monier-Williams dictionary stems and Sandhi reversal for tokenization.

    """
    
    vocab_files_names = {"vocab_file": "vocab.json"}
    model_input_names = ["input_ids", "attention_mask"]
    
    def __init__(

        self,

        vocab_file: Optional[str] = None,

        unk_token: str = "<unk>",

        pad_token: str = "<pad>",

        bos_token: str = "<bos>",

        eos_token: str = "<eos>",

        **kwargs

    ):
        # Load vocabulary
        self._vocab = {}
        self._id_to_token = {}
        
        if vocab_file and os.path.exists(vocab_file):
            with open(vocab_file, "r", encoding="utf-8") as f:
                self._vocab = json.load(f)
            self._id_to_token = {v: k for k, v in self._vocab.items()}
        
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs
        )
        
        # Lazy-load the morphological splitter
        self._splitter = None
        self._stems = None
    
    def _load_splitter(self):
        """Lazy-load the morphological splitter."""
        if self._splitter is None:
            # Try to import from src directory
            import sys
            src_dir = os.path.join(os.path.dirname(__file__), "src")
            if src_dir not in sys.path:
                sys.path.insert(0, src_dir)
            
            try:
                from splitter import SamasaSplitter
                self._splitter = SamasaSplitter()
            except ImportError:
                self._splitter = None
    
    @property
    def vocab_size(self) -> int:
        return len(self._vocab)
    
    def get_vocab(self):
        return self._vocab.copy()
    
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize using morphological analysis."""
        self._load_splitter()
        
        tokens = []
        words = text.split()
        
        for i, word in enumerate(words):
            prefix = "▁" if i == 0 or not tokens else ""
            
            if self._splitter:
                # Use morphological splitting
                split_result = self._splitter.split_v4(word)  # V1.5: Sandhi expansion
                if split_result.is_compound and len(split_result.components) > 1:
                    for j, comp in enumerate(split_result.components):
                        if j == 0:
                            tokens.append(prefix + comp)
                        else:
                            tokens.append(comp)
                else:
                    tokens.append(prefix + word)
            else:
                # Fallback: simple tokenization
                tokens.append(prefix + word)
        
        return tokens
    
    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
    
    def _convert_id_to_token(self, index: int) -> str:
        return self._id_to_token.get(index, self.unk_token)
    
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Convert tokens back to string."""
        text = ""
        for token in tokens:
            if token.startswith("▁"):
                text += " " + token[1:]
            else:
                text += token
        return text.strip()
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
        """Save vocabulary to file."""
        vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
        )
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
        return (vocab_file,)