""" HuggingFace-compatible wrapper for PaniniTokenizer. This file enables: tokenizer = AutoTokenizer.from_pretrained("ArthaLabs/panini-tokenizer", trust_remote_code=True) """ import os import json from typing import List, Optional, Union from transformers import PreTrainedTokenizer class PaniniTokenizerHF(PreTrainedTokenizer): """ HuggingFace-compatible Panini Tokenizer. A grammar-first Sanskrit tokenizer based on Pāṇinian morphological analysis. Uses Monier-Williams dictionary stems and Sandhi reversal for tokenization. """ vocab_files_names = {"vocab_file": "vocab.json"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file: Optional[str] = None, unk_token: str = "", pad_token: str = "", bos_token: str = "", eos_token: str = "", **kwargs ): # Load vocabulary self._vocab = {} self._id_to_token = {} if vocab_file and os.path.exists(vocab_file): with open(vocab_file, "r", encoding="utf-8") as f: self._vocab = json.load(f) self._id_to_token = {v: k for k, v in self._vocab.items()} super().__init__( unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs ) # Lazy-load the morphological splitter self._splitter = None self._stems = None def _load_splitter(self): """Lazy-load the morphological splitter.""" if self._splitter is None: # Try to import from src directory import sys src_dir = os.path.join(os.path.dirname(__file__), "src") if src_dir not in sys.path: sys.path.insert(0, src_dir) try: from splitter import SamasaSplitter self._splitter = SamasaSplitter() except ImportError: self._splitter = None @property def vocab_size(self) -> int: return len(self._vocab) def get_vocab(self): return self._vocab.copy() def _tokenize(self, text: str) -> List[str]: """Tokenize using morphological analysis.""" self._load_splitter() tokens = [] words = text.split() for i, word in enumerate(words): prefix = "▁" if i == 0 or not tokens else "" if self._splitter: # Use morphological splitting split_result = self._splitter.split_v4(word) # V1.5: Sandhi expansion if split_result.is_compound and len(split_result.components) > 1: for j, comp in enumerate(split_result.components): if j == 0: tokens.append(prefix + comp) else: tokens.append(comp) else: tokens.append(prefix + word) else: # Fallback: simple tokenization tokens.append(prefix + word) return tokens def _convert_token_to_id(self, token: str) -> int: return self._vocab.get(token, self._vocab.get(self.unk_token, 0)) def _convert_id_to_token(self, index: int) -> str: return self._id_to_token.get(index, self.unk_token) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Convert tokens back to string.""" text = "" for token in tokens: if token.startswith("▁"): text += " " + token[1:] else: text += token return text.strip() def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): """Save vocabulary to file.""" vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json" ) with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self._vocab, f, ensure_ascii=False, indent=2) return (vocab_file,)