""" TouchGrass tokenizer for HuggingFace. Wraps extended Qwen tokenizer for HF compatibility. """ from typing import List, Optional, Dict, Any import json import os class TouchGrassTokenizer: """ HuggingFace-compatible tokenizer for TouchGrass. Wraps the extended Qwen tokenizer. """ def __init__( self, tokenizer_file: Optional[str] = None, config: Optional[Dict] = None, **kwargs, ): """ Initialize tokenizer. Args: tokenizer_file: Path to tokenizer JSON config: Tokenizer configuration """ from .tokenizer.music_token_extension import MusicTokenizerExtension self.config = config or {} self.special_tokens = self.config.get("special_tokens", {}) if tokenizer_file and os.path.exists(tokenizer_file): self.tokenizer_ext = MusicTokenizerExtension.from_pretrained( os.path.dirname(tokenizer_file) ) self.tokenizer = self.tokenizer_ext.get_tokenizer() else: # Initialize empty - needs training or loading self.tokenizer_ext = None self.tokenizer = None # HF compatibility attributes self.pad_token = "[PAD]" self.unk_token = "[UNK]" self.bos_token = "[BOS]" self.eos_token = "[EOS]" self.pad_token_id = self.special_tokens.get("[PAD]", 0) self.unk_token_id = self.special_tokens.get("[UNK]", 1) self.bos_token_id = self.special_tokens.get("[BOS]", 2) self.eos_token_id = self.special_tokens.get("[EOS]", 3) @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str, **kwargs, ): """Load tokenizer from pretrained model.""" tokenizer_path = os.path.join(pretrained_model_name_or_path, "tokenizer.json") config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") config = {} if os.path.exists(config_path): with open(config_path, "r") as f: config = json.load(f) return cls(tokenizer_file=tokenizer_path, config=config, **kwargs) def __call__( self, text: str | List[str], padding: bool = False, truncation: bool = False, max_length: Optional[int] = None, return_tensors: str = "pt", **kwargs, ) -> Dict[str, Any]: """ Tokenize text. Args: text: Input text or list of texts padding: Pad to same length truncation: Truncate to max_length max_length: Maximum length return_tensors: "pt" for PyTorch, "np" for numpy, None for list Returns: Dictionary with input_ids, attention_mask """ if self.tokenizer is None: raise ValueError("Tokenizer not initialized. Load from pretrained or extend a base tokenizer.") if isinstance(text, str): text = [text] if max_length is None: max_length = self.config.get("max_seq_len", 4096) # Use tokenizer result = self.tokenizer( text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors, **kwargs ) return result def encode( self, text: str, add_special_tokens: bool = True, **kwargs, ) -> List[int]: """Encode text to token IDs.""" result = self.tokenizer.encode( text, add_special_tokens=add_special_tokens, return_tensors=None, ) return result["input_ids"] def decode( self, token_ids: List[int], skip_special_tokens: bool = True, **kwargs, ) -> str: """Decode token IDs to text.""" return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) def save_pretrained(self, save_directory: str): """Save tokenizer to directory.""" os.makedirs(save_directory, exist_ok=True) # Save base tokenizer self.tokenizer.save_pretrained(save_directory) # Save tokenizer config config_path = os.path.join(save_directory, "tokenizer_config.json") with open(config_path, "w") as f: json.dump({ "model_type": "touchgrass", "special_tokens": self.special_tokens, }, f, indent=2) @property def vocab_size(self) -> int: """Get vocabulary size.""" return self.tokenizer.vocab_size if self.tokenizer else 0