""" Vortex tokenizer for HuggingFace. Wraps VortexScienceTokenizer for HF compatibility. """ from typing import List, Optional, Dict, Any import json import os class VortexTokenizer: """ HuggingFace-compatible tokenizer for Vortex. Wraps VortexScienceTokenizer. """ def __init__( self, tokenizer_file: Optional[str] = None, config: Optional[Dict] = None, **kwargs, ): """ Initialize tokenizer. Args: tokenizer_file: Path to tokenizer JSON config: Tokenizer configuration """ from .tokenizer.vortex_tokenizer import VortexScienceTokenizer self.config = config or {} self.special_tokens = self.config.get("special_tokens", {}) if tokenizer_file and os.path.exists(tokenizer_file): self.tokenizer = VortexScienceTokenizer( self.config, tokenizer_path=tokenizer_file, ) else: # Initialize empty - needs training self.tokenizer = VortexScienceTokenizer(self.config) # HF compatibility attributes self.pad_token = "[PAD]" self.unk_token = "[UNK]" self.bos_token = "[BOS]" self.eos_token = "[EOS]" self.pad_token_id = self.special_tokens.get("[PAD]", 0) self.unk_token_id = self.special_tokens.get("[UNK]", 1) self.bos_token_id = self.special_tokens.get("[BOS]", 2) self.eos_token_id = self.special_tokens.get("[EOS]", 3) @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str, **kwargs, ): """Load tokenizer from pretrained model.""" tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json") config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") config = {} if os.path.exists(config_path): with open(config_path, "r") as f: config = json.load(f) return cls(tokenizer_file=tokenizer_path, config=config, **kwargs) def __call__( self, text: str | List[str], padding: bool = False, truncation: bool = False, max_length: Optional[int] = None, return_tensors: str = "pt", **kwargs, ) -> Dict[str, Any]: """ Tokenize text. Args: text: Input text or list of texts padding: Pad to same length truncation: Truncate to max_length max_length: Maximum length return_tensors: "pt" for PyTorch, "np" for numpy, None for list Returns: Dictionary with input_ids, attention_mask """ if isinstance(text, str): text = [text] if max_length is None: max_length = self.config.get("max_seq_len", 16384) # Use batch_encode result = self.tokenizer.batch_encode( text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors, ) return result def encode( self, text: str, add_special_tokens: bool = True, **kwargs, ) -> List[int]: """Encode text to token IDs.""" result = self.tokenizer.encode( text, add_special_tokens=add_special_tokens, return_tensors=None, ) return result["input_ids"] def decode( self, token_ids: List[int], skip_special_tokens: bool = True, **kwargs, ) -> str: """Decode token IDs to text.""" return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) def save_pretrained(self, save_directory: str): """Save tokenizer to directory.""" os.makedirs(save_directory, exist_ok=True) tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json") self.tokenizer.save(tokenizer_path) # Save tokenizer config config_path = os.path.join(save_directory, "tokenizer_config.json") with open(config_path, "w") as f: json.dump({ "model_type": "vortex", "special_tokens": self.special_tokens, }, f, indent=2) @property def vocab_size(self) -> int: """Get vocabulary size.""" return self.tokenizer.vocab_size def get_vocab(self) -> Dict[str, int]: """Get vocabulary dictionary.""" return self.tokenizer.get_vocab() def test_vortex_tokenizer(): """Test VortexTokenizer.""" from configs.vortex_7b_config import VORTEX_7B_CONFIG tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG) text = "The equation is $E = mc^2$ and the reaction is H2O." encoded = tokenizer(text, padding=False, truncation=True, max_length=128) print(f"Encoded: {encoded['input_ids'][0][:10]}...") decoded = tokenizer.decode(encoded["input_ids"][0]) print(f"Decoded: {decoded[:50]}...") print("VortexTokenizer test passed!") if __name__ == "__main__": test_vortex_tokenizer()