| | """
|
| | Vortex tokenizer for HuggingFace.
|
| | Wraps VortexScienceTokenizer for HF compatibility.
|
| | """
|
| |
|
| | from typing import List, Optional, Dict, Any
|
| | import json
|
| | import os
|
| |
|
| |
|
| | class VortexTokenizer:
|
| | """
|
| | HuggingFace-compatible tokenizer for Vortex.
|
| | Wraps VortexScienceTokenizer.
|
| | """
|
| |
|
| | def __init__(
|
| | self,
|
| | tokenizer_file: Optional[str] = None,
|
| | config: Optional[Dict] = None,
|
| | **kwargs,
|
| | ):
|
| | """
|
| | Initialize tokenizer.
|
| |
|
| | Args:
|
| | tokenizer_file: Path to tokenizer JSON
|
| | config: Tokenizer configuration
|
| | """
|
| | from .tokenizer.vortex_tokenizer import VortexScienceTokenizer
|
| |
|
| | self.config = config or {}
|
| | self.special_tokens = self.config.get("special_tokens", {})
|
| |
|
| | if tokenizer_file and os.path.exists(tokenizer_file):
|
| | self.tokenizer = VortexScienceTokenizer(
|
| | self.config,
|
| | tokenizer_path=tokenizer_file,
|
| | )
|
| | else:
|
| |
|
| | self.tokenizer = VortexScienceTokenizer(self.config)
|
| |
|
| |
|
| | self.pad_token = "[PAD]"
|
| | self.unk_token = "[UNK]"
|
| | self.bos_token = "[BOS]"
|
| | self.eos_token = "[EOS]"
|
| | self.pad_token_id = self.special_tokens.get("[PAD]", 0)
|
| | self.unk_token_id = self.special_tokens.get("[UNK]", 1)
|
| | self.bos_token_id = self.special_tokens.get("[BOS]", 2)
|
| | self.eos_token_id = self.special_tokens.get("[EOS]", 3)
|
| |
|
| | @classmethod
|
| | def from_pretrained(
|
| | cls,
|
| | pretrained_model_name_or_path: str,
|
| | **kwargs,
|
| | ):
|
| | """Load tokenizer from pretrained model."""
|
| | tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json")
|
| | config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
|
| |
|
| | config = {}
|
| | if os.path.exists(config_path):
|
| | with open(config_path, "r") as f:
|
| | config = json.load(f)
|
| |
|
| | return cls(tokenizer_file=tokenizer_path, config=config, **kwargs)
|
| |
|
| | def __call__(
|
| | self,
|
| | text: str | List[str],
|
| | padding: bool = False,
|
| | truncation: bool = False,
|
| | max_length: Optional[int] = None,
|
| | return_tensors: str = "pt",
|
| | **kwargs,
|
| | ) -> Dict[str, Any]:
|
| | """
|
| | Tokenize text.
|
| |
|
| | Args:
|
| | text: Input text or list of texts
|
| | padding: Pad to same length
|
| | truncation: Truncate to max_length
|
| | max_length: Maximum length
|
| | return_tensors: "pt" for PyTorch, "np" for numpy, None for list
|
| |
|
| | Returns:
|
| | Dictionary with input_ids, attention_mask
|
| | """
|
| | if isinstance(text, str):
|
| | text = [text]
|
| |
|
| | if max_length is None:
|
| | max_length = self.config.get("max_seq_len", 16384)
|
| |
|
| |
|
| | result = self.tokenizer.batch_encode(
|
| | text,
|
| | padding=padding,
|
| | truncation=truncation,
|
| | max_length=max_length,
|
| | return_tensors=return_tensors,
|
| | )
|
| |
|
| | return result
|
| |
|
| | def encode(
|
| | self,
|
| | text: str,
|
| | add_special_tokens: bool = True,
|
| | **kwargs,
|
| | ) -> List[int]:
|
| | """Encode text to token IDs."""
|
| | result = self.tokenizer.encode(
|
| | text,
|
| | add_special_tokens=add_special_tokens,
|
| | return_tensors=None,
|
| | )
|
| | return result["input_ids"]
|
| |
|
| | def decode(
|
| | self,
|
| | token_ids: List[int],
|
| | skip_special_tokens: bool = True,
|
| | **kwargs,
|
| | ) -> str:
|
| | """Decode token IDs to text."""
|
| | return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| |
|
| | def save_pretrained(self, save_directory: str):
|
| | """Save tokenizer to directory."""
|
| | os.makedirs(save_directory, exist_ok=True)
|
| | tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json")
|
| | self.tokenizer.save(tokenizer_path)
|
| |
|
| |
|
| | config_path = os.path.join(save_directory, "tokenizer_config.json")
|
| | with open(config_path, "w") as f:
|
| | json.dump({
|
| | "model_type": "vortex",
|
| | "special_tokens": self.special_tokens,
|
| | }, f, indent=2)
|
| |
|
| | @property
|
| | def vocab_size(self) -> int:
|
| | """Get vocabulary size."""
|
| | return self.tokenizer.vocab_size
|
| |
|
| | def get_vocab(self) -> Dict[str, int]:
|
| | """Get vocabulary dictionary."""
|
| | return self.tokenizer.get_vocab()
|
| |
|
| |
|
| | def test_vortex_tokenizer():
|
| | """Test VortexTokenizer."""
|
| | from configs.vortex_7b_config import VORTEX_7B_CONFIG
|
| |
|
| | tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG)
|
| |
|
| | text = "The equation is $E = mc^2$ and the reaction is H2O."
|
| | encoded = tokenizer(text, padding=False, truncation=True, max_length=128)
|
| | print(f"Encoded: {encoded['input_ids'][0][:10]}...")
|
| |
|
| | decoded = tokenizer.decode(encoded["input_ids"][0])
|
| | print(f"Decoded: {decoded[:50]}...")
|
| |
|
| | print("VortexTokenizer test passed!")
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | test_vortex_tokenizer()
|
| |
|