Vortex-7b-V1 / tokenization_vortex.py
Zandy-Wandy's picture
Upload Vortex model
bf64b03 verified
"""
Vortex tokenizer for HuggingFace.
Wraps VortexScienceTokenizer for HF compatibility.
"""
from typing import List, Optional, Dict, Any
import json
import os
class VortexTokenizer:
"""
HuggingFace-compatible tokenizer for Vortex.
Wraps VortexScienceTokenizer.
"""
def __init__(
self,
tokenizer_file: Optional[str] = None,
config: Optional[Dict] = None,
**kwargs,
):
"""
Initialize tokenizer.
Args:
tokenizer_file: Path to tokenizer JSON
config: Tokenizer configuration
"""
from .tokenizer.vortex_tokenizer import VortexScienceTokenizer
self.config = config or {}
self.special_tokens = self.config.get("special_tokens", {})
if tokenizer_file and os.path.exists(tokenizer_file):
self.tokenizer = VortexScienceTokenizer(
self.config,
tokenizer_path=tokenizer_file,
)
else:
# Initialize empty - needs training
self.tokenizer = VortexScienceTokenizer(self.config)
# HF compatibility attributes
self.pad_token = "[PAD]"
self.unk_token = "[UNK]"
self.bos_token = "[BOS]"
self.eos_token = "[EOS]"
self.pad_token_id = self.special_tokens.get("[PAD]", 0)
self.unk_token_id = self.special_tokens.get("[UNK]", 1)
self.bos_token_id = self.special_tokens.get("[BOS]", 2)
self.eos_token_id = self.special_tokens.get("[EOS]", 3)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
**kwargs,
):
"""Load tokenizer from pretrained model."""
tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json")
config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
config = {}
if os.path.exists(config_path):
with open(config_path, "r") as f:
config = json.load(f)
return cls(tokenizer_file=tokenizer_path, config=config, **kwargs)
def __call__(
self,
text: str | List[str],
padding: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
return_tensors: str = "pt",
**kwargs,
) -> Dict[str, Any]:
"""
Tokenize text.
Args:
text: Input text or list of texts
padding: Pad to same length
truncation: Truncate to max_length
max_length: Maximum length
return_tensors: "pt" for PyTorch, "np" for numpy, None for list
Returns:
Dictionary with input_ids, attention_mask
"""
if isinstance(text, str):
text = [text]
if max_length is None:
max_length = self.config.get("max_seq_len", 16384)
# Use batch_encode
result = self.tokenizer.batch_encode(
text,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
)
return result
def encode(
self,
text: str,
add_special_tokens: bool = True,
**kwargs,
) -> List[int]:
"""Encode text to token IDs."""
result = self.tokenizer.encode(
text,
add_special_tokens=add_special_tokens,
return_tensors=None,
)
return result["input_ids"]
def decode(
self,
token_ids: List[int],
skip_special_tokens: bool = True,
**kwargs,
) -> str:
"""Decode token IDs to text."""
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
def save_pretrained(self, save_directory: str):
"""Save tokenizer to directory."""
os.makedirs(save_directory, exist_ok=True)
tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json")
self.tokenizer.save(tokenizer_path)
# Save tokenizer config
config_path = os.path.join(save_directory, "tokenizer_config.json")
with open(config_path, "w") as f:
json.dump({
"model_type": "vortex",
"special_tokens": self.special_tokens,
}, f, indent=2)
@property
def vocab_size(self) -> int:
"""Get vocabulary size."""
return self.tokenizer.vocab_size
def get_vocab(self) -> Dict[str, int]:
"""Get vocabulary dictionary."""
return self.tokenizer.get_vocab()
def test_vortex_tokenizer():
"""Test VortexTokenizer."""
from configs.vortex_7b_config import VORTEX_7B_CONFIG
tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG)
text = "The equation is $E = mc^2$ and the reaction is H2O."
encoded = tokenizer(text, padding=False, truncation=True, max_length=128)
print(f"Encoded: {encoded['input_ids'][0][:10]}...")
decoded = tokenizer.decode(encoded["input_ids"][0])
print(f"Decoded: {decoded[:50]}...")
print("VortexTokenizer test passed!")
if __name__ == "__main__":
test_vortex_tokenizer()