tokenization_vortex.py · Matrix-Corp/Vortex-13b-V1 at main

Vortex-13b-V1 / tokenization_vortex.py

Upload Vortex model

5c43f61 verified 8 days ago

5.36 kB

	"""
	Vortex tokenizer for HuggingFace.
	Wraps VortexScienceTokenizer for HF compatibility.
	"""

	from typing import List, Optional, Dict, Any
	import json
	import os


	class VortexTokenizer:
	"""
	HuggingFace-compatible tokenizer for Vortex.
	Wraps VortexScienceTokenizer.
	"""

	def __init__(
	self,
	tokenizer_file: Optional[str] = None,
	config: Optional[Dict] = None,
	**kwargs,
	):
	"""
	Initialize tokenizer.

	Args:
	tokenizer_file: Path to tokenizer JSON
	config: Tokenizer configuration
	"""
	from .tokenizer.vortex_tokenizer import VortexScienceTokenizer

	self.config = config or {}
	self.special_tokens = self.config.get("special_tokens", {})

	if tokenizer_file and os.path.exists(tokenizer_file):
	self.tokenizer = VortexScienceTokenizer(
	self.config,
	tokenizer_path=tokenizer_file,
	)
	else:
	# Initialize empty - needs training
	self.tokenizer = VortexScienceTokenizer(self.config)

	# HF compatibility attributes
	self.pad_token = "[PAD]"
	self.unk_token = "[UNK]"
	self.bos_token = "[BOS]"
	self.eos_token = "[EOS]"
	self.pad_token_id = self.special_tokens.get("[PAD]", 0)
	self.unk_token_id = self.special_tokens.get("[UNK]", 1)
	self.bos_token_id = self.special_tokens.get("[BOS]", 2)
	self.eos_token_id = self.special_tokens.get("[EOS]", 3)

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name_or_path: str,
	**kwargs,
	):
	"""Load tokenizer from pretrained model."""
	tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json")
	config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")

	config = {}
	if os.path.exists(config_path):
	with open(config_path, "r") as f:
	config = json.load(f)

	return cls(tokenizer_file=tokenizer_path, config=config, **kwargs)

	def __call__(
	self,
	text: str \| List[str],
	padding: bool = False,
	truncation: bool = False,
	max_length: Optional[int] = None,
	return_tensors: str = "pt",
	**kwargs,
	) -> Dict[str, Any]:
	"""
	Tokenize text.

	Args:
	text: Input text or list of texts
	padding: Pad to same length
	truncation: Truncate to max_length
	max_length: Maximum length
	return_tensors: "pt" for PyTorch, "np" for numpy, None for list

	Returns:
	Dictionary with input_ids, attention_mask
	"""
	if isinstance(text, str):
	text = [text]

	if max_length is None:
	max_length = self.config.get("max_seq_len", 16384)

	# Use batch_encode
	result = self.tokenizer.batch_encode(
	text,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	return_tensors=return_tensors,
	)

	return result

	def encode(
	self,
	text: str,
	add_special_tokens: bool = True,
	**kwargs,
	) -> List[int]:
	"""Encode text to token IDs."""
	result = self.tokenizer.encode(
	text,
	add_special_tokens=add_special_tokens,
	return_tensors=None,
	)
	return result["input_ids"]

	def decode(
	self,
	token_ids: List[int],
	skip_special_tokens: bool = True,
	**kwargs,
	) -> str:
	"""Decode token IDs to text."""
	return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

	def save_pretrained(self, save_directory: str):
	"""Save tokenizer to directory."""
	os.makedirs(save_directory, exist_ok=True)
	tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json")
	self.tokenizer.save(tokenizer_path)

	# Save tokenizer config
	config_path = os.path.join(save_directory, "tokenizer_config.json")
	with open(config_path, "w") as f:
	json.dump({
	"model_type": "vortex",
	"special_tokens": self.special_tokens,
	}, f, indent=2)

	@property
	def vocab_size(self) -> int:
	"""Get vocabulary size."""
	return self.tokenizer.vocab_size

	def get_vocab(self) -> Dict[str, int]:
	"""Get vocabulary dictionary."""
	return self.tokenizer.get_vocab()


	def test_vortex_tokenizer():
	"""Test VortexTokenizer."""
	from configs.vortex_7b_config import VORTEX_7B_CONFIG

	tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG)

	text = "The equation is $E = mc^2$ and the reaction is H2O."
	encoded = tokenizer(text, padding=False, truncation=True, max_length=128)
	print(f"Encoded: {encoded['input_ids'][0][:10]}...")

	decoded = tokenizer.decode(encoded["input_ids"][0])
	print(f"Decoded: {decoded[:50]}...")

	print("VortexTokenizer test passed!")


	if __name__ == "__main__":
	test_vortex_tokenizer()