tokenization_vortex.py · Matrix-Corp/Vortex-7b-V1 at main

File size: 5,355 Bytes

bf64b03

"""

Vortex tokenizer for HuggingFace.

Wraps VortexScienceTokenizer for HF compatibility.

"""

from typing import List, Optional, Dict, Any
import json
import os


class VortexTokenizer:
    """

    HuggingFace-compatible tokenizer for Vortex.

    Wraps VortexScienceTokenizer.

    """

    def __init__(

        self,

        tokenizer_file: Optional[str] = None,

        config: Optional[Dict] = None,

        **kwargs,

    ):
        """

        Initialize tokenizer.



        Args:

            tokenizer_file: Path to tokenizer JSON

            config: Tokenizer configuration

        """
        from .tokenizer.vortex_tokenizer import VortexScienceTokenizer

        self.config = config or {}
        self.special_tokens = self.config.get("special_tokens", {})

        if tokenizer_file and os.path.exists(tokenizer_file):
            self.tokenizer = VortexScienceTokenizer(
                self.config,
                tokenizer_path=tokenizer_file,
            )
        else:
            # Initialize empty - needs training
            self.tokenizer = VortexScienceTokenizer(self.config)

        # HF compatibility attributes
        self.pad_token = "[PAD]"
        self.unk_token = "[UNK]"
        self.bos_token = "[BOS]"
        self.eos_token = "[EOS]"
        self.pad_token_id = self.special_tokens.get("[PAD]", 0)
        self.unk_token_id = self.special_tokens.get("[UNK]", 1)
        self.bos_token_id = self.special_tokens.get("[BOS]", 2)
        self.eos_token_id = self.special_tokens.get("[EOS]", 3)

    @classmethod
    def from_pretrained(

        cls,

        pretrained_model_name_or_path: str,

        **kwargs,

    ):
        """Load tokenizer from pretrained model."""
        tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json")
        config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")

        config = {}
        if os.path.exists(config_path):
            with open(config_path, "r") as f:
                config = json.load(f)

        return cls(tokenizer_file=tokenizer_path, config=config, **kwargs)

    def __call__(

        self,

        text: str | List[str],

        padding: bool = False,

        truncation: bool = False,

        max_length: Optional[int] = None,

        return_tensors: str = "pt",

        **kwargs,

    ) -> Dict[str, Any]:
        """

        Tokenize text.



        Args:

            text: Input text or list of texts

            padding: Pad to same length

            truncation: Truncate to max_length

            max_length: Maximum length

            return_tensors: "pt" for PyTorch, "np" for numpy, None for list



        Returns:

            Dictionary with input_ids, attention_mask

        """
        if isinstance(text, str):
            text = [text]

        if max_length is None:
            max_length = self.config.get("max_seq_len", 16384)

        # Use batch_encode
        result = self.tokenizer.batch_encode(
            text,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            return_tensors=return_tensors,
        )

        return result

    def encode(

        self,

        text: str,

        add_special_tokens: bool = True,

        **kwargs,

    ) -> List[int]:
        """Encode text to token IDs."""
        result = self.tokenizer.encode(
            text,
            add_special_tokens=add_special_tokens,
            return_tensors=None,
        )
        return result["input_ids"]

    def decode(

        self,

        token_ids: List[int],

        skip_special_tokens: bool = True,

        **kwargs,

    ) -> str:
        """Decode token IDs to text."""
        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

    def save_pretrained(self, save_directory: str):
        """Save tokenizer to directory."""
        os.makedirs(save_directory, exist_ok=True)
        tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json")
        self.tokenizer.save(tokenizer_path)

        # Save tokenizer config
        config_path = os.path.join(save_directory, "tokenizer_config.json")
        with open(config_path, "w") as f:
            json.dump({
                "model_type": "vortex",
                "special_tokens": self.special_tokens,
            }, f, indent=2)

    @property
    def vocab_size(self) -> int:
        """Get vocabulary size."""
        return self.tokenizer.vocab_size

    def get_vocab(self) -> Dict[str, int]:
        """Get vocabulary dictionary."""
        return self.tokenizer.get_vocab()


def test_vortex_tokenizer():
    """Test VortexTokenizer."""
    from configs.vortex_7b_config import VORTEX_7B_CONFIG

    tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG)

    text = "The equation is $E = mc^2$ and the reaction is H2O."
    encoded = tokenizer(text, padding=False, truncation=True, max_length=128)
    print(f"Encoded: {encoded['input_ids'][0][:10]}...")

    decoded = tokenizer.decode(encoded["input_ids"][0])
    print(f"Decoded: {decoded[:50]}...")

    print("VortexTokenizer test passed!")


if __name__ == "__main__":
    test_vortex_tokenizer()