"""
Vortex tokenizer for HuggingFace.
Wraps VortexScienceTokenizer for HF compatibility.
"""

from typing import List, Optional, Dict, Any
import json
import os


class VortexTokenizer:
    """
    HuggingFace-compatible tokenizer for Vortex.
    Wraps VortexScienceTokenizer.
    """

    def __init__(
        self,
        tokenizer_file: Optional[str] = None,
        config: Optional[Dict] = None,
        **kwargs,
    ):
        """
        Initialize tokenizer.

        Args:
            tokenizer_file: Path to tokenizer JSON
            config: Tokenizer configuration
        """
        from .tokenizer.vortex_tokenizer import VortexScienceTokenizer

        self.config = config or {}
        self.special_tokens = self.config.get("special_tokens", {})

        if tokenizer_file and os.path.exists(tokenizer_file):
            self.tokenizer = VortexScienceTokenizer(
                self.config,
                tokenizer_path=tokenizer_file,
            )
        else:
            # Initialize empty - needs training
            self.tokenizer = VortexScienceTokenizer(self.config)

        # HF compatibility attributes
        self.pad_token = "[PAD]"
        self.unk_token = "[UNK]"
        self.bos_token = "[BOS]"
        self.eos_token = "[EOS]"
        self.pad_token_id = self.special_tokens.get("[PAD]", 0)
        self.unk_token_id = self.special_tokens.get("[UNK]", 1)
        self.bos_token_id = self.special_tokens.get("[BOS]", 2)
        self.eos_token_id = self.special_tokens.get("[EOS]", 3)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        **kwargs,
    ):
        """Load tokenizer from pretrained model."""
        tokenizer_path = os.path.join(pretrained_model_name_or_path, "vortex_tokenizer.json")
        config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")

        config = {}
        if os.path.exists(config_path):
            with open(config_path, "r") as f:
                config = json.load(f)

        return cls(tokenizer_file=tokenizer_path, config=config, **kwargs)

    def __call__(
        self,
        text: str | List[str],
        padding: bool = False,
        truncation: bool = False,
        max_length: Optional[int] = None,
        return_tensors: str = "pt",
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Tokenize text.

        Args:
            text: Input text or list of texts
            padding: Pad to same length
            truncation: Truncate to max_length
            max_length: Maximum length
            return_tensors: "pt" for PyTorch, "np" for numpy, None for list

        Returns:
            Dictionary with input_ids, attention_mask
        """
        if isinstance(text, str):
            text = [text]

        if max_length is None:
            max_length = self.config.get("max_seq_len", 16384)

        # Use batch_encode
        result = self.tokenizer.batch_encode(
            text,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            return_tensors=return_tensors,
        )

        return result

    def encode(
        self,
        text: str,
        add_special_tokens: bool = True,
        **kwargs,
    ) -> List[int]:
        """Encode text to token IDs."""
        result = self.tokenizer.encode(
            text,
            add_special_tokens=add_special_tokens,
            return_tensors=None,
        )
        return result["input_ids"]

    def decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = True,
        **kwargs,
    ) -> str:
        """Decode token IDs to text."""
        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

    def save_pretrained(self, save_directory: str):
        """Save tokenizer to directory."""
        os.makedirs(save_directory, exist_ok=True)
        tokenizer_path = os.path.join(save_directory, "vortex_tokenizer.json")
        self.tokenizer.save(tokenizer_path)

        # Save tokenizer config
        config_path = os.path.join(save_directory, "tokenizer_config.json")
        with open(config_path, "w") as f:
            json.dump({
                "model_type": "vortex",
                "special_tokens": self.special_tokens,
            }, f, indent=2)

    @property
    def vocab_size(self) -> int:
        """Get vocabulary size."""
        return self.tokenizer.vocab_size

    def get_vocab(self) -> Dict[str, int]:
        """Get vocabulary dictionary."""
        return self.tokenizer.get_vocab()


def test_vortex_tokenizer():
    """Test VortexTokenizer."""
    from configs.vortex_7b_config import VORTEX_7B_CONFIG

    tokenizer = VortexTokenizer(config=VORTEX_7B_CONFIG)

    text = "The equation is $E = mc^2$ and the reaction is H2O."
    encoded = tokenizer(text, padding=False, truncation=True, max_length=128)
    print(f"Encoded: {encoded['input_ids'][0][:10]}...")

    decoded = tokenizer.decode(encoded["input_ids"][0])
    print(f"Decoded: {decoded[:50]}...")

    print("VortexTokenizer test passed!")


if __name__ == "__main__":
    test_vortex_tokenizer()