import json
import re
from pathlib import Path
from typing import List, Dict, Union, Optional
from transformers import PreTrainedTokenizer


class ChessTokenizer(PreTrainedTokenizer):
    """
    Chess move tokenizer compatible with HuggingFace transformers.
    Can be loaded with: AutoTokenizer.from_pretrained("ankanmbz/chess-tok")
    """
    
    vocab_files_names = {
        "vocab_file": "vocab.json",
    }
    
    model_input_names = ["input_ids", "attention_mask"]
    
    def __init__(
        self,
        vocab_file,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<sos>",  # Changed from sos_token to bos_token (HuggingFace standard)
        eos_token="<eos>",
        **kwargs
    ):
        # Load vocabulary
        with open(vocab_file, 'r', encoding='utf-8') as f:
            self.encoder = json.load(f)
        
        self.decoder = {v: k for k, v in self.encoder.items()}
        
        # Regex pattern for tokenization
        self.pattern = r"w\.|b\.|[♔♕♖♗♘♙♚♛♜♝♞♟][a-h][1-8]|[a-h][1-8]|\.\.[+#x\.]*|\.x\.[+#]*|\.\.x\.[+#]*|\+#|\+|x|\."
        
        # Initialize parent class - this sets up special token IDs
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,  # bos = beginning of sequence (same as sos)
            eos_token=eos_token,
            **kwargs
        )
    
    @property
    def vocab_size(self) -> int:
        return len(self.encoder)
    
    def get_vocab(self) -> Dict[str, int]:
        return dict(self.encoder)
    
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a chess move string into tokens"""
        tokens = re.findall(self.pattern, text)
        return tokens
    
    def _convert_token_to_id(self, token: str) -> int:
        """Convert a token to an id using the vocab"""
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    
    def _convert_id_to_token(self, index: int) -> str:
        """Convert an id to a token using the vocab"""
        return self.decoder.get(index, self.unk_token)
    
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Convert a list of tokens to a string"""
        return ''.join(tokens)
    
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs by adding special tokens.
        Format: <sos> X <eos>
        """
        # Use bos_token_id instead of sos_token_id (HuggingFace standard)
        bos = [self.bos_token_id] if self.bos_token_id is not None else []
        eos = [self.eos_token_id] if self.eos_token_id is not None else []
        
        if token_ids_1 is None:
            return bos + token_ids_0 + eos
        return bos + token_ids_0 + eos + token_ids_1 + eos
    
    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        """
        Get mask for special tokens
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True,
            )
        
        bos = [1] if self.bos_token_id is not None else []
        eos = [1] if self.eos_token_id is not None else []
        
        if token_ids_1 is None:
            return bos + ([0] * len(token_ids_0)) + eos
        return bos + ([0] * len(token_ids_0)) + eos + ([0] * len(token_ids_1)) + eos
    
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs (not used for chess, but required by interface)
        """
        bos = [0] if self.bos_token_id is not None else []
        eos = [0] if self.eos_token_id is not None else []
        
        if token_ids_1 is None:
            return bos + ([0] * len(token_ids_0)) + eos
        return bos + ([0] * len(token_ids_0)) + eos + ([1] * len(token_ids_1)) + eos
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
        """Save the vocabulary to a directory"""
        if not Path(save_directory).is_dir():
            print(f"Vocabulary path {save_directory} should be a directory")
            return
        
        vocab_file = Path(save_directory) / (
            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
        )
        
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self.encoder, f, ensure_ascii=False, indent=2)
        
        return (str(vocab_file),)
    
    def prepare_for_tokenization(self, text: str, **kwargs):
        """Prepare text before tokenization - must return (text, kwargs) tuple"""
        return (text, kwargs)
    
    def _decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = False,
        **kwargs
    ) -> str:
        """Decode token ids to string"""
        tokens = [self._convert_id_to_token(id) for id in token_ids]
        
        if skip_special_tokens:
            tokens = [
                token for token in tokens 
                if token not in [self.pad_token, self.bos_token, self.eos_token, self.unk_token]
            ]
        
        return self.convert_tokens_to_string(tokens)


# ============================================================================
# Builder script to create HuggingFace-compatible tokenizer
# ============================================================================

def build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf"):
    """Build HuggingFace-compatible tokenizer from dataset"""
    import pandas as pd
    from collections import Counter
    
    print("Building HuggingFace-compatible tokenizer...")
    print(f"Dataset: {dataset_path}")
    print(f"Output: {output_dir}")
    
    # Load dataset (first 1M rows)
    df = pd.read_parquet(dataset_path)
    df = df.head(1_000_000)
    print(f"✓ Loaded {len(df):,} rows")
    
    # Extract tokens
    all_tokens = set()
    token_freq = Counter()
    pattern = r"w\.|b\.|[♔♕♖♗♘♙♚♛♜♝♞♟][a-h][1-8]|[a-h][1-8]|\.\.[+#x\.]*|\.x\.[+#]*|\.\.x\.[+#]*|\+#|\+|x|\."
    
    for moves_list in df['moves_custom']:
        for move in moves_list:
            tokens = re.findall(pattern, move)
            for token in tokens:
                all_tokens.add(token)
                token_freq[token] += 1
    
    print(f"✓ Found {len(all_tokens)} unique tokens")
    
    # Build vocabulary
    special_tokens = {
        "<pad>": 0,
        "<sos>": 1,
        "<eos>": 2,
        "<unk>": 3
    }
    
    vocab = special_tokens.copy()
    current_id = len(vocab)
    
    # Add ASCII characters
    for i in range(32, 127):
        char = chr(i)
        if char not in vocab:
            vocab[char] = current_id
            current_id += 1
    
    # Add dataset tokens (sorted by frequency)
    sorted_tokens = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)
    for token, freq in sorted_tokens:
        if token not in vocab:
            vocab[token] = current_id
            current_id += 1
    
    print(f"✓ Vocabulary size: {len(vocab)}")
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True, parents=True)
    
    # 1. Save vocab.json
    with open(output_path / "vocab.json", 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved: vocab.json")
    
    # 2. Save tokenizer_config.json
    tokenizer_config = {
        "tokenizer_class": "ChessTokenizer",
        "auto_map": {
            "AutoTokenizer": ["tokenizer.ChessTokenizer", None]
        },
        "model_max_length": 512,
        "pad_token": "<pad>",
        "sos_token": "<sos>",
        "eos_token": "<eos>",
        "unk_token": "<unk>",
        "clean_up_tokenization_spaces": True
    }
    
    with open(output_path / "tokenizer_config.json", 'w') as f:
        json.dump(tokenizer_config, f, indent=2)
    print(f"✓ Saved: tokenizer_config.json")
    
    # 3. Save special_tokens_map.json
    special_tokens_map = {
        "pad_token": "<pad>",
        "bos_token": "<sos>",  # bos = beginning of sequence
        "eos_token": "<eos>",
        "unk_token": "<unk>"
    }
    
    with open(output_path / "special_tokens_map.json", 'w') as f:
        json.dump(special_tokens_map, f, indent=2)
    print(f"✓ Saved: special_tokens_map.json")
    
    # 4. Save this tokenizer.py file
    import shutil
    shutil.copy(__file__, output_path / "tokenizer.py")
    print(f"✓ Saved: tokenizer.py")

if __name__ == "__main__":
    # Build the tokenizer
    dataset_path = "/vast/users/ankan.deria/Document/TinyRecursiveModels/data/chees_data/dataset.parquet"
    build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf")