"""Lichess data preparation for FiLM behavioral cloning.

Parses a PGN file, tokenizes via the Rust engine, and produces PyTorch
tensors ready for training.  Legal move grids are computed per-batch during
training (not precomputed) to keep memory independent of dataset size.
"""

from __future__ import annotations

from pathlib import Path

import numpy as np
import torch
import torch.utils.data

import chess_engine as engine

from pawn.config import (
    WHITE_CHECKMATES,
    BLACK_CHECKMATES,
    DRAW_BY_RULE,
    PLY_LIMIT,
)


# ---------------------------------------------------------------------------
# PGN result → outcome token
# ---------------------------------------------------------------------------

_RESULT_MAP = {
    "1-0": "white",
    "0-1": "black",
    "1/2-1/2": "draw",
}


def _result_to_outcome(results: list[str]) -> torch.Tensor:
    """Map PGN result strings to outcome token IDs.

    For decisive games we use the checkmate token even though the actual
    termination was likely resignation/time — the prefix of moves is still
    valid strategic play and the outcome token approximation is acceptable
    per the spec (§3.4).
    """
    outcomes = torch.full((len(results),), PLY_LIMIT, dtype=torch.long)
    for i, result in enumerate(results):
        mapped = _RESULT_MAP.get(result)
        if mapped == "white":
            outcomes[i] = WHITE_CHECKMATES
        elif mapped == "black":
            outcomes[i] = BLACK_CHECKMATES
        elif mapped == "draw":
            outcomes[i] = DRAW_BY_RULE
    return outcomes


# ---------------------------------------------------------------------------
# Legal token mask via fused Rust computation
# ---------------------------------------------------------------------------


def compute_legal_indices(
    move_ids: np.ndarray,
    game_lengths: np.ndarray,
    seq_len: int,
    vocab_size: int = 4278,
) -> np.ndarray:
    """Compute flat sparse indices for legal token masks (CPU only).

    Calls the Rust engine to replay games and returns flat i64 indices
    suitable for scattering into a (B, seq_len, vocab_size) bool mask.
    """
    move_ids = np.ascontiguousarray(move_ids, dtype=np.int16)
    game_lengths = np.asarray(game_lengths, dtype=np.int16)
    return engine.compute_legal_token_masks_sparse(
        move_ids, game_lengths, seq_len, vocab_size,
    )


class LegalMaskBuilder:
    """Legal token mask via sparse Rust computation + GPU scatter.

    Calls engine.compute_legal_token_masks_sparse which replays games and
    returns flat i64 indices (~2 MB) instead of a dense bool mask (~70 MB).
    Indices are transferred to GPU and scattered into a pre-allocated buffer.

    Two usage modes:
      1. ``scatter(indices, B)`` — fast GPU-only path for pre-computed indices
         (from ``LegalMaskCollate`` or precomputation).
      2. ``__call__(batch)`` — legacy path that computes indices inline.
    """

    def __init__(self, batch_size: int, max_ply: int, vocab_size: int = 4278,
                 device: str = "cpu", max_index_buf: int = 4_000_000):
        self.vocab_size = vocab_size
        self.max_ply = max_ply
        self.T = max_ply + 1  # seq_len = outcome token + max_ply move slots
        self.device = device

        # Pre-allocated GPU output buffer
        self._mask_gpu = torch.zeros(batch_size, self.T, vocab_size,
                                     dtype=torch.bool, device=device)
        # Pre-allocated GPU index buffer to avoid per-batch allocation
        self._idx_buf = torch.empty(max_index_buf, dtype=torch.long, device=device)

    def scatter(self, legal_indices: torch.Tensor, B: int) -> torch.Tensor:
        """Scatter pre-computed CPU indices into the GPU mask buffer.

        Uses a pre-allocated index buffer to avoid per-batch GPU allocation.
        Falls back to a fresh allocation if the buffer is too small.
        """
        if B > self._mask_gpu.shape[0]:
            raise ValueError(
                f"B={B} exceeds pre-allocated batch_size={self._mask_gpu.shape[0]}"
            )
        mask_view = self._mask_gpu[:B]
        mask_view.zero_()
        n = legal_indices.shape[0]
        if n > 0:
            if n <= self._idx_buf.shape[0]:
                self._idx_buf[:n].copy_(legal_indices)
                mask_view.view(-1).index_fill_(0, self._idx_buf[:n], True)
            else:
                idx_gpu = legal_indices.to(self.device)
                mask_view.view(-1).index_fill_(0, idx_gpu, True)
        return mask_view

    def __call__(self, batch: dict) -> torch.Tensor:
        """Build (B, T, V) legal mask from batch move_ids + game_lengths.

        Computes sparse indices via Rust and scatters to the GPU buffer.
        For better performance, use ``LegalMaskCollate`` with DataLoader
        workers to compute indices off the critical path, then call
        ``scatter()`` directly.
        """
        move_ids = batch["move_ids"]
        game_lengths_raw = batch["game_length"]
        B = move_ids.shape[0] if hasattr(move_ids, 'shape') else len(move_ids)

        if isinstance(move_ids, torch.Tensor):
            move_ids = move_ids.numpy()
        move_ids = np.ascontiguousarray(move_ids, dtype=np.int16)
        game_lengths = np.asarray(game_lengths_raw, dtype=np.int16)

        indices = engine.compute_legal_token_masks_sparse(
            move_ids, game_lengths, self.T, self.vocab_size,
        )

        return self.scatter(torch.from_numpy(indices), B)


class LegalMaskCollate:
    """Collate that computes legal mask indices in DataLoader workers.

    Wraps default collation and appends a ``legal_indices`` CPU tensor
    to each batch so the Rust replay runs in worker processes, off the
    GPU training critical path.
    """

    def __init__(self, seq_len: int, vocab_size: int = 4278):
        self.seq_len = seq_len
        self.vocab_size = vocab_size

    def __call__(self, items: list[dict]) -> dict:
        batch = torch.utils.data.default_collate(items)
        move_ids = batch["move_ids"].numpy()
        game_lengths = np.asarray(batch["game_length"], dtype=np.int16)
        indices = compute_legal_indices(
            move_ids, game_lengths, self.seq_len, self.vocab_size,
        )
        batch["legal_indices"] = torch.from_numpy(indices)
        return batch


# ---------------------------------------------------------------------------
# PGN → tokenized dataset with legal move masks
# ---------------------------------------------------------------------------


def prepare_lichess_dataset(
    pgn_path: str | Path,
    max_ply: int = 255,
    max_games: int = 50_000,
    min_ply: int = 10,
) -> dict:
    """Parse a PGN or Parquet file and produce training-ready tensors.

    If pgn_path ends with .parquet, delegates to prepare_lichess_parquet().
    If pgn_path looks like a HuggingFace repo (contains '/'), loads from HF.

    Returns dict with:
        move_ids:       (N, max_ply) int16 — tokenized moves
        game_lengths:   (N,) int16
        input_ids:      (N, seq_len) long — [outcome, move_0, ..., PAD]
        targets:        (N, seq_len) long — shifted left
        loss_mask:      (N, seq_len) bool
        n_games:        int
    """
    pgn_path_str = str(pgn_path)
    if pgn_path_str.endswith(".parquet"):
        return prepare_lichess_parquet(
            parquet_path=pgn_path_str, max_ply=max_ply,
            max_games=max_games, min_ply=min_ply,
        )
    # Check if it looks like a HF repo ID (e.g. "user/dataset")
    if "/" in pgn_path_str and not Path(pgn_path_str).exists():
        return prepare_lichess_parquet(
            hf_repo=pgn_path_str, max_ply=max_ply,
            max_games=max_games, min_ply=min_ply,
        )
    pgn_path = Path(pgn_path)

    # Parse with min_ply=1 so every parseable game appears in the output,
    # keeping result extraction aligned.  We apply min_ply in Python below.
    print(f"Parsing PGN: {pgn_path}")
    move_ids, game_lengths, n_parsed = engine.parse_pgn_file(
        str(pgn_path), max_ply=max_ply, max_games=max_games, min_ply=1,
    )
    N = move_ids.shape[0]
    print(f"  Parsed {n_parsed} PGN games, {N} tokenized")

    move_ids = move_ids[:N]
    game_lengths = game_lengths[:N]

    # Extract results — aligned with engine output since min_ply=1
    results = _extract_results(pgn_path, n_parsed)[:N]

    # Apply min_ply filter in Python on aligned arrays
    if min_ply > 1:
        keep = game_lengths >= min_ply
        move_ids = move_ids[keep]
        game_lengths = game_lengths[keep]
        results = [r for r, k in zip(results, keep) if k]
        N = len(results)
        print(f"  After min_ply={min_ply} filter: {N} games")

    outcome_tokens = _result_to_outcome(results)

    seq_len = max_ply + 1  # outcome token + max_ply move slots

    from pawn.data import pack_clm_sequences
    batch = pack_clm_sequences(move_ids, game_lengths, outcome_tokens, seq_len)

    return {
        "move_ids": move_ids,
        "game_lengths": game_lengths,
        "input_ids": batch["input_ids"],
        "targets": batch["targets"],
        "loss_mask": batch["loss_mask"],
        "outcome_tokens": outcome_tokens,
        "n_games": N,
    }


def prepare_lichess_parquet(
    parquet_path: str | Path = None,
    hf_repo: str = None,
    max_ply: int = 255,
    max_games: int = 50_000,
    min_ply: int = 10,
) -> dict:
    """Load a Lichess Parquet dataset and produce training-ready tensors.

    Reads from a local Parquet file or a HuggingFace dataset repo.
    Expects columns: pgn (SAN move text), result (1-0/0-1/1/2-1/2).

    Returns the same dict format as prepare_lichess_dataset().
    """

    import polars as pl

    if hf_repo is not None:
        from huggingface_hub import hf_hub_download, HfApi
        api = HfApi()
        files = api.list_repo_files(hf_repo, repo_type="dataset")
        parquet_files = [f for f in files if f.endswith(".parquet")]
        local_files = [hf_hub_download(hf_repo, pf, repo_type="dataset")
                       for pf in parquet_files]
        lf = pl.scan_parquet(local_files)
    elif parquet_path is not None:
        lf = pl.scan_parquet(str(parquet_path))
    else:
        raise ValueError("Either parquet_path or hf_repo must be provided")

    # Lazy: select only needed columns, limit rows, then collect
    df = (
        lf.select(["pgn", "result"])
        .head(max_games)
        .collect()
    )
    n_to_use = len(df)
    print(f"Loaded {n_to_use} games from Parquet")

    pgn_strings = df["pgn"].to_list()
    results = df["result"].to_list()

    # Split PGN text into move lists, stripping comments, move numbers, results
    import re
    games: list[list[str]] = []
    for pgn_text in pgn_strings:
        # Strip { ... } comments (clock annotations, etc.)
        cleaned = re.sub(r'\{[^}]*\}', '', pgn_text)
        tokens = cleaned.split()
        moves = []
        for tok in tokens:
            if tok in ("1-0", "0-1", "1/2-1/2", "*"):
                break
            # Skip move numbers (1. 2. 12... etc.)
            stripped = tok.rstrip(".")
            if stripped and stripped.replace(".", "").isdigit():
                continue
            if not tok:
                continue
            moves.append(tok)
        games.append(moves)

    # Tokenize via Rust engine (batch)
    print(f"  Tokenizing {len(games)} games...")
    move_ids, game_lengths = engine.pgn_to_tokens(games, max_ply=max_ply)
    N = move_ids.shape[0]

    # Apply min_ply filter
    if min_ply > 1:
        keep = game_lengths >= min_ply
        move_ids = move_ids[keep]
        game_lengths = game_lengths[keep]
        results = [r for r, k in zip(results, keep) if k]
        N = len(results)
        print(f"  After min_ply={min_ply} filter: {N} games")

    outcome_tokens = _result_to_outcome(results)

    seq_len = max_ply + 1
    from pawn.data import pack_clm_sequences
    batch = pack_clm_sequences(move_ids, game_lengths, outcome_tokens, seq_len)

    return {
        "move_ids": move_ids,
        "game_lengths": game_lengths,
        "input_ids": batch["input_ids"],
        "targets": batch["targets"],
        "loss_mask": batch["loss_mask"],
        "outcome_tokens": outcome_tokens,
        "n_games": N,
    }


def _extract_results(pgn_path: Path, max_games: int) -> list[str]:
    """Extract game results from PGN headers.

    Uses [Event header to delimit games, matching the Rust parser's
    game-boundary detection.  The previous approach (one result per
    [Result] header) could miscount when headers were malformed.
    """
    import re
    results: list[str] = []
    current_result = "*"
    in_game = False

    with open(pgn_path) as f:
        for line in f:
            line = line.strip()
            if line.startswith("[Event "):
                if in_game:
                    results.append(current_result)
                    if len(results) >= max_games:
                        break
                current_result = "*"
                in_game = True
            elif line.startswith('[Result "'):
                m = re.search(r'"([^"]+)"', line)
                if m:
                    current_result = m.group(1)
        # Flush last game
        if in_game and len(results) < max_games:
            results.append(current_result)

    return results


# ---------------------------------------------------------------------------
# Dataset class
# ---------------------------------------------------------------------------


class LichessDataset(torch.utils.data.Dataset):
    """Map-style dataset for Lichess behavioral cloning."""

    def __init__(self, data: dict, start: int = 0, end: int | None = None):
        end = end or data["n_games"]
        self.input_ids = data["input_ids"][start:end]
        self.targets = data["targets"][start:end]
        self.loss_mask = data["loss_mask"][start:end]
        self.move_ids = data["move_ids"][start:end]
        self.game_lengths = data["game_lengths"][start:end]

    def share_memory(self):
        """Move tensors to shared memory so spawn workers avoid copies."""
        self.input_ids = self.input_ids.share_memory_()
        self.targets = self.targets.share_memory_()
        self.loss_mask = self.loss_mask.share_memory_()
        self.move_ids = torch.from_numpy(np.array(self.move_ids)).share_memory_()
        self.game_lengths = torch.from_numpy(np.array(self.game_lengths)).share_memory_()
        return self

    def __len__(self) -> int:
        return len(self.input_ids)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor | int]:
        return {
            "input_ids": self.input_ids[idx],
            "targets": self.targets[idx],
            "loss_mask": self.loss_mask[idx],
            "move_ids": self.move_ids[idx],
            "game_length": int(self.game_lengths[idx]),
        }