#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = ["chess", "datasets>=2.19", "tqdm"]
# ///
"""Deterministic chess→English converter for puzzles.

Generates a standardized English-readable description of each puzzle WITHOUT
any LLM. Uses python-chess for UCI→SAN conversion (with board context), regex
for decamelizing themes, and a fixed template.

For each puzzle, produces a doc like:

    "White to move. Short middlegame puzzle with crushing fork and hanging
    piece motifs. Opening: King's Pawn Game. Moves: Bxg3 Rxe7 Qb1+ Nc1 Qxc1+
    Qxc1"

Pretrained English cross-encoders have seen SAN notation in chess web content
during pretraining, so this doc is semantically meaningful to them — unlike
the raw UCI form (`f2g3`) which gets fragmented into character pieces.

Output: parquet at models/puzzles_english.parquet with columns:
    PuzzleId, anchor (original themes+opening str), english_doc

Run:
    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 convert_to_english.py
    uv run --exclude-newer=2026-05-12 convert_to_english.py
"""
from __future__ import annotations

import os
import re
import sys

import chess
from datasets import Dataset, load_dataset
from tqdm import tqdm

sys.stdout.reconfigure(line_buffering=True)

OUTPUT_PATH = "models/puzzles_english.parquet"
SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"

# Length tag mapping
LENGTH_MAP = {
    "oneMove": "single-move",
    "short": "short",
    "long": "long",
    "veryLong": "very long",
}
PHASE_TAGS = {"opening", "middlegame", "endgame"}
LENGTH_TAGS = set(LENGTH_MAP.keys())
# Anything matching `mateInN`, `mateIn1`, etc.
MATE_IN_PATTERN = re.compile(r"^mateIn(\d+)$")
# Specific mate-pattern names (their English form is just decamel)
# camelCase → "camel case" via regex
_CAMEL_BOUNDARY = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])")


def decamelize(tag: str) -> str:
    """`backRankMate` → 'back rank mate'. `attackingF2F7` → 'attacking f2 f7'."""
    return _CAMEL_BOUNDARY.sub(" ", tag).lower()


def themes_to_english(themes: list[str]) -> tuple[str, str, str, list[str]]:
    """Returns (side_phrase, length_phrase, phase, decamelized_other_themes).

    Splits themes into structural (phase, length, mate-in-N) and motif (everything else).
    The motifs are returned decamelized.
    """
    if not themes:
        return ("", "", "", [])
    phase = ""
    length = ""
    mate_in = None
    motifs = []
    for t in themes:
        if t in PHASE_TAGS:
            phase = t
        elif t in LENGTH_TAGS:
            length = LENGTH_MAP[t]
        elif (m := MATE_IN_PATTERN.match(t)):
            mate_in = int(m.group(1))
        else:
            motifs.append(decamelize(t))
    # Mate-in-N gets folded into motifs as natural-language phrase
    if mate_in is not None:
        motifs.append(f"mate in {mate_in}")
    return phase, length, "", motifs  # side_phrase computed separately from FEN


def opening_tags_to_english(opening_tags: list[str]) -> str:
    """`['Kings_Pawn_Game', 'Kings_Pawn_Game_Leonardis_Variation']` → 'King's Pawn Game Leonardi's Variation'.
    Dedupe by taking the longest matching tag."""
    if not opening_tags:
        return ""
    # Use the longest tag (most specific) and replace underscores with spaces
    longest = max(opening_tags, key=len)
    return longest.replace("_", " ")


def uci_to_san_sequence(fen: str, uci_moves: str) -> str:
    """Convert UCI move sequence to SAN, using board context for disambiguation."""
    try:
        board = chess.Board(fen)
        san_moves = []
        for uci in uci_moves.split():
            try:
                move = chess.Move.from_uci(uci)
                san = board.san(move)
                san_moves.append(san)
                board.push(move)
            except Exception:
                # Invalid move — skip rest
                break
        return " ".join(san_moves)
    except Exception:
        return uci_moves  # fall back to raw UCI


def side_to_move(fen: str) -> str:
    parts = fen.split()
    if len(parts) >= 2 and parts[1] == "w":
        return "White"
    return "Black"


def build_english_doc(row: dict) -> str:
    """Build a deterministic English description from a Lichess puzzle row."""
    side = side_to_move(row["FEN"])
    phase, length, _, motifs = themes_to_english(row["Themes"] or [])
    opening = opening_tags_to_english(row.get("OpeningTags") or [])
    san = uci_to_san_sequence(row["FEN"], row["Moves"])

    # Construct sentence
    parts = []
    parts.append(f"{side} to move.")

    # "Short middlegame puzzle with crushing fork and hanging piece motifs."
    descriptor = []
    if length:
        descriptor.append(length)
    if phase:
        descriptor.append(phase)
    descriptor.append("puzzle")
    descriptor_str = " ".join(descriptor)
    if motifs:
        motifs_str = ", ".join(motifs)
        descriptor_str += f" with {motifs_str} motifs"
    parts.append(descriptor_str.capitalize() + ".")

    if opening:
        parts.append(f"Opening: {opening}.")

    if san:
        parts.append(f"Moves: {san}")

    return " ".join(parts)


def build_english_anchor(row: dict) -> str:
    """Anchor side: same as before (themes + opening) but in deterministic English.
    Used as query for retrieval/reranker training."""
    phase, length, _, motifs = themes_to_english(row["Themes"] or [])
    opening = opening_tags_to_english(row.get("OpeningTags") or [])
    parts = []
    if motifs:
        parts.append(", ".join(motifs))
    if length:
        parts.append(length)
    if phase:
        parts.append(phase)
    if opening:
        parts.append(opening)
    return " ".join(parts).strip()


def main():
    print("Loading puzzles...")
    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
    if SMOKE_TEST:
        puzzles = puzzles.select(range(2_000))
    print(f"  {len(puzzles):,} rows")

    print("Converting to English (deterministic)...")

    def proc(batch):
        ids, anchors, docs = [], [], []
        for r in [{k: batch[k][i] for k in batch} for i in range(len(batch["PuzzleId"]))]:
            if not r["Themes"]:
                continue
            ids.append(r["PuzzleId"])
            anchors.append(build_english_anchor(r))
            docs.append(build_english_doc(r))
        return {"PuzzleId": ids, "anchor_en": anchors, "doc_en": docs}

    out = puzzles.map(
        proc, batched=True, batch_size=10_000,
        remove_columns=puzzles.column_names,
        num_proc=4,
    )
    print(f"  produced {len(out):,} English-converted rows")

    print("\n=== Sample conversions ===")
    for i in [0, 100, 1000]:
        r = out[i]
        print(f"\nPuzzleId: {r['PuzzleId']}")
        print(f"  anchor: {r['anchor_en']!r}")
        print(f"  doc:    {r['doc_en'][:200]!r}")

    out.to_parquet(OUTPUT_PATH)
    print(f"\nSaved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1e6:.1f} MB)")


if __name__ == "__main__":
    main()