"""CUTE — Compact Unicode Token Encoding.

Public API:
    build_cute      — train a CUTE tokenizer from a corpus directory.
    CUTEConfig      — all knobs for the build pipeline.
    CUTETokenizerFast — HuggingFace-compatible inference wrapper.
    PUAMapping      — word ↔ PUA character mapping.
    load_default_tokenizer — load the bundled production-ready tokenizer.
"""

from __future__ import annotations

import os as _os
from pathlib import Path

# Silence the "None of PyTorch, TensorFlow, or Flax have been found" warning
# that `transformers` emits at import. CUTE only needs the tokenizer layer, not
# the model layer, so this warning is irrelevant. Must be set BEFORE the first
# `transformers` import — putting it here covers both library and CLI paths.
_os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
_os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")

from ._version import __version__
from .baseline import BaselineTokenizer, Cl100kBaseline, NullBaseline, get_default_baseline
from .config import CUTEConfig
from .pua import PUAMapping
from .selection import compute_candidate_score, select_by_savings
from .tokenizer import CUTETokenizerFast
from .trainer import build_cute, load_mapping, save_mapping


def _bundled_data_dir() -> Path:
    """Directory containing tokenizer.json and cute_mapping.json.

    Resolution order:
    1. Package ``data/`` (wheel / after prepare copies into the package tree).
    2. Repository ``model/`` (editable install from a checkout with ``src/`` layout).
    """
    pkg_dir = Path(__file__).resolve().parent
    embedded = pkg_dir / "data"
    if (embedded / "tokenizer.json").is_file() and (embedded / "cute_mapping.json").is_file():
        return embedded
    if pkg_dir.parent.name == "src":
        repo_root = pkg_dir.parent.parent
        dev = repo_root / "model"
        if (dev / "tokenizer.json").is_file() and (dev / "cute_mapping.json").is_file():
            return dev
    raise RuntimeError(
        "Bundled tokenizer files not found. Expected tokenizer.json and cute_mapping.json in "
        "the package data directory, or in ./model at the repository root. "
        "For releases, ensure model/ is populated before building the wheel."
    )


def load_default_tokenizer() -> CUTETokenizerFast:
    """Load the bundled production-ready CUTE tokenizer.

    This tokenizer was trained on a large code corpus (The Stack subset)
    with 80k vocab and 90% coverage target. It is ready to use immediately
    after `pip install cute-tokenizer` — no training required.

    Returns
    -------
    CUTETokenizerFast
        Pre-trained tokenizer instance.

    Example
    -------
        >>> from cute_tokenizer import load_default_tokenizer
        >>> tok = load_default_tokenizer()
        >>> ids = tok("def hello(): return 42", add_special_tokens=False).input_ids
        >>> len(ids)
        6
    """
    data_dir = _bundled_data_dir()
    tokenizer_file = data_dir / "tokenizer.json"
    mapping_file = data_dir / "cute_mapping.json"

    return CUTETokenizerFast(
        tokenizer_file=str(tokenizer_file),
        cute_mapping_file=str(mapping_file),
    )


__all__ = [
    "BaselineTokenizer",
    "CUTEConfig",
    "CUTETokenizerFast",
    "Cl100kBaseline",
    "NullBaseline",
    "PUAMapping",
    "__version__",
    "build_cute",
    "compute_candidate_score",
    "get_default_baseline",
    "load_default_tokenizer",
    "load_mapping",
    "save_mapping",
    "select_by_savings",
]