"""PUA candidate selection.

Two strategies are provided:

* `select_by_savings` — the production path. Ranks candidates by
  `frequency * (baseline_token_cost - 1)` so PUA slots go to tokens that
  actually save tokens vs the baseline (cl100k). Tokens whose baseline
  cost is 1 (single byte/char) score 0 — byte fallback already handles
  them optimally.

* `select_by_coverage` — the legacy v0 strategy. Ranks by raw frequency.
  Kept as a deprecated shim for callers that haven't migrated; emits a
  `DeprecationWarning` on use.

The savings strategy also applies a tier-aware penalty: tokens beyond
`PUA_BMP_SIZE` would be assigned a 4-byte supplementary-plane PUA char
(vs 3 bytes for BMP). For tokens whose token-savings == 1, the byte cost
of substitution can erase the saving, so by default we cap the budget at
`PUA_BMP_SIZE` unless the caller opts into supplementary planes.
"""

from __future__ import annotations

import warnings
from collections import Counter

import regex as _re

from .baseline import BaselineTokenizer
from .pua import PUA_BMP_SIZE

DEFAULT_MAX_LEN = 50

# Heuristic regex for hash / UUID / base64-blob shapes — these waste PUA slots.
# Keep conservative: false positives just demote a token from PUA to byte
# fallback, which is fine.
_HEX_HASH_RE = _re.compile(r"^[0-9a-f]{16,}$", _re.IGNORECASE)
_UUID_RE = _re.compile(
    r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.IGNORECASE
)
_BASE64_BLOB_RE = _re.compile(r"^[A-Za-z0-9+/=_\-]{24,}$")
# A "blob" must be mostly base64 alphabet with no recognizable English-word
# substrings. We approximate: if the token is long, has high alphabet entropy,
# and at least 1/3 digits or 1/3 mixed-case alpha runs, treat it as a blob.


def _looks_like_secret_or_hash(tok: str) -> bool:
    """Heuristic: True if `tok` looks like a hash, UUID, or base64 blob."""
    if _UUID_RE.match(tok):
        return True
    if _HEX_HASH_RE.match(tok):
        return True
    if len(tok) >= 24 and _BASE64_BLOB_RE.match(tok):
        # Has the shape — confirm it lacks lowercase-word substructure.
        # A real identifier has long-ish lowercase runs; a base64 blob is
        # interleaved upper/lower/digit. Require at least 3 short runs of
        # contiguous lowercase letters before we accept it as "wordy".
        lower_runs = _re.findall(r"[a-z]{3,}", tok)
        if len(lower_runs) < 2:
            return True
    return False


def is_good_token(tok: str, max_len: int = DEFAULT_MAX_LEN) -> bool:
    """Filter for PUA-eligible tokens.

    Rejects:
    - Empty / whitespace-only / over-length tokens.
    - Tokens that encode to a single UTF-8 byte (byte fallback handles them
      optimally, so a PUA assignment is at best a wash).
    - Hashes, UUIDs, long base64 blobs (waste PUA slots; near-zero
      cross-document reuse).
    """
    if not tok:
        return False
    if tok.isspace():
        return False
    if len(tok) > max_len:
        return False
    if len(tok.encode("utf-8")) <= 1:
        return False
    return not _looks_like_secret_or_hash(tok)


def compute_candidate_score(
    token: str,
    frequency: int,
    baseline: BaselineTokenizer,
) -> float:
    """Net token savings per occurrence times frequency.

    `savings = max(0, baseline.count_tokens(token) - 1)`.

    A token with `baseline_count == 1` scores 0 — substituting it costs a
    PUA slot but doesn't shorten the token sequence. A token with
    `baseline_count == 5` and frequency 1000 scores 4000.

    Note: this scoring is intentionally tier-agnostic. The tier-aware
    penalty (BMP vs supplementary plane) is applied in `select_by_savings`
    after ranking, not here, because the tier of any given candidate
    depends on how many candidates outrank it.
    """
    baseline_cost = baseline.count_tokens(token)
    savings = max(0, baseline_cost - 1)
    return frequency * savings


def select_by_savings(
    freq: Counter[str],
    baseline: BaselineTokenizer,
    *,
    vocab_budget: int,
    max_len: int = DEFAULT_MAX_LEN,
    min_score: float = 1.0,
    allow_supplementary_pua: bool = False,
) -> list[str]:
    """Select PUA candidates by net token savings.

    Parameters
    ----------
    freq
        Frequency counter from `count_frequencies`.
    baseline
        Baseline tokenizer used to score `(baseline_cost - 1)` per token.
    vocab_budget
        Hard upper bound on the number of selected tokens. Capped at
        `PUA_BMP_SIZE` unless `allow_supplementary_pua` is True.
    max_len
        Reject tokens longer than this.
    min_score
        Reject tokens whose score is below this threshold. Default 1.0
        (i.e., at least one token saved across the entire corpus).
    allow_supplementary_pua
        If True, the budget is uncapped (limited only by `PUA_TOTAL`).
        Off by default because supplementary-plane PUA chars are 4 bytes
        each and net savings can go negative for `baseline_count==2`
        candidates.

    Returns
    -------
    Tokens in deterministic order: descending score, ascending lex for ties.
    """
    if vocab_budget <= 0:
        return []

    effective_budget = vocab_budget
    if not allow_supplementary_pua:
        effective_budget = min(effective_budget, PUA_BMP_SIZE)

    scored: list[tuple[float, str]] = []
    for token, count in freq.items():
        if not is_good_token(token, max_len=max_len):
            continue
        score = compute_candidate_score(token, count, baseline)
        if score < min_score:
            continue
        scored.append((score, token))

    # Sort by (-score, token) for deterministic, savings-priority order.
    scored.sort(key=lambda st: (-st[0], st[1]))

    return [tok for _, tok in scored[:effective_budget]]


def select_by_coverage(
    freq: Counter[str],
    coverage_target: float = 0.90,
    max_len: int = DEFAULT_MAX_LEN,
    max_tokens: int | None = None,
) -> list[str]:
    """Frequency-based selection (deprecated, retained for backward compat).

    Stops at the first of: cumulative coverage ≥ `coverage_target`,
    `max_tokens` selected, or input exhausted. New code should use
    `select_by_savings`.
    """
    if not 0.0 < coverage_target <= 1.0:
        raise ValueError(f"coverage_target must be in (0,1], got {coverage_target}")

    total = sum(freq.values())
    if total == 0:
        return []

    sorted_items = sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))

    selected: list[str] = []
    cumulative = 0
    threshold = coverage_target * total

    for token, count in sorted_items:
        if not is_good_token(token, max_len=max_len):
            continue
        selected.append(token)
        cumulative += count
        if max_tokens is not None and len(selected) >= max_tokens:
            break
        if cumulative >= threshold:
            break

    return selected


def coverage_of(freq: Counter[str], tokens: list[str]) -> float:
    """Fraction of total frequency covered by `tokens`. Used for diagnostics."""
    total = sum(freq.values())
    if total == 0:
        return 0.0
    covered = sum(freq.get(t, 0) for t in tokens)
    return covered / total


def select_by_coverage_deprecated(
    freq: Counter[str],
    coverage_target: float = 0.90,
    max_len: int = DEFAULT_MAX_LEN,
    max_tokens: int | None = None,
) -> list[str]:
    """Shim that emits a DeprecationWarning then delegates to `select_by_coverage`."""
    warnings.warn(
        "select_by_coverage is deprecated; use select_by_savings with a "
        "BaselineTokenizer for production builds.",
        DeprecationWarning,
        stacklevel=2,
    )
    return select_by_coverage(freq, coverage_target, max_len, max_tokens)


__all__ = [
    "DEFAULT_MAX_LEN",
    "compute_candidate_score",
    "coverage_of",
    "is_good_token",
    "select_by_coverage",
    "select_by_coverage_deprecated",
    "select_by_savings",
]