"""Core type definitions for NedoTurkishTokenizer.

Defines the Token dataclass, SegmentationCandidate for the candidate-based
segmentation engine, token type constants, and punctuation character sets.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any


# ── Token type constants ──────────────────────────────────────────────────────

ROOT = "ROOT"
SUFFIX = "SUFFIX"
FOREIGN = "FOREIGN"
PUNCT = "PUNCT"
NUM = "NUM"
DATE = "DATE"
UNIT = "UNIT"
URL = "URL"
MENTION = "MENTION"
HASHTAG = "HASHTAG"
EMOJI = "EMOJI"
ACRONYM = "ACRONYM"

# Special token types that represent non-textual entities
SPECIAL_TYPES: frozenset[str] = frozenset(
    {NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
)

# All recognized token types
ALL_TYPES: frozenset[str] = frozenset(
    {ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
)

# ── Punctuation character set ────────────────────────────────────────────────

PUNCT_CHARS: frozenset[str] = frozenset(
    "'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
    "\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
    "\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
)

# Digits — used alongside PUNCT_CHARS for pure-punctuation detection
_DIGITS: frozenset[str] = frozenset("0123456789")


def is_punct_token(text: str) -> bool:
    """Return True if *text* consists entirely of punctuation / digit characters."""
    stripped = text.strip()
    if not stripped:
        return False
    return all(
        c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha())
        for c in stripped
    )


# ── Token dataclass ──────────────────────────────────────────────────────────


@dataclass
class Token:
    """Internal token representation.

    *text* uses the leading-space convention: a space prefix indicates
    that this token starts a new word.  Suffixes within a word have
    no leading space.

    The *metadata* dict carries optional annotation fields (all prefixed
    with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``.
    """

    text: str
    token_type: str
    morph_pos: int = 0
    metadata: dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict[str, Any]:
        """Convert to the public API dict format."""
        result: dict[str, Any] = {
            "token": self.text,
            "token_type": self.token_type,
            "morph_pos": self.morph_pos,
        }
        result.update(self.metadata)
        return result


# ── Segmentation candidate ───────────────────────────────────────────────────


@dataclass
class SegmentationCandidate:
    """One possible way to segment a word into tokens.

    The candidate-generation engine produces multiple candidates per word,
    then the selection step picks the highest-scoring one.

    *source* is a short human-readable tag describing the strategy that
    produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``,
    ``"foreign"``).
    """

    tokens: list[Token]
    score: float
    source: str