"""Word-level segmentation with candidate generation and selection.

This is the core of the tokenizer.  For each word it:
1. Generates multiple segmentation candidates (whole-word ROOT, suffix
   chains, foreign root, etc.)
2. Scores each candidate deterministically
3. Selects the highest-scoring segmentation

The scoring rules are transparent and tunable:
- TDK root match gives a large bonus
- Domain vocabulary match gives a moderate bonus
- Longer roots are preferred over shorter ones
- Each recognised suffix adds a small bonus
- Unknown / unvalidated roots get a low base score
"""

from __future__ import annotations

import re
from typing import Any

from ._domain_vocab import ALL_DOMAIN_ROOTS
from ._suffix_table import (
    SHORT_AMBIGUOUS_SUFFIXES,
    SUFFIX_ENTRIES,
    SUFFIX_MAP,
)
from .normalization import has_turkish_chars, turkish_lower
from .resources import load_proper_nouns, load_tdk_words
from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token

# ── Scoring constants ────────────────────────────────────────────────────────
# Why these values: TDK_BONUS dominates so that a TDK-validated root almost
# always wins over an unvalidated one.  SUFFIX_BONUS is small enough that
# over-segmentation (many tiny suffixes) doesn't beat a valid longer root.

_TDK_BONUS = 10         # Root found in TDK dictionary
_DOMAIN_BONUS = 8        # Root found in domain vocabulary
_SUFFIX_BONUS = 2        # Each recognised suffix
_ROOT_LEN_WEIGHT = 2     # Per-character bonus for root length (prefer longer roots)
_WHOLE_WORD_BONUS = 5    # Extra bonus when the *entire* unsplit word is in TDK
_FOREIGN_BASE = 3        # Base score for foreign root (intentionally low)
_UNKNOWN_BASE = 1        # Base score for unrecognised root
_SHORT_ROOT_PENALTY = 4  # Penalty when root is exactly _MIN_ROOT_LEN chars
_MIN_ROOT_LEN = 2        # Minimum root length for suffix stripping
_MAX_SUFFIX_DEPTH = 5    # Maximum number of suffixes to strip

# ── Known-intact words ───────────────────────────────────────────────────────
# Common Turkish words that *look* like root+suffix but must stay whole.
# Without this set, "dedi" would split into "de" (TDK conjunction) + "di"
# (past tense suffix) because both are individually valid.
#
# This set covers inflected forms of very short verb stems (de-, ye-) and
# common discourse particles that happen to end in suffix-like sequences.

KNOWN_INTACT: frozenset[str] = frozenset({
    # Forms of "demek" (to say) — stem "de" is a TDK conjunction,
    # causing false splits like de+di, de+miş, de+se, etc.
    "dedi", "dedim", "dedin", "dedik", "dediniz", "dediler",
    "demiş", "demişti", "demiştir",
    "dese", "desem", "desen", "desek",
    "der", "derim", "dersin", "deriz",
    "denir", "dendi", "denmiş",
    # Forms of "yemek" (to eat) — stem "ye" is in TDK
    "yemiş", "yese", "yesem", "yesen",
    "yer", "yerim", "yersin", "yeriz",
    "yenir", "yendi", "yenmiş",
    # Common particles / conjunctions that end in suffix-like sequences
    # (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding)
    "diye", "niye", "nice",
})


# ── Punctuation splitting ────────────────────────────────────────────────────

# Regex to split a word at apostrophes (keeping the apostrophe)
_APOSTROPHE_RE = re.compile(r"(['\u2019])")

# Regex to split leading/trailing punctuation from a word
_LEADING_PUNCT_RE = re.compile(r"^([^\w]+)")
_TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$")


def _split_punctuation(word: str) -> list[tuple[str, str]]:
    """Split a raw word token into (text, type) pairs.

    Separates leading and trailing punctuation from the core word.
    For example: ``'"hello,'`` → ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]``
    """
    if not word:
        return []

    parts: list[tuple[str, str]] = []

    # Check if the entire token is punctuation
    if is_punct_token(word):
        return [(word, "PUNCT")]

    # Strip leading punctuation
    lead_m = _LEADING_PUNCT_RE.match(word)
    if lead_m:
        for ch in lead_m.group(1):
            parts.append((ch, "PUNCT"))
        word = word[lead_m.end():]

    # Strip trailing punctuation
    trail_m = _TRAILING_PUNCT_RE.search(word)
    trailing: list[tuple[str, str]] = []
    if trail_m:
        for ch in trail_m.group(1):
            trailing.append((ch, "PUNCT"))
        word = word[:trail_m.start()]

    if word:
        parts.append((word, "WORD"))

    parts.extend(trailing)
    return parts


# ── Word splitting ───────────────────────────────────────────────────────────

def split_into_words(text: str) -> list[str]:
    """Split text into whitespace-delimited word tokens.

    Preserves the original casing and punctuation within each token.
    """
    return text.split()


# ── Candidate generation ────────────────────────────────────────────────────

def _generate_suffix_candidates(
    word_lower: str,
    tdk: set[str],
    domain_roots: frozenset[str],
    depth: int = 0,
) -> list[SegmentationCandidate]:
    """Recursively generate segmentation candidates by stripping suffixes.

    Tries each suffix in the table (longest first).  If the remainder
    is a valid root, produces a candidate.  If not, recurses to try
    stripping additional suffixes from the remainder.
    """
    if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN:
        return []

    candidates: list[SegmentationCandidate] = []

    for suffix_surface, suffix_label in SUFFIX_ENTRIES:
        if not word_lower.endswith(suffix_surface):
            continue

        remainder = word_lower[: -len(suffix_surface)]
        if len(remainder) < _MIN_ROOT_LEN:
            continue

        # Extra caution for very short / ambiguous suffixes
        if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3:
            continue

        suffix_token = Token(
            text=suffix_surface,
            token_type="SUFFIX",
            metadata={"_suffix_label": suffix_label},
        )

        # Check if remainder is a valid root
        root_in_tdk = remainder in tdk
        root_in_domain = remainder in domain_roots
        root_score = len(remainder) * _ROOT_LEN_WEIGHT

        if root_in_tdk:
            root_score += _TDK_BONUS
        elif root_in_domain:
            root_score += _DOMAIN_BONUS
        else:
            root_score += _UNKNOWN_BASE

        # Penalise very short roots: 2-char roots like "de", "ye", "al"
        # are valid TDK entries but produce many false splits on short
        # words (e.g. "dedi" → de+di).  The penalty makes it harder for
        # a 2-char root to beat the whole-word hypothesis.
        if len(remainder) <= _MIN_ROOT_LEN:
            root_score -= _SHORT_ROOT_PENALTY

        if root_in_tdk or root_in_domain:
            # Valid root found → create single-level candidate
            root_token = Token(
                text=remainder,
                token_type="ROOT",
                metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {},
            )
            total_score = root_score + _SUFFIX_BONUS
            candidates.append(SegmentationCandidate(
                tokens=[root_token, suffix_token],
                score=total_score,
                source="suffix_chain",
            ))

        # Recurse: try stripping more suffixes from the remainder
        if depth < _MAX_SUFFIX_DEPTH - 1:
            sub_candidates = _generate_suffix_candidates(
                remainder, tdk, domain_roots, depth + 1
            )
            for sc in sub_candidates:
                # Only accept recursive results that found a real root
                if sc.score > len(remainder) + _UNKNOWN_BASE:
                    extended = SegmentationCandidate(
                        tokens=sc.tokens + [suffix_token],
                        score=sc.score + _SUFFIX_BONUS,
                        source="suffix_chain",
                    )
                    candidates.append(extended)

    return candidates


def generate_candidates(
    word: str,
    tdk: set[str],
    domain_roots: frozenset[str],
    caps_set: frozenset[str],
) -> list[SegmentationCandidate]:
    """Generate all plausible segmentation candidates for a single word.

    Returns a list of candidates sorted by score (highest first).
    """
    wl = turkish_lower(word)
    candidates: list[SegmentationCandidate] = []

    is_caps = wl in caps_set
    is_tr_chars = has_turkish_chars(wl)

    # ── Fast path: known-intact words bypass candidate generation ────────
    # These are common words that look splittable but must stay whole.
    if wl in KNOWN_INTACT:
        root_meta_intact: dict[str, Any] = {}
        if is_caps:
            root_meta_intact["_caps"] = True
        return [SegmentationCandidate(
            tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)],
            score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS,
            source="known_intact",
        )]

    # ── Candidate 1: whole word as ROOT ──────────────────────────────────
    in_tdk = wl in tdk
    in_proper = wl in load_proper_nouns()
    in_domain = wl in domain_roots
    whole_score = len(wl) * _ROOT_LEN_WEIGHT
    if in_tdk or in_proper:
        # Whole-word TDK/proper-noun match gets an extra bonus to prevent
        # over-segmenting valid dictionary words like "dünya" into
        # "dün" + "ya".
        whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS
    elif in_domain:
        whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS
    else:
        whole_score += _UNKNOWN_BASE

    root_meta: dict[str, Any] = {}
    if is_caps:
        root_meta["_caps"] = True
    if in_domain:
        root_meta["_domain"] = True

    whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta)
    candidates.append(SegmentationCandidate(
        tokens=[whole_root],
        score=whole_score,
        source="whole_word",
    ))

    # ── Candidate 2+: suffix stripping ───────────────────────────────────
    suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots)
    for sc in suffix_cands:
        # Propagate caps flag to the root token
        if is_caps and sc.tokens:
            sc.tokens[0].metadata["_caps"] = True
        candidates.append(sc)

    # ── Candidate N: foreign root ────────────────────────────────────────
    if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2:
        foreign_token = Token(
            text=wl, token_type="FOREIGN",
            metadata={"_foreign": True},
        )
        # Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that
        # valid suffix chains with a TDK root always beat FOREIGN.
        foreign_score = _FOREIGN_BASE + len(wl)
        candidates.append(SegmentationCandidate(
            tokens=[foreign_token],
            score=foreign_score,
            source="foreign",
        ))

    # Sort by score descending (highest first)
    candidates.sort(key=lambda c: c.score, reverse=True)
    return candidates


# ── Candidate selection ──────────────────────────────────────────────────────

def select_best_candidate(
    candidates: list[SegmentationCandidate],
) -> SegmentationCandidate:
    """Select the best segmentation among candidates.

    Picks the highest-scoring candidate.  Ties are broken by:
    1. Fewer tokens (less fragmentation)
    2. Longer root token
    """
    if not candidates:
        # Fallback: should never happen, but safety net
        return SegmentationCandidate(
            tokens=[Token(text="", token_type="ROOT")],
            score=0.0,
            source="fallback",
        )

    if len(candidates) == 1:
        return candidates[0]

    best_score = candidates[0].score
    tied = [c for c in candidates if c.score == best_score]

    if len(tied) == 1:
        return tied[0]

    # Tie-breaking: fewer tokens first; then longer root
    def _tie_key(c: SegmentationCandidate) -> tuple[int, int]:
        root_len = max(
            (len(t.text) for t in c.tokens if t.token_type == "ROOT"),
            default=0,
        )
        return (len(c.tokens), -root_len)

    tied.sort(key=_tie_key)
    return tied[0]


# ── Full word segmentation ───────────────────────────────────────────────────

def segment_word(
    word: str,
    tdk: set[str],
    domain_roots: frozenset[str],
    caps_set: frozenset[str],
) -> list[dict[str, object]]:
    """Segment a single word into token dicts.

    This is the main entry point for per-word segmentation.  It handles
    punctuation splitting, candidate generation, and selection.

    Args:
        word: Raw word string (may include surrounding punctuation).
        tdk: TDK dictionary set.
        domain_roots: Domain vocabulary set.
        caps_set: Set of words that were originally ALL CAPS.

    Returns:
        List of token dicts ready for inclusion in the output.
    """
    parts = _split_punctuation(word)
    result: list[dict[str, object]] = []
    is_first = True

    for text, part_type in parts:
        if part_type == "PUNCT":
            prefix = " " if is_first else ""
            result.append({
                "token": f"{prefix}{text}",
                "token_type": "PUNCT",
                "morph_pos": 0,
                "_punct": True,
            })
            is_first = False
            continue

        # part_type == "WORD"
        # Check for apostrophe within the word
        if "'" in text or "\u2019" in text:
            apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set)
            for i, t in enumerate(apo_tokens):
                if i == 0 and is_first:
                    t["token"] = f" {t['token'].lstrip()}"
                result.append(t)
            is_first = False
            continue

        # Standard word segmentation via candidate generation
        candidates = generate_candidates(text, tdk, domain_roots, caps_set)
        best = select_best_candidate(candidates)

        for i, token in enumerate(best.tokens):
            tok_dict = token.to_dict()
            # Add leading space to the first token of this word
            if i == 0 and is_first:
                tok_dict["token"] = f" {tok_dict['token'].lstrip()}"
            # Compute morph_pos
            if i == 0:
                tok_dict["morph_pos"] = 0
            else:
                tok_dict["morph_pos"] = i
            result.append(tok_dict)

        is_first = False

    return result


def _segment_apostrophe_word(
    word: str,
    tdk: set[str],
    domain_roots: frozenset[str],
    caps_set: frozenset[str],
) -> list[dict[str, object]]:
    """Segment a word containing an apostrophe.

    Splits at the apostrophe and determines whether the base is Turkish
    (proper name) or foreign.
    """
    from .apostrophe import is_turkish_base  # avoid circular at module level

    # Find the apostrophe position
    apo_pos = word.find("'")
    if apo_pos == -1:
        apo_pos = word.find("\u2019")
    if apo_pos == -1:
        # No apostrophe found (shouldn't happen) — treat as regular word
        candidates = generate_candidates(word, tdk, domain_roots, caps_set)
        best = select_best_candidate(candidates)
        return [t.to_dict() for t in best.tokens]

    base = word[:apo_pos]
    suffix = word[apo_pos + 1:]

    wl = turkish_lower(base)
    is_caps = wl in caps_set

    if is_turkish_base(base):
        # Turkish proper name: ROOT + PUNCT(') + SUFFIX
        suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
        tokens: list[dict[str, object]] = [
            {
                "token": base, "token_type": "ROOT", "morph_pos": 0,
                **( {"_caps": True} if is_caps else {}),
            },
            {
                "token": "'", "token_type": "PUNCT", "morph_pos": 0,
                "_punct": True,
            },
        ]
        if suffix:
            tokens.append({
                "token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
                "_apo_suffix": True, "_suffix_label": suffix_label,
            })
        return tokens
    else:
        # Foreign word: FOREIGN + SUFFIX
        suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
        tokens = [
            {
                "token": base, "token_type": "FOREIGN", "morph_pos": 0,
                "_foreign": True,
            },
        ]
        if suffix:
            tokens.append({
                "token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
                "_apo_suffix": True, "_suffix_label": suffix_label,
            })
        return tokens