"""Word-level segmentation with candidate generation and selection. This is the core of the tokenizer. For each word it: 1. Generates multiple segmentation candidates (whole-word ROOT, suffix chains, foreign root, etc.) 2. Scores each candidate deterministically 3. Selects the highest-scoring segmentation The scoring rules are transparent and tunable: - TDK root match gives a large bonus - Domain vocabulary match gives a moderate bonus - Longer roots are preferred over shorter ones - Each recognised suffix adds a small bonus - Unknown / unvalidated roots get a low base score """ from __future__ import annotations import re from typing import Any from ._domain_vocab import ALL_DOMAIN_ROOTS from ._suffix_table import ( SHORT_AMBIGUOUS_SUFFIXES, SUFFIX_ENTRIES, SUFFIX_MAP, ) from .normalization import has_turkish_chars, turkish_lower from .resources import load_proper_nouns, load_tdk_words from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token # ── Scoring constants ──────────────────────────────────────────────────────── # Why these values: TDK_BONUS dominates so that a TDK-validated root almost # always wins over an unvalidated one. SUFFIX_BONUS is small enough that # over-segmentation (many tiny suffixes) doesn't beat a valid longer root. _TDK_BONUS = 10 # Root found in TDK dictionary _DOMAIN_BONUS = 8 # Root found in domain vocabulary _SUFFIX_BONUS = 2 # Each recognised suffix _ROOT_LEN_WEIGHT = 2 # Per-character bonus for root length (prefer longer roots) _WHOLE_WORD_BONUS = 5 # Extra bonus when the *entire* unsplit word is in TDK _FOREIGN_BASE = 3 # Base score for foreign root (intentionally low) _UNKNOWN_BASE = 1 # Base score for unrecognised root _SHORT_ROOT_PENALTY = 4 # Penalty when root is exactly _MIN_ROOT_LEN chars _MIN_ROOT_LEN = 2 # Minimum root length for suffix stripping _MAX_SUFFIX_DEPTH = 5 # Maximum number of suffixes to strip # ── Known-intact words ─────────────────────────────────────────────────────── # Common Turkish words that *look* like root+suffix but must stay whole. # Without this set, "dedi" would split into "de" (TDK conjunction) + "di" # (past tense suffix) because both are individually valid. # # This set covers inflected forms of very short verb stems (de-, ye-) and # common discourse particles that happen to end in suffix-like sequences. KNOWN_INTACT: frozenset[str] = frozenset({ # Forms of "demek" (to say) — stem "de" is a TDK conjunction, # causing false splits like de+di, de+miş, de+se, etc. "dedi", "dedim", "dedin", "dedik", "dediniz", "dediler", "demiş", "demişti", "demiştir", "dese", "desem", "desen", "desek", "der", "derim", "dersin", "deriz", "denir", "dendi", "denmiş", # Forms of "yemek" (to eat) — stem "ye" is in TDK "yemiş", "yese", "yesem", "yesen", "yer", "yerim", "yersin", "yeriz", "yenir", "yendi", "yenmiş", # Common particles / conjunctions that end in suffix-like sequences # (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding) "diye", "niye", "nice", }) # ── Punctuation splitting ──────────────────────────────────────────────────── # Regex to split a word at apostrophes (keeping the apostrophe) _APOSTROPHE_RE = re.compile(r"(['\u2019])") # Regex to split leading/trailing punctuation from a word _LEADING_PUNCT_RE = re.compile(r"^([^\w]+)") _TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$") def _split_punctuation(word: str) -> list[tuple[str, str]]: """Split a raw word token into (text, type) pairs. Separates leading and trailing punctuation from the core word. For example: ``'"hello,'`` → ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]`` """ if not word: return [] parts: list[tuple[str, str]] = [] # Check if the entire token is punctuation if is_punct_token(word): return [(word, "PUNCT")] # Strip leading punctuation lead_m = _LEADING_PUNCT_RE.match(word) if lead_m: for ch in lead_m.group(1): parts.append((ch, "PUNCT")) word = word[lead_m.end():] # Strip trailing punctuation trail_m = _TRAILING_PUNCT_RE.search(word) trailing: list[tuple[str, str]] = [] if trail_m: for ch in trail_m.group(1): trailing.append((ch, "PUNCT")) word = word[:trail_m.start()] if word: parts.append((word, "WORD")) parts.extend(trailing) return parts # ── Word splitting ─────────────────────────────────────────────────────────── def split_into_words(text: str) -> list[str]: """Split text into whitespace-delimited word tokens. Preserves the original casing and punctuation within each token. """ return text.split() # ── Candidate generation ──────────────────────────────────────────────────── def _generate_suffix_candidates( word_lower: str, tdk: set[str], domain_roots: frozenset[str], depth: int = 0, ) -> list[SegmentationCandidate]: """Recursively generate segmentation candidates by stripping suffixes. Tries each suffix in the table (longest first). If the remainder is a valid root, produces a candidate. If not, recurses to try stripping additional suffixes from the remainder. """ if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN: return [] candidates: list[SegmentationCandidate] = [] for suffix_surface, suffix_label in SUFFIX_ENTRIES: if not word_lower.endswith(suffix_surface): continue remainder = word_lower[: -len(suffix_surface)] if len(remainder) < _MIN_ROOT_LEN: continue # Extra caution for very short / ambiguous suffixes if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3: continue suffix_token = Token( text=suffix_surface, token_type="SUFFIX", metadata={"_suffix_label": suffix_label}, ) # Check if remainder is a valid root root_in_tdk = remainder in tdk root_in_domain = remainder in domain_roots root_score = len(remainder) * _ROOT_LEN_WEIGHT if root_in_tdk: root_score += _TDK_BONUS elif root_in_domain: root_score += _DOMAIN_BONUS else: root_score += _UNKNOWN_BASE # Penalise very short roots: 2-char roots like "de", "ye", "al" # are valid TDK entries but produce many false splits on short # words (e.g. "dedi" → de+di). The penalty makes it harder for # a 2-char root to beat the whole-word hypothesis. if len(remainder) <= _MIN_ROOT_LEN: root_score -= _SHORT_ROOT_PENALTY if root_in_tdk or root_in_domain: # Valid root found → create single-level candidate root_token = Token( text=remainder, token_type="ROOT", metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {}, ) total_score = root_score + _SUFFIX_BONUS candidates.append(SegmentationCandidate( tokens=[root_token, suffix_token], score=total_score, source="suffix_chain", )) # Recurse: try stripping more suffixes from the remainder if depth < _MAX_SUFFIX_DEPTH - 1: sub_candidates = _generate_suffix_candidates( remainder, tdk, domain_roots, depth + 1 ) for sc in sub_candidates: # Only accept recursive results that found a real root if sc.score > len(remainder) + _UNKNOWN_BASE: extended = SegmentationCandidate( tokens=sc.tokens + [suffix_token], score=sc.score + _SUFFIX_BONUS, source="suffix_chain", ) candidates.append(extended) return candidates def generate_candidates( word: str, tdk: set[str], domain_roots: frozenset[str], caps_set: frozenset[str], ) -> list[SegmentationCandidate]: """Generate all plausible segmentation candidates for a single word. Returns a list of candidates sorted by score (highest first). """ wl = turkish_lower(word) candidates: list[SegmentationCandidate] = [] is_caps = wl in caps_set is_tr_chars = has_turkish_chars(wl) # ── Fast path: known-intact words bypass candidate generation ──────── # These are common words that look splittable but must stay whole. if wl in KNOWN_INTACT: root_meta_intact: dict[str, Any] = {} if is_caps: root_meta_intact["_caps"] = True return [SegmentationCandidate( tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)], score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS, source="known_intact", )] # ── Candidate 1: whole word as ROOT ────────────────────────────────── in_tdk = wl in tdk in_proper = wl in load_proper_nouns() in_domain = wl in domain_roots whole_score = len(wl) * _ROOT_LEN_WEIGHT if in_tdk or in_proper: # Whole-word TDK/proper-noun match gets an extra bonus to prevent # over-segmenting valid dictionary words like "dünya" into # "dün" + "ya". whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS elif in_domain: whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS else: whole_score += _UNKNOWN_BASE root_meta: dict[str, Any] = {} if is_caps: root_meta["_caps"] = True if in_domain: root_meta["_domain"] = True whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta) candidates.append(SegmentationCandidate( tokens=[whole_root], score=whole_score, source="whole_word", )) # ── Candidate 2+: suffix stripping ─────────────────────────────────── suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots) for sc in suffix_cands: # Propagate caps flag to the root token if is_caps and sc.tokens: sc.tokens[0].metadata["_caps"] = True candidates.append(sc) # ── Candidate N: foreign root ──────────────────────────────────────── if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2: foreign_token = Token( text=wl, token_type="FOREIGN", metadata={"_foreign": True}, ) # Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that # valid suffix chains with a TDK root always beat FOREIGN. foreign_score = _FOREIGN_BASE + len(wl) candidates.append(SegmentationCandidate( tokens=[foreign_token], score=foreign_score, source="foreign", )) # Sort by score descending (highest first) candidates.sort(key=lambda c: c.score, reverse=True) return candidates # ── Candidate selection ────────────────────────────────────────────────────── def select_best_candidate( candidates: list[SegmentationCandidate], ) -> SegmentationCandidate: """Select the best segmentation among candidates. Picks the highest-scoring candidate. Ties are broken by: 1. Fewer tokens (less fragmentation) 2. Longer root token """ if not candidates: # Fallback: should never happen, but safety net return SegmentationCandidate( tokens=[Token(text="", token_type="ROOT")], score=0.0, source="fallback", ) if len(candidates) == 1: return candidates[0] best_score = candidates[0].score tied = [c for c in candidates if c.score == best_score] if len(tied) == 1: return tied[0] # Tie-breaking: fewer tokens first; then longer root def _tie_key(c: SegmentationCandidate) -> tuple[int, int]: root_len = max( (len(t.text) for t in c.tokens if t.token_type == "ROOT"), default=0, ) return (len(c.tokens), -root_len) tied.sort(key=_tie_key) return tied[0] # ── Full word segmentation ─────────────────────────────────────────────────── def segment_word( word: str, tdk: set[str], domain_roots: frozenset[str], caps_set: frozenset[str], ) -> list[dict[str, object]]: """Segment a single word into token dicts. This is the main entry point for per-word segmentation. It handles punctuation splitting, candidate generation, and selection. Args: word: Raw word string (may include surrounding punctuation). tdk: TDK dictionary set. domain_roots: Domain vocabulary set. caps_set: Set of words that were originally ALL CAPS. Returns: List of token dicts ready for inclusion in the output. """ parts = _split_punctuation(word) result: list[dict[str, object]] = [] is_first = True for text, part_type in parts: if part_type == "PUNCT": prefix = " " if is_first else "" result.append({ "token": f"{prefix}{text}", "token_type": "PUNCT", "morph_pos": 0, "_punct": True, }) is_first = False continue # part_type == "WORD" # Check for apostrophe within the word if "'" in text or "\u2019" in text: apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set) for i, t in enumerate(apo_tokens): if i == 0 and is_first: t["token"] = f" {t['token'].lstrip()}" result.append(t) is_first = False continue # Standard word segmentation via candidate generation candidates = generate_candidates(text, tdk, domain_roots, caps_set) best = select_best_candidate(candidates) for i, token in enumerate(best.tokens): tok_dict = token.to_dict() # Add leading space to the first token of this word if i == 0 and is_first: tok_dict["token"] = f" {tok_dict['token'].lstrip()}" # Compute morph_pos if i == 0: tok_dict["morph_pos"] = 0 else: tok_dict["morph_pos"] = i result.append(tok_dict) is_first = False return result def _segment_apostrophe_word( word: str, tdk: set[str], domain_roots: frozenset[str], caps_set: frozenset[str], ) -> list[dict[str, object]]: """Segment a word containing an apostrophe. Splits at the apostrophe and determines whether the base is Turkish (proper name) or foreign. """ from .apostrophe import is_turkish_base # avoid circular at module level # Find the apostrophe position apo_pos = word.find("'") if apo_pos == -1: apo_pos = word.find("\u2019") if apo_pos == -1: # No apostrophe found (shouldn't happen) — treat as regular word candidates = generate_candidates(word, tdk, domain_roots, caps_set) best = select_best_candidate(candidates) return [t.to_dict() for t in best.tokens] base = word[:apo_pos] suffix = word[apo_pos + 1:] wl = turkish_lower(base) is_caps = wl in caps_set if is_turkish_base(base): # Turkish proper name: ROOT + PUNCT(') + SUFFIX suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX") tokens: list[dict[str, object]] = [ { "token": base, "token_type": "ROOT", "morph_pos": 0, **( {"_caps": True} if is_caps else {}), }, { "token": "'", "token_type": "PUNCT", "morph_pos": 0, "_punct": True, }, ] if suffix: tokens.append({ "token": suffix, "token_type": "SUFFIX", "morph_pos": 1, "_apo_suffix": True, "_suffix_label": suffix_label, }) return tokens else: # Foreign word: FOREIGN + SUFFIX suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX") tokens = [ { "token": base, "token_type": "FOREIGN", "morph_pos": 0, "_foreign": True, }, ] if suffix: tokens.append({ "token": suffix, "token_type": "SUFFIX", "morph_pos": 1, "_apo_suffix": True, "_suffix_label": suffix_label, }) return tokens