Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7 | """Word-level segmentation with candidate generation and selection. | |
| This is the core of the tokenizer. For each word it: | |
| 1. Generates multiple segmentation candidates (whole-word ROOT, suffix | |
| chains, foreign root, etc.) | |
| 2. Scores each candidate deterministically | |
| 3. Selects the highest-scoring segmentation | |
| The scoring rules are transparent and tunable: | |
| - TDK root match gives a large bonus | |
| - Domain vocabulary match gives a moderate bonus | |
| - Longer roots are preferred over shorter ones | |
| - Each recognised suffix adds a small bonus | |
| - Unknown / unvalidated roots get a low base score | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from typing import Any | |
| from ._domain_vocab import ALL_DOMAIN_ROOTS | |
| from ._suffix_table import ( | |
| SHORT_AMBIGUOUS_SUFFIXES, | |
| SUFFIX_ENTRIES, | |
| SUFFIX_MAP, | |
| ) | |
| from .normalization import has_turkish_chars, turkish_lower | |
| from .resources import load_proper_nouns, load_tdk_words | |
| from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token | |
| # ββ Scoring constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Why these values: TDK_BONUS dominates so that a TDK-validated root almost | |
| # always wins over an unvalidated one. SUFFIX_BONUS is small enough that | |
| # over-segmentation (many tiny suffixes) doesn't beat a valid longer root. | |
| _TDK_BONUS = 10 # Root found in TDK dictionary | |
| _DOMAIN_BONUS = 8 # Root found in domain vocabulary | |
| _SUFFIX_BONUS = 2 # Each recognised suffix | |
| _ROOT_LEN_WEIGHT = 2 # Per-character bonus for root length (prefer longer roots) | |
| _WHOLE_WORD_BONUS = 5 # Extra bonus when the *entire* unsplit word is in TDK | |
| _FOREIGN_BASE = 3 # Base score for foreign root (intentionally low) | |
| _UNKNOWN_BASE = 1 # Base score for unrecognised root | |
| _SHORT_ROOT_PENALTY = 4 # Penalty when root is exactly _MIN_ROOT_LEN chars | |
| _MIN_ROOT_LEN = 2 # Minimum root length for suffix stripping | |
| _MAX_SUFFIX_DEPTH = 5 # Maximum number of suffixes to strip | |
| # ββ Known-intact words βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Common Turkish words that *look* like root+suffix but must stay whole. | |
| # Without this set, "dedi" would split into "de" (TDK conjunction) + "di" | |
| # (past tense suffix) because both are individually valid. | |
| # | |
| # This set covers inflected forms of very short verb stems (de-, ye-) and | |
| # common discourse particles that happen to end in suffix-like sequences. | |
| KNOWN_INTACT: frozenset[str] = frozenset({ | |
| # Forms of "demek" (to say) β stem "de" is a TDK conjunction, | |
| # causing false splits like de+di, de+miΕ, de+se, etc. | |
| "dedi", "dedim", "dedin", "dedik", "dediniz", "dediler", | |
| "demiΕ", "demiΕti", "demiΕtir", | |
| "dese", "desem", "desen", "desek", | |
| "der", "derim", "dersin", "deriz", | |
| "denir", "dendi", "denmiΕ", | |
| # Forms of "yemek" (to eat) β stem "ye" is in TDK | |
| "yemiΕ", "yese", "yesem", "yesen", | |
| "yer", "yerim", "yersin", "yeriz", | |
| "yenir", "yendi", "yenmiΕ", | |
| # Common particles / conjunctions that end in suffix-like sequences | |
| # (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding) | |
| "diye", "niye", "nice", | |
| }) | |
| # ββ Punctuation splitting ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Regex to split a word at apostrophes (keeping the apostrophe) | |
| _APOSTROPHE_RE = re.compile(r"(['\u2019])") | |
| # Regex to split leading/trailing punctuation from a word | |
| _LEADING_PUNCT_RE = re.compile(r"^([^\w]+)") | |
| _TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$") | |
| def _split_punctuation(word: str) -> list[tuple[str, str]]: | |
| """Split a raw word token into (text, type) pairs. | |
| Separates leading and trailing punctuation from the core word. | |
| For example: ``'"hello,'`` β ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]`` | |
| """ | |
| if not word: | |
| return [] | |
| parts: list[tuple[str, str]] = [] | |
| # Check if the entire token is punctuation | |
| if is_punct_token(word): | |
| return [(word, "PUNCT")] | |
| # Strip leading punctuation | |
| lead_m = _LEADING_PUNCT_RE.match(word) | |
| if lead_m: | |
| for ch in lead_m.group(1): | |
| parts.append((ch, "PUNCT")) | |
| word = word[lead_m.end():] | |
| # Strip trailing punctuation | |
| trail_m = _TRAILING_PUNCT_RE.search(word) | |
| trailing: list[tuple[str, str]] = [] | |
| if trail_m: | |
| for ch in trail_m.group(1): | |
| trailing.append((ch, "PUNCT")) | |
| word = word[:trail_m.start()] | |
| if word: | |
| parts.append((word, "WORD")) | |
| parts.extend(trailing) | |
| return parts | |
| # ββ Word splitting βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def split_into_words(text: str) -> list[str]: | |
| """Split text into whitespace-delimited word tokens. | |
| Preserves the original casing and punctuation within each token. | |
| """ | |
| return text.split() | |
| # ββ Candidate generation ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _generate_suffix_candidates( | |
| word_lower: str, | |
| tdk: set[str], | |
| domain_roots: frozenset[str], | |
| depth: int = 0, | |
| ) -> list[SegmentationCandidate]: | |
| """Recursively generate segmentation candidates by stripping suffixes. | |
| Tries each suffix in the table (longest first). If the remainder | |
| is a valid root, produces a candidate. If not, recurses to try | |
| stripping additional suffixes from the remainder. | |
| """ | |
| if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN: | |
| return [] | |
| candidates: list[SegmentationCandidate] = [] | |
| for suffix_surface, suffix_label in SUFFIX_ENTRIES: | |
| if not word_lower.endswith(suffix_surface): | |
| continue | |
| remainder = word_lower[: -len(suffix_surface)] | |
| if len(remainder) < _MIN_ROOT_LEN: | |
| continue | |
| # Extra caution for very short / ambiguous suffixes | |
| if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3: | |
| continue | |
| suffix_token = Token( | |
| text=suffix_surface, | |
| token_type="SUFFIX", | |
| metadata={"_suffix_label": suffix_label}, | |
| ) | |
| # Check if remainder is a valid root | |
| root_in_tdk = remainder in tdk | |
| root_in_domain = remainder in domain_roots | |
| root_score = len(remainder) * _ROOT_LEN_WEIGHT | |
| if root_in_tdk: | |
| root_score += _TDK_BONUS | |
| elif root_in_domain: | |
| root_score += _DOMAIN_BONUS | |
| else: | |
| root_score += _UNKNOWN_BASE | |
| # Penalise very short roots: 2-char roots like "de", "ye", "al" | |
| # are valid TDK entries but produce many false splits on short | |
| # words (e.g. "dedi" β de+di). The penalty makes it harder for | |
| # a 2-char root to beat the whole-word hypothesis. | |
| if len(remainder) <= _MIN_ROOT_LEN: | |
| root_score -= _SHORT_ROOT_PENALTY | |
| if root_in_tdk or root_in_domain: | |
| # Valid root found β create single-level candidate | |
| root_token = Token( | |
| text=remainder, | |
| token_type="ROOT", | |
| metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {}, | |
| ) | |
| total_score = root_score + _SUFFIX_BONUS | |
| candidates.append(SegmentationCandidate( | |
| tokens=[root_token, suffix_token], | |
| score=total_score, | |
| source="suffix_chain", | |
| )) | |
| # Recurse: try stripping more suffixes from the remainder | |
| if depth < _MAX_SUFFIX_DEPTH - 1: | |
| sub_candidates = _generate_suffix_candidates( | |
| remainder, tdk, domain_roots, depth + 1 | |
| ) | |
| for sc in sub_candidates: | |
| # Only accept recursive results that found a real root | |
| if sc.score > len(remainder) + _UNKNOWN_BASE: | |
| extended = SegmentationCandidate( | |
| tokens=sc.tokens + [suffix_token], | |
| score=sc.score + _SUFFIX_BONUS, | |
| source="suffix_chain", | |
| ) | |
| candidates.append(extended) | |
| return candidates | |
| def generate_candidates( | |
| word: str, | |
| tdk: set[str], | |
| domain_roots: frozenset[str], | |
| caps_set: frozenset[str], | |
| ) -> list[SegmentationCandidate]: | |
| """Generate all plausible segmentation candidates for a single word. | |
| Returns a list of candidates sorted by score (highest first). | |
| """ | |
| wl = turkish_lower(word) | |
| candidates: list[SegmentationCandidate] = [] | |
| is_caps = wl in caps_set | |
| is_tr_chars = has_turkish_chars(wl) | |
| # ββ Fast path: known-intact words bypass candidate generation ββββββββ | |
| # These are common words that look splittable but must stay whole. | |
| if wl in KNOWN_INTACT: | |
| root_meta_intact: dict[str, Any] = {} | |
| if is_caps: | |
| root_meta_intact["_caps"] = True | |
| return [SegmentationCandidate( | |
| tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)], | |
| score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS, | |
| source="known_intact", | |
| )] | |
| # ββ Candidate 1: whole word as ROOT ββββββββββββββββββββββββββββββββββ | |
| in_tdk = wl in tdk | |
| in_proper = wl in load_proper_nouns() | |
| in_domain = wl in domain_roots | |
| whole_score = len(wl) * _ROOT_LEN_WEIGHT | |
| if in_tdk or in_proper: | |
| # Whole-word TDK/proper-noun match gets an extra bonus to prevent | |
| # over-segmenting valid dictionary words like "dΓΌnya" into | |
| # "dΓΌn" + "ya". | |
| whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS | |
| elif in_domain: | |
| whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS | |
| else: | |
| whole_score += _UNKNOWN_BASE | |
| root_meta: dict[str, Any] = {} | |
| if is_caps: | |
| root_meta["_caps"] = True | |
| if in_domain: | |
| root_meta["_domain"] = True | |
| whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta) | |
| candidates.append(SegmentationCandidate( | |
| tokens=[whole_root], | |
| score=whole_score, | |
| source="whole_word", | |
| )) | |
| # ββ Candidate 2+: suffix stripping βββββββββββββββββββββββββββββββββββ | |
| suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots) | |
| for sc in suffix_cands: | |
| # Propagate caps flag to the root token | |
| if is_caps and sc.tokens: | |
| sc.tokens[0].metadata["_caps"] = True | |
| candidates.append(sc) | |
| # ββ Candidate N: foreign root ββββββββββββββββββββββββββββββββββββββββ | |
| if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2: | |
| foreign_token = Token( | |
| text=wl, token_type="FOREIGN", | |
| metadata={"_foreign": True}, | |
| ) | |
| # Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that | |
| # valid suffix chains with a TDK root always beat FOREIGN. | |
| foreign_score = _FOREIGN_BASE + len(wl) | |
| candidates.append(SegmentationCandidate( | |
| tokens=[foreign_token], | |
| score=foreign_score, | |
| source="foreign", | |
| )) | |
| # Sort by score descending (highest first) | |
| candidates.sort(key=lambda c: c.score, reverse=True) | |
| return candidates | |
| # ββ Candidate selection ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def select_best_candidate( | |
| candidates: list[SegmentationCandidate], | |
| ) -> SegmentationCandidate: | |
| """Select the best segmentation among candidates. | |
| Picks the highest-scoring candidate. Ties are broken by: | |
| 1. Fewer tokens (less fragmentation) | |
| 2. Longer root token | |
| """ | |
| if not candidates: | |
| # Fallback: should never happen, but safety net | |
| return SegmentationCandidate( | |
| tokens=[Token(text="", token_type="ROOT")], | |
| score=0.0, | |
| source="fallback", | |
| ) | |
| if len(candidates) == 1: | |
| return candidates[0] | |
| best_score = candidates[0].score | |
| tied = [c for c in candidates if c.score == best_score] | |
| if len(tied) == 1: | |
| return tied[0] | |
| # Tie-breaking: fewer tokens first; then longer root | |
| def _tie_key(c: SegmentationCandidate) -> tuple[int, int]: | |
| root_len = max( | |
| (len(t.text) for t in c.tokens if t.token_type == "ROOT"), | |
| default=0, | |
| ) | |
| return (len(c.tokens), -root_len) | |
| tied.sort(key=_tie_key) | |
| return tied[0] | |
| # ββ Full word segmentation βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def segment_word( | |
| word: str, | |
| tdk: set[str], | |
| domain_roots: frozenset[str], | |
| caps_set: frozenset[str], | |
| ) -> list[dict[str, object]]: | |
| """Segment a single word into token dicts. | |
| This is the main entry point for per-word segmentation. It handles | |
| punctuation splitting, candidate generation, and selection. | |
| Args: | |
| word: Raw word string (may include surrounding punctuation). | |
| tdk: TDK dictionary set. | |
| domain_roots: Domain vocabulary set. | |
| caps_set: Set of words that were originally ALL CAPS. | |
| Returns: | |
| List of token dicts ready for inclusion in the output. | |
| """ | |
| parts = _split_punctuation(word) | |
| result: list[dict[str, object]] = [] | |
| is_first = True | |
| for text, part_type in parts: | |
| if part_type == "PUNCT": | |
| prefix = " " if is_first else "" | |
| result.append({ | |
| "token": f"{prefix}{text}", | |
| "token_type": "PUNCT", | |
| "morph_pos": 0, | |
| "_punct": True, | |
| }) | |
| is_first = False | |
| continue | |
| # part_type == "WORD" | |
| # Check for apostrophe within the word | |
| if "'" in text or "\u2019" in text: | |
| apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set) | |
| for i, t in enumerate(apo_tokens): | |
| if i == 0 and is_first: | |
| t["token"] = f" {t['token'].lstrip()}" | |
| result.append(t) | |
| is_first = False | |
| continue | |
| # Standard word segmentation via candidate generation | |
| candidates = generate_candidates(text, tdk, domain_roots, caps_set) | |
| best = select_best_candidate(candidates) | |
| for i, token in enumerate(best.tokens): | |
| tok_dict = token.to_dict() | |
| # Add leading space to the first token of this word | |
| if i == 0 and is_first: | |
| tok_dict["token"] = f" {tok_dict['token'].lstrip()}" | |
| # Compute morph_pos | |
| if i == 0: | |
| tok_dict["morph_pos"] = 0 | |
| else: | |
| tok_dict["morph_pos"] = i | |
| result.append(tok_dict) | |
| is_first = False | |
| return result | |
| def _segment_apostrophe_word( | |
| word: str, | |
| tdk: set[str], | |
| domain_roots: frozenset[str], | |
| caps_set: frozenset[str], | |
| ) -> list[dict[str, object]]: | |
| """Segment a word containing an apostrophe. | |
| Splits at the apostrophe and determines whether the base is Turkish | |
| (proper name) or foreign. | |
| """ | |
| from .apostrophe import is_turkish_base # avoid circular at module level | |
| # Find the apostrophe position | |
| apo_pos = word.find("'") | |
| if apo_pos == -1: | |
| apo_pos = word.find("\u2019") | |
| if apo_pos == -1: | |
| # No apostrophe found (shouldn't happen) β treat as regular word | |
| candidates = generate_candidates(word, tdk, domain_roots, caps_set) | |
| best = select_best_candidate(candidates) | |
| return [t.to_dict() for t in best.tokens] | |
| base = word[:apo_pos] | |
| suffix = word[apo_pos + 1:] | |
| wl = turkish_lower(base) | |
| is_caps = wl in caps_set | |
| if is_turkish_base(base): | |
| # Turkish proper name: ROOT + PUNCT(') + SUFFIX | |
| suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX") | |
| tokens: list[dict[str, object]] = [ | |
| { | |
| "token": base, "token_type": "ROOT", "morph_pos": 0, | |
| **( {"_caps": True} if is_caps else {}), | |
| }, | |
| { | |
| "token": "'", "token_type": "PUNCT", "morph_pos": 0, | |
| "_punct": True, | |
| }, | |
| ] | |
| if suffix: | |
| tokens.append({ | |
| "token": suffix, "token_type": "SUFFIX", "morph_pos": 1, | |
| "_apo_suffix": True, "_suffix_label": suffix_label, | |
| }) | |
| return tokens | |
| else: | |
| # Foreign word: FOREIGN + SUFFIX | |
| suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX") | |
| tokens = [ | |
| { | |
| "token": base, "token_type": "FOREIGN", "morph_pos": 0, | |
| "_foreign": True, | |
| }, | |
| ] | |
| if suffix: | |
| tokens.append({ | |
| "token": suffix, "token_type": "SUFFIX", "morph_pos": 1, | |
| "_apo_suffix": True, "_suffix_label": suffix_label, | |
| }) | |
| return tokens | |