Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7
"""Word-level segmentation with candidate generation and selection.
This is the core of the tokenizer. For each word it:
1. Generates multiple segmentation candidates (whole-word ROOT, suffix
chains, foreign root, etc.)
2. Scores each candidate deterministically
3. Selects the highest-scoring segmentation
The scoring rules are transparent and tunable:
- TDK root match gives a large bonus
- Domain vocabulary match gives a moderate bonus
- Longer roots are preferred over shorter ones
- Each recognised suffix adds a small bonus
- Unknown / unvalidated roots get a low base score
"""
from __future__ import annotations
import re
from typing import Any
from ._domain_vocab import ALL_DOMAIN_ROOTS
from ._suffix_table import (
SHORT_AMBIGUOUS_SUFFIXES,
SUFFIX_ENTRIES,
SUFFIX_MAP,
)
from .normalization import has_turkish_chars, turkish_lower
from .resources import load_proper_nouns, load_tdk_words
from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token
# ── Scoring constants ────────────────────────────────────────────────────────
# Why these values: TDK_BONUS dominates so that a TDK-validated root almost
# always wins over an unvalidated one. SUFFIX_BONUS is small enough that
# over-segmentation (many tiny suffixes) doesn't beat a valid longer root.
_TDK_BONUS = 10 # Root found in TDK dictionary
_DOMAIN_BONUS = 8 # Root found in domain vocabulary
_SUFFIX_BONUS = 2 # Each recognised suffix
_ROOT_LEN_WEIGHT = 2 # Per-character bonus for root length (prefer longer roots)
_WHOLE_WORD_BONUS = 5 # Extra bonus when the *entire* unsplit word is in TDK
_FOREIGN_BASE = 3 # Base score for foreign root (intentionally low)
_UNKNOWN_BASE = 1 # Base score for unrecognised root
_SHORT_ROOT_PENALTY = 4 # Penalty when root is exactly _MIN_ROOT_LEN chars
_MIN_ROOT_LEN = 2 # Minimum root length for suffix stripping
_MAX_SUFFIX_DEPTH = 5 # Maximum number of suffixes to strip
# ── Known-intact words ───────────────────────────────────────────────────────
# Common Turkish words that *look* like root+suffix but must stay whole.
# Without this set, "dedi" would split into "de" (TDK conjunction) + "di"
# (past tense suffix) because both are individually valid.
#
# This set covers inflected forms of very short verb stems (de-, ye-) and
# common discourse particles that happen to end in suffix-like sequences.
KNOWN_INTACT: frozenset[str] = frozenset({
# Forms of "demek" (to say) β€” stem "de" is a TDK conjunction,
# causing false splits like de+di, de+miş, de+se, etc.
"dedi", "dedim", "dedin", "dedik", "dediniz", "dediler",
"demiş", "demişti", "demiştir",
"dese", "desem", "desen", "desek",
"der", "derim", "dersin", "deriz",
"denir", "dendi", "denmiş",
# Forms of "yemek" (to eat) β€” stem "ye" is in TDK
"yemiş", "yese", "yesem", "yesen",
"yer", "yerim", "yersin", "yeriz",
"yenir", "yendi", "yenmiş",
# Common particles / conjunctions that end in suffix-like sequences
# (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding)
"diye", "niye", "nice",
})
# ── Punctuation splitting ────────────────────────────────────────────────────
# Regex to split a word at apostrophes (keeping the apostrophe)
_APOSTROPHE_RE = re.compile(r"(['\u2019])")
# Regex to split leading/trailing punctuation from a word
_LEADING_PUNCT_RE = re.compile(r"^([^\w]+)")
_TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$")
def _split_punctuation(word: str) -> list[tuple[str, str]]:
"""Split a raw word token into (text, type) pairs.
Separates leading and trailing punctuation from the core word.
For example: ``'"hello,'`` β†’ ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]``
"""
if not word:
return []
parts: list[tuple[str, str]] = []
# Check if the entire token is punctuation
if is_punct_token(word):
return [(word, "PUNCT")]
# Strip leading punctuation
lead_m = _LEADING_PUNCT_RE.match(word)
if lead_m:
for ch in lead_m.group(1):
parts.append((ch, "PUNCT"))
word = word[lead_m.end():]
# Strip trailing punctuation
trail_m = _TRAILING_PUNCT_RE.search(word)
trailing: list[tuple[str, str]] = []
if trail_m:
for ch in trail_m.group(1):
trailing.append((ch, "PUNCT"))
word = word[:trail_m.start()]
if word:
parts.append((word, "WORD"))
parts.extend(trailing)
return parts
# ── Word splitting ───────────────────────────────────────────────────────────
def split_into_words(text: str) -> list[str]:
"""Split text into whitespace-delimited word tokens.
Preserves the original casing and punctuation within each token.
"""
return text.split()
# ── Candidate generation ────────────────────────────────────────────────────
def _generate_suffix_candidates(
word_lower: str,
tdk: set[str],
domain_roots: frozenset[str],
depth: int = 0,
) -> list[SegmentationCandidate]:
"""Recursively generate segmentation candidates by stripping suffixes.
Tries each suffix in the table (longest first). If the remainder
is a valid root, produces a candidate. If not, recurses to try
stripping additional suffixes from the remainder.
"""
if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN:
return []
candidates: list[SegmentationCandidate] = []
for suffix_surface, suffix_label in SUFFIX_ENTRIES:
if not word_lower.endswith(suffix_surface):
continue
remainder = word_lower[: -len(suffix_surface)]
if len(remainder) < _MIN_ROOT_LEN:
continue
# Extra caution for very short / ambiguous suffixes
if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3:
continue
suffix_token = Token(
text=suffix_surface,
token_type="SUFFIX",
metadata={"_suffix_label": suffix_label},
)
# Check if remainder is a valid root
root_in_tdk = remainder in tdk
root_in_domain = remainder in domain_roots
root_score = len(remainder) * _ROOT_LEN_WEIGHT
if root_in_tdk:
root_score += _TDK_BONUS
elif root_in_domain:
root_score += _DOMAIN_BONUS
else:
root_score += _UNKNOWN_BASE
# Penalise very short roots: 2-char roots like "de", "ye", "al"
# are valid TDK entries but produce many false splits on short
# words (e.g. "dedi" β†’ de+di). The penalty makes it harder for
# a 2-char root to beat the whole-word hypothesis.
if len(remainder) <= _MIN_ROOT_LEN:
root_score -= _SHORT_ROOT_PENALTY
if root_in_tdk or root_in_domain:
# Valid root found β†’ create single-level candidate
root_token = Token(
text=remainder,
token_type="ROOT",
metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {},
)
total_score = root_score + _SUFFIX_BONUS
candidates.append(SegmentationCandidate(
tokens=[root_token, suffix_token],
score=total_score,
source="suffix_chain",
))
# Recurse: try stripping more suffixes from the remainder
if depth < _MAX_SUFFIX_DEPTH - 1:
sub_candidates = _generate_suffix_candidates(
remainder, tdk, domain_roots, depth + 1
)
for sc in sub_candidates:
# Only accept recursive results that found a real root
if sc.score > len(remainder) + _UNKNOWN_BASE:
extended = SegmentationCandidate(
tokens=sc.tokens + [suffix_token],
score=sc.score + _SUFFIX_BONUS,
source="suffix_chain",
)
candidates.append(extended)
return candidates
def generate_candidates(
word: str,
tdk: set[str],
domain_roots: frozenset[str],
caps_set: frozenset[str],
) -> list[SegmentationCandidate]:
"""Generate all plausible segmentation candidates for a single word.
Returns a list of candidates sorted by score (highest first).
"""
wl = turkish_lower(word)
candidates: list[SegmentationCandidate] = []
is_caps = wl in caps_set
is_tr_chars = has_turkish_chars(wl)
# ── Fast path: known-intact words bypass candidate generation ────────
# These are common words that look splittable but must stay whole.
if wl in KNOWN_INTACT:
root_meta_intact: dict[str, Any] = {}
if is_caps:
root_meta_intact["_caps"] = True
return [SegmentationCandidate(
tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)],
score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS,
source="known_intact",
)]
# ── Candidate 1: whole word as ROOT ──────────────────────────────────
in_tdk = wl in tdk
in_proper = wl in load_proper_nouns()
in_domain = wl in domain_roots
whole_score = len(wl) * _ROOT_LEN_WEIGHT
if in_tdk or in_proper:
# Whole-word TDK/proper-noun match gets an extra bonus to prevent
# over-segmenting valid dictionary words like "dΓΌnya" into
# "dΓΌn" + "ya".
whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS
elif in_domain:
whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS
else:
whole_score += _UNKNOWN_BASE
root_meta: dict[str, Any] = {}
if is_caps:
root_meta["_caps"] = True
if in_domain:
root_meta["_domain"] = True
whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta)
candidates.append(SegmentationCandidate(
tokens=[whole_root],
score=whole_score,
source="whole_word",
))
# ── Candidate 2+: suffix stripping ───────────────────────────────────
suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots)
for sc in suffix_cands:
# Propagate caps flag to the root token
if is_caps and sc.tokens:
sc.tokens[0].metadata["_caps"] = True
candidates.append(sc)
# ── Candidate N: foreign root ────────────────────────────────────────
if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2:
foreign_token = Token(
text=wl, token_type="FOREIGN",
metadata={"_foreign": True},
)
# Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that
# valid suffix chains with a TDK root always beat FOREIGN.
foreign_score = _FOREIGN_BASE + len(wl)
candidates.append(SegmentationCandidate(
tokens=[foreign_token],
score=foreign_score,
source="foreign",
))
# Sort by score descending (highest first)
candidates.sort(key=lambda c: c.score, reverse=True)
return candidates
# ── Candidate selection ──────────────────────────────────────────────────────
def select_best_candidate(
candidates: list[SegmentationCandidate],
) -> SegmentationCandidate:
"""Select the best segmentation among candidates.
Picks the highest-scoring candidate. Ties are broken by:
1. Fewer tokens (less fragmentation)
2. Longer root token
"""
if not candidates:
# Fallback: should never happen, but safety net
return SegmentationCandidate(
tokens=[Token(text="", token_type="ROOT")],
score=0.0,
source="fallback",
)
if len(candidates) == 1:
return candidates[0]
best_score = candidates[0].score
tied = [c for c in candidates if c.score == best_score]
if len(tied) == 1:
return tied[0]
# Tie-breaking: fewer tokens first; then longer root
def _tie_key(c: SegmentationCandidate) -> tuple[int, int]:
root_len = max(
(len(t.text) for t in c.tokens if t.token_type == "ROOT"),
default=0,
)
return (len(c.tokens), -root_len)
tied.sort(key=_tie_key)
return tied[0]
# ── Full word segmentation ───────────────────────────────────────────────────
def segment_word(
word: str,
tdk: set[str],
domain_roots: frozenset[str],
caps_set: frozenset[str],
) -> list[dict[str, object]]:
"""Segment a single word into token dicts.
This is the main entry point for per-word segmentation. It handles
punctuation splitting, candidate generation, and selection.
Args:
word: Raw word string (may include surrounding punctuation).
tdk: TDK dictionary set.
domain_roots: Domain vocabulary set.
caps_set: Set of words that were originally ALL CAPS.
Returns:
List of token dicts ready for inclusion in the output.
"""
parts = _split_punctuation(word)
result: list[dict[str, object]] = []
is_first = True
for text, part_type in parts:
if part_type == "PUNCT":
prefix = " " if is_first else ""
result.append({
"token": f"{prefix}{text}",
"token_type": "PUNCT",
"morph_pos": 0,
"_punct": True,
})
is_first = False
continue
# part_type == "WORD"
# Check for apostrophe within the word
if "'" in text or "\u2019" in text:
apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set)
for i, t in enumerate(apo_tokens):
if i == 0 and is_first:
t["token"] = f" {t['token'].lstrip()}"
result.append(t)
is_first = False
continue
# Standard word segmentation via candidate generation
candidates = generate_candidates(text, tdk, domain_roots, caps_set)
best = select_best_candidate(candidates)
for i, token in enumerate(best.tokens):
tok_dict = token.to_dict()
# Add leading space to the first token of this word
if i == 0 and is_first:
tok_dict["token"] = f" {tok_dict['token'].lstrip()}"
# Compute morph_pos
if i == 0:
tok_dict["morph_pos"] = 0
else:
tok_dict["morph_pos"] = i
result.append(tok_dict)
is_first = False
return result
def _segment_apostrophe_word(
word: str,
tdk: set[str],
domain_roots: frozenset[str],
caps_set: frozenset[str],
) -> list[dict[str, object]]:
"""Segment a word containing an apostrophe.
Splits at the apostrophe and determines whether the base is Turkish
(proper name) or foreign.
"""
from .apostrophe import is_turkish_base # avoid circular at module level
# Find the apostrophe position
apo_pos = word.find("'")
if apo_pos == -1:
apo_pos = word.find("\u2019")
if apo_pos == -1:
# No apostrophe found (shouldn't happen) β€” treat as regular word
candidates = generate_candidates(word, tdk, domain_roots, caps_set)
best = select_best_candidate(candidates)
return [t.to_dict() for t in best.tokens]
base = word[:apo_pos]
suffix = word[apo_pos + 1:]
wl = turkish_lower(base)
is_caps = wl in caps_set
if is_turkish_base(base):
# Turkish proper name: ROOT + PUNCT(') + SUFFIX
suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
tokens: list[dict[str, object]] = [
{
"token": base, "token_type": "ROOT", "morph_pos": 0,
**( {"_caps": True} if is_caps else {}),
},
{
"token": "'", "token_type": "PUNCT", "morph_pos": 0,
"_punct": True,
},
]
if suffix:
tokens.append({
"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
"_apo_suffix": True, "_suffix_label": suffix_label,
})
return tokens
else:
# Foreign word: FOREIGN + SUFFIX
suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
tokens = [
{
"token": base, "token_type": "FOREIGN", "morph_pos": 0,
"_foreign": True,
},
]
if suffix:
tokens.append({
"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
"_apo_suffix": True, "_suffix_label": suffix_label,
})
return tokens