Ethosoft

Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper

edec8b7 about 1 month ago

17.7 kB

	"""Word-level segmentation with candidate generation and selection.

	This is the core of the tokenizer. For each word it:
	1. Generates multiple segmentation candidates (whole-word ROOT, suffix
	chains, foreign root, etc.)
	2. Scores each candidate deterministically
	3. Selects the highest-scoring segmentation

	The scoring rules are transparent and tunable:
	- TDK root match gives a large bonus
	- Domain vocabulary match gives a moderate bonus
	- Longer roots are preferred over shorter ones
	- Each recognised suffix adds a small bonus
	- Unknown / unvalidated roots get a low base score
	"""

	from __future__ import annotations

	import re
	from typing import Any

	from ._domain_vocab import ALL_DOMAIN_ROOTS
	from ._suffix_table import (
	SHORT_AMBIGUOUS_SUFFIXES,
	SUFFIX_ENTRIES,
	SUFFIX_MAP,
	)
	from .normalization import has_turkish_chars, turkish_lower
	from .resources import load_proper_nouns, load_tdk_words
	from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token

	# ── Scoring constants ────────────────────────────────────────────────────────
	# Why these values: TDK_BONUS dominates so that a TDK-validated root almost
	# always wins over an unvalidated one. SUFFIX_BONUS is small enough that
	# over-segmentation (many tiny suffixes) doesn't beat a valid longer root.

	_TDK_BONUS = 10 # Root found in TDK dictionary
	_DOMAIN_BONUS = 8 # Root found in domain vocabulary
	_SUFFIX_BONUS = 2 # Each recognised suffix
	_ROOT_LEN_WEIGHT = 2 # Per-character bonus for root length (prefer longer roots)
	_WHOLE_WORD_BONUS = 5 # Extra bonus when the entire unsplit word is in TDK
	_FOREIGN_BASE = 3 # Base score for foreign root (intentionally low)
	_UNKNOWN_BASE = 1 # Base score for unrecognised root
	_SHORT_ROOT_PENALTY = 4 # Penalty when root is exactly _MIN_ROOT_LEN chars
	_MIN_ROOT_LEN = 2 # Minimum root length for suffix stripping
	_MAX_SUFFIX_DEPTH = 5 # Maximum number of suffixes to strip

	# ── Known-intact words ───────────────────────────────────────────────────────
	# Common Turkish words that look like root+suffix but must stay whole.
	# Without this set, "dedi" would split into "de" (TDK conjunction) + "di"
	# (past tense suffix) because both are individually valid.
	#
	# This set covers inflected forms of very short verb stems (de-, ye-) and
	# common discourse particles that happen to end in suffix-like sequences.

	KNOWN_INTACT: frozenset[str] = frozenset({
	# Forms of "demek" (to say) — stem "de" is a TDK conjunction,
	# causing false splits like de+di, de+miş, de+se, etc.
	"dedi", "dedim", "dedin", "dedik", "dediniz", "dediler",
	"demiş", "demişti", "demiştir",
	"dese", "desem", "desen", "desek",
	"der", "derim", "dersin", "deriz",
	"denir", "dendi", "denmiş",
	# Forms of "yemek" (to eat) — stem "ye" is in TDK
	"yemiş", "yese", "yesem", "yesen",
	"yer", "yerim", "yersin", "yeriz",
	"yenir", "yendi", "yenmiş",
	# Common particles / conjunctions that end in suffix-like sequences
	# (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding)
	"diye", "niye", "nice",
	})


	# ── Punctuation splitting ────────────────────────────────────────────────────

	# Regex to split a word at apostrophes (keeping the apostrophe)
	_APOSTROPHE_RE = re.compile(r"(['\u2019])")

	# Regex to split leading/trailing punctuation from a word
	_LEADING_PUNCT_RE = re.compile(r"^([^\w]+)")
	_TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$")


	def _split_punctuation(word: str) -> list[tuple[str, str]]:
	"""Split a raw word token into (text, type) pairs.

	Separates leading and trailing punctuation from the core word.
	For example: ``'"hello,'`` → ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]``
	"""
	if not word:
	return []

	parts: list[tuple[str, str]] = []

	# Check if the entire token is punctuation
	if is_punct_token(word):
	return [(word, "PUNCT")]

	# Strip leading punctuation
	lead_m = _LEADING_PUNCT_RE.match(word)
	if lead_m:
	for ch in lead_m.group(1):
	parts.append((ch, "PUNCT"))
	word = word[lead_m.end():]

	# Strip trailing punctuation
	trail_m = _TRAILING_PUNCT_RE.search(word)
	trailing: list[tuple[str, str]] = []
	if trail_m:
	for ch in trail_m.group(1):
	trailing.append((ch, "PUNCT"))
	word = word[:trail_m.start()]

	if word:
	parts.append((word, "WORD"))

	parts.extend(trailing)
	return parts


	# ── Word splitting ───────────────────────────────────────────────────────────

	def split_into_words(text: str) -> list[str]:
	"""Split text into whitespace-delimited word tokens.

	Preserves the original casing and punctuation within each token.
	"""
	return text.split()


	# ── Candidate generation ────────────────────────────────────────────────────

	def _generate_suffix_candidates(
	word_lower: str,
	tdk: set[str],
	domain_roots: frozenset[str],
	depth: int = 0,
	) -> list[SegmentationCandidate]:
	"""Recursively generate segmentation candidates by stripping suffixes.

	Tries each suffix in the table (longest first). If the remainder
	is a valid root, produces a candidate. If not, recurses to try
	stripping additional suffixes from the remainder.
	"""
	if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN:
	return []

	candidates: list[SegmentationCandidate] = []

	for suffix_surface, suffix_label in SUFFIX_ENTRIES:
	if not word_lower.endswith(suffix_surface):
	continue

	remainder = word_lower[: -len(suffix_surface)]
	if len(remainder) < _MIN_ROOT_LEN:
	continue

	# Extra caution for very short / ambiguous suffixes
	if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3:
	continue

	suffix_token = Token(
	text=suffix_surface,
	token_type="SUFFIX",
	metadata={"_suffix_label": suffix_label},
	)

	# Check if remainder is a valid root
	root_in_tdk = remainder in tdk
	root_in_domain = remainder in domain_roots
	root_score = len(remainder) * _ROOT_LEN_WEIGHT

	if root_in_tdk:
	root_score += _TDK_BONUS
	elif root_in_domain:
	root_score += _DOMAIN_BONUS
	else:
	root_score += _UNKNOWN_BASE

	# Penalise very short roots: 2-char roots like "de", "ye", "al"
	# are valid TDK entries but produce many false splits on short
	# words (e.g. "dedi" → de+di). The penalty makes it harder for
	# a 2-char root to beat the whole-word hypothesis.
	if len(remainder) <= _MIN_ROOT_LEN:
	root_score -= _SHORT_ROOT_PENALTY

	if root_in_tdk or root_in_domain:
	# Valid root found → create single-level candidate
	root_token = Token(
	text=remainder,
	token_type="ROOT",
	metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {},
	)
	total_score = root_score + _SUFFIX_BONUS
	candidates.append(SegmentationCandidate(
	tokens=[root_token, suffix_token],
	score=total_score,
	source="suffix_chain",
	))

	# Recurse: try stripping more suffixes from the remainder
	if depth < _MAX_SUFFIX_DEPTH - 1:
	sub_candidates = _generate_suffix_candidates(
	remainder, tdk, domain_roots, depth + 1
	)
	for sc in sub_candidates:
	# Only accept recursive results that found a real root
	if sc.score > len(remainder) + _UNKNOWN_BASE:
	extended = SegmentationCandidate(
	tokens=sc.tokens + [suffix_token],
	score=sc.score + _SUFFIX_BONUS,
	source="suffix_chain",
	)
	candidates.append(extended)

	return candidates


	def generate_candidates(
	word: str,
	tdk: set[str],
	domain_roots: frozenset[str],
	caps_set: frozenset[str],
	) -> list[SegmentationCandidate]:
	"""Generate all plausible segmentation candidates for a single word.

	Returns a list of candidates sorted by score (highest first).
	"""
	wl = turkish_lower(word)
	candidates: list[SegmentationCandidate] = []

	is_caps = wl in caps_set
	is_tr_chars = has_turkish_chars(wl)

	# ── Fast path: known-intact words bypass candidate generation ────────
	# These are common words that look splittable but must stay whole.
	if wl in KNOWN_INTACT:
	root_meta_intact: dict[str, Any] = {}
	if is_caps:
	root_meta_intact["_caps"] = True
	return [SegmentationCandidate(
	tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)],
	score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS,
	source="known_intact",
	)]

	# ── Candidate 1: whole word as ROOT ──────────────────────────────────
	in_tdk = wl in tdk
	in_proper = wl in load_proper_nouns()
	in_domain = wl in domain_roots
	whole_score = len(wl) * _ROOT_LEN_WEIGHT
	if in_tdk or in_proper:
	# Whole-word TDK/proper-noun match gets an extra bonus to prevent
	# over-segmenting valid dictionary words like "dünya" into
	# "dün" + "ya".
	whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS
	elif in_domain:
	whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS
	else:
	whole_score += _UNKNOWN_BASE

	root_meta: dict[str, Any] = {}
	if is_caps:
	root_meta["_caps"] = True
	if in_domain:
	root_meta["_domain"] = True

	whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta)
	candidates.append(SegmentationCandidate(
	tokens=[whole_root],
	score=whole_score,
	source="whole_word",
	))

	# ── Candidate 2+: suffix stripping ───────────────────────────────────
	suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots)
	for sc in suffix_cands:
	# Propagate caps flag to the root token
	if is_caps and sc.tokens:
	sc.tokens[0].metadata["_caps"] = True
	candidates.append(sc)

	# ── Candidate N: foreign root ────────────────────────────────────────
	if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2:
	foreign_token = Token(
	text=wl, token_type="FOREIGN",
	metadata={"_foreign": True},
	)
	# Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that
	# valid suffix chains with a TDK root always beat FOREIGN.
	foreign_score = _FOREIGN_BASE + len(wl)
	candidates.append(SegmentationCandidate(
	tokens=[foreign_token],
	score=foreign_score,
	source="foreign",
	))

	# Sort by score descending (highest first)
	candidates.sort(key=lambda c: c.score, reverse=True)
	return candidates


	# ── Candidate selection ──────────────────────────────────────────────────────

	def select_best_candidate(
	candidates: list[SegmentationCandidate],
	) -> SegmentationCandidate:
	"""Select the best segmentation among candidates.

	Picks the highest-scoring candidate. Ties are broken by:
	1. Fewer tokens (less fragmentation)
	2. Longer root token
	"""
	if not candidates:
	# Fallback: should never happen, but safety net
	return SegmentationCandidate(
	tokens=[Token(text="", token_type="ROOT")],
	score=0.0,
	source="fallback",
	)

	if len(candidates) == 1:
	return candidates[0]

	best_score = candidates[0].score
	tied = [c for c in candidates if c.score == best_score]

	if len(tied) == 1:
	return tied[0]

	# Tie-breaking: fewer tokens first; then longer root
	def _tie_key(c: SegmentationCandidate) -> tuple[int, int]:
	root_len = max(
	(len(t.text) for t in c.tokens if t.token_type == "ROOT"),
	default=0,
	)
	return (len(c.tokens), -root_len)

	tied.sort(key=_tie_key)
	return tied[0]


	# ── Full word segmentation ───────────────────────────────────────────────────

	def segment_word(
	word: str,
	tdk: set[str],
	domain_roots: frozenset[str],
	caps_set: frozenset[str],
	) -> list[dict[str, object]]:
	"""Segment a single word into token dicts.

	This is the main entry point for per-word segmentation. It handles
	punctuation splitting, candidate generation, and selection.

	Args:
	word: Raw word string (may include surrounding punctuation).
	tdk: TDK dictionary set.
	domain_roots: Domain vocabulary set.
	caps_set: Set of words that were originally ALL CAPS.

	Returns:
	List of token dicts ready for inclusion in the output.
	"""
	parts = _split_punctuation(word)
	result: list[dict[str, object]] = []
	is_first = True

	for text, part_type in parts:
	if part_type == "PUNCT":
	prefix = " " if is_first else ""
	result.append({
	"token": f"{prefix}{text}",
	"token_type": "PUNCT",
	"morph_pos": 0,
	"_punct": True,
	})
	is_first = False
	continue

	# part_type == "WORD"
	# Check for apostrophe within the word
	if "'" in text or "\u2019" in text:
	apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set)
	for i, t in enumerate(apo_tokens):
	if i == 0 and is_first:
	t["token"] = f" {t['token'].lstrip()}"
	result.append(t)
	is_first = False
	continue

	# Standard word segmentation via candidate generation
	candidates = generate_candidates(text, tdk, domain_roots, caps_set)
	best = select_best_candidate(candidates)

	for i, token in enumerate(best.tokens):
	tok_dict = token.to_dict()
	# Add leading space to the first token of this word
	if i == 0 and is_first:
	tok_dict["token"] = f" {tok_dict['token'].lstrip()}"
	# Compute morph_pos
	if i == 0:
	tok_dict["morph_pos"] = 0
	else:
	tok_dict["morph_pos"] = i
	result.append(tok_dict)

	is_first = False

	return result


	def _segment_apostrophe_word(
	word: str,
	tdk: set[str],
	domain_roots: frozenset[str],
	caps_set: frozenset[str],
	) -> list[dict[str, object]]:
	"""Segment a word containing an apostrophe.

	Splits at the apostrophe and determines whether the base is Turkish
	(proper name) or foreign.
	"""
	from .apostrophe import is_turkish_base # avoid circular at module level

	# Find the apostrophe position
	apo_pos = word.find("'")
	if apo_pos == -1:
	apo_pos = word.find("\u2019")
	if apo_pos == -1:
	# No apostrophe found (shouldn't happen) — treat as regular word
	candidates = generate_candidates(word, tdk, domain_roots, caps_set)
	best = select_best_candidate(candidates)
	return [t.to_dict() for t in best.tokens]

	base = word[:apo_pos]
	suffix = word[apo_pos + 1:]

	wl = turkish_lower(base)
	is_caps = wl in caps_set

	if is_turkish_base(base):
	# Turkish proper name: ROOT + PUNCT(') + SUFFIX
	suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
	tokens: list[dict[str, object]] = [
	{
	"token": base, "token_type": "ROOT", "morph_pos": 0,
	**( {"_caps": True} if is_caps else {}),
	},
	{
	"token": "'", "token_type": "PUNCT", "morph_pos": 0,
	"_punct": True,
	},
	]
	if suffix:
	tokens.append({
	"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
	"_apo_suffix": True, "_suffix_label": suffix_label,
	})
	return tokens
	else:
	# Foreign word: FOREIGN + SUFFIX
	suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
	tokens = [
	{
	"token": base, "token_type": "FOREIGN", "morph_pos": 0,
	"_foreign": True,
	},
	]
	if suffix:
	tokens.append({
	"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
	"_apo_suffix": True, "_suffix_label": suffix_label,
	})
	return tokens