Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7
"""Tokenization engine β€” orchestrates the full pipeline.
This is the central pipeline that ties together all modules:
1. Text normalization (Unicode, whitespace)
2. ALL CAPS detection and lowercasing
3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
4. Word-level segmentation with candidate generation/selection
5. Post-annotation (allomorph labels, compound info, acronym expansion)
6. Number/unit reclassification safety net
"""
from __future__ import annotations
from ._domain_vocab import ALL_DOMAIN_ROOTS
from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
from .normalization import detect_all_caps, normalize_text
from .resources import load_tdk_words
from .segmentation import segment_word, split_into_words
from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens
class TokenizationEngine:
"""Core tokenization engine.
Stateless after initialisation: loads TDK and domain vocabulary once,
then processes texts through a deterministic pipeline.
This class is NOT the public API. Use ``NedoTurkishTokenizer``
instead, which delegates to this engine.
"""
def __init__(self) -> None:
self._tdk: set[str] = load_tdk_words()
self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS
def tokenize(self, text: str) -> list[dict[str, object]]:
"""Run the full tokenization pipeline on *text*.
Returns a list of token dicts, each with at minimum:
``token``, ``token_type``, ``morph_pos``.
"""
if not text or not text.strip():
return []
# ── 1. Normalize ─────────────────────────────────────────────────
text = normalize_text(text)
# ── 2. ALL CAPS detection ────────────────────────────────────────
text, caps_set = detect_all_caps(text)
# ── 3. Special span extraction ───────────────────────────────────
spans = find_special_spans(text)
tokens: list[dict[str, object]] = []
pos = 0
for start, end, span_type, original in spans:
# Tokenize normal text before this special span
if pos < start:
segment = text[pos:start]
if segment.strip():
seg_tokens = self._tokenize_segment(segment, caps_set)
tokens.extend(seg_tokens)
# Insert special tokens directly
tokens.extend(make_special_tokens(span_type, original))
pos = end
# Tokenize remaining text after last special span
if pos < len(text):
segment = text[pos:]
if segment.strip():
seg_tokens = self._tokenize_segment(segment, caps_set)
tokens.extend(seg_tokens)
# ── 5. Post-annotation passes ────────────────────────────────────
tokens = reclassify_numbers_in_tokens(tokens)
tokens = annotate_canonical(tokens)
tokens = annotate_compounds(tokens)
tokens = annotate_acronyms(tokens)
# ── 6. Finalize morph_pos ────────────────────────────────────────
tokens = _compute_morph_pos(tokens)
# ── 7. Strip internal leading spaces from token text ─────────────
# Leading spaces are used internally to detect word boundaries
# during morph_pos computation but are NOT part of the public API.
tokens = _strip_token_text(tokens)
return tokens
def _tokenize_segment(
self, segment: str, caps_set: frozenset[str]
) -> list[dict[str, object]]:
"""Tokenize a plain-text segment (no special spans)."""
words = split_into_words(segment)
tokens: list[dict[str, object]] = []
for word in words:
word_tokens = segment_word(
word, self._tdk, self._domain_roots, caps_set
)
tokens.extend(word_tokens)
return tokens
# ── Helper: compute morph_pos across the full token stream ───────────────────
def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
"""Recompute ``morph_pos`` consistently across the token stream.
Rules:
- Word-initial tokens (leading space, special types, PUNCT) β†’ morph_pos = 0
- SUFFIX tokens increment the position counter
- Apostrophe suffixes continue from the previous word
"""
result: list[dict[str, object]] = []
word_pos = 0
for tok in tokens:
raw = str(tok["token"])
token_type = str(tok["token_type"])
is_word_start = raw.startswith(" ") or raw.strip().startswith("<")
# Apostrophe suffixes continue the previous word
if tok.get("_apo_suffix"):
is_word_start = False
if is_word_start or token_type in (
"NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
):
word_pos = 0
morph_pos = 0
elif token_type == "SUFFIX":
word_pos += 1
morph_pos = word_pos
else:
# ROOT or FOREIGN within a word (shouldn't normally happen)
word_pos = 0
morph_pos = 0
result.append({**tok, "morph_pos": morph_pos})
return result
def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
"""Remove internal leading whitespace from all token text strings.
During pipeline processing, a leading space in ``token`` signals
a word-initial token. Once ``morph_pos`` has been computed, this
space is no longer needed and must be stripped so the public API
returns clean text.
"""
return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]