File size: 6,217 Bytes

edec8b7

"""Tokenization engine — orchestrates the full pipeline.

This is the central pipeline that ties together all modules:
1. Text normalization (Unicode, whitespace)
2. ALL CAPS detection and lowercasing
3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
4. Word-level segmentation with candidate generation/selection
5. Post-annotation (allomorph labels, compound info, acronym expansion)
6. Number/unit reclassification safety net
"""

from __future__ import annotations

from ._domain_vocab import ALL_DOMAIN_ROOTS
from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
from .normalization import detect_all_caps, normalize_text
from .resources import load_tdk_words
from .segmentation import segment_word, split_into_words
from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens


class TokenizationEngine:
    """Core tokenization engine.

    Stateless after initialisation: loads TDK and domain vocabulary once,
    then processes texts through a deterministic pipeline.

    This class is NOT the public API.  Use ``NedoTurkishTokenizer``
    instead, which delegates to this engine.
    """

    def __init__(self) -> None:
        self._tdk: set[str] = load_tdk_words()
        self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS

    def tokenize(self, text: str) -> list[dict[str, object]]:
        """Run the full tokenization pipeline on *text*.

        Returns a list of token dicts, each with at minimum:
        ``token``, ``token_type``, ``morph_pos``.
        """
        if not text or not text.strip():
            return []

        # ── 1. Normalize ─────────────────────────────────────────────────
        text = normalize_text(text)

        # ── 2. ALL CAPS detection ────────────────────────────────────────
        text, caps_set = detect_all_caps(text)

        # ── 3. Special span extraction ───────────────────────────────────
        spans = find_special_spans(text)

        tokens: list[dict[str, object]] = []
        pos = 0

        for start, end, span_type, original in spans:
            # Tokenize normal text before this special span
            if pos < start:
                segment = text[pos:start]
                if segment.strip():
                    seg_tokens = self._tokenize_segment(segment, caps_set)
                    tokens.extend(seg_tokens)

            # Insert special tokens directly
            tokens.extend(make_special_tokens(span_type, original))
            pos = end

        # Tokenize remaining text after last special span
        if pos < len(text):
            segment = text[pos:]
            if segment.strip():
                seg_tokens = self._tokenize_segment(segment, caps_set)
                tokens.extend(seg_tokens)

        # ── 5. Post-annotation passes ────────────────────────────────────
        tokens = reclassify_numbers_in_tokens(tokens)
        tokens = annotate_canonical(tokens)
        tokens = annotate_compounds(tokens)
        tokens = annotate_acronyms(tokens)

        # ── 6. Finalize morph_pos ────────────────────────────────────────
        tokens = _compute_morph_pos(tokens)

        # ── 7. Strip internal leading spaces from token text ─────────────
        # Leading spaces are used internally to detect word boundaries
        # during morph_pos computation but are NOT part of the public API.
        tokens = _strip_token_text(tokens)

        return tokens

    def _tokenize_segment(
        self, segment: str, caps_set: frozenset[str]
    ) -> list[dict[str, object]]:
        """Tokenize a plain-text segment (no special spans)."""
        words = split_into_words(segment)
        tokens: list[dict[str, object]] = []

        for word in words:
            word_tokens = segment_word(
                word, self._tdk, self._domain_roots, caps_set
            )
            tokens.extend(word_tokens)

        return tokens


# ── Helper: compute morph_pos across the full token stream ───────────────────

def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
    """Recompute ``morph_pos`` consistently across the token stream.

    Rules:
    - Word-initial tokens (leading space, special types, PUNCT) → morph_pos = 0
    - SUFFIX tokens increment the position counter
    - Apostrophe suffixes continue from the previous word
    """
    result: list[dict[str, object]] = []
    word_pos = 0

    for tok in tokens:
        raw = str(tok["token"])
        token_type = str(tok["token_type"])

        is_word_start = raw.startswith(" ") or raw.strip().startswith("<")

        # Apostrophe suffixes continue the previous word
        if tok.get("_apo_suffix"):
            is_word_start = False

        if is_word_start or token_type in (
            "NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
        ):
            word_pos = 0
            morph_pos = 0
        elif token_type == "SUFFIX":
            word_pos += 1
            morph_pos = word_pos
        else:
            # ROOT or FOREIGN within a word (shouldn't normally happen)
            word_pos = 0
            morph_pos = 0

        result.append({**tok, "morph_pos": morph_pos})

    return result


def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
    """Remove internal leading whitespace from all token text strings.

    During pipeline processing, a leading space in ``token`` signals
    a word-initial token.  Once ``morph_pos`` has been computed, this
    space is no longer needed and must be stripped so the public API
    returns clean text.
    """
    return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]