File size: 6,217 Bytes
edec8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Tokenization engine β€” orchestrates the full pipeline.

This is the central pipeline that ties together all modules:
1. Text normalization (Unicode, whitespace)
2. ALL CAPS detection and lowercasing
3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
4. Word-level segmentation with candidate generation/selection
5. Post-annotation (allomorph labels, compound info, acronym expansion)
6. Number/unit reclassification safety net
"""

from __future__ import annotations

from ._domain_vocab import ALL_DOMAIN_ROOTS
from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
from .normalization import detect_all_caps, normalize_text
from .resources import load_tdk_words
from .segmentation import segment_word, split_into_words
from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens


class TokenizationEngine:
    """Core tokenization engine.

    Stateless after initialisation: loads TDK and domain vocabulary once,
    then processes texts through a deterministic pipeline.

    This class is NOT the public API.  Use ``NedoTurkishTokenizer``
    instead, which delegates to this engine.
    """

    def __init__(self) -> None:
        self._tdk: set[str] = load_tdk_words()
        self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS

    def tokenize(self, text: str) -> list[dict[str, object]]:
        """Run the full tokenization pipeline on *text*.

        Returns a list of token dicts, each with at minimum:
        ``token``, ``token_type``, ``morph_pos``.
        """
        if not text or not text.strip():
            return []

        # ── 1. Normalize ─────────────────────────────────────────────────
        text = normalize_text(text)

        # ── 2. ALL CAPS detection ────────────────────────────────────────
        text, caps_set = detect_all_caps(text)

        # ── 3. Special span extraction ───────────────────────────────────
        spans = find_special_spans(text)

        tokens: list[dict[str, object]] = []
        pos = 0

        for start, end, span_type, original in spans:
            # Tokenize normal text before this special span
            if pos < start:
                segment = text[pos:start]
                if segment.strip():
                    seg_tokens = self._tokenize_segment(segment, caps_set)
                    tokens.extend(seg_tokens)

            # Insert special tokens directly
            tokens.extend(make_special_tokens(span_type, original))
            pos = end

        # Tokenize remaining text after last special span
        if pos < len(text):
            segment = text[pos:]
            if segment.strip():
                seg_tokens = self._tokenize_segment(segment, caps_set)
                tokens.extend(seg_tokens)

        # ── 5. Post-annotation passes ────────────────────────────────────
        tokens = reclassify_numbers_in_tokens(tokens)
        tokens = annotate_canonical(tokens)
        tokens = annotate_compounds(tokens)
        tokens = annotate_acronyms(tokens)

        # ── 6. Finalize morph_pos ────────────────────────────────────────
        tokens = _compute_morph_pos(tokens)

        # ── 7. Strip internal leading spaces from token text ─────────────
        # Leading spaces are used internally to detect word boundaries
        # during morph_pos computation but are NOT part of the public API.
        tokens = _strip_token_text(tokens)

        return tokens

    def _tokenize_segment(
        self, segment: str, caps_set: frozenset[str]
    ) -> list[dict[str, object]]:
        """Tokenize a plain-text segment (no special spans)."""
        words = split_into_words(segment)
        tokens: list[dict[str, object]] = []

        for word in words:
            word_tokens = segment_word(
                word, self._tdk, self._domain_roots, caps_set
            )
            tokens.extend(word_tokens)

        return tokens


# ── Helper: compute morph_pos across the full token stream ───────────────────

def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
    """Recompute ``morph_pos`` consistently across the token stream.

    Rules:
    - Word-initial tokens (leading space, special types, PUNCT) β†’ morph_pos = 0
    - SUFFIX tokens increment the position counter
    - Apostrophe suffixes continue from the previous word
    """
    result: list[dict[str, object]] = []
    word_pos = 0

    for tok in tokens:
        raw = str(tok["token"])
        token_type = str(tok["token_type"])

        is_word_start = raw.startswith(" ") or raw.strip().startswith("<")

        # Apostrophe suffixes continue the previous word
        if tok.get("_apo_suffix"):
            is_word_start = False

        if is_word_start or token_type in (
            "NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
        ):
            word_pos = 0
            morph_pos = 0
        elif token_type == "SUFFIX":
            word_pos += 1
            morph_pos = word_pos
        else:
            # ROOT or FOREIGN within a word (shouldn't normally happen)
            word_pos = 0
            morph_pos = 0

        result.append({**tok, "morph_pos": morph_pos})

    return result


def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
    """Remove internal leading whitespace from all token text strings.

    During pipeline processing, a leading space in ``token`` signals
    a word-initial token.  Once ``morph_pos`` has been computed, this
    space is no longer needed and must be stripped so the public API
    returns clean text.
    """
    return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]