File size: 11,674 Bytes
ca41c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
TurkTokenizer β€” production-ready Turkish morphological tokenizer.

Applies 12 sequential fixes on top of the base turkish-tokenizer:
  1.  ALL CAPS inflation fix
  2.  Apostrophe / code-switching split
  3.  BPE→SUFFIX reclassification
  4.  Zemberek root validation & correction
  5.  Punctuation β†’ PUNCT type
  6.  Domain vocabulary (medical / sports / tourism)
  7.  TDK-based FOREIGN word detection
  8.  Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)
  9.  Allomorph canonicalization
  10. Compound word decomposition
  11. Acronym expansion
  12. Context-aware Zemberek disambiguation

Output fields per token:
    token       : str  β€” token string (leading space = word-initial)
    token_type  : str  β€” ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
                         NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
    morph_pos   : int  β€” 0=root/word-initial, 1=first suffix, 2=second suffix…
    (+ optional _* metadata fields)
"""

from __future__ import annotations

import os
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

from ._java_check import ensure_java
from ._preprocessor import preprocess, postprocess
from ._suffix_expander import reclassify_bpe_suffixes
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
from ._medical_vocab import ALL_DOMAIN_ROOTS
from ._tdk_vocab import reclassify_foreign_words
from ._normalizer import (
    preprocess_special_tokens,
    restore_special_tokens,
    reclassify_numbers_in_tokens,
)
from ._allomorph import add_canonical_labels
from ._compound import add_compound_info
from ._acronym_dict import reclassify_acronyms
from ._context_aware import annotate_with_context

try:
    from ._root_validator import _morphology as _zemb_morphology
except Exception:
    _zemb_morphology = None

_DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}

# ── Token types ───────────────────────────────────────────────────────────────

_SPECIAL_TYPES = frozenset(
    ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
)

_TYPE_SYM = {
    "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
    "NUM": "N", "DATE": "D", "UNIT": "U",
    "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
}


# ── Parallel worker helpers ───────────────────────────────────────────────────

_worker_tok: "TurkTokenizer | None" = None


def _init_worker() -> None:
    global _worker_tok
    _worker_tok = TurkTokenizer()


def _tokenize_one(text: str) -> list[dict]:
    assert _worker_tok is not None
    return _worker_tok.tokenize(text)


# ══════════════════════════════════════════════════════════════════════════════

class TurkTokenizer:
    """
    Turkish morphological tokenizer with HuggingFace-compatible interface.

    Example::

        from turk_tokenizer import TurkTokenizer

        tok = TurkTokenizer()
        tokens = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m")
        for t in tokens:
            print(t["token"], t["token_type"], t["morph_pos"])
    """

    def __init__(self) -> None:
        ensure_java()
        from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
        self._base = TurkishTokenizer()
        self.zemberek_available = ZEMBEREK_AVAILABLE

    # ── Public API ────────────────────────────────────────────────────────────

    def __call__(self, text: str) -> list[dict]:
        return self.tokenize(text)

    def tokenize(self, text: str) -> list[dict]:
        """Tokenize a single text string.

        Returns a list of token dicts, each with:
            ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
        """
        # Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders
        text_norm, specials = preprocess_special_tokens(text)

        # Fix 1 & 2 pre: ALL CAPS + apostrophe
        processed, caps_map = preprocess(text_norm)

        # Base tokenizer
        raw = self._base.tokenize_text(processed)

        # Fix 8 post: restore placeholders
        tokens = restore_special_tokens(raw, specials)

        # Fix 1 & 2 post
        tokens = postprocess(tokens, caps_map)

        # Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
        tokens = reclassify_bpe_suffixes(tokens)

        # Fix 8b: remaining numbers / units
        tokens = reclassify_numbers_in_tokens(tokens)

        # Fix 6: domain vocabulary (medical / sports / tourism)
        tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER)

        # Fix 7: TDK FOREIGN detection
        tokens = reclassify_foreign_words(tokens)

        # Fix 11: acronym expansions
        tokens = reclassify_acronyms(tokens)

        # Fix 9: allomorph canonical labels
        tokens = add_canonical_labels(tokens)

        # Fix 10: compound word annotation
        tokens = add_compound_info(tokens, morphology=_zemb_morphology)

        # Fix 12: context-aware Zemberek disambiguation
        tokens = annotate_with_context(tokens, text)

        # Fix 4: Zemberek root validation & correction
        tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base)

        # Add public output fields
        tokens = _add_output_fields(tokens)

        return tokens

    def batch_tokenize(
        self,
        texts: list[str],
        workers: int | None = None,
        chunk_size: int = 64,
    ) -> list[list[dict]]:
        """Tokenize a list of texts in parallel.

        Args:
            texts: List of strings to tokenize.
            workers: Number of worker processes (None = all CPUs).
            chunk_size: Below this count, run sequentially to avoid overhead.

        Returns:
            List of token lists, in the same order as ``texts``.
        """
        if not texts:
            return []

        n = workers or os.cpu_count() or 4

        if len(texts) <= chunk_size or n == 1:
            return [self.tokenize(t) for t in texts]

        results: list[list[dict] | None] = [None] * len(texts)

        with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool:
            futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)}
            for fut in as_completed(futs):
                i = futs[fut]
                try:
                    results[i] = fut.result()
                except Exception as exc:  # noqa: BLE001
                    results[i] = self._base.tokenize_text(texts[i])
                    print(f"[TurkTokenizer] fallback at idx={i}: {exc}")

        return results  # type: ignore[return-value]

    # ── HuggingFace-style helpers ─────────────────────────────────────────────

    @classmethod
    def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer":
        """Load tokenizer (rules-based, no weights to download)."""
        return cls()

    def save_pretrained(self, save_directory: str) -> None:
        """Save tokenizer config to a directory (for HF Hub compatibility)."""
        import json
        path = Path(save_directory)
        path.mkdir(parents=True, exist_ok=True)
        config = {
            "tokenizer_class": "TurkTokenizer",
            "model_type": "turk-tokenizer",
            "version": "1.0.0",
            "zemberek_available": self.zemberek_available,
        }
        (path / "tokenizer_config.json").write_text(
            json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
        )

    # ── Utility ───────────────────────────────────────────────────────────────

    def stats(self, tokens: list[dict]) -> dict:
        """Compute morphological coverage statistics for a token list."""
        total = len(tokens)
        if total == 0:
            return {k: 0 for k in ("total", "roots", "suffixes", "foreign",
                                    "bpe", "punct", "special", "tr_pct", "pure_pct")}
        roots    = sum(1 for t in tokens if t["token_type"] == "ROOT")
        suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
        foreign  = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
        punct    = sum(1 for t in tokens if t["token_type"] == "PUNCT")
        bpe      = sum(1 for t in tokens if t["token_type"] == "BPE")
        special  = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES)
        tr       = roots + suffixes + foreign + punct + special
        pure     = sum(
            1 for t in tokens
            if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
            and not t["token"].strip().startswith("<")
        )
        return {
            "total":    total,
            "roots":    roots,
            "suffixes": suffixes,
            "foreign":  foreign,
            "bpe":      bpe,
            "punct":    punct,
            "special":  special,
            "tr_pct":   round(tr / total * 100, 2),
            "pure_pct": round(pure / total * 100, 2),
        }


# ── Internal helpers ──────────────────────────────────────────────────────────

def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]:
    result = []
    for tok in tokens:
        if tok["type"] != "BPE":
            result.append(tok)
            continue
        raw = tok["token"]
        if raw == raw.lstrip():   # no leading space β†’ not word-initial
            result.append(tok)
            continue
        if raw.lstrip().lower() in domain_lower:
            result.append({**tok, "type": "ROOT", "_domain": True})
        else:
            result.append(tok)
    return result


def _add_output_fields(tokens: list[dict]) -> list[dict]:
    """Compute token_type and morph_pos and add them to every token."""
    result = []
    word_pos = 0

    for tok in tokens:
        raw = tok["token"]
        base_type = tok["type"]
        stripped = raw.strip()

        # ── token_type: FOREIGN for foreign ROOTs ─────────────────────────
        if base_type == "ROOT" and tok.get("_foreign"):
            token_type = "FOREIGN"
        else:
            token_type = base_type

        # ── morph_pos ─────────────────────────────────────────────────────
        is_word_start = raw.startswith(" ") or stripped.startswith("<")

        if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT":
            word_pos = 0
            morph_pos = 0
        elif base_type == "SUFFIX":
            word_pos += 1
            morph_pos = word_pos
        else:
            # ROOT or BPE within a word (no leading space)
            word_pos = 0
            morph_pos = 0

        result.append({**tok, "token_type": token_type, "morph_pos": morph_pos})

    return result